[tracker/tracker-0.6] Use libenca to check for Cyrillic encodings for ID3v1 tags

From: Martyn James Russell <mr src gnome org>
To: svn-commits-list gnome org
Subject: [tracker/tracker-0.6] Use libenca to check for Cyrillic encodings for ID3v1 tags
Date: Wed, 17 Jun 2009 07:28:45 -0400 (EDT)
commit 9e40472245605667c652b62e5c96732434b1c9e3
Author: Philip Van Hoof <philip codeminded be>
Date:   Wed Jun 17 12:20:59 2009 +0100

    Use libenca to check for Cyrillic encodings for ID3v1 tags
    
    This partially fixes NB#119318. There are other tags which also need checks and
    conversions done which can potentially overwrite the ID3v1 tag data we
    initially retrieved.

 configure.ac                              |   12 ++
 src/tracker-extract/Makefile.am           |    9 ++
 src/tracker-extract/tracker-extract-mp3.c |  164 ++++++++++++++++++-----------
 3 files changed, 124 insertions(+), 61 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 48854b7..ebda38e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -203,6 +203,17 @@ AC_SUBST(GCONF_LIBS)
 
 AM_CONDITIONAL(HAVE_GCONF, test "$have_gconf" = "yes")
 
+# Check for enca, detects defect Russian or Cyrillic language specifics in mp3s
+PKG_CHECK_MODULES(ENCA, [enca >= 1.9], have_enca=yes, have_enca=no)
+AC_SUBST(ENCA_CFLAGS)
+AC_SUBST(ENCA_LIBS)
+
+AM_CONDITIONAL(HAVE_ENCA, test "$have_enca" = "yes")
+
+if test x$have_enca == "xyes"; then
+  AC_DEFINE(HAVE_ENCA, 1, [Enca language detection aid])
+fi
+
 # Check for Raptor
 PKG_CHECK_MODULES(RAPTOR, [raptor >= 1.4.17], have_raptor=yes, have_raptor=no)
 AC_SUBST(RAPTOR_CFLAGS)
@@ -1471,6 +1482,7 @@ Build Configuration:
 
 	Enable unit tests:			$have_unit_tests
 	Enable unac accent stripper:	  	$have_unac
+	Enable enca encoding detection:	  	$have_enca
 	Enable full text search:               	$enable_fts
 	Enable detailed metadata:              	$enable_detailed_metadata
 
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index aff57f5..72247a9 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -28,6 +28,10 @@ INCLUDES = 								\
 	$(XINE_CFLAGS) 							\
 	$(TOTEM_PL_PARSER_CFLAGS)
 
+if HAVE_ENCA
+INCLUDES += $(ENCA_CFLAGS)
+endif
+
 modules_LTLIBRARIES = 							\
 	libextract-abw.la 						\
 	libextract-mp3.la				 		\
@@ -125,6 +129,11 @@ libextract_mp3_la_SOURCES = tracker-extract-mp3.c $(albumart_sources) $(escape_s
 libextract_mp3_la_LDFLAGS = $(module_flags) $(albumart_flags)
 libextract_mp3_la_LIBADD = $(albumart_libs) $(GLIB2_LIBS) $(GCOV_LIBS)
 
+if HAVE_ENCA
+libextract_mp3_la_LIBADD += $(ENCA_LIBS)
+#libextract_mp3_la_LIBADD += -L/usr/lib/libenca.a
+endif
+
 # Vorbis (OGG)
 libextract_vorbis_la_SOURCES = tracker-extract-vorbis.c $(escape_sources)
 libextract_vorbis_la_LDFLAGS = $(module_flags)
diff --git a/src/tracker-extract/tracker-extract-mp3.c b/src/tracker-extract/tracker-extract-mp3.c
index 1bbbd25..514eed4 100644
--- a/src/tracker-extract/tracker-extract-mp3.c
+++ b/src/tracker-extract/tracker-extract-mp3.c
@@ -40,6 +40,10 @@
 #include <sys/mman.h>
 #endif /* G_OS_WIN32 */
 
+#ifdef HAVE_ENCA
+#include <enca.h>
+#endif
+
 #include <libtracker-common/tracker-file-utils.h>
 #include <libtracker-common/tracker-utils.h>
 
@@ -441,7 +445,7 @@ un_unsync (const unsigned char *source,
 		*dest = source[offset];
 
 		if ((source[offset] == 0xFF) && 
-		    (source[offset+1] == 0x00)) {
+		    (source[offset + 1] == 0x00)) {
 			offset++;
 			new_size--;
 		}
@@ -452,12 +456,52 @@ un_unsync (const unsigned char *source,
 	*dest_size = new_size;
 }
 
+static char*
+get_encoding (const char *data, size_t size)
+{
+	gchar *encoding_string = NULL;
+
+#ifdef HAVE_ENCA
+	const char **langs;
+	size_t s, i;
+
+	langs = enca_get_languages (&s);
+
+	for (i = 0; i < s && !encoding_string; i++) {
+		EncaAnalyser analyser;
+		EncaEncoding encoding;
+
+		analyser = enca_analyser_alloc (langs[i]);
+		encoding = enca_analyse_const (analyser, data, size);
+
+		if (enca_charset_is_known (encoding.charset)) {
+			encoding_string = g_strdup (enca_charset_name (encoding.charset, 
+								       ENCA_NAME_STYLE_ICONV));
+		}
+
+		enca_analyser_free (analyser);
+	}
+
+	free (langs);
+#endif
+
+	if (!encoding_string) {
+		encoding_string = g_strdup ("ISO-8859-1");
+	}
+
+	return encoding_string;
+}
+
 static gboolean
 get_id3 (const gchar *data,
 	 size_t       size,
 	 id3tag      *id3)
 {
+#ifdef HAVE_ENCA
+	GString *s;
+#endif /* HAVE_ENCA */
 	const gchar *pos;
+	gchar *encoding = NULL;
 	gchar buf[5];
 
 	if (!data) {
@@ -474,55 +518,53 @@ get_id3 (const gchar *data,
 		return FALSE;
 	}
 
-	pos += 3;
+#ifdef HAVE_ENCA
+	/* Get the encoding for ALL the data we are extracting here */
+	s = g_string_new ("");
+	g_string_append_len (s, pos, 30);
+	g_string_append_len (s, pos + 30, 30);
+	g_string_append_len (s, pos + 60, 30);
+
+	encoding = get_encoding (s->str, 90);
+	g_string_free (s, TRUE);
+#else  /* HAVE_ENCA */
+	encoding = get_encoding (NULL, 0);
+#endif /* HAVE_ENCA */
 
-	id3->title = g_convert (pos, 30,
-				"UTF-8",
-				"ISO-8859-1",
-				NULL, NULL, NULL);
+	/* Now convert all the data separately */
+	pos += 3;
+	id3->title = g_convert (pos, 30, "UTF-8", encoding, NULL, NULL, NULL);
 
 	pos += 30;
-	id3->artist = g_convert (pos, 30,
-				 "UTF-8",
-				 "ISO-8859-1",
-				 NULL, NULL, NULL);
+	id3->artist = g_convert (pos, 30, "UTF-8", encoding, NULL, NULL, NULL);
+
 	pos += 30;
-	id3->album = g_convert (pos, 30,
-				"UTF-8",
-				"ISO-8859-1",
-				NULL, NULL, NULL);
+	id3->album = g_convert (pos, 30, "UTF-8", encoding, NULL, NULL, NULL);
+
 	pos += 30;
-	id3->year = g_convert (pos, 4,
-			       "UTF-8",
-			       "ISO-8859-1",
-			       NULL, NULL, NULL);
+	id3->year = g_convert (pos, 4, "UTF-8", encoding, NULL, NULL, NULL);
 
 	pos += 4;
 
-	if (pos[28] != (guint)0) {
-		id3->comment = g_convert (pos, 30,
-					  "UTF-8",
-					  "ISO-8859-1",
-					  NULL, NULL, NULL);
-
+	if (pos[28] != 0) {
+		id3->comment = g_convert (pos, 30, "UTF-8", encoding, NULL, NULL, NULL);
 		id3->trackno = NULL;
 	} else {
-		id3->comment = g_convert (pos, 28,
-					  "UTF-8",
-					  "ISO-8859-1",
-					  NULL, NULL, NULL);
+		id3->comment = g_convert (pos, 28, "UTF-8", encoding, NULL, NULL, NULL);
+
 		snprintf (buf, 5, "%d", pos[29]);
-		id3->trackno = strdup(buf);
+		id3->trackno = g_strdup (buf);
 	}
 
 	pos += 30;
-
 	id3->genre = g_strdup (get_genre_name ((guint) pos[0]));
 
 	if (!id3->genre) {
 		id3->genre = g_strdup ("");
 	}
 
+	g_free (encoding);
+
 	return TRUE;
 }
 
@@ -833,28 +875,28 @@ get_id3v24_tags (const gchar *data,
 
 				switch (data[pos + 10]) {
 				case 0x00:
-					word = g_convert (&data[pos+11],
-							  csize-1,
+					word = g_convert (&data[pos + 11],
+							  csize - 1,
 							  "UTF-8",
 							  "ISO-8859-1",
 							  NULL, NULL, NULL);
 					break;
 				case 0x01 :
-					word = g_convert (&data[pos+11],
-							  csize-1,
+					word = g_convert (&data[pos + 11],
+							  csize - 1,
 							  "UTF-8",
 							  "UTF-16",
 							  NULL, NULL, NULL);
 					break;
 				case 0x02 :
-					word = g_convert (&data[pos+11],
-							  csize-1,
+					word = g_convert (&data[pos + 11],
+							  csize - 1,
 							  "UTF-8",
 							  "UTF-16BE",
 							  NULL, NULL, NULL);
 					break;
 				case 0x03 :
-					word = strndup (&data[pos+11], csize-1);
+					word = strndup (&data[pos + 11], csize - 1);
 					break;
 
 				default:
@@ -862,8 +904,8 @@ get_id3v24_tags (const gchar *data,
 					 * try to convert from
 					 * iso-8859-1
 					 */
-					word = g_convert (&data[pos+11],
-							  csize-1,
+					word = g_convert (&data[pos + 11],
+							  csize - 1,
 							  "UTF-8",
 							  "ISO-8859-1",
 							  NULL, NULL, NULL);
@@ -939,21 +981,21 @@ get_id3v24_tags (const gchar *data,
 						  "ISO-8859-1",
 						  NULL, NULL, NULL);
 				break;
-			case 0x01 :
+			case 0x01:
 				word = g_convert (text,
 						  csize - offset,
 						  "UTF-8",
 						  "UTF-16",
 						  NULL, NULL, NULL);
 				break;
-			case 0x02 :
+			case 0x02:
 				word = g_convert (text,
 						  csize-offset,
 						  "UTF-8",
 						  "UTF-16BE",
 						  NULL, NULL, NULL);
 				break;
-			case 0x03 :
+			case 0x03:
 				word = g_strndup (text, csize - offset);
 				break;
 				
@@ -1091,28 +1133,28 @@ get_id3v23_tags (const gchar *data,
 
 				switch (data[pos + 10]) {
 				case 0x00:
-					word = g_convert (&data[pos+11],
-							  csize-1,
+					word = g_convert (&data[pos + 11],
+							  csize - 1,
 							  "UTF-8",
 							  "ISO-8859-1",
 							  NULL, NULL, NULL);
 					break;
 				case 0x01 :
-/* 					word = g_convert (&data[pos+11], */
-/* 							  csize-1, */
+/* 					word = g_convert (&data[pos + 11], */
+/* 							  csize - 1, */
 /* 							  "UTF-8", */
 /* 							  "UCS-2", */
 /* 							  NULL, NULL, NULL); */
-					word = ucs2_to_utf8 (&data[pos+11],
-							     csize-1);
+					word = ucs2_to_utf8 (&data[pos + 11],
+							     csize - 1);
 					break;
 				default:
 					/* Bad encoding byte,
 					 * try to convert from
 					 * iso-8859-1
 					 */
-					word = g_convert (&data[pos+11],
-							  csize-1,
+					word = g_convert (&data[pos + 11],
+							  csize - 1,
 							  "UTF-8",
 							  "ISO-8859-1",
 							  NULL, NULL, NULL);
@@ -1230,11 +1272,11 @@ get_id3v23_tags (const gchar *data,
 			guint        offset;
 			gint         mime_len;
 
-			text_type =  data[pos +10];
-			mime      = &data[pos +11];
+			text_type =  data[pos + 10];
+			mime      = &data[pos + 11];
 			mime_len  = strlen (mime);
-			pic_type  =  data[pos +11 + mime_len + 1];
-			desc      = &data[pos +11 + mime_len + 1 + 1];
+			pic_type  =  data[pos + 11 + mime_len + 1];
+			desc      = &data[pos + 11 + mime_len + 1 + 1];
 			
 			if (pic_type == 3 || (pic_type == 0 && filedata->albumartsize == 0)) {
 				offset = pos + 11 + mime_len + 2 + strlen (desc) + 1;
@@ -1319,8 +1361,8 @@ get_id3v20_tags (const gchar *data,
 				 */
 				switch (data[pos + 6]) {
 				case 0x00:
-					word = g_convert (&data[pos+7],
-							  csize-1,
+					word = g_convert (&data[pos + 7],
+							  csize - 1,
 							  "UTF-8",
 							  "ISO-8859-1",
 							  NULL, NULL, NULL);
@@ -1331,16 +1373,16 @@ get_id3v20_tags (const gchar *data,
 /* 							  "UTF-8", */
 /* 							  "UCS-2", */
 /* 							  NULL, NULL, NULL); */
-					word = ucs2_to_utf8 (&data[pos+7],
-							     csize-1);
+					word = ucs2_to_utf8 (&data[pos + 7],
+							     csize - 1);
 					break;
 				default:
 					/* Bad encoding byte,
 					 * try to convert from
 					 * iso-8859-1
 					 */
-					word = g_convert (&data[pos+7],
-							  csize-1,
+					word = g_convert (&data[pos + 7],
+							  csize - 1,
 							  "UTF-8",
 							  "ISO-8859-1",
 							  NULL, NULL, NULL);
@@ -1696,7 +1738,7 @@ extract_mp3 (const gchar *filename,
 
 	close (fd);
 
-	if (buffer == NULL || buffer == (void*) -1) {
+	if (buffer == NULL || buffer == (void*) - 1) {
 		return;
 	}
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]