[tracker] Ported enca encoding detection from 0.6 to master



commit 85bc8247adc6b2bbc7fcd071ca25de1401edfa37
Author: Philip Van Hoof <philip codeminded be>
Date:   Mon Jul 27 12:28:53 2009 +0200

    Ported enca encoding detection from 0.6 to master

 configure.ac                              |   11 +
 src/tracker-extract/Makefile.am           |    9 +
 src/tracker-extract/tracker-extract-mp3.c |  298 ++++++++++++++++++++--------
 3 files changed, 233 insertions(+), 85 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 013f889..5818cb5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -219,6 +219,17 @@ PKG_CHECK_MODULES(UUID, [uuid])
 AC_SUBST(UUID_CFLAGS)
 AC_SUBST(UUID_LIBS)
 
+# Check for enca, detects defect Russian or Cyrillic language specifics in mp3s
+PKG_CHECK_MODULES(ENCA, [enca >= 1.9], have_enca=yes, have_enca=no)
+AC_SUBST(ENCA_CFLAGS)
+AC_SUBST(ENCA_LIBS)
+
+AM_CONDITIONAL(HAVE_ENCA, test "$have_enca" = "yes")
+
+if test x$have_enca == "xyes"; then
+  AC_DEFINE(HAVE_ENCA, 1, [Enca language detection aid])
+fi
+
 # Check for Raptor
 PKG_CHECK_MODULES(RAPTOR, [raptor >= 1.4.18])
 AC_SUBST(RAPTOR_CFLAGS)
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 2b4627a..f9e48e1 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -32,6 +32,10 @@ if HAVE_STREAMANALYZER
 INCLUDES += $(STREAMANALYZER_CFLAGS) -DHAVE_STREAMANALYZER
 endif
 
+if HAVE_ENCA
+INCLUDES += $(ENCA_CFLAGS)
+endif
+
 modules_LTLIBRARIES = 							\
 	libextract-abw.la 						\
 	libextract-mp3.la				 		\
@@ -127,6 +131,11 @@ libextract_mp3_la_LDFLAGS = $(module_flags) $(albumart_flags)
 libextract_mp3_la_LIBADD = $(albumart_libs) $(GLIB2_LIBS) $(GCOV_LIBS) \
 	$(top_builddir)/src/libtracker-common/libtracker-common.la
 
+if HAVE_ENCA
+libextract_mp3_la_LIBADD += $(ENCA_LIBS)
+#libextract_mp3_la_LIBADD += -L/usr/lib/libenca.a
+endif
+
 # Vorbis (OGG)
 libextract_vorbis_la_SOURCES = tracker-extract-vorbis.c $(escape_sources)
 libextract_vorbis_la_LDFLAGS = $(module_flags)
diff --git a/src/tracker-extract/tracker-extract-mp3.c b/src/tracker-extract/tracker-extract-mp3.c
index 162fbf5..e97c5ef 100644
--- a/src/tracker-extract/tracker-extract-mp3.c
+++ b/src/tracker-extract/tracker-extract-mp3.c
@@ -40,6 +40,10 @@
 #include <sys/mman.h>
 #endif /* G_OS_WIN32 */
 
+#ifdef HAVE_ENCA
+#include <enca.h>
+#endif
+
 #include <libtracker-common/tracker-file-utils.h>
 #include <libtracker-common/tracker-statement-list.h>
 #include <libtracker-common/tracker-ontology.h>
@@ -91,6 +95,7 @@ typedef struct {
 	gchar *comment;
 	gchar *trackno;
 	gchar *genre;
+	gchar *encoding;
 } id3tag;
 
 typedef struct {
@@ -459,7 +464,7 @@ un_unsync (const unsigned char *source,
 		*dest = source[offset];
 
 		if ((source[offset] == 0xFF) && 
-		    (source[offset+1] == 0x00)) {
+		    (source[offset + 1] == 0x00)) {
 			offset++;
 			new_size--;
 		}
@@ -470,15 +475,95 @@ un_unsync (const unsigned char *source,
 	*dest_size = new_size;
 }
 
-/* convert string from ISO-8859-1 to UTF-8 and strip leading and trailing whitespace */
-static gchar *
-convert_and_strip (const gchar *str,
-                   gssize       len)
+static char*
+get_encoding (const char *data, 
+	      gssize      size, 
+	      gboolean   *encoding_found)
+{
+	gchar *encoding = NULL;
+#ifdef HAVE_ENCA
+	const char **langs;
+	size_t s, i;
+#endif
+
+	if (encoding_found) {
+		*encoding_found = FALSE;
+	}
+	
+#ifdef HAVE_ENCA
+
+	langs = enca_get_languages (&s);
+
+	for (i = 0; i < s && !encoding; i++) {
+		EncaAnalyser analyser;
+		EncaEncoding eencoding;
+
+		analyser = enca_analyser_alloc (langs[i]);
+		eencoding = enca_analyse_const (analyser, data, size);
+
+		if (enca_charset_is_known (eencoding.charset)) {
+			if (encoding_found) {
+				*encoding_found = TRUE;
+			}
+
+			encoding = g_strdup (enca_charset_name (eencoding.charset, 
+								ENCA_NAME_STYLE_ICONV));
+		}
+
+		enca_analyser_free (analyser);
+	}
+
+	free (langs);
+#endif
+
+	if (!encoding) {
+		encoding = g_strdup ("ISO-8859-1");
+	}
+
+	return encoding;
+}
+
+static gchar*
+t_convert (const gchar  *str,
+           gssize        len,
+           const gchar  *to_codeset,
+           const gchar  *from_codeset,
+           gsize        *bytes_read,
+           gsize        *bytes_written,
+           GError      **error_out)
 {
-	return g_strstrip (g_convert (str, len,
-				      "UTF-8",
-				      "ISO-8859-1",
-				      NULL, NULL, NULL));
+	GError *error = NULL;
+	gchar *word;
+
+	/* g_print ("%s for %s\n", from_codeset, str); */
+
+	word = g_convert (str,
+			  len,
+			  to_codeset,
+			  from_codeset,
+			  bytes_read, 
+			  bytes_written, 
+			  &error);
+
+	if (error) {
+		gchar *encoding;
+
+		encoding = get_encoding (str, len, NULL);
+		g_free (word);
+
+		word = g_convert (str,
+				  len,
+				  to_codeset,
+				  encoding,
+				  bytes_read, 
+				  bytes_written, 
+				  error_out);
+
+		g_free (encoding);
+		g_error_free (error);
+	}
+
+	return word;
 }
 
 static gboolean
@@ -486,8 +571,12 @@ get_id3 (const gchar *data,
 	 size_t       size,
 	 id3tag      *id3)
 {
+#ifdef HAVE_ENCA
+	GString *s;
+	gboolean encoding_was_found;
+#endif /* HAVE_ENCA */
+	gchar *encoding;
 	const gchar *pos;
-	gchar buf[5];
 
 	if (!data) {
 		return FALSE;
@@ -503,40 +592,69 @@ get_id3 (const gchar *data,
 		return FALSE;
 	}
 
+	/* Now convert all the data separately */
 	pos += 3;
 
-	id3->title = convert_and_strip (pos, 30);
+	/* We don't use our magic t_convert here because we have a better way
+	 * to collect a bit more data before we let enca loose on it for v1. */
+
+#ifdef HAVE_ENCA
+	/* Get the encoding for ALL the data we are extracting here */
+	s = g_string_new_len (pos, 30);
+	g_string_append_len (s, pos + 30, 30);
+	g_string_append_len (s, pos + 60, 30);
+
+	encoding = get_encoding (s->str, 90, &encoding_was_found);
+
+	if (encoding_was_found) {
+		id3->encoding = encoding;
+	}
+
+	g_string_free (s, TRUE);
+#else  /* HAVE_ENCA */
+	encoding = get_encoding (NULL, 0, NULL);
+#endif /* HAVE_ENCA */
+
+	id3->title = g_convert (pos, 30, "UTF-8", encoding, NULL, NULL, NULL);
 
 	pos += 30;
-	id3->artist = convert_and_strip (pos, 30);
+	id3->artist = g_convert (pos, 30, "UTF-8", encoding, NULL, NULL, NULL);
+
 	pos += 30;
-	id3->album = convert_and_strip (pos, 30);
+	id3->album = g_convert (pos, 30, "UTF-8", encoding, NULL, NULL, NULL);
+
 	pos += 30;
-	id3->year = convert_and_strip (pos, 4);
+	id3->year = g_convert (pos, 4, "UTF-8", encoding, NULL, NULL, NULL);
 
 	pos += 4;
 
-	if (pos[28] != (guint)0) {
-		id3->comment = convert_and_strip (pos, 30);
-
+	if (pos[28] != 0) {
+		id3->comment = g_convert (pos, 30, "UTF-8", encoding, NULL, NULL, NULL);
 		id3->trackno = NULL;
 	} else {
-		id3->comment = convert_and_strip (pos, 28);
+		gchar buf[5];
+
+		id3->comment = g_convert (pos, 28, "UTF-8", encoding, NULL, NULL, NULL);
+
 		snprintf (buf, 5, "%d", pos[29]);
-		id3->trackno = strdup(buf);
+		id3->trackno = g_strdup (buf);
 	}
 
 	pos += 30;
-
 	id3->genre = g_strdup (get_genre_name ((guint) pos[0]));
 
 	if (!id3->genre) {
 		id3->genre = g_strdup ("");
 	}
 
+#ifndef HAVE_ENCA
+	g_free (encoding);
+#endif /* HAVE_ENCA */
+
 	return TRUE;
 }
 
+
 static gboolean
 mp3_parse_header (const gchar *data,
 		  size_t       size,
@@ -755,6 +873,7 @@ mp3_parse (const gchar *data,
 static void
 get_id3v24_tags (const gchar *data,
 		 size_t       size,
+ 		 id3tag      *info,
 		 const gchar *uri,
 		 TrackerSparqlBuilder  *metadata,
 		 file_data   *filedata)
@@ -830,28 +949,28 @@ get_id3v24_tags (const gchar *data,
 
 				switch (data[pos + 10]) {
 				case 0x00:
-					word = g_convert (&data[pos+11],
-							  csize-1,
+					word = t_convert (&data[pos + 11],
+							  csize - 1,
 							  "UTF-8",
-							  "ISO-8859-1",
+							  info->encoding ? info->encoding : "ISO-8859-1",
 							  NULL, NULL, NULL);
 					break;
 				case 0x01 :
-					word = g_convert (&data[pos+11],
-							  csize-1,
+					word = t_convert (&data[pos + 11],
+							  csize - 1,
 							  "UTF-8",
-							  "UTF-16",
+							  info->encoding ? info->encoding : "UTF-16",
 							  NULL, NULL, NULL);
 					break;
 				case 0x02 :
-					word = g_convert (&data[pos+11],
-							  csize-1,
+					word = t_convert (&data[pos + 11],
+							  csize - 1,
 							  "UTF-8",
-							  "UTF-16BE",
+							  info->encoding ? info->encoding : "UTF-16BE",
 							  NULL, NULL, NULL);
 					break;
 				case 0x03 :
-					word = strndup (&data[pos+11], csize-1);
+					word = strndup (&data[pos + 11], csize - 1);
 					break;
 
 				default:
@@ -859,10 +978,10 @@ get_id3v24_tags (const gchar *data,
 					 * try to convert from
 					 * iso-8859-1
 					 */
-					word = g_convert (&data[pos+11],
-							  csize-1,
+					word = t_convert (&data[pos + 11],
+							  csize - 1,
 							  "UTF-8",
-							  "ISO-8859-1",
+							  info->encoding ? info->encoding : "ISO-8859-1",
 							  NULL, NULL, NULL);
 					break;
 				}
@@ -950,27 +1069,27 @@ get_id3v24_tags (const gchar *data,
 
 			switch (text_encode) {
 			case 0x00:
-				word = g_convert (text,
+				word = t_convert (text,
 						  csize - offset,
 						  "UTF-8",
-						  "ISO-8859-1",
+						  info->encoding ? info->encoding : "ISO-8859-1",
 						  NULL, NULL, NULL);
 				break;
-			case 0x01 :
-				word = g_convert (text,
+			case 0x01:
+				word = t_convert (text,
 						  csize - offset,
 						  "UTF-8",
-						  "UTF-16",
+						  info->encoding ? info->encoding : "UTF-16",
 						  NULL, NULL, NULL);
 				break;
-			case 0x02 :
-				word = g_convert (text,
+			case 0x02:
+				word = t_convert (text,
 						  csize-offset,
 						  "UTF-8",
-						  "UTF-16BE",
+						  info->encoding ? info->encoding : "UTF-16BE",
 						  NULL, NULL, NULL);
 				break;
-			case 0x03 :
+			case 0x03:
 				word = g_strndup (text, csize - offset);
 				break;
 				
@@ -979,10 +1098,10 @@ get_id3v24_tags (const gchar *data,
 				 * try to convert from
 				 * iso-8859-1
 				 */
-				word = g_convert (text,
+				word = t_convert (text,
 						  csize - offset,
 						  "UTF-8",
-						  "ISO-8859-1",
+						  info->encoding ? info->encoding : "ISO-8859-1",
 						  NULL, NULL, NULL);
 				break;
 			}
@@ -1031,6 +1150,7 @@ get_id3v24_tags (const gchar *data,
 static void
 get_id3v23_tags (const gchar *data,
 		 size_t       size,
+		 id3tag      *info,
 		 const gchar *uri,
 		 TrackerSparqlBuilder  *metadata,
 		 file_data   *filedata)
@@ -1105,30 +1225,30 @@ get_id3v23_tags (const gchar *data,
 
 				switch (data[pos + 10]) {
 				case 0x00:
-					word = g_convert (&data[pos+11],
-							  csize-1,
+					word = t_convert (&data[pos + 11],
+							  csize - 1,
 							  "UTF-8",
-							  "ISO-8859-1",
+							  info->encoding ? info->encoding : "ISO-8859-1",
 							  NULL, NULL, NULL);
 					break;
 				case 0x01 :
-/* 					word = g_convert (&data[pos+11], */
-/* 							  csize-1, */
+/* 					word = g_convert (&data[pos + 11], */
+/* 							  csize - 1, */
 /* 							  "UTF-8", */
 /* 							  "UCS-2", */
 /* 							  NULL, NULL, NULL); */
-					word = ucs2_to_utf8 (&data[pos+11],
-							     csize-1);
+					word = ucs2_to_utf8 (&data[pos + 11],
+							     csize - 1);
 					break;
 				default:
 					/* Bad encoding byte,
 					 * try to convert from
 					 * iso-8859-1
 					 */
-					word = g_convert (&data[pos+11],
-							  csize-1,
+					word = t_convert (&data[pos + 11],
+							  csize - 1,
 							  "UTF-8",
-							  "ISO-8859-1",
+							  info->encoding ? info->encoding : "ISO-8859-1",
 							  NULL, NULL, NULL);
 					break;
 				}
@@ -1216,10 +1336,10 @@ get_id3v23_tags (const gchar *data,
 
 			switch (text_encode) {
 			case 0x00:
-				word = g_convert (text,
+				word = t_convert (text,
 						  csize - offset,
 						  "UTF-8",
-						  "ISO-8859-1",
+						  info->encoding ? info->encoding : "ISO-8859-1",
 						  NULL, NULL, NULL);
 				break;
 			case 0x01 :
@@ -1236,10 +1356,10 @@ get_id3v23_tags (const gchar *data,
 				 * try to convert from
 				 * iso-8859-1
 				 */
-				word = g_convert (text,
+				word = t_convert (text,
 						  csize - offset,
 						  "UTF-8",
-						  "ISO-8859-1",
+						  info->encoding ? info->encoding : "ISO-8859-1",
 						  NULL, NULL, NULL);
 				break;
 			}
@@ -1264,11 +1384,11 @@ get_id3v23_tags (const gchar *data,
 			guint        offset;
 			gint         mime_len;
 
-			text_type =  data[pos +10];
-			mime      = &data[pos +11];
+			text_type =  data[pos + 10];
+			mime      = &data[pos + 11];
 			mime_len  = strlen (mime);
-			pic_type  =  data[pos +11 + mime_len + 1];
-			desc      = &data[pos +11 + mime_len + 1 + 1];
+			pic_type  =  data[pos + 11 + mime_len + 1];
+			desc      = &data[pos + 11 + mime_len + 1 + 1];
 			
 			if (pic_type == 3 || (pic_type == 0 && filedata->albumartsize == 0)) {
 				offset = pos + 11 + mime_len + 2 + strlen (desc) + 1;
@@ -1287,6 +1407,7 @@ get_id3v23_tags (const gchar *data,
 static void
 get_id3v20_tags (const gchar *data,
 		size_t	     size,
+		id3tag      *info,
 		const gchar *uri,
 		TrackerSparqlBuilder  *metadata,
 		 file_data   *filedata)
@@ -1350,10 +1471,10 @@ get_id3v20_tags (const gchar *data,
 				 */
 				switch (data[pos + 6]) {
 				case 0x00:
-					word = g_convert (&data[pos+7],
-							  csize-1,
+					word = t_convert (&data[pos + 7],
+							  csize - 1,
 							  "UTF-8",
-							  "ISO-8859-1",
+							  info->encoding ? info->encoding : "ISO-8859-1",
 							  NULL, NULL, NULL);
 					break;
 				case 0x01 :
@@ -1362,18 +1483,18 @@ get_id3v20_tags (const gchar *data,
 /* 							  "UTF-8", */
 /* 							  "UCS-2", */
 /* 							  NULL, NULL, NULL); */
-					word = ucs2_to_utf8 (&data[pos+7],
-							     csize-1);
+					word = ucs2_to_utf8 (&data[pos + 7],
+							     csize - 1);
 					break;
 				default:
 					/* Bad encoding byte,
 					 * try to convert from
 					 * iso-8859-1
 					 */
-					word = g_convert (&data[pos+7],
-							  csize-1,
+					word = t_convert (&data[pos + 7],
+							  csize - 1,
 							  "UTF-8",
-							  "ISO-8859-1",
+							  info->encoding ? info->encoding : "ISO-8859-1",
 							  NULL, NULL, NULL);
 					break;
 				}
@@ -1471,6 +1592,7 @@ get_id3v20_tags (const gchar *data,
 static void
 parse_id3v24 (const gchar *data,
 	      size_t       size,
+	      id3tag      *info,
 	      const gchar *uri,
 	      TrackerSparqlBuilder  *metadata,
 	      file_data   *filedata,
@@ -1523,10 +1645,10 @@ parse_id3v24 (const gchar *data,
 		gchar  *body;
 
 		un_unsync (&data[pos], tsize, (unsigned char **)&body, &unsync_size);
-		get_id3v24_tags (body, unsync_size, uri, metadata, filedata);
+		get_id3v24_tags (body, unsync_size, info, uri, metadata, filedata);
 		g_free (body);
 	} else {
-		get_id3v24_tags (&data[pos], tsize, uri, metadata, filedata);
+		get_id3v24_tags (&data[pos], tsize, info, uri, metadata, filedata);
 	}
 
 	*offset_delta = tsize + 10;
@@ -1535,6 +1657,7 @@ parse_id3v24 (const gchar *data,
 static void
 parse_id3v23 (const gchar *data,
 	      size_t       size,
+	      id3tag      *info,
 	      const gchar *uri,
 	      TrackerSparqlBuilder  *metadata,
 	      file_data   *filedata,
@@ -1597,10 +1720,10 @@ parse_id3v23 (const gchar *data,
 		gchar  *body;
 
 		un_unsync (&data[pos], tsize, (unsigned char **)&body, &unsync_size);
-		get_id3v23_tags (body, unsync_size, uri, metadata, filedata);
+		get_id3v23_tags (body, unsync_size, info, uri, metadata, filedata);
 		g_free (body);
 	} else {
-		get_id3v23_tags (&data[pos], tsize, uri, metadata, filedata);
+		get_id3v23_tags (&data[pos], tsize, info, uri, metadata, filedata);
 	}
 
 	*offset_delta = tsize + 10;
@@ -1609,6 +1732,7 @@ parse_id3v23 (const gchar *data,
 static void
 parse_id3v20 (const gchar *data,
 	      size_t	      size,
+	      id3tag      *info,
 	      const gchar *uri,
 	      TrackerSparqlBuilder  *metadata,
 	      file_data   *filedata,
@@ -1643,10 +1767,10 @@ parse_id3v20 (const gchar *data,
 		gchar  *body;
 
 		un_unsync (&data[pos], tsize, (unsigned char **)&body, &unsync_size);
-		get_id3v20_tags (body, unsync_size, uri, metadata, filedata);
+		get_id3v20_tags (body, unsync_size, info, uri, metadata, filedata);
 		g_free (body);
 	} else {
-		get_id3v20_tags (&data[pos], tsize, uri, metadata, filedata);
+		get_id3v20_tags (&data[pos], tsize, info, uri, metadata, filedata);
 	}
 
 	*offset_delta = tsize + 10;
@@ -1655,6 +1779,7 @@ parse_id3v20 (const gchar *data,
 static goffset
 parse_id3v2 (const gchar *data,
 	     size_t	     size,
+	     id3tag      *info,
 	     const gchar *uri,
 	     TrackerSparqlBuilder  *metadata,
 	     file_data   *filedata)
@@ -1664,9 +1789,9 @@ parse_id3v2 (const gchar *data,
 
 	do {
 		size_t offset_delta = 0;
-		parse_id3v24 (data+offset, size-offset, uri, metadata, filedata, &offset_delta);
-		parse_id3v23 (data+offset, size-offset, uri, metadata, filedata, &offset_delta);
-		parse_id3v20 (data+offset, size-offset, uri, metadata, filedata, &offset_delta);		
+		parse_id3v24 (data+offset, size-offset, info, uri, metadata, filedata, &offset_delta);
+		parse_id3v23 (data+offset, size-offset, info, uri, metadata, filedata, &offset_delta);
+		parse_id3v20 (data+offset, size-offset, info, uri, metadata, filedata, &offset_delta);
 
 		if (offset_delta == 0) {
 			done = TRUE;
@@ -1701,6 +1826,7 @@ extract_mp3 (const gchar *uri,
 	info.comment = NULL;
 	info.genre = NULL;
 	info.trackno = NULL;
+	info.encoding = NULL;
 
 	filedata.size = 0;
 	filedata.id3v2_size = 0;
@@ -1776,9 +1902,6 @@ extract_mp3 (const gchar *uri,
 	                               RDF_TYPE, 
 	                               NFO_PREFIX "Audio");
 
-	/* Get other embedded tags */
-	audio_offset = parse_id3v2 (buffer, buffer_size, uri, metadata, &filedata);
-
 	if (!tracker_is_empty_string (info.title)) {
 		tracker_statement_list_insert (metadata, uri,
 				     NIE_PREFIX "title",
@@ -1828,6 +1951,12 @@ extract_mp3 (const gchar *uri,
 				     info.trackno);
 	}
 
+	/* Get other embedded tags */
+	audio_offset = parse_id3v2 (buffer, buffer_size, &info, uri, metadata, &filedata);
+
+	/* Get mp3 stream info */
+	mp3_parse (buffer, buffer_size, audio_offset, uri, metadata, &filedata);
+
 	g_free (info.title);
 	g_free (info.year);
 	g_free (info.album);
@@ -1836,9 +1965,6 @@ extract_mp3 (const gchar *uri,
 	g_free (info.trackno);
 	g_free (info.genre);
 
-	/* Get mp3 stream info */
-	mp3_parse (buffer, buffer_size, audio_offset, uri, metadata, &filedata);
-
 	/* TODO */
 #ifdef HAVE_GDKPIXBUF
 	tracker_process_albumart (filedata.albumartdata, filedata.albumartsize, filedata.albumartmime,
@@ -1857,6 +1983,8 @@ extract_mp3 (const gchar *uri,
 	g_free (filedata.albumartdata);
 	g_free (filedata.albumartmime);
 
+	g_free (info.encoding);
+
 #ifndef G_OS_WIN32
 	munmap (buffer, buffer_size);
 #endif



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]