[tracker/tracker-0.6] Use libenca to check for Cyrillic encodings for ID3v1 tags
- From: Martyn James Russell <mr src gnome org>
- To: svn-commits-list gnome org
- Subject: [tracker/tracker-0.6] Use libenca to check for Cyrillic encodings for ID3v1 tags
- Date: Wed, 17 Jun 2009 07:28:45 -0400 (EDT)
commit 9e40472245605667c652b62e5c96732434b1c9e3
Author: Philip Van Hoof <philip codeminded be>
Date: Wed Jun 17 12:20:59 2009 +0100
Use libenca to check for Cyrillic encodings for ID3v1 tags
This partially fixes NB#119318. There are other tags which also need checks and
conversions done which can potentially overwrite the ID3v1 tag data we
initially retrieved.
configure.ac | 12 ++
src/tracker-extract/Makefile.am | 9 ++
src/tracker-extract/tracker-extract-mp3.c | 164 ++++++++++++++++++-----------
3 files changed, 124 insertions(+), 61 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 48854b7..ebda38e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -203,6 +203,17 @@ AC_SUBST(GCONF_LIBS)
AM_CONDITIONAL(HAVE_GCONF, test "$have_gconf" = "yes")
+# Check for enca, detects defect Russian or Cyrillic language specifics in mp3s
+PKG_CHECK_MODULES(ENCA, [enca >= 1.9], have_enca=yes, have_enca=no)
+AC_SUBST(ENCA_CFLAGS)
+AC_SUBST(ENCA_LIBS)
+
+AM_CONDITIONAL(HAVE_ENCA, test "$have_enca" = "yes")
+
+if test x$have_enca == "xyes"; then
+ AC_DEFINE(HAVE_ENCA, 1, [Enca language detection aid])
+fi
+
# Check for Raptor
PKG_CHECK_MODULES(RAPTOR, [raptor >= 1.4.17], have_raptor=yes, have_raptor=no)
AC_SUBST(RAPTOR_CFLAGS)
@@ -1471,6 +1482,7 @@ Build Configuration:
Enable unit tests: $have_unit_tests
Enable unac accent stripper: $have_unac
+ Enable enca encoding detection: $have_enca
Enable full text search: $enable_fts
Enable detailed metadata: $enable_detailed_metadata
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index aff57f5..72247a9 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -28,6 +28,10 @@ INCLUDES = \
$(XINE_CFLAGS) \
$(TOTEM_PL_PARSER_CFLAGS)
+if HAVE_ENCA
+INCLUDES += $(ENCA_CFLAGS)
+endif
+
modules_LTLIBRARIES = \
libextract-abw.la \
libextract-mp3.la \
@@ -125,6 +129,11 @@ libextract_mp3_la_SOURCES = tracker-extract-mp3.c $(albumart_sources) $(escape_s
libextract_mp3_la_LDFLAGS = $(module_flags) $(albumart_flags)
libextract_mp3_la_LIBADD = $(albumart_libs) $(GLIB2_LIBS) $(GCOV_LIBS)
+if HAVE_ENCA
+libextract_mp3_la_LIBADD += $(ENCA_LIBS)
+#libextract_mp3_la_LIBADD += -L/usr/lib/libenca.a
+endif
+
# Vorbis (OGG)
libextract_vorbis_la_SOURCES = tracker-extract-vorbis.c $(escape_sources)
libextract_vorbis_la_LDFLAGS = $(module_flags)
diff --git a/src/tracker-extract/tracker-extract-mp3.c b/src/tracker-extract/tracker-extract-mp3.c
index 1bbbd25..514eed4 100644
--- a/src/tracker-extract/tracker-extract-mp3.c
+++ b/src/tracker-extract/tracker-extract-mp3.c
@@ -40,6 +40,10 @@
#include <sys/mman.h>
#endif /* G_OS_WIN32 */
+#ifdef HAVE_ENCA
+#include <enca.h>
+#endif
+
#include <libtracker-common/tracker-file-utils.h>
#include <libtracker-common/tracker-utils.h>
@@ -441,7 +445,7 @@ un_unsync (const unsigned char *source,
*dest = source[offset];
if ((source[offset] == 0xFF) &&
- (source[offset+1] == 0x00)) {
+ (source[offset + 1] == 0x00)) {
offset++;
new_size--;
}
@@ -452,12 +456,52 @@ un_unsync (const unsigned char *source,
*dest_size = new_size;
}
+static char*
+get_encoding (const char *data, size_t size)
+{
+ gchar *encoding_string = NULL;
+
+#ifdef HAVE_ENCA
+ const char **langs;
+ size_t s, i;
+
+ langs = enca_get_languages (&s);
+
+ for (i = 0; i < s && !encoding_string; i++) {
+ EncaAnalyser analyser;
+ EncaEncoding encoding;
+
+ analyser = enca_analyser_alloc (langs[i]);
+ encoding = enca_analyse_const (analyser, data, size);
+
+ if (enca_charset_is_known (encoding.charset)) {
+ encoding_string = g_strdup (enca_charset_name (encoding.charset,
+ ENCA_NAME_STYLE_ICONV));
+ }
+
+ enca_analyser_free (analyser);
+ }
+
+ free (langs);
+#endif
+
+ if (!encoding_string) {
+ encoding_string = g_strdup ("ISO-8859-1");
+ }
+
+ return encoding_string;
+}
+
static gboolean
get_id3 (const gchar *data,
size_t size,
id3tag *id3)
{
+#ifdef HAVE_ENCA
+ GString *s;
+#endif /* HAVE_ENCA */
const gchar *pos;
+ gchar *encoding = NULL;
gchar buf[5];
if (!data) {
@@ -474,55 +518,53 @@ get_id3 (const gchar *data,
return FALSE;
}
- pos += 3;
+#ifdef HAVE_ENCA
+ /* Get the encoding for ALL the data we are extracting here */
+ s = g_string_new ("");
+ g_string_append_len (s, pos, 30);
+ g_string_append_len (s, pos + 30, 30);
+ g_string_append_len (s, pos + 60, 30);
+
+ encoding = get_encoding (s->str, 90);
+ g_string_free (s, TRUE);
+#else /* HAVE_ENCA */
+ encoding = get_encoding (NULL, 0);
+#endif /* HAVE_ENCA */
- id3->title = g_convert (pos, 30,
- "UTF-8",
- "ISO-8859-1",
- NULL, NULL, NULL);
+ /* Now convert all the data separately */
+ pos += 3;
+ id3->title = g_convert (pos, 30, "UTF-8", encoding, NULL, NULL, NULL);
pos += 30;
- id3->artist = g_convert (pos, 30,
- "UTF-8",
- "ISO-8859-1",
- NULL, NULL, NULL);
+ id3->artist = g_convert (pos, 30, "UTF-8", encoding, NULL, NULL, NULL);
+
pos += 30;
- id3->album = g_convert (pos, 30,
- "UTF-8",
- "ISO-8859-1",
- NULL, NULL, NULL);
+ id3->album = g_convert (pos, 30, "UTF-8", encoding, NULL, NULL, NULL);
+
pos += 30;
- id3->year = g_convert (pos, 4,
- "UTF-8",
- "ISO-8859-1",
- NULL, NULL, NULL);
+ id3->year = g_convert (pos, 4, "UTF-8", encoding, NULL, NULL, NULL);
pos += 4;
- if (pos[28] != (guint)0) {
- id3->comment = g_convert (pos, 30,
- "UTF-8",
- "ISO-8859-1",
- NULL, NULL, NULL);
-
+ if (pos[28] != 0) {
+ id3->comment = g_convert (pos, 30, "UTF-8", encoding, NULL, NULL, NULL);
id3->trackno = NULL;
} else {
- id3->comment = g_convert (pos, 28,
- "UTF-8",
- "ISO-8859-1",
- NULL, NULL, NULL);
+ id3->comment = g_convert (pos, 28, "UTF-8", encoding, NULL, NULL, NULL);
+
snprintf (buf, 5, "%d", pos[29]);
- id3->trackno = strdup(buf);
+ id3->trackno = g_strdup (buf);
}
pos += 30;
-
id3->genre = g_strdup (get_genre_name ((guint) pos[0]));
if (!id3->genre) {
id3->genre = g_strdup ("");
}
+ g_free (encoding);
+
return TRUE;
}
@@ -833,28 +875,28 @@ get_id3v24_tags (const gchar *data,
switch (data[pos + 10]) {
case 0x00:
- word = g_convert (&data[pos+11],
- csize-1,
+ word = g_convert (&data[pos + 11],
+ csize - 1,
"UTF-8",
"ISO-8859-1",
NULL, NULL, NULL);
break;
case 0x01 :
- word = g_convert (&data[pos+11],
- csize-1,
+ word = g_convert (&data[pos + 11],
+ csize - 1,
"UTF-8",
"UTF-16",
NULL, NULL, NULL);
break;
case 0x02 :
- word = g_convert (&data[pos+11],
- csize-1,
+ word = g_convert (&data[pos + 11],
+ csize - 1,
"UTF-8",
"UTF-16BE",
NULL, NULL, NULL);
break;
case 0x03 :
- word = strndup (&data[pos+11], csize-1);
+ word = strndup (&data[pos + 11], csize - 1);
break;
default:
@@ -862,8 +904,8 @@ get_id3v24_tags (const gchar *data,
* try to convert from
* iso-8859-1
*/
- word = g_convert (&data[pos+11],
- csize-1,
+ word = g_convert (&data[pos + 11],
+ csize - 1,
"UTF-8",
"ISO-8859-1",
NULL, NULL, NULL);
@@ -939,21 +981,21 @@ get_id3v24_tags (const gchar *data,
"ISO-8859-1",
NULL, NULL, NULL);
break;
- case 0x01 :
+ case 0x01:
word = g_convert (text,
csize - offset,
"UTF-8",
"UTF-16",
NULL, NULL, NULL);
break;
- case 0x02 :
+ case 0x02:
word = g_convert (text,
csize-offset,
"UTF-8",
"UTF-16BE",
NULL, NULL, NULL);
break;
- case 0x03 :
+ case 0x03:
word = g_strndup (text, csize - offset);
break;
@@ -1091,28 +1133,28 @@ get_id3v23_tags (const gchar *data,
switch (data[pos + 10]) {
case 0x00:
- word = g_convert (&data[pos+11],
- csize-1,
+ word = g_convert (&data[pos + 11],
+ csize - 1,
"UTF-8",
"ISO-8859-1",
NULL, NULL, NULL);
break;
case 0x01 :
-/* word = g_convert (&data[pos+11], */
-/* csize-1, */
+/* word = g_convert (&data[pos + 11], */
+/* csize - 1, */
/* "UTF-8", */
/* "UCS-2", */
/* NULL, NULL, NULL); */
- word = ucs2_to_utf8 (&data[pos+11],
- csize-1);
+ word = ucs2_to_utf8 (&data[pos + 11],
+ csize - 1);
break;
default:
/* Bad encoding byte,
* try to convert from
* iso-8859-1
*/
- word = g_convert (&data[pos+11],
- csize-1,
+ word = g_convert (&data[pos + 11],
+ csize - 1,
"UTF-8",
"ISO-8859-1",
NULL, NULL, NULL);
@@ -1230,11 +1272,11 @@ get_id3v23_tags (const gchar *data,
guint offset;
gint mime_len;
- text_type = data[pos +10];
- mime = &data[pos +11];
+ text_type = data[pos + 10];
+ mime = &data[pos + 11];
mime_len = strlen (mime);
- pic_type = data[pos +11 + mime_len + 1];
- desc = &data[pos +11 + mime_len + 1 + 1];
+ pic_type = data[pos + 11 + mime_len + 1];
+ desc = &data[pos + 11 + mime_len + 1 + 1];
if (pic_type == 3 || (pic_type == 0 && filedata->albumartsize == 0)) {
offset = pos + 11 + mime_len + 2 + strlen (desc) + 1;
@@ -1319,8 +1361,8 @@ get_id3v20_tags (const gchar *data,
*/
switch (data[pos + 6]) {
case 0x00:
- word = g_convert (&data[pos+7],
- csize-1,
+ word = g_convert (&data[pos + 7],
+ csize - 1,
"UTF-8",
"ISO-8859-1",
NULL, NULL, NULL);
@@ -1331,16 +1373,16 @@ get_id3v20_tags (const gchar *data,
/* "UTF-8", */
/* "UCS-2", */
/* NULL, NULL, NULL); */
- word = ucs2_to_utf8 (&data[pos+7],
- csize-1);
+ word = ucs2_to_utf8 (&data[pos + 7],
+ csize - 1);
break;
default:
/* Bad encoding byte,
* try to convert from
* iso-8859-1
*/
- word = g_convert (&data[pos+7],
- csize-1,
+ word = g_convert (&data[pos + 7],
+ csize - 1,
"UTF-8",
"ISO-8859-1",
NULL, NULL, NULL);
@@ -1696,7 +1738,7 @@ extract_mp3 (const gchar *filename,
close (fd);
- if (buffer == NULL || buffer == (void*) -1) {
+ if (buffer == NULL || buffer == (void*) - 1) {
return;
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]