[tracker] libtracker-extract: Add ICU-based encoding detection



commit 86302c3bfce0d3be9a647bbf8b33075916efb8e9
Author: Carlos Garnacho <carlos lanedo com>
Date:   Tue Oct 1 12:04:40 2013 +0200

    libtracker-extract: Add ICU-based encoding detection
    
    This is used before enca if libicu is available, an UCharsetDetector
    is used to detect the most likely encoding for a given string. Out
    of testing, it provides more reliable values than enca, where you
    first have to make a rough guess on the locale, yielding more false
    positives.

 configure.ac                                     |   10 +++-
 src/libtracker-extract/Makefile.am               |    6 ++
 src/libtracker-extract/tracker-encoding-libicu.c |   72 ++++++++++++++++++++++
 src/libtracker-extract/tracker-encoding-libicu.h |   33 ++++++++++
 src/libtracker-extract/tracker-encoding.c        |   12 +++-
 5 files changed, 131 insertions(+), 2 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 1387d40..18100cb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -913,6 +913,8 @@ case "x$with_unicode_support" in
      *) AC_MSG_ERROR([Wrong value for --with-unicode-support: $with_unicode_support]) ;;
 esac
 
+AM_CONDITIONAL(HAVE_LIBICU, test "x$have_libicu" = "xyes")
+
 # Add to libtracker-fts
 LIBTRACKER_FTS_CFLAGS="$LIBTRACKER_FTS_CFLAGS $UNICODE_SUPPORT_CFLAGS"
 LIBTRACKER_FTS_LIBS="$LIBTRACKER_FTS_LIBS $UNICODE_SUPPORT_LIBS"
@@ -1466,6 +1468,12 @@ else
    have_enca="no  (disabled)"
 fi
 
+if test "x$have_enca" = "xyes" || test "x$have_libicu" = "xyes"; then
+   have_charset_detection="yes"
+else
+   have_charset_detection="no"
+fi
+
 AM_CONDITIONAL(HAVE_ENCA, test "$have_enca" = "yes")
 
 ####################################################################
@@ -2560,7 +2568,6 @@ Build Configuration:
 
        Support for HAL:                        $have_hal
        Support for UPower:                     $have_upower
-       Support for Cyrillic languages (enca):  $have_enca
        Support for network status detection:   $have_network_manager
         Support for keyring data (libsecret):   $have_libsecret
        Unicode support library:                $with_unicode_support
@@ -2595,6 +2602,7 @@ Metadata Extractors:
        Support AbiWord document parsing:       $have_abiword
        Support DVI parsing:                    $have_dvi
        Support MP3 parsing:                    $have_mp3
+       Support MP3 tag charset detection:      $have_charset_detection (icu: $have_libicu, enca: $have_enca)
        Support PS parsing:                     $have_ps
        Support text parsing:                   $have_text
        Support icon parsing:                   $have_icon
diff --git a/src/libtracker-extract/Makefile.am b/src/libtracker-extract/Makefile.am
index f3be025..c6ab418 100644
--- a/src/libtracker-extract/Makefile.am
+++ b/src/libtracker-extract/Makefile.am
@@ -58,6 +58,12 @@ libtracker_extract_ TRACKER_API_VERSION@_la_SOURCES += \
        tracker-encoding-meegotouch.h
 endif
 
+if HAVE_LIBICU
+libtracker_extract_ TRACKER_API_VERSION@_la_SOURCES += \
+       tracker-encoding-libicu.c                      \
+       tracker-encoding-libicu.h
+endif
+
 libtracker_extract_ TRACKER_API_VERSION@_la_LDFLAGS =  \
        -version-info $(LT_CURRENT):$(LT_REVISION):$(LT_AGE) \
        -export-symbols-regex '^tracker_.*'
diff --git a/src/libtracker-extract/tracker-encoding-libicu.c 
b/src/libtracker-extract/tracker-encoding-libicu.c
new file mode 100644
index 0000000..13327e3
--- /dev/null
+++ b/src/libtracker-extract/tracker-encoding-libicu.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2013 Carlos Garnacho <carlos lanedo com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include "config.h"
+
+#include "unicode/utypes.h"
+#include "unicode/localpointer.h"
+#include "unicode/uenum.h"
+#include "unicode/ucsdet.h"
+
+#include <glib.h>
+#include "tracker-encoding-libicu.h"
+
+gchar *
+tracker_encoding_guess_icu (const gchar *buffer,
+                           gsize        size)
+{
+       UCharsetDetector *detector = NULL;
+       const UCharsetMatch *match;
+       gchar *charset = NULL;
+       UErrorCode status;
+
+       detector = ucsdet_open (&status);
+
+       if (U_FAILURE (status))
+               goto failure;
+
+       if (size >= G_MAXINT32)
+               goto failure;
+
+       ucsdet_setText (detector, buffer, (int32_t) size, &status);
+
+       if (U_FAILURE (status))
+               goto failure;
+
+       match = ucsdet_detect (detector, &status);
+
+       if (U_FAILURE (status))
+               goto failure;
+
+       charset = g_strdup (ucsdet_getName (match, &status));
+
+       if (U_FAILURE (status)) {
+               g_free (charset);
+               charset = NULL;
+       }
+
+       if (charset)
+               g_debug ("Guessing charset as '%s'", charset);
+
+failure:
+       if (detector)
+               ucsdet_close (detector);
+
+       return charset;
+}
diff --git a/src/libtracker-extract/tracker-encoding-libicu.h 
b/src/libtracker-extract/tracker-encoding-libicu.h
new file mode 100644
index 0000000..0b9b9f4
--- /dev/null
+++ b/src/libtracker-extract/tracker-encoding-libicu.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2013 Carlos Garnacho <carlos lanedo com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __LIBTRACKER_EXTRACT_ENCODING_ICU_H__
+#define __LIBTRACKER_EXTRACT_ENCODING_ICU_H__
+
+#include <glib.h>
+
+G_BEGIN_DECLS
+
+G_GNUC_INTERNAL
+gchar *tracker_encoding_guess_icu (const gchar *buffer,
+                                  gsize        size);
+
+G_END_DECLS
+
+#endif /* __LIBTRACKER_EXTRACT_ENCODING_ICU_H__ */
diff --git a/src/libtracker-extract/tracker-encoding.c b/src/libtracker-extract/tracker-encoding.c
index dba19ee..b5c82d5 100644
--- a/src/libtracker-extract/tracker-encoding.c
+++ b/src/libtracker-extract/tracker-encoding.c
@@ -30,10 +30,14 @@
 #include "tracker-encoding-meegotouch.h"
 #endif
 
+#ifdef HAVE_LIBICU
+#include "tracker-encoding-libicu.h"
+#endif
+
 gboolean
 tracker_encoding_can_guess (void)
 {
-#if defined (HAVE_ENCA) || defined (HAVE_MEEGOTOUCH)
+#if defined (HAVE_ENCA) || defined (HAVE_MEEGOTOUCH) || defined (HAVE_LIBICU)
        return TRUE;
 #else
        return FALSE;
@@ -50,10 +54,16 @@ tracker_encoding_guess (const gchar *buffer,
        encoding = tracker_encoding_guess_meegotouch (buffer, size);
 #endif /* HAVE_MEEGOTOUCH */
 
+#ifdef HAVE_LIBICU
+       if (!encoding)
+               encoding = tracker_encoding_guess_icu (buffer, size);
+#endif
+
 #ifdef HAVE_ENCA
        if (!encoding)
                encoding = tracker_encoding_guess_enca (buffer, size);
 #endif /* HAVE_ENCA */
 
+
        return encoding;
 }


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]