[tracker] libtracker-extract: Add ICU-based encoding detection
- From: Carlos Garnacho <carlosg src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker] libtracker-extract: Add ICU-based encoding detection
- Date: Tue, 1 Oct 2013 10:35:31 +0000 (UTC)
commit 86302c3bfce0d3be9a647bbf8b33075916efb8e9
Author: Carlos Garnacho <carlos lanedo com>
Date: Tue Oct 1 12:04:40 2013 +0200
libtracker-extract: Add ICU-based encoding detection
This is used before enca if libicu is available, an UCharsetDetector
is used to detect the most likely encoding for a given string. Out
of testing, it provides more reliable values than enca, where you
first have to make a rough guess on the locale, yielding more false
positives.
configure.ac | 10 +++-
src/libtracker-extract/Makefile.am | 6 ++
src/libtracker-extract/tracker-encoding-libicu.c | 72 ++++++++++++++++++++++
src/libtracker-extract/tracker-encoding-libicu.h | 33 ++++++++++
src/libtracker-extract/tracker-encoding.c | 12 +++-
5 files changed, 131 insertions(+), 2 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 1387d40..18100cb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -913,6 +913,8 @@ case "x$with_unicode_support" in
*) AC_MSG_ERROR([Wrong value for --with-unicode-support: $with_unicode_support]) ;;
esac
+AM_CONDITIONAL(HAVE_LIBICU, test "x$have_libicu" = "xyes")
+
# Add to libtracker-fts
LIBTRACKER_FTS_CFLAGS="$LIBTRACKER_FTS_CFLAGS $UNICODE_SUPPORT_CFLAGS"
LIBTRACKER_FTS_LIBS="$LIBTRACKER_FTS_LIBS $UNICODE_SUPPORT_LIBS"
@@ -1466,6 +1468,12 @@ else
have_enca="no (disabled)"
fi
+if test "x$have_enca" = "xyes" || test "x$have_libicu" = "xyes"; then
+ have_charset_detection="yes"
+else
+ have_charset_detection="no"
+fi
+
AM_CONDITIONAL(HAVE_ENCA, test "$have_enca" = "yes")
####################################################################
@@ -2560,7 +2568,6 @@ Build Configuration:
Support for HAL: $have_hal
Support for UPower: $have_upower
- Support for Cyrillic languages (enca): $have_enca
Support for network status detection: $have_network_manager
Support for keyring data (libsecret): $have_libsecret
Unicode support library: $with_unicode_support
@@ -2595,6 +2602,7 @@ Metadata Extractors:
Support AbiWord document parsing: $have_abiword
Support DVI parsing: $have_dvi
Support MP3 parsing: $have_mp3
+ Support MP3 tag charset detection: $have_charset_detection (icu: $have_libicu, enca: $have_enca)
Support PS parsing: $have_ps
Support text parsing: $have_text
Support icon parsing: $have_icon
diff --git a/src/libtracker-extract/Makefile.am b/src/libtracker-extract/Makefile.am
index f3be025..c6ab418 100644
--- a/src/libtracker-extract/Makefile.am
+++ b/src/libtracker-extract/Makefile.am
@@ -58,6 +58,12 @@ libtracker_extract_ TRACKER_API_VERSION@_la_SOURCES += \
tracker-encoding-meegotouch.h
endif
+if HAVE_LIBICU
+libtracker_extract_ TRACKER_API_VERSION@_la_SOURCES += \
+ tracker-encoding-libicu.c \
+ tracker-encoding-libicu.h
+endif
+
libtracker_extract_ TRACKER_API_VERSION@_la_LDFLAGS = \
-version-info $(LT_CURRENT):$(LT_REVISION):$(LT_AGE) \
-export-symbols-regex '^tracker_.*'
diff --git a/src/libtracker-extract/tracker-encoding-libicu.c
b/src/libtracker-extract/tracker-encoding-libicu.c
new file mode 100644
index 0000000..13327e3
--- /dev/null
+++ b/src/libtracker-extract/tracker-encoding-libicu.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2013 Carlos Garnacho <carlos lanedo com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include "config.h"
+
+#include "unicode/utypes.h"
+#include "unicode/localpointer.h"
+#include "unicode/uenum.h"
+#include "unicode/ucsdet.h"
+
+#include <glib.h>
+#include "tracker-encoding-libicu.h"
+
+gchar *
+tracker_encoding_guess_icu (const gchar *buffer,
+ gsize size)
+{
+ UCharsetDetector *detector = NULL;
+ const UCharsetMatch *match;
+ gchar *charset = NULL;
+ UErrorCode status;
+
+ detector = ucsdet_open (&status);
+
+ if (U_FAILURE (status))
+ goto failure;
+
+ if (size >= G_MAXINT32)
+ goto failure;
+
+ ucsdet_setText (detector, buffer, (int32_t) size, &status);
+
+ if (U_FAILURE (status))
+ goto failure;
+
+ match = ucsdet_detect (detector, &status);
+
+ if (U_FAILURE (status))
+ goto failure;
+
+ charset = g_strdup (ucsdet_getName (match, &status));
+
+ if (U_FAILURE (status)) {
+ g_free (charset);
+ charset = NULL;
+ }
+
+ if (charset)
+ g_debug ("Guessing charset as '%s'", charset);
+
+failure:
+ if (detector)
+ ucsdet_close (detector);
+
+ return charset;
+}
diff --git a/src/libtracker-extract/tracker-encoding-libicu.h
b/src/libtracker-extract/tracker-encoding-libicu.h
new file mode 100644
index 0000000..0b9b9f4
--- /dev/null
+++ b/src/libtracker-extract/tracker-encoding-libicu.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2013 Carlos Garnacho <carlos lanedo com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __LIBTRACKER_EXTRACT_ENCODING_ICU_H__
+#define __LIBTRACKER_EXTRACT_ENCODING_ICU_H__
+
+#include <glib.h>
+
+G_BEGIN_DECLS
+
+G_GNUC_INTERNAL
+gchar *tracker_encoding_guess_icu (const gchar *buffer,
+ gsize size);
+
+G_END_DECLS
+
+#endif /* __LIBTRACKER_EXTRACT_ENCODING_ICU_H__ */
diff --git a/src/libtracker-extract/tracker-encoding.c b/src/libtracker-extract/tracker-encoding.c
index dba19ee..b5c82d5 100644
--- a/src/libtracker-extract/tracker-encoding.c
+++ b/src/libtracker-extract/tracker-encoding.c
@@ -30,10 +30,14 @@
#include "tracker-encoding-meegotouch.h"
#endif
+#ifdef HAVE_LIBICU
+#include "tracker-encoding-libicu.h"
+#endif
+
gboolean
tracker_encoding_can_guess (void)
{
-#if defined (HAVE_ENCA) || defined (HAVE_MEEGOTOUCH)
+#if defined (HAVE_ENCA) || defined (HAVE_MEEGOTOUCH) || defined (HAVE_LIBICU)
return TRUE;
#else
return FALSE;
@@ -50,10 +54,16 @@ tracker_encoding_guess (const gchar *buffer,
encoding = tracker_encoding_guess_meegotouch (buffer, size);
#endif /* HAVE_MEEGOTOUCH */
+#ifdef HAVE_LIBICU
+ if (!encoding)
+ encoding = tracker_encoding_guess_icu (buffer, size);
+#endif
+
#ifdef HAVE_ENCA
if (!encoding)
encoding = tracker_encoding_guess_enca (buffer, size);
#endif /* HAVE_ENCA */
+
return encoding;
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]