[tracker] extract-mp3: Bail out on encoding detection if confidence is too low
- From: Carlos Garnacho <carlosg src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker] extract-mp3: Bail out on encoding detection if confidence is too low
- Date: Sun, 5 Jul 2015 10:34:25 +0000 (UTC)
commit ede17cc22b0c6245c030fbb45d0db60a35316c73
Author: Carlos Garnacho <carlosg gnome org>
Date: Sun Jul 5 12:21:27 2015 +0200
extract-mp3: Bail out on encoding detection if confidence is too low
Libicu encoding detection is able to tell the confidence it got on
the detection, we should be using that in case the confidence is
too low, as that means the returned encoding is probably bogus, and
we have an encoding to fallback on.
This fixes detection on the file reported on bug #735515, where
a couple of 'ï' chars (valid ISO-8859-1) make libicu detect UTF-16BE,
although with an extremely low confidence.
https://bugzilla.gnome.org/show_bug.cgi?id=735515
src/libtracker-extract/tracker-encoding-libicu.c | 15 +++++++++++++--
src/libtracker-extract/tracker-encoding-libicu.h | 3 ++-
src/libtracker-extract/tracker-encoding.c | 12 +++++++++---
src/libtracker-extract/tracker-encoding.h | 3 ++-
src/tracker-extract/tracker-extract-mp3.c | 11 ++++++++++-
5 files changed, 36 insertions(+), 8 deletions(-)
---
diff --git a/src/libtracker-extract/tracker-encoding-libicu.c
b/src/libtracker-extract/tracker-encoding-libicu.c
index 8eb0add..3490dac 100644
--- a/src/libtracker-extract/tracker-encoding-libicu.c
+++ b/src/libtracker-extract/tracker-encoding-libicu.c
@@ -29,13 +29,15 @@
gchar *
tracker_encoding_guess_icu (const gchar *buffer,
- gsize size)
+ gsize size,
+ gdouble *confidence)
{
UCharsetDetector *detector = NULL;
const UCharsetMatch *match;
gchar *charset = NULL;
UErrorCode status = 0;
const char *p_match = NULL;
+ int32_t conf = 0;
detector = ucsdet_open (&status);
@@ -60,12 +62,21 @@ tracker_encoding_guess_icu (const gchar *buffer,
if (p_match == NULL || U_FAILURE (status))
goto failure;
+ conf = ucsdet_getConfidence (match, &status);
+
+ if (U_FAILURE (status))
+ goto failure;
+
charset = g_strdup ((const gchar *) p_match);
if (charset)
- g_debug ("Guessing charset as '%s'", charset);
+ g_debug ("Guessing charset as '%s' (Confidence: %f)",
+ charset, (gdouble) conf / 100);
failure:
+ if (confidence)
+ *confidence = (gdouble) conf / 100;
+
if (detector)
ucsdet_close (detector);
diff --git a/src/libtracker-extract/tracker-encoding-libicu.h
b/src/libtracker-extract/tracker-encoding-libicu.h
index 0b9b9f4..3b3f942 100644
--- a/src/libtracker-extract/tracker-encoding-libicu.h
+++ b/src/libtracker-extract/tracker-encoding-libicu.h
@@ -26,7 +26,8 @@ G_BEGIN_DECLS
G_GNUC_INTERNAL
gchar *tracker_encoding_guess_icu (const gchar *buffer,
- gsize size);
+ gsize size,
+ gdouble *confidence);
G_END_DECLS
diff --git a/src/libtracker-extract/tracker-encoding.c b/src/libtracker-extract/tracker-encoding.c
index d8da3c4..ac4f976 100644
--- a/src/libtracker-extract/tracker-encoding.c
+++ b/src/libtracker-extract/tracker-encoding.c
@@ -46,9 +46,11 @@ tracker_encoding_can_guess (void)
gchar *
tracker_encoding_guess (const gchar *buffer,
- gsize size)
+ gsize size,
+ gdouble *confidence)
{
gchar *encoding = NULL;
+ gdouble conf = 1;
#ifdef HAVE_MEEGOTOUCH
encoding = tracker_encoding_guess_meegotouch (buffer, size);
@@ -56,14 +58,18 @@ tracker_encoding_guess (const gchar *buffer,
#ifdef HAVE_LIBICU_CHARSET_DETECTION
if (!encoding)
- encoding = tracker_encoding_guess_icu (buffer, size);
+ encoding = tracker_encoding_guess_icu (buffer, size, &conf);
#endif /* HAVE_LIBICU_CHARSET_DETECTION */
#ifdef HAVE_ENCA
- if (!encoding)
+ if (!encoding || conf < 0.5) {
+ conf = 1;
encoding = tracker_encoding_guess_enca (buffer, size);
+ }
#endif /* HAVE_ENCA */
+ if (confidence)
+ *confidence = conf;
return encoding;
}
diff --git a/src/libtracker-extract/tracker-encoding.h b/src/libtracker-extract/tracker-encoding.h
index 3964452..ed7e51e 100644
--- a/src/libtracker-extract/tracker-encoding.h
+++ b/src/libtracker-extract/tracker-encoding.h
@@ -33,7 +33,8 @@ gboolean tracker_encoding_can_guess (void);
/* Returns NULL if it couldn't guess it */
gchar *tracker_encoding_guess (const gchar *buffer,
- gsize size);
+ gsize size,
+ gdouble *confidence);
G_END_DECLS
diff --git a/src/tracker-extract/tracker-extract-mp3.c b/src/tracker-extract/tracker-extract-mp3.c
index f3d1bcb..04c4c09 100644
--- a/src/tracker-extract/tracker-extract-mp3.c
+++ b/src/tracker-extract/tracker-extract-mp3.c
@@ -675,13 +675,22 @@ get_encoding (const gchar *data,
gsize size,
gboolean *encoding_found)
{
+ gdouble confidence = 1;
gchar *encoding;
/* Try to guess encoding */
encoding = (data && size ?
- tracker_encoding_guess (data, size) :
+ tracker_encoding_guess (data, size, &confidence) :
NULL);
+ if (confidence < 0.5) {
+ /* Confidence on the results was too low, bail out and
+ * fallback to the default ISO-8859-1/Windows-1252 encoding.
+ */
+ g_free (encoding);
+ encoding = NULL;
+ }
+
/* Notify if a proper detection was done */
if (encoding_found) {
*encoding_found = (encoding ? TRUE : FALSE);;
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]