[tracker] libtracker-common: Only use encoding guessing when confidence >30%



commit 91ce0b167644e68365bdb729fcbfab37de387708
Author: Philip Van Hoof <philip codeminded be>
Date:   Thu Mar 31 11:16:39 2011 +0200

    libtracker-common: Only use encoding guessing when confidence >30%

 .../tracker-encoding-meegotouch.cpp                |   42 +++++++++++++-------
 src/tracker-extract/tracker-extract-mp3.c          |    4 +-
 2 files changed, 30 insertions(+), 16 deletions(-)
---
diff --git a/src/libtracker-common/tracker-encoding-meegotouch.cpp b/src/libtracker-common/tracker-encoding-meegotouch.cpp
index 28573d0..29502f5 100644
--- a/src/libtracker-common/tracker-encoding-meegotouch.cpp
+++ b/src/libtracker-common/tracker-encoding-meegotouch.cpp
@@ -24,6 +24,7 @@
 
 #include <glib.h>
 #include "tracker-encoding-meegotouch.h"
+#include "tracker-locale.h"
 
 /*
  * See http://apidocs.meego.com/git-tip/mtf/class_m_charset_detector.html
@@ -35,6 +36,8 @@ tracker_encoding_guess_meegotouch (const gchar *buffer,
 {
 	/* Initialize detector */
 	MCharsetDetector detector ((const char *)buffer, (int)size);
+	gchar *locale;
+	gchar *encoding = NULL;
 
 	if (detector.hasError ()) {
 		g_warning ("Charset detector error when creating: %s",
@@ -50,26 +53,37 @@ tracker_encoding_guess_meegotouch (const gchar *buffer,
 		return NULL;
 	}
 
-	gchar *encoding = g_strdup (bestMatch.name ().toUtf8 ().data ());
+	locale = tracker_locale_get (TRACKER_LOCALE_LANGUAGE);
+	detector.setDeclaredLocale (locale);
+
+	if (bestMatch.confidence () > 30) {
+		encoding = g_strdup (bestMatch.name ().toUtf8 ().data ());
 
 #if 0
-	QList<MCharsetMatch> mCharsetMatchList = detector.detectAll();
+		QList<MCharsetMatch> mCharsetMatchList = detector.detectAll();
 
-	if (detector.hasError ()) {
-		g_warning ("Charset detector error when detecting all: %s",
-		           detector.errorString ().toUtf8 (). data ());
-	}
+		if (detector.hasError ()) {
+			g_warning ("Charset detector error when detecting all: %s",
+			           detector.errorString ().toUtf8 (). data ());
+		}
 
-	g_debug ("Detecting all charsets...");
-	for (gint i = 0; i < mCharsetMatchList.size (); ++i) {
-		g_debug ("  Charset '%s' with %d%% confidence...",
-		         mCharsetMatchList[i].name (). toUtf8 ().data (),
-		         mCharsetMatchList[i].confidence ());
-	}
+		g_debug ("Detecting all charsets...");
+		for (gint i = 0; i < mCharsetMatchList.size (); ++i) {
+			g_debug ("  Charset '%s' with %d%% confidence...",
+			         mCharsetMatchList[i].name (). toUtf8 ().data (),
+			         mCharsetMatchList[i].confidence ());
+		}
 #endif
 
-	g_debug ("Guessing charset as '%s' with %d%% confidence",
-	         encoding, bestMatch.confidence ());
+		g_debug ("Guessing charset as '%s' with %d%% confidence",
+		         encoding, bestMatch.confidence ());
+	} else {
+		g_debug ("Ignoring charset as '%s' with %d%% (< 30%%) confidence",
+		         bestMatch.name ().toUtf8 ().data (),
+		         bestMatch.confidence ());
+	}
+
+	g_free (locale);
 
 	return encoding;
 }
diff --git a/src/tracker-extract/tracker-extract-mp3.c b/src/tracker-extract/tracker-extract-mp3.c
index 791d999..51cf1e7 100644
--- a/src/tracker-extract/tracker-extract-mp3.c
+++ b/src/tracker-extract/tracker-extract-mp3.c
@@ -513,10 +513,10 @@ read_id3v1_buffer (int     fd,
 static gchar *
 ucs2_to_utf8(const gchar *data, guint len)
 {
-	const gchar   *encoding = NULL;
+	const gchar *encoding = NULL;
 	guint16  c;
 	gboolean be;
-	gchar   *utf8 = NULL;
+	gchar *utf8 = NULL;
 
 	memcpy (&c, data, 2);
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]