[tracker/parser-libunistring-review] Added libicu-based unac stripping



commit 19d11799d550a97b09bc7703bc5b38c7ed6392ea
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Tue May 4 09:40:51 2010 +0200

    Added libicu-based unac stripping

 configure.ac                                     |    2 +-
 src/libtracker-fts/tracker-parser-glib.c         |    4 +-
 src/libtracker-fts/tracker-parser-libunistring.c |    6 +-
 src/libtracker-fts/tracker-parser-utils.c        |  131 ++++++++++++++++++----
 src/libtracker-fts/tracker-parser-utils.h        |   21 +++-
 5 files changed, 133 insertions(+), 31 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index ba39dba..f6af8ab 100644
--- a/configure.ac
+++ b/configure.ac
@@ -832,7 +832,7 @@ else
       unicode_library=libicu
       AC_CHECK_HEADER(unicode/ubrk.h, [have_libicu=yes],[have_libicu=no])
 
-      LIBICU_CFLAGS="-Iunicode"
+      LIBICU_CFLAGS=""
       LIBICU_LIBS="-licuuc"
 
       AC_SUBST(LIBICU_CFLAGS)
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index f685fd4..83a969b 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -538,7 +538,9 @@ tracker_parser_process_word (TrackerParser *parser,
 		                            word, bytes);
 
 		if (do_strip) {
-			stripped_word = tracker_parser_unaccent_string (word, bytes, &len);
+			stripped_word = tracker_parser_unaccent_utf8_word (word,
+			                                                   bytes,
+			                                                   &len);
 
 			/* Log after UNAC stripping */
 			tracker_parser_message_hex (" After UNAC stripping",
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index efb05aa..6fec131 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -344,9 +344,9 @@ tracker_parser_process_word (TrackerParser *parser,
 	if (do_strip) {
 		gsize stripped_word_length;
 
-		stripped = tracker_parser_unaccent_string (normalized,
-		                                           new_word_length,
-		                                           &stripped_word_length);
+		stripped = tracker_parser_unaccent_utf8_word (normalized,
+		                                              new_word_length,
+		                                              &stripped_word_length);
 
 		if (stripped) {
 			/* Log after UNAC stripping */
diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c
index d2486ab..0a37440 100644
--- a/src/libtracker-fts/tracker-parser-utils.c
+++ b/src/libtracker-fts/tracker-parser-utils.c
@@ -25,55 +25,140 @@
 #include <unac.h>
 #endif
 
+#ifdef HAVE_LIBICU
+#include <unicode/utypes.h>
+#include <unicode/ucnv.h>
+#endif
+
 #include <libtracker-common/tracker-common.h>
 #include "tracker-parser-utils.h"
 
+
+/* Output is always UTF-8. */
 gchar *
-tracker_parser_unaccent_string (const gchar *str,
-                                gsize        ilength,
-                                gsize        *p_olength)
+tracker_parser_unaccent_utf16be_word (const gchar *string,
+                                      gsize        ilength,
+                                      gsize        *p_olength)
 {
-#ifdef HAVE_UNAC
 	GError *error = NULL;
-	gchar *str_utf16;
-	gsize utf16_len, unaccented_len, final_len;
 	gchar *unaccented_str = NULL;
-	gchar *s = NULL;
+	gchar *str_utf8 = NULL;
+	gsize unaccented_len;
+	gsize utf8_len;
 
 	*p_olength = 0;
 
-	/* unac_string() does roughly the same than below, plus it
-	 * corrupts memory in 64bit systems, so avoid it for now.
-	 */
-	str_utf16 = g_convert (str, ilength, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
+	if (unac_string_utf16 (string, ilength,
+	                       &unaccented_str, &unaccented_len) != 0) {
+		g_warning ("UNAC failed to strip accents");
+		return NULL;
+	}
+
+	/* Convert from UTF-16BE to UTF-8 */
+	str_utf8 = g_convert (unaccented_str,
+	                      unaccented_len,
+	                      "UTF-8",
+	                      "UTF-16BE",
+	                      NULL,
+	                      &utf8_len,
+	                      &error);
+	g_free (unaccented_str);
 
 	if (error) {
-		g_warning ("Could not convert to UTF-16: %s", error->message);
+		g_warning ("Could not convert back to UTF-8: %s",
+		           error->message);
 		g_error_free (error);
 		return NULL;
 	}
 
-	if (unac_string_utf16 (str_utf16, utf16_len,
-	                       &unaccented_str, &unaccented_len) != 0) {
-		g_warning ("UNAC failed to strip accents");
-		g_free (str_utf16);
-		return NULL;
+	*p_olength = utf8_len;
+	return str_utf8;
+}
+
+
+#ifdef HAVE_LIBICU
+/* NOTE: Internally, UChars are UTF-16, but conversion needed just in case,
+ *  as libunac needs UTF-16BE. Output is always UTF-8.*/
+gchar *
+tracker_parser_unaccent_UChar_word (const UChar *string,
+                                    gsize        ilength,
+                                    gsize        *p_olength)
+{
+#ifdef HAVE_UNAC
+	UErrorCode icu_error = U_ZERO_ERROR;
+	UConverter *converter;
+	gchar *str_utf16;
+	gchar *str_utf8 = NULL;
+	gsize utf16_len;
+
+	*p_olength = 0;
+
+	/* Open converter UChar to UTF-16BE */
+	converter = ucnv_open ("UTF-16BE", &icu_error);
+	if (!converter) {
+		g_warning ("Cannot open UTF-16BE converter: '%s'",
+		           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+               return NULL;
 	}
 
+	/* Allocate buffer, same size as input string */
+	str_utf16 = g_malloc (ilength);
+
+	/* Convert from UChar to UTF-16BE */
+	utf16_len = ucnv_fromUChars (converter,
+	                             str_utf16,
+	                             ilength,
+	                             string,
+	                             ilength,
+	                             &icu_error);
+	if (U_FAILURE (icu_error)) {
+		g_warning ("Cannot convert from UChar to UTF-16BE: '%s'",
+		           u_errorName (icu_error));
+	} else {
+		str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
+		                                                 utf16_len,
+		                                                 p_olength);
+	}
+	ucnv_close (converter);
 	g_free (str_utf16);
+	return str_utf8;
+#else
+	return NULL;
+#endif
+}
+#endif
 
-	s = g_convert (unaccented_str, unaccented_len, "UTF-8", "UTF-16BE", NULL, &final_len, &error);
-	g_free (unaccented_str);
+gchar *
+tracker_parser_unaccent_utf8_word (const gchar *str,
+                                   gsize        ilength,
+                                   gsize        *p_olength)
+{
+#ifdef HAVE_UNAC
+	GError *error = NULL;
+	gchar *str_utf16 = NULL;
+	gchar *str_utf8 = NULL;
+	gsize utf16_len;
+
+	*p_olength = 0;
+
+	/* unac_string() does roughly the same than below, plus it
+	 * corrupts memory in 64bit systems, so avoid it for now.
+	 */
+	str_utf16 = g_convert (str, ilength, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
 
 	if (error) {
-		g_warning ("Could not convert back to UTF-8: %s", error->message);
+		g_warning ("Could not convert to UTF-16: %s", error->message);
 		g_error_free (error);
 		return NULL;
-	}
+	} else {
 
-	*p_olength = final_len;
+		str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
+		                                                 utf16_len,
+		                                                 p_olength);
+	}
 
-	return s;
+	g_free (str_utf16);
+	return str_utf8;
 #else
 	return NULL;
 #endif
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
index 77eb662..2e7a2c6 100644
--- a/src/libtracker-fts/tracker-parser-utils.h
+++ b/src/libtracker-fts/tracker-parser-utils.h
@@ -20,14 +20,29 @@
 #ifndef __TRACKER_PARSER_UTILS_H__
 #define __TRACKER_PARSER_UTILS_H__
 
+#include "config.h"
+
 #include <glib.h>
 
+#ifdef HAVE_LIBICU
+#include <unicode/utypes.h>
+#endif
+
 G_BEGIN_DECLS
 
-gchar *tracker_parser_unaccent_string (const gchar *str,
-                                       gsize        ilength,
-                                       gsize        *p_olength);
+gchar *tracker_parser_unaccent_utf16be_word (const gchar *string,
+                                             gsize        ilength,
+                                             gsize        *p_olength);
 
+gchar *tracker_parser_unaccent_utf8_word (const gchar *string,
+                                          gsize        ilength,
+                                          gsize        *p_olength);
+
+#ifdef HAVE_LIBICU
+gchar *tracker_parser_unaccent_UChar_word (const UChar *string,
+                                           gsize        ilength,
+                                           gsize        *p_olength);
+#endif
 
 
 /* Define to 1 if you want to enable debugging logs showing HEX contents



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]