[tracker/drop-unac] Fixes GB#619244: Use a custom unaccenting method instead of libunac

From: Aleksander Morgado <aleksm src gnome org>
To: commits-list gnome org
Cc:
Subject: [tracker/drop-unac] Fixes GB#619244: Use a custom unaccenting method instead of libunac
Date: Mon, 7 Jun 2010 09:10:18 +0000 (UTC)
commit b85f3bd11cf7c4da72748abc8c80aaaa725303c7
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Tue Jun 1 17:17:27 2010 +0300

    Fixes GB#619244: Use a custom unaccenting method instead of libunac
    
            * Notes: Output strings are now always normalized using
            compatibility decomposition (NFKD). This actually is the best
            normalization type for text search.
    
            * If unaccenting requested, all combining diacritical marks
            are removed from the string.
    
            * This new method avoids extra conversion to UTF-16, and does
            mark removal in-place without any extra allocation.
    
            * libunac dependency is completely removed.

 configure.ac                                     |   34 ----
 src/libtracker-fts/tracker-parser-glib.c         |  121 +++++++++----
 src/libtracker-fts/tracker-parser-libicu.c       |  207 ++++++++++++++--------
 src/libtracker-fts/tracker-parser-libunistring.c |   95 +++++++---
 src/libtracker-fts/tracker-parser-utils.c        |  149 ----------------
 src/libtracker-fts/tracker-parser-utils.h        |   25 ++--
 tests/libtracker-fts/tracker-parser-test.c       |   55 +++---
 7 files changed, 324 insertions(+), 362 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 191f476..bf5709f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -142,7 +142,6 @@ HAL_REQUIRED=0.5
 UPOWER_REQUIRED=0.9.0
 GDKPIXBUF_REQUIRED=2.12.0
 QUILL_REQUIRED=1.0.0
-UNAC_REQUIRED=1.0.0
 POPPLER_REQUIRED=0.12.2
 CAIRO_REQUIRED=1.0
 GDK_REQUIRED=1.0
@@ -735,38 +734,6 @@ AC_SUBST(SQLITE3_CFLAGS)
 AC_SUBST(SQLITE3_LIBS)
 
 ##################################################################
-# Enable UNAC support?
-##################################################################
-
-AC_ARG_ENABLE(unac,
-	      AS_HELP_STRING([--enable-unac],
-			     [enable UNAC support, required for stripping accents [[default=auto]]]),,
-	      [enable_unac=auto])
-
-if test "x$enable_unac" != "xno"; then
-   PKG_CHECK_MODULES(UNAC,
-		     [unac >= $UNAC_REQUIRED],
-		     [have_unac=yes],
-		     [have_unac=no])
-   AC_SUBST(UNAC_LIBS)
-   AC_SUBST(UNAC_CFLAGS)
-
-   if test "x$have_unac" = "xyes"; then
-      AC_DEFINE(HAVE_UNAC, [], [Define if we have UNAC for accent stripping])
-   fi
-else
-   have_unac="no  (disabled)"
-fi
-
-if test "x$enable_unac" = "xyes"; then
-   if test "x$have_unac" != "xyes"; then
-      AC_MSG_ERROR([Couldn't find UNAC >= $UNAC_REQUIRED.])
-   fi
-fi
-
-AM_CONDITIONAL(HAVE_UNAC, test "x$have_unac" = "xyes")
-
-##################################################################
 # Enable Gnome Keyring support to store credentials (for web miners)
 ##################################################################
 
@@ -1963,7 +1930,6 @@ Build Configuration:
 	Support for HAL:			$have_hal
 	Support for UPower:			$have_upower
 	Support for file monitoring:            $have_file_monitor
-	Support for accent stripping (unac):  	$have_unac
 	Support for Cyrillic languages (enca): 	$have_enca
 	Support for network status detection:	$have_network_manager
 	Unicode support library: 		$with_unicode_support
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index d521c9c..06858f8 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -42,9 +42,6 @@
 #define IS_ASCII_IGNORE(c)       ((c) <= 0x002C)
 #define IS_HYPHEN(c)             ((c) == 0x002D)
 #define IS_UNDERSCORE(c)         ((c) == 0x005F)
-#define IS_NEWLINE(c)            ((c) == 0x000D)
-#define IS_O(c)                          ((c) == 0x006F)
-#define IS_R(c)                          ((c) == 0x0072)
 
 typedef enum {
 	TRACKER_PARSER_WORD_ASCII_HIGHER,
@@ -162,64 +159,108 @@ get_encoding (const gchar *txt)
 
 }
 
+static gboolean
+tracker_parser_unaccent_nfkd_word (gchar *word,
+                                   gsize *word_length)
+{
+	/* The input word in this method MUST be normalized in NFKD form */
+	gsize i;
+	gsize j;
+
+	g_return_val_if_fail (word, FALSE);
+	g_return_val_if_fail (word_length, FALSE);
+	g_return_val_if_fail (*word_length > 0, FALSE);
+
+	i = 0;
+	j = 0;
+	while (i < *word_length) {
+		gunichar unichar;
+		gchar *next_utf8;
+		gint utf8_len;
+
+		/* Get next character of the word as UCS4 */
+		unichar = g_utf8_get_char_validated (&word[i], -1);
+
+		/* Invalid UTF-8 character or end of original string. */
+		if (unichar == (gunichar)-1 ||
+		    unichar == (gunichar)-2) {
+			break;
+		}
+
+		/* Find next UTF-8 character */
+		next_utf8 = g_utf8_next_char (&word[i]);
+		utf8_len = next_utf8 - &word[i];
+
+		/* If the given unichar is a combining diacritical mark,
+		 *  just update the original index, not the output one */
+		if (IS_CDM_UCS4 ((guint32)unichar)) {
+			i += utf8_len;
+			continue;
+		}
+
+		/* If already found a previous combining
+		 *  diacritical mark, indexes are different so
+		 *  need to copy characters. As output and input
+		 *  buffers may overlap, need to use memmove
+		 *  instead of memcpy */
+		if (i != j) {
+			memmove (&word[j], &word[i], utf8_len);
+		}
+
+		/* Update both indexes */
+		i += utf8_len;
+		j += utf8_len;
+	}
+
+	/* Force proper string end */
+	word[j] = '\0';
+	/* Set new output length */
+	*word_length = j;
+
+	return TRUE;
+}
+
 static gchar *
 process_word_utf8 (TrackerParser *parser,
-		   const gchar   *word,
-		   gint           length,
+                   const gchar   *word,
+                   gint           length,
                    gboolean       do_strip,
                    gboolean      *stop_word)
 {
 	gchar *stem_word;
 	gchar *str;
-	gchar *stripped_word;
 	gsize  bytes, len;
 
 	g_return_val_if_fail (parser != NULL, NULL);
 	g_return_val_if_fail (word != NULL, NULL);
 
 	str = NULL;
-	stripped_word = NULL;
 
 	if (word) {
-		if (length == -1) {
-			bytes = strlen (word);
-		} else {
-			bytes = length;
-		}
+		bytes = length == -1 ? strlen (word) : length;
 
 		/* Log original word */
 		tracker_parser_message_hex ("ORIGINAL word",
 		                            word, bytes);
 
-		if (parser->enable_unaccent && do_strip) {
-			stripped_word = tracker_parser_unaccent_utf8_word (word,
-			                                                   bytes,
-			                                                   &len);
-
-			/* Log after UNAC stripping */
-			tracker_parser_message_hex (" After UNAC stripping",
-			                            stripped_word, len);
-		} else {
-			stripped_word = NULL;
+		str = g_utf8_normalize (word, bytes, G_NORMALIZE_NFKD);
+		if (!str) {
+			return NULL;
 		}
 
-		if (!stripped_word) {
-			str = g_utf8_normalize (word,
-			                        bytes,
-			                        G_NORMALIZE_NFC);
-		} else {
-			str = g_utf8_normalize (stripped_word,
-			                        len,
-			                        G_NORMALIZE_NFC);
-			g_free (stripped_word);
-		}
+		/* Update string length */
+		bytes = strlen (str);
 
 		/* Log after normalization */
-		tracker_parser_message_hex ("  After NFC normalization",
-		                            str, strlen ((gchar *)str));
+		tracker_parser_message_hex (" After NFKD normalization",
+		                            str, bytes);
 
-		if (!str) {
-			return NULL;
+		if (parser->enable_unaccent &&
+		    do_strip &&
+		    tracker_parser_unaccent_nfkd_word (str, &bytes)) {
+			/* Log after UNAC stripping */
+			tracker_parser_message_hex ("  After UNAC stripping",
+			                            str, bytes);
 		}
 
 		/* Check if stop word */
@@ -232,9 +273,9 @@ process_word_utf8 (TrackerParser *parser,
 			return str;
 		}
 
-		len = strlen (str);
-
-		stem_word = tracker_language_stem_word (parser->language, str, len);
+		stem_word = tracker_language_stem_word (parser->language,
+		                                        str,
+		                                        bytes);
 
 		if (stem_word) {
 			g_free (str);
@@ -414,7 +455,7 @@ parser_next (TrackerParser *parser,
 		case TRACKER_PARSER_WORD_ASCII_HIGHER:
 			c += 32;
 
-                        /* Fall through */
+			/* Fall through */
 		case TRACKER_PARSER_WORD_ASCII_LOWER:
 		case TRACKER_PARSER_WORD_HYPHEN:
 		case TRACKER_PARSER_WORD_UNDERSCORE:
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 55151ec..f69fb2a 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -138,6 +138,109 @@ get_word_info (const UChar           *word,
 	return TRUE;
 }
 
+static gboolean
+tracker_parser_unaccent_nfkd_word (UChar *word,
+                                   gsize *word_length)
+{
+	/* The input word in this method MUST be normalized in NFKD form */
+	gsize i;
+	gsize j;
+
+	g_return_val_if_fail (word, FALSE);
+	g_return_val_if_fail (word_length, FALSE);
+	g_return_val_if_fail (*word_length > 0, FALSE);
+
+	i = 0;
+	j = 0;
+	while (i < *word_length) {
+		UChar32 unichar;
+		gint utf16_len; /* given in UChars */
+		gsize aux_i;
+
+		/* Get next character of the word as UCS4 */
+		aux_i = i;
+		U16_NEXT (word, aux_i, *word_length, unichar);
+		utf16_len = aux_i - i;
+
+		/* Invalid UTF-16 character or end of original string. */
+		if (utf16_len <= 0) {
+			break;
+		}
+
+		/* If the given unichar is a combining diacritical mark,
+		 *  just update the original index, not the output one */
+		if (IS_CDM_UCS4 ((guint32)unichar)) {
+			i += utf16_len;
+			continue;
+		}
+
+		/* If already found a previous combining
+		 *  diacritical mark, indexes are different so
+		 *  need to copy characters. As output and input
+		 *  buffers may overlap, need to use memmove
+		 *  instead of memcpy */
+		if (i != j) {
+			memmove (&word[j], &word[i], sizeof (UChar) * utf16_len);
+		}
+
+		/* Update both indexes */
+		i += utf16_len;
+		j += utf16_len;
+	}
+
+	/* Force proper string end */
+	word[j] = (UChar)0;
+	/* Set new output length */
+	*word_length = j;
+
+	return TRUE;
+}
+
+static gchar *
+convert_UChar_to_utf8 (const UChar *word,
+                       gsize        uchar_len,
+                       gsize       *utf8_len)
+{
+	gchar *utf8_str;
+	UErrorCode icu_error = U_ZERO_ERROR;
+	UConverter *converter;
+	gsize new_utf8_len;
+
+	g_return_val_if_fail (word, NULL);
+	g_return_val_if_fail (utf8_len, NULL);
+
+	/* Open converter UChar to UTF-16BE */
+	converter = ucnv_open ("UTF-8", &icu_error);
+	if (!converter) {
+		g_warning ("Cannot open UTF-8 converter: '%s'",
+		           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+		return NULL;
+	}
+
+	/* A character encoded in 2 bytes in UTF-16 may get expanded to 3 or 4 bytes
+	 *  in UTF-8. */
+	utf8_str = g_malloc (2 * uchar_len * sizeof (UChar) + 1);
+
+	/* Convert from UChar to UTF-8 (NIL-terminated) */
+	new_utf8_len = ucnv_fromUChars (converter,
+	                                utf8_str,
+	                                2 * uchar_len * sizeof (UChar) + 1,
+	                                word,
+	                                uchar_len,
+	                                &icu_error);
+	if (U_FAILURE (icu_error)) {
+		g_warning ("Cannot convert from UChar to UTF-8: '%s'",
+		           u_errorName (icu_error));
+		g_free (utf8_str);
+		ucnv_close (converter);
+		return NULL;
+	}
+
+	*utf8_len = new_utf8_len;
+	ucnv_close (converter);
+	return utf8_str;
+}
+
 static gchar *
 process_word_uchar (TrackerParser         *parser,
                     const UChar           *word,
@@ -148,13 +251,12 @@ process_word_uchar (TrackerParser         *parser,
 	UErrorCode error = U_ZERO_ERROR;
 	UChar normalized_buffer [WORD_BUFFER_LENGTH];
 	gchar *utf8_str = NULL;
-	gchar *stemmed = NULL;
-	size_t new_word_length;
+	gsize new_word_length;
 
 	/* Log original word */
 	tracker_parser_message_hex ("ORIGINAL word",
 	                            (guint8 *)word,
-				    length * sizeof (UChar));
+	                            length * sizeof (UChar));
 
 
 	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
@@ -178,12 +280,12 @@ process_word_uchar (TrackerParser         *parser,
 		/* Log after casefolding */
 		tracker_parser_message_hex (" After Casefolding",
 		                            (guint8 *)casefolded_buffer,
-					    new_word_length * sizeof (UChar));
+		                            new_word_length * sizeof (UChar));
 
-		/* NFC normalization... */
+		/* NFKD normalization... */
 		new_word_length = unorm_normalize (casefolded_buffer,
 		                                   new_word_length,
-		                                   UNORM_NFC,
+		                                   UNORM_NFKD,
 		                                   0,
 		                                   normalized_buffer,
 		                                   WORD_BUFFER_LENGTH,
@@ -200,7 +302,7 @@ process_word_uchar (TrackerParser         *parser,
 		/* Log after casefolding */
 		tracker_parser_message_hex (" After Normalization",
 		                            (guint8 *)normalized_buffer,
-					    new_word_length * sizeof (UChar));
+		                            new_word_length * sizeof (UChar));
 	} else {
 		/* For ASCII-only, just tolower() each character */
 		new_word_length = u_strToLower (normalized_buffer,
@@ -218,67 +320,29 @@ process_word_uchar (TrackerParser         *parser,
 		/* Log after casefolding */
 		tracker_parser_message_hex (" After lowercase",
 		                            (guint8 *)normalized_buffer,
-					    new_word_length * sizeof (UChar));
+		                            new_word_length * sizeof (UChar));
 	}
 
 	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
-	if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
-		gsize stripped_word_length;
-
-		/* Get unaccented string in UTF-8 */
-		utf8_str = tracker_parser_unaccent_UChar_word (normalized_buffer,
-		                                               new_word_length,
-		                                               &stripped_word_length);
-		if (utf8_str) {
-			new_word_length = stripped_word_length;
-
-			/* Log after unaccenting */
-			tracker_parser_message_hex ("   After UNAC",
-						    utf8_str,
-						    new_word_length);
-		}
+	if (parser->enable_unaccent &&
+	    type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
+	    tracker_parser_unaccent_nfkd_word (normalized_buffer,
+	                                       &new_word_length)) {
+		/* Log after unaccenting */
+		tracker_parser_message_hex ("  After UNAC",
+		                            (guint8 *)normalized_buffer,
+		                            new_word_length * sizeof (UChar));
 	}
 
-	/* If stripping failed or not needed, convert to UTF-8 */
-	if (!utf8_str) {
-		UErrorCode icu_error = U_ZERO_ERROR;
-		UConverter *converter;
-		gsize utf8_len;
-
-		/* Open converter UChar to UTF-16BE */
-		converter = ucnv_open ("UTF-8", &icu_error);
-		if (!converter) {
-			g_warning ("Cannot open UTF-8 converter: '%s'",
-			           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
-			return NULL;
-		}
-		/* A character encoded in 2 bytes in UTF-16 may get expanded to 3 or 4 bytes
-		 *  in UTF-8. */
-		utf8_str = g_malloc (2 * new_word_length * sizeof (UChar) + 1);
-
-		/* Convert from UChar to UTF-8 (NIL-terminated) */
-		utf8_len = ucnv_fromUChars (converter,
-		                            utf8_str,
-		                            2 * new_word_length * sizeof (UChar) + 1,
-		                            normalized_buffer,
-		                            new_word_length,
-		                            &icu_error);
-		if (U_FAILURE (icu_error)) {
-			g_warning ("Cannot convert from UChar to UTF-8: '%s'",
-			           u_errorName (icu_error));
-			g_free (utf8_str);
-			ucnv_close (converter);
-			return NULL;
-		}
+	/* Finally, convert to UTF-8 */
+	utf8_str = convert_UChar_to_utf8 (normalized_buffer,
+	                                  new_word_length,
+	                                  &new_word_length);
 
-		new_word_length = utf8_len;
-		ucnv_close (converter);
-
-		/* Log after unaccenting */
-		tracker_parser_message_hex ("   After UTF8 conversion",
-		                            utf8_str,
-					    new_word_length);
-	}
+	/* Log after unaccenting */
+	tracker_parser_message_hex ("   After UTF8 conversion",
+	                            utf8_str,
+	                            new_word_length);
 
 	/* Check if stop word */
 	if (parser->ignore_stop_words) {
@@ -287,21 +351,24 @@ process_word_uchar (TrackerParser         *parser,
 	}
 
 	/* Stemming needed? */
-	if (parser->enable_stemmer) {
+	if (utf8_str &&
+	    parser->enable_stemmer) {
+		gchar *stemmed;
+
 		/* Input for stemmer ALWAYS in UTF-8, as well as output */
 		stemmed = tracker_language_stem_word (parser->language,
 		                                      utf8_str,
 		                                      new_word_length);
 
 		/* Log after stemming */
-		tracker_parser_message_hex ("   After stemming",
+		tracker_parser_message_hex ("    After stemming",
 		                            stemmed, strlen (stemmed));
-	}
 
-	/* If stemmed wanted and succeeded, free previous and return it */
-	if (stemmed) {
-		g_free (utf8_str);
-		return stemmed;
+		/* If stemmed wanted and succeeded, free previous and return it */
+		if (stemmed) {
+			g_free (utf8_str);
+			return stemmed;
+		}
 	}
 
 	return utf8_str;
@@ -510,7 +577,7 @@ tracker_parser_reset (TrackerParser *parser,
 	if (!converter) {
 		g_warning ("Cannot open UTF-8 converter: '%s'",
 		           U_FAILURE (error) ? u_errorName (error) : "none");
-               return;
+		return;
 	}
 
 	/* Allocate UChars and offsets buffers */
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index 07f638d..240ea44 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -139,6 +139,61 @@ get_word_info (TrackerParser         *parser,
 	return TRUE;
 }
 
+static gboolean
+tracker_parser_unaccent_nfkd_word (gchar *word,
+                                   gsize *word_length)
+{
+	/* The input word in this method MUST be normalized in NFKD form */
+	gsize i;
+	gsize j;
+
+	g_return_val_if_fail (word, FALSE);
+	g_return_val_if_fail (word_length, FALSE);
+	g_return_val_if_fail (*word_length > 0, FALSE);
+
+	i = 0;
+	j = 0;
+	while (i < *word_length) {
+		ucs4_t unichar;
+		gint utf8_len;
+
+		/* Get next character of the word as UCS4 */
+		utf8_len = u8_strmbtouc (&unichar, &word[i]);
+
+		/* Invalid UTF-8 character or end of original string. */
+		if (utf8_len <= 0) {
+			break;
+		}
+
+		/* If the given unichar is a combining diacritical mark,
+		 *  just update the original index, not the output one */
+		if (IS_CDM_UCS4 ((guint32)unichar)) {
+			i += utf8_len;
+			continue;
+		}
+
+		/* If already found a previous combining
+		 *  diacritical mark, indexes are different so
+		 *  need to copy characters. As output and input
+		 *  buffers may overlap, need to use memmove
+		 *  instead of memcpy */
+		if (i != j) {
+			memmove (&word[j], &word[i], utf8_len);
+		}
+
+		/* Update both indexes */
+		i += utf8_len;
+		j += utf8_len;
+	}
+
+	/* Force proper string end */
+	word[j] = '\0';
+	/* Set new output length */
+	*word_length = j;
+
+	return TRUE;
+}
+
 static gchar *
 process_word_utf8 (TrackerParser         *parser,
                    const gchar           *word,
@@ -148,7 +203,6 @@ process_word_utf8 (TrackerParser         *parser,
 {
 	gchar word_buffer [WORD_BUFFER_LENGTH];
 	gchar *normalized = NULL;
-	gchar *stripped = NULL;
 	gchar *stemmed = NULL;
 	size_t new_word_length;
 
@@ -171,13 +225,13 @@ process_word_utf8 (TrackerParser         *parser,
 		/* Leave space for last NIL */
 		new_word_length = WORD_BUFFER_LENGTH - 1;
 
-		/* Casefold and NFC normalization in output.
+		/* Casefold and NFKD normalization in output.
 		 *  NOTE: if the output buffer is not big enough, u8_casefold will
 		 *  return a newly-allocated buffer. */
 		normalized = u8_casefold ((const uint8_t *)word,
 		                          length,
 		                          uc_locale_language (),
-		                          UNINORM_NFC,
+		                          UNINORM_NFKD,
 		                          word_buffer,
 		                          &new_word_length);
 
@@ -192,7 +246,7 @@ process_word_utf8 (TrackerParser         *parser,
 		}
 
 		/* Log after Normalization */
-		tracker_parser_message_hex (" After Casefolding and NFC normalization",
+		tracker_parser_message_hex (" After Casefolding and NFKD normalization",
 		                            normalized, new_word_length);
 	} else {
 		/* For ASCII-only, just tolower() each character */
@@ -215,31 +269,25 @@ process_word_utf8 (TrackerParser         *parser,
 	normalized[new_word_length] = '\0';
 
 	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
-	if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
-		gsize stripped_word_length;
-
-		stripped = tracker_parser_unaccent_utf8_word (normalized,
-		                                              new_word_length,
-		                                              &stripped_word_length);
-
-		if (stripped) {
-			/* Log after UNAC stripping */
-			tracker_parser_message_hex ("  After UNAC stripping",
-			                            stripped, stripped_word_length);
-			new_word_length = stripped_word_length;
-		}
+	if (parser->enable_unaccent &&
+	    type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
+	    tracker_parser_unaccent_nfkd_word (normalized,
+	                                       &new_word_length)) {
+		/* Log after UNAC stripping */
+		tracker_parser_message_hex ("  After UNAC stripping",
+		                            normalized, new_word_length);
 	}
 
 	/* Check if stop word */
 	if (parser->ignore_stop_words) {
 		*stop_word = tracker_language_is_stop_word (parser->language,
-		                                            stripped ? stripped : normalized);
+		                                            normalized);
 	}
 
 	/* Stemming needed? */
 	if (parser->enable_stemmer) {
 		stemmed = tracker_language_stem_word (parser->language,
-		                                      stripped ? stripped : normalized,
+		                                      normalized,
 		                                      new_word_length);
 
 		/* Log after stemming */
@@ -249,21 +297,12 @@ process_word_utf8 (TrackerParser         *parser,
 
 	/* If stemmed wanted and succeeded, free previous and return it */
 	if (stemmed) {
-		g_free (stripped);
 		if (normalized != word_buffer) {
 			g_free (normalized);
 		}
 		return stemmed;
 	}
 
-	/* If stripped wanted and succeeded, free previous and return it */
-	if (stripped) {
-		if (normalized != word_buffer) {
-			g_free (normalized);
-		}
-		return stripped;
-	}
-
 	/* It may be the case that no stripping and no stemming was needed, and
 	 * that the output buffer in stack was enough for case-folding and
 	 * normalization. In this case, need to strdup() the string to return it */
diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c
index 9c24bd0..76a8ecb 100644
--- a/src/libtracker-fts/tracker-parser-utils.c
+++ b/src/libtracker-fts/tracker-parser-utils.c
@@ -21,158 +21,9 @@
 
 #include <string.h>
 
-#ifdef HAVE_UNAC
-#include <unac.h>
-#endif
-
-#ifdef HAVE_LIBICU
-#include <unicode/utypes.h>
-#include <unicode/ucnv.h>
-#endif
-
 #include <libtracker-common/tracker-common.h>
 #include "tracker-parser-utils.h"
 
-
-/* Output is always UTF-8. */
-gchar *
-tracker_parser_unaccent_utf16be_word (const gchar *string,
-                                      gsize        ilength,
-                                      gsize        *p_olength)
-{
-#ifdef HAVE_UNAC
-	GError *error = NULL;
-	gchar *unaccented_str = NULL;
-	gchar *str_utf8 = NULL;
-	gsize unaccented_len;
-	gsize utf8_len;
-
-	*p_olength = 0;
-
-	if (unac_string_utf16 (string, ilength,
-	                       &unaccented_str, &unaccented_len) != 0) {
-		g_warning ("UNAC failed to strip accents");
-		return NULL;
-	}
-
-	/* Convert from UTF-16BE to UTF-8 */
-	str_utf8 = g_convert (unaccented_str,
-	                      unaccented_len,
-	                      "UTF-8",
-	                      "UTF-16BE",
-	                      NULL,
-	                      &utf8_len,
-	                      &error);
-	g_free (unaccented_str);
-
-	if (error) {
-		g_warning ("Could not convert back to UTF-8: %s",
-		           error->message);
-		g_error_free (error);
-		return NULL;
-	}
-
-	*p_olength = utf8_len;
-	return str_utf8;
-#else
-	return NULL;
-#endif
-}
-
-
-#ifdef HAVE_LIBICU
-/* NOTE: Internally, UChars are UTF-16, but conversion needed just in case,
- *  as libunac needs UTF-16BE. Output is always UTF-8.*/
-gchar *
-tracker_parser_unaccent_UChar_word (const UChar *string,
-                                    gsize        ilength,
-                                    gsize        *p_olength)
-{
-#ifdef HAVE_UNAC
-	UErrorCode icu_error = U_ZERO_ERROR;
-	UConverter *converter;
-	gchar *str_utf16;
-	gchar *str_utf8 = NULL;
-	gsize utf16_len;
-
-	*p_olength = 0;
-
-	/* Open converter UChar to UTF-16BE */
-	converter = ucnv_open ("UTF-16BE", &icu_error);
-	if (!converter) {
-		g_warning ("Cannot open UTF-16BE converter: '%s'",
-		           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
-               return NULL;
-	}
-
-	/* Allocate buffer, same size as input string.
-	 * Note that ilength specifies number of UChars not
-	 *  number of bytes */
-	str_utf16 = g_malloc ((ilength + 1) * 2);
-
-	/* Convert from UChar to UTF-16BE */
-	utf16_len = ucnv_fromUChars (converter,
-	                             str_utf16,
-	                             (ilength + 1) * 2,
-	                             string,
-	                             ilength,
-	                             &icu_error);
-	if (U_FAILURE (icu_error)) {
-		g_warning ("Cannot convert from UChar to UTF-16BE: '%s' "
-		           "(ilength: %" G_GSIZE_FORMAT ")",
-		           u_errorName (icu_error),
-		           ilength);
-	} else {
-		str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
-		                                                 utf16_len,
-		                                                 p_olength);
-	}
-	ucnv_close (converter);
-	g_free (str_utf16);
-	return str_utf8;
-#else
-	return NULL;
-#endif
-}
-#endif
-
-gchar *
-tracker_parser_unaccent_utf8_word (const gchar *str,
-                                   gsize        ilength,
-                                   gsize        *p_olength)
-{
-#ifdef HAVE_UNAC
-	GError *error = NULL;
-	gchar *str_utf16 = NULL;
-	gchar *str_utf8 = NULL;
-	gsize utf16_len;
-
-	*p_olength = 0;
-
-	/* unac_string() does roughly the same than below, plus it
-	 * corrupts memory in 64bit systems, so avoid it for now.
-	 */
-	str_utf16 = g_convert (str, ilength, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
-
-	if (error) {
-		g_warning ("Could not convert to UTF-16: %s", error->message);
-		g_error_free (error);
-		return NULL;
-	} else {
-
-		str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
-		                                                 utf16_len,
-		                                                 p_olength);
-	}
-
-	g_free (str_utf16);
-	return str_utf8;
-#else
-	return NULL;
-#endif
-}
-
-
 /*
  * Definition of the possible reserved words.
  *  Length of word is explicitly given to avoid strlen() calls
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
index 50805c1..f3f884e 100644
--- a/src/libtracker-fts/tracker-parser-utils.h
+++ b/src/libtracker-fts/tracker-parser-utils.h
@@ -38,22 +38,19 @@ G_BEGIN_DECLS
                                ((c) >= 0x4E00 && (c) <= 0x9FA5)  ||	\
                                ((c) >= 0x20000 && (c) <= 0x2A6D6))
 
+/* ASCII undescore? */
 #define IS_UNDERSCORE_UCS4(c) ((c) == 0x005F)
 
-
-gchar *tracker_parser_unaccent_utf16be_word (const gchar *string,
-                                             gsize        ilength,
-                                             gsize        *p_olength);
-
-gchar *tracker_parser_unaccent_utf8_word (const gchar *string,
-                                          gsize        ilength,
-                                          gsize        *p_olength);
-
-#ifdef HAVE_LIBICU
-gchar *tracker_parser_unaccent_UChar_word (const UChar *string,
-                                           gsize        ilength,
-                                           gsize        *p_olength);
-#endif
+/* Combining diacritical mark?
+ *  Basic range: [0x0300,0x036F]
+ *  Supplement:  [0x1DC0,0x1DFF]
+ *  For Symbols: [0x20D0,0x20FF]
+ *  Half marks:  [0xFE20,0xFE2F]
+ */
+#define IS_CDM_UCS4(c)        (((c) >= 0x0300 && (c) <= 0x036F)  ||	\
+                               ((c) >= 0x1DC0 && (c) <= 0x1DFF)  ||	\
+                               ((c) >= 0x20D0 && (c) <= 0x20FF)  ||	\
+                               ((c) >= 0xFE20 && (c) <= 0xFE2F))
 
 
 gboolean tracker_parser_is_reserved_word_utf8 (const gchar *word,
diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c
index 813ce38..5390989 100644
--- a/tests/libtracker-fts/tracker-parser-test.c
+++ b/tests/libtracker-fts/tracker-parser-test.c
@@ -170,6 +170,7 @@ expected_word_check (TrackerParserTestFixture *fixture,
 {
 	const TestDataExpectedWord *testdata = data;
 	const gchar *word;
+	gchar *expected_nfkd;
 	gint position;
 	gint byte_offset_start;
 	gint byte_offset_end;
@@ -195,8 +196,15 @@ expected_word_check (TrackerParserTestFixture *fixture,
 	                            &stop_word,
 	                            &word_length);
 
+	/* Expected word MUST always be in NFKD normalization */
+	expected_nfkd = g_utf8_normalize (testdata->expected,
+	                                  -1,
+	                                  G_NORMALIZE_NFKD);
+
 	/* Check if input is same as expected */
-	g_assert_cmpstr (word, == , testdata->expected);
+	g_assert_cmpstr (word, == , expected_nfkd);
+
+	g_free (expected_nfkd);
 }
 
 /* -------------- STOP WORD TESTS ----------------- */
@@ -247,7 +255,6 @@ stop_word_check (TrackerParserTestFixture *fixture,
 
 /* -------------- LIST OF TESTS ----------------- */
 
-#ifdef HAVE_UNAC
 /* Normalization-related tests (unaccenting) */
 static const TestDataExpectedWord test_data_normalization[] = {
 	{ "Ã©cole",                "ecole", FALSE, TRUE  },
@@ -263,36 +270,30 @@ static const TestDataExpectedWord test_data_normalization[] = {
 
 /* Unaccenting-related tests */
 static const TestDataExpectedWord test_data_unaccent[] = {
-	{ "MurciÃ©lago", "murcielago", FALSE, TRUE  },
-	{ "camiÃ³n",     "camion",     FALSE, TRUE  },
-	{ "desagÃ¼e",    "desague",    FALSE, TRUE  },
+	{ "MurciÃ©lago",   "murcielago", FALSE, TRUE  },
+	{ "camiÃ³n",       "camion",     FALSE, TRUE  },
+	{ "desagÃ¼e",      "desague",    FALSE, TRUE  },
+	{ "á¾º",            "Î±",          FALSE, TRUE  }, /* greek capital alpha with U+0300, composed */
+	{ "á½°",            "Î±",          FALSE, TRUE  }, /* greek small alpha with U+0300, composed */
+	{ "á¿?",            "Î¹",          FALSE, TRUE  }, /* greek capital iotta with U+0300, composed */
+	{ "á½¶",            "Î¹",          FALSE, TRUE  }, /* greek small iotta with U+0300, composed */
+	{ "á¿º",            "Ï?",          FALSE, TRUE  }, /* greek capital omega with U+0300, composed */
+	{ "á½¼",            "Ï?",          FALSE, TRUE  }, /* greek small omega with U+0300, composed */
+#ifdef FULL_UNICODE_TESTS /* glib/pango does not like NFD strings */
+	{ "Î?Ì?",          "Î±",          FALSE, TRUE  }, /* capital alpha with U+0300, decomposed */
+	{ "Î±Ì?",          "Î±",          FALSE, TRUE  }, /* small alpha with U+0300, decomposed */
+	{ "Î?Ì?",          "Î¹",          FALSE, TRUE  }, /* capital iotta with U+0300, decomposed */
+	{ "Î¹Ì?",          "Î¹",          FALSE, TRUE  }, /* small iotta with U+0300, decomposed */
+	{ "Î©Ì?",          "Ï?",          FALSE, TRUE  }, /* capital omega with U+0300, decomposed */
+	{ "Ï?Ì?",          "Ï?",          FALSE, TRUE  }, /* small omega with U+0300, decomposed */
+	{ "aNÍ¡Ga",       "anga",       FALSE, TRUE  }, /* 0x0361 affects to two characters */
+	{ "aNGÍ¡a",       "anga",       FALSE, TRUE  }, /* 0x0361 affects to two characters */
+#endif
 	{ "MurciÃ©lago", "murciÃ©lago", FALSE, FALSE },
 	{ "camiÃ³n",     "camiÃ³n",     FALSE, FALSE },
 	{ "desagÃ¼e",    "desagÃ¼e",    FALSE, FALSE },
 	{ NULL,         NULL,         FALSE, FALSE }
 };
-#else
-/* Normalization-related tests (not unaccenting) */
-static const TestDataExpectedWord test_data_normalization[] = {
-	{ "Ã©cole",                "Ã©cole", FALSE, FALSE },
-	{ "Ã?COLE",                "Ã©cole", FALSE, FALSE },
-	{ "Ã?cole",                "Ã©cole", FALSE, FALSE },
-#ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
-	{ "e" "\xCC\x81" "cole",  "Ã©cole", FALSE, FALSE },
-	{ "E" "\xCC\x81" "COLE",  "Ã©cole", FALSE, FALSE },
-	{ "E" "\xCC\x81" "cole",  "Ã©cole", FALSE, FALSE },
-#endif
-	{ "Ã©cole",                "Ã©cole", FALSE, TRUE  },
-	{ "Ã?COLE",                "Ã©cole", FALSE, TRUE  },
-	{ "Ã?cole",                "Ã©cole", FALSE, TRUE  },
-#ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
-	{ "e" "\xCC\x81" "cole",  "Ã©cole", FALSE, TRUE  },
-	{ "E" "\xCC\x81" "COLE",  "Ã©cole", FALSE, TRUE  },
-	{ "E" "\xCC\x81" "cole",  "Ã©cole", FALSE, TRUE  },
-#endif
-	{ NULL,                   NULL,    FALSE, FALSE }
-};
-#endif /* !HAVE_UNAC */
 
 /* Stemming-related tests */
 static const TestDataExpectedWord test_data_stemming[] = {
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]