[tracker/parser-unicode-libs-review] FTS parser: Added new GNU libunistring and libicu based parsers

From: Aleksander Morgado <aleksm src gnome org>
To: commits-list gnome org
Cc:
Subject: [tracker/parser-unicode-libs-review] FTS parser: Added new GNU libunistring and libicu based parsers
Date: Fri, 14 May 2010 17:21:32 +0000 (UTC)
commit b497d3a1842d376869258b2ff09d93715d073fb9
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Fri May 14 18:45:18 2010 +0200

    FTS parser: Added new GNU libunistring and libicu based parsers
    
    	* configure.ac: New --with-unicode-support option added to configure,
    	which enables choosing between three parser implementations (glib,
    	libunistring and libicu), defaulting to libunistring if none specified.
    
    	* src/libtracker-fts/tracker-parser-utils.[h|c]: New files including
    	the common utilities for all parsers.
    
    	* src/libtracker-fts/tracker-parser-glib.c: Original custom/pango
    	parser, file renamed from `src/libtracker-fts/tracker-parser.c'
    
    	* src/libtracker-fts/tracker-parser-libicu.c: libicu-based parser
    	implementation.
    
    	* src/libtracker-fts/tracker-parser-libunistring.c: libunistring-based
    	parser implementation.
    
    	* src/libtracker-fts/tracker-parser.h: API of the parser implementation
    	extended to enable/disable indexing numbers.
    
    	* src/libtracker-fts/Makefile.am: Setup the compilation of the new
    	parsers.

 configure.ac                                       |   72 +++
 src/libtracker-fts/Makefile.am                     |   37 +-
 src/libtracker-fts/tracker-fts.c                   |    6 +-
 .../{tracker-parser.c => tracker-parser-glib.c}    |  129 ++---
 src/libtracker-fts/tracker-parser-libicu.c         |  637 ++++++++++++++++++++
 src/libtracker-fts/tracker-parser-libunistring.c   |  499 +++++++++++++++
 src/libtracker-fts/tracker-parser-utils.c          |  235 +++++++
 src/libtracker-fts/tracker-parser-utils.h          |   77 +++
 src/libtracker-fts/tracker-parser.h                |   14 +-
 9 files changed, 1602 insertions(+), 104 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 9e48269..16c5eda 100644
--- a/configure.ac
+++ b/configure.ac
@@ -857,6 +857,77 @@ fi
 
 AM_CONDITIONAL(HAVE_MINER_FLICKR, test "x$have_miner_flickr" = "xyes")
 
+##################################################################
+# GLib, libunistring or libicu?
+# By default, AUTO with this order of preference:
+#  1)  libunistring
+#  2)  libicu
+#  3)  glib
+##################################################################
+
+# Check for libunistring...
+AC_CHECK_HEADER(uniwbrk.h,
+                AC_CHECK_LIB(unistring, u8_wordbreaks))
+have_libunistring=${ac_cv_lib_unistring_u8_wordbreaks:-no}
+LIBUNISTRING_CFLAGS="$CFLAGS"
+LIBUNISTRING_LIBS="$LIBS"
+
+# Check for libicu... Note that AC_CHECK_LIB cannot be used as
+#  symbol name includes libicu library version... don't want to
+#  look for ubrk_next_4_2
+AC_CHECK_HEADER(unicode/ubrk.h, [have_libicu=yes],[have_libicu=no])
+LIBICU_CFLAGS=""
+LIBICU_LIBS="-licuuc"
+
+# Configure option
+AC_ARG_WITH([unicode-support],
+            AS_HELP_STRING([--with-unicode-support],
+                           [Unicode support library? (libunistring|libicu|glib) [[default=auto]]]),,
+            [with_unicode_support=auto])
+
+# If auto, decide ourselves
+if test "x$with_unicode_support" = "xauto"; then
+   if test "x$have_libunistring" = "xyes"; then
+      with_unicode_support=libunistring
+   else
+      if test "x$have_libicu" = "xyes"; then
+         with_unicode_support=libicu
+      else
+         with_unicode_support=glib
+      fi
+   fi
+fi
+
+case "x$with_unicode_support" in
+     # Use libunistring
+     "xlibunistring")
+        AC_SUBST(LIBUNISTRING_CFLAGS)
+        AC_SUBST(LIBUNISTRING_LIBS)
+        if test "x$have_libunistring" = "xyes"; then
+           AC_DEFINE(HAVE_LIBUNISTRING, [], [libunistring Unicode support library])
+        else
+           AC_MSG_ERROR([***libunistring requested but not found - exiting!])
+        fi
+     ;;
+     # Use libicu
+     "xlibicu")
+        AC_SUBST(LIBICU_CFLAGS)
+        AC_SUBST(LIBICU_LIBS)
+        if test "x$have_libicu" = "xyes"; then
+          AC_DEFINE(HAVE_LIBICU, [], [libicu Unicode support library])
+        else
+          AC_MSG_ERROR([***libicu requested but not found - exiting!])
+        fi
+     ;;
+     # Use glib
+     "xglib") ;;
+     # Invalid option value
+     *) AC_MSG_ERROR([***wrong value for --with-unicode-support: $with_unicode_support - exiting!]) ;;
+esac
+
+AM_CONDITIONAL(HAVE_LIBUNISTRING, test "x$have_libunistring" = "xyes")
+AM_CONDITIONAL(HAVE_LIBICU, test "x$have_libicu" = "xyes")
+
 ####################################################################
 # Miner Evolution
 ####################################################################
@@ -1852,6 +1923,7 @@ Build Configuration:
 	Support for accent stripping (unac):  	$have_unac
 	Support for Cyrillic languages (enca): 	$have_enca
 	Support for network status detection:	$have_network_manager
+	Unicode support library: 		$with_unicode_support
 
 Applications:
 
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am
index 4938097..62c6d7a 100644
--- a/src/libtracker-fts/Makefile.am
+++ b/src/libtracker-fts/Makefile.am
@@ -7,11 +7,20 @@ INCLUDES =								\
 	$(WARN_CFLAGS)							\
 	$(GLIB2_CFLAGS)							\
 	$(GCOV_CFLAGS)							\
-	$(PANGO_CFLAGS)							\
 	$(DBUS_CFLAGS)							\
 	$(UNAC_CFLAGS)							\
 	$(SQLITE3_CFLAGS)
 
+if HAVE_LIBUNISTRING
+  INCLUDES += $(LIBUNISTRING_CFLAGS)
+else
+if HAVE_LIBICU
+  INCLUDES += $(LIBICU_CFLAGS)
+else
+  INCLUDES += $(PANGO_CFLAGS)
+endif
+endif
+
 noinst_LTLIBRARIES = libtracker-fts.la
 
 libtracker_fts_la_SOURCES = 						\
@@ -21,15 +30,35 @@ libtracker_fts_la_SOURCES = 						\
 	tracker-fts-config.h						\
 	tracker-fts-hash.c						\
 	tracker-fts-hash.h						\
-	tracker-parser.c						\
+	tracker-parser-utils.c						\
+	tracker-parser-utils.h						\
 	tracker-parser.h
 
+if HAVE_LIBUNISTRING
+  libtracker_fts_la_SOURCES += tracker-parser-libunistring.c
+else
+if HAVE_LIBICU
+  libtracker_fts_la_SOURCES += tracker-parser-libicu.c
+else
+  libtracker_fts_la_SOURCES += tracker-parser-glib.c
+endif
+endif
+
 libtracker_fts_la_LIBADD =						\
 	$(top_builddir)/src/libtracker-common/libtracker-common.la	\
 	$(SQLITE3_LIBS)							\
 	$(DBUS_LIBS)							\
 	$(GTHREAD_LIBS)							\
 	$(GCOV_LIBS)							\
-	$(PANGO_LIBS)							\
 	$(UNAC_LIBS)							\
-	$(GLIB2_LIBS)							
+	$(GLIB2_LIBS)
+
+if HAVE_LIBUNISTRING
+  libtracker_fts_la_LIBADD += $(LIBUNISTRING_LIBS)
+else
+if HAVE_LIBICU
+  libtracker_fts_la_LIBADD += $(LIBICU_LIBS)
+else
+  libtracker_fts_la_LIBADD += $(PANGO_LIBS)
+endif
+endif
diff --git a/src/libtracker-fts/tracker-fts.c b/src/libtracker-fts/tracker-fts.c
index c2f6f60..ab2b329 100644
--- a/src/libtracker-fts/tracker-fts.c
+++ b/src/libtracker-fts/tracker-fts.c
@@ -3666,7 +3666,7 @@ static void snippetOffsetsOfColumn(
   pVtab = pQuery->pFts;
   nColumn = pVtab->nColumn;
 
-  tracker_parser_reset (pVtab->parser, zDoc, nDoc, FALSE, TRUE, pVtab->stop_words, FALSE);
+  tracker_parser_reset (pVtab->parser, zDoc, nDoc, FALSE, TRUE, pVtab->stop_words, TRUE, TRUE);
 
   aTerm = pQuery->pTerms;
   nTerm = pQuery->nTerms;
@@ -4363,7 +4363,7 @@ static int tokenizeSegment(
   int firstIndex = pQuery->nTerms;
   int nTerm = 1;
 
-  tracker_parser_reset (parser, pSegment, nSegment, FALSE, TRUE, v->stop_words, TRUE);
+  tracker_parser_reset (parser, pSegment, nSegment, FALSE, TRUE, v->stop_words, FALSE, TRUE);
 
   while( 1 ){
     const char *pToken;
@@ -4816,7 +4816,7 @@ int Catid,
 
   if (!zText) return SQLITE_OK;
 
-  tracker_parser_reset (parser, zText, strlen (zText), FALSE, TRUE, v->stop_words, FALSE);
+  tracker_parser_reset (parser, zText, strlen (zText), FALSE, TRUE, v->stop_words, TRUE, TRUE);
 
   while( 1 ){
 
diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser-glib.c
similarity index 86%
rename from src/libtracker-fts/tracker-parser.c
rename to src/libtracker-fts/tracker-parser-glib.c
index bd9326f..a2144af 100644
--- a/src/libtracker-fts/tracker-parser.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
- * Copyright (C) 2008, Nokia <ivan frade nokia com>
+ * Copyright (C) 2008,2009,2010 Nokia <ivan frade nokia com>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -21,15 +21,11 @@
 #include "config.h"
 
 #include <string.h>
-#include <pango/pango.h>
 
-#ifdef HAVE_UNAC
-#include <unac.h>
-#endif
+#include <pango/pango.h>
 
 #include "tracker-parser.h"
-
-#define INDEX_NUMBER_MIN_LENGTH 6
+#include "tracker-parser-utils.h"
 
 /* Need pango for CJK ranges which are : 0x3400 - 0x4DB5, 0x4E00 -
  * 0x9FA5, 0x20000 - <= 0x2A6D6
@@ -80,7 +76,8 @@ struct TrackerParser {
 	guint                  max_words_to_index;
 	guint                  max_word_length;
 	gboolean               delimit_words;
-	gboolean               parse_reserved_words;
+	gboolean               skip_reserved_words;
+	gboolean               skip_numbers;
 
 	/* Private members */
 	gchar                   *word;
@@ -138,58 +135,6 @@ get_word_type (gunichar c)
 	return TRACKER_PARSER_WORD_IGNORE;
 }
 
-static inline gchar *
-strip_word (const gchar *str,
-            gint         length,
-            guint32     *len)
-{
-#ifdef HAVE_UNAC
-	GError *error = NULL;
-	gchar *str_utf16;
-	gsize utf16_len, unaccented_len, final_len;
-	gchar *unaccented_str = NULL;
-	gchar *s = NULL;
-
-	*len = 0;
-
-	/* unac_string() does roughly the same than below, plus it
-	 * corrupts memory in 64bit systems, so avoid it for now.
-	 */
-	str_utf16 = g_convert (str, length, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
-
-	if (error) {
-		g_warning ("Could not convert to UTF-16: %s", error->message);
-		g_error_free (error);
-		return NULL;
-	}
-
-	if (unac_string_utf16 (str_utf16, utf16_len,
-	                       &unaccented_str, &unaccented_len) != 0) {
-		g_warning ("UNAC failed to strip accents");
-		g_free (str_utf16);
-		return NULL;
-	}
-
-	g_free (str_utf16);
-
-	s = g_convert (unaccented_str, unaccented_len, "UTF-8", "UTF-16BE", NULL, &final_len, &error);
-	g_free (unaccented_str);
-
-	if (error) {
-		g_warning ("Could not convert back to UTF-8: %s", error->message);
-		g_error_free (error);
-		return NULL;
-	}
-
-	*len = (guint32) final_len;
-
-	return s;
-#else
-	*len = length;
-	return NULL;
-#endif
-}
-
 static TrackerParserEncoding
 get_encoding (const gchar *txt)
 {
@@ -219,21 +164,6 @@ get_encoding (const gchar *txt)
 }
 
 static gboolean
-is_stop_word (TrackerLanguage *language,
-              const gchar     *word)
-{
-	GHashTable *stop_words;
-
-	if (!word) {
-		return FALSE;
-	}
-
-	stop_words = tracker_language_get_stop_words (language);
-
-	return g_hash_table_lookup (stop_words, word) != NULL;
-}
-
-static gboolean
 pango_next (TrackerParser *parser,
             gint          *byte_offset_start,
             gint          *byte_offset_end)
@@ -348,14 +278,14 @@ parser_next (TrackerParser *parser,
 				/* word break */
 
 				/* check if word is reserved */
-				if (is_valid && parser->parse_reserved_words) {
+				if (is_valid && parser->skip_reserved_words) {
 					if (length == 2 && word[0] == 'o' && word[1] == 'r') {
-						break;
+						is_valid = FALSE;
 					}
 				}
 
 				if (!is_valid ||
-				    word_type == TRACKER_PARSER_WORD_NUM) {
+				    (parser->skip_numbers && word_type == TRACKER_PARSER_WORD_NUM)) {
 					word_type = TRACKER_PARSER_WORD_IGNORE;
 					is_valid = TRUE;
 					length = 0;
@@ -382,12 +312,12 @@ parser_next (TrackerParser *parser,
 			 * underscore if we are filtering.
 			 */
 
-			if (type == TRACKER_PARSER_WORD_NUM) {
+			if (parser->skip_numbers && type == TRACKER_PARSER_WORD_NUM) {
 				is_valid = FALSE;
 				continue;
 			} else {
 				if (type == TRACKER_PARSER_WORD_HYPHEN) {
-					is_valid = parser->parse_reserved_words;
+					is_valid = !parser->skip_reserved_words;
 					continue;
 				}
 			}
@@ -463,8 +393,6 @@ parser_next (TrackerParser *parser,
 		gchar       *utf8;
 		gchar       *processed_word;
 
-
-
 		utf8 = g_ucs4_to_utf8 (word, length, NULL, &bytes, NULL);
 
 		if (!utf8) {
@@ -535,7 +463,8 @@ tracker_parser_reset (TrackerParser *parser,
                       gboolean       delimit_words,
                       gboolean       enable_stemmer,
                       gboolean       enable_stop_words,
-                      gboolean       parse_reserved_words)
+                      gboolean       skip_reserved_words,
+                      gboolean       skip_numbers)
 {
 	g_return_if_fail (parser != NULL);
 	g_return_if_fail (txt != NULL);
@@ -543,21 +472,23 @@ tracker_parser_reset (TrackerParser *parser,
 	g_free (parser->attrs);
 	parser->attrs = NULL;
 
+	parser->cursor = txt;
+	parser->encoding = get_encoding (txt);
+
 	parser->enable_stemmer = enable_stemmer;
 	parser->enable_stop_words = enable_stop_words;
 	parser->delimit_words = delimit_words;
-	parser->encoding = get_encoding (txt);
+
 	parser->txt_size = txt_size;
 	parser->txt = txt;
-	parser->parse_reserved_words = parse_reserved_words;
+	parser->skip_reserved_words = skip_reserved_words;
+	parser->skip_numbers = skip_numbers;
 
 	g_free (parser->word);
 	parser->word = NULL;
 
 	parser->word_position = 0;
 
-	parser->cursor = txt;
-
 	if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
 		PangoLogAttr *attrs;
 
@@ -583,14 +514,14 @@ tracker_parser_reset (TrackerParser *parser,
 
 gchar *
 tracker_parser_process_word (TrackerParser *parser,
-                             const char    *word,
+                             const gchar   *word,
                              gint           length,
                              gboolean       do_strip)
 {
 	gchar *stem_word;
 	gchar *str;
 	gchar *stripped_word;
-	guint  bytes, len;
+	gsize  bytes, len;
 
 	g_return_val_if_fail (parser != NULL, NULL);
 	g_return_val_if_fail (word != NULL, NULL);
@@ -605,8 +536,18 @@ tracker_parser_process_word (TrackerParser *parser,
 			bytes = length;
 		}
 
+		/* Log original word */
+		tracker_parser_message_hex ("ORIGINAL word",
+		                            word, bytes);
+
 		if (do_strip) {
-			stripped_word = strip_word (word, bytes, &len);
+			stripped_word = tracker_parser_unaccent_utf8_word (word,
+			                                                   bytes,
+			                                                   &len);
+
+			/* Log after UNAC stripping */
+			tracker_parser_message_hex (" After UNAC stripping",
+			                            stripped_word, len);
 		} else {
 			stripped_word = NULL;
 		}
@@ -622,6 +563,10 @@ tracker_parser_process_word (TrackerParser *parser,
 			g_free (stripped_word);
 		}
 
+		/* Log after normalization */
+		tracker_parser_message_hex ("  After NFC normalization",
+		                            str, strlen ((gchar *)str));
+
 		if (!str) {
 			return NULL;
 		}
@@ -672,7 +617,9 @@ tracker_parser_next (TrackerParser *parser,
 			str = parser->word;
 		}
 
-		if (parser->enable_stop_words && is_stop_word (parser->language, str)) {
+		if (str &&
+		    parser->enable_stop_words &&
+		    tracker_language_is_stop_word (parser->language, str)) {
 			*stop_word = TRUE;
 		} else {
 			parser->word_position++;
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
new file mode 100644
index 0000000..190931c
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -0,0 +1,637 @@
+/*
+ * Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
+ * Copyright (C) 2008,2009,2010 Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301  USA
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <locale.h>
+
+#include <unicode/utypes.h>
+#include <unicode/ucnv.h>
+#include <unicode/ubrk.h>
+#include <unicode/ustring.h>
+#include <unicode/uchar.h>
+#include <unicode/unorm.h>
+
+#include "tracker-parser.h"
+#include "tracker-parser-utils.h"
+
+/* Type of words detected */
+typedef enum {
+	TRACKER_PARSER_WORD_TYPE_ASCII,
+	TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
+	TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
+} TrackerParserWordType;
+
+/* Max possible length of a UChar encoded string (just a safety limit) */
+#define WORD_BUFFER_LENGTH 512
+
+
+static gchar *process_word_uchar (TrackerParser *parser,
+                                  const UChar   *word,
+                                  gint           length,
+                                  TrackerParserWordType type);
+
+
+struct TrackerParser {
+	const gchar           *txt;
+	gint                   txt_size;
+
+	TrackerLanguage       *language;
+	gboolean               enable_stemmer;
+	gboolean               enable_stop_words;
+	guint                  max_words_to_index;
+	guint                  max_word_length;
+	gboolean               delimit_words;
+	gboolean               skip_reserved_words;
+	gboolean               skip_numbers;
+
+	/* Private members */
+	gchar                 *word;
+	gint                   word_length;
+	guint                  word_position;
+
+	/* Text as UChars */
+	UChar                 *utxt;
+	gint                   utxt_size;
+	/* Original offset of each UChar in the input txt string */
+	gint32                *offsets;
+
+	/* The word-break iterator */
+	UBreakIterator        *bi;
+
+	/* Cursor, as index of the utxt array of bytes */
+	gsize                  cursor;
+};
+
+
+static gboolean
+get_word_info (const UChar           *word,
+               gsize                  word_length,
+               gboolean               skip_numbers,
+               gboolean              *p_is_allowed_word_start,
+               TrackerParserWordType *p_word_type)
+{
+	UCharIterator iter;
+	UChar32 unichar;
+	guint8 unichar_gc;
+
+	/* Get first character of the word as UCS4 */
+	uiter_setString (&iter, word, word_length);
+	unichar = uiter_current32 (&iter);
+	if (unichar == U_SENTINEL) {
+		return FALSE;
+	}
+
+	/* We only want the words where the first character
+	 *  in the word is either a letter, a number or a symbol.
+	 * This is needed because the word break algorithm also
+	 *  considers word breaks after for example commas or other
+	 *  punctuation marks.
+	 * Note that looking at the first character in the string
+	 *  should be compatible with all Unicode normalization
+	 *  methods.
+	 */
+	unichar_gc = u_charType (unichar);
+	if (unichar_gc == U_UPPERCASE_LETTER ||
+	    unichar_gc == U_LOWERCASE_LETTER ||
+	    unichar_gc == U_TITLECASE_LETTER ||
+	    unichar_gc == U_MODIFIER_LETTER ||
+	    unichar_gc == U_OTHER_LETTER ||
+	    IS_UNDERSCORE_UCS4 ((guint32)unichar) ||
+	    (!skip_numbers &&
+	     (unichar_gc == U_DECIMAL_DIGIT_NUMBER ||
+	      unichar_gc == U_LETTER_NUMBER ||
+	      unichar_gc == U_OTHER_NUMBER))) {
+		*p_is_allowed_word_start = TRUE;
+	} else {
+		*p_is_allowed_word_start = FALSE;
+		return TRUE;
+	}
+
+	/* Word starts with a CJK character? */
+	if (IS_CJK_UCS4 ((guint32)unichar)) {
+		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
+		return TRUE;
+	}
+
+	/* Is ASCII-only string? */
+	while (unichar != U_SENTINEL)
+	{
+		if (!IS_ASCII_UCS4 ((guint32)unichar)) {
+			*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
+			return TRUE;
+		}
+		unichar = uiter_next32 (&iter);
+	}
+
+	*p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
+	return TRUE;
+}
+
+static gboolean
+parser_next (TrackerParser *parser,
+             gint          *byte_offset_start,
+             gint          *byte_offset_end)
+{
+	gsize word_length_uchar = 0;
+	gsize word_length_utf8 = 0;
+	gchar *processed_word = NULL;
+	gsize current_word_offset_utf8;
+
+	*byte_offset_start = 0;
+	*byte_offset_end = 0;
+
+	g_return_val_if_fail (parser, FALSE);
+
+	/* Loop to look for next valid word */
+	while (!processed_word &&
+	       parser->cursor < parser->utxt_size) {
+		TrackerParserWordType type;
+		gboolean is_allowed;
+		gsize next_word_offset_uchar;
+		gsize next_word_offset_utf8;
+		gsize truncated_length;
+
+		/* Set current word offset in the original UTF-8 string */
+		current_word_offset_utf8 = parser->offsets[parser->cursor];
+
+		/* Find next word break. */
+		next_word_offset_uchar = ubrk_next (parser->bi);
+		if (next_word_offset_uchar >= parser->utxt_size) {
+			/* Last word support... */
+			next_word_offset_uchar = parser->utxt_size;
+			next_word_offset_utf8 = parser->txt_size;
+		}
+		else {
+			next_word_offset_utf8 = parser->offsets[next_word_offset_uchar];
+		}
+
+		/* Word end is the first byte after the word, which is either the
+		 *  start of next word or the end of the string */
+		word_length_uchar = next_word_offset_uchar - parser->cursor;
+		word_length_utf8 = next_word_offset_utf8 - current_word_offset_utf8;
+
+		/* g_debug ("word_length_uchar: %" G_GSIZE_FORMAT, word_length_uchar); */
+		/* g_debug ("next_word_offset_uchar: %" G_GSIZE_FORMAT, next_word_offset_uchar); */
+		/* g_debug ("current_word_offset_uchar: %" G_GSIZE_FORMAT, parser->cursor); */
+		/* g_debug ("word_length_utf8: %" G_GSIZE_FORMAT, word_length_utf8); */
+		/* g_debug ("next_word_offset_utf8: %" G_GSIZE_FORMAT, next_word_offset_utf8); */
+		/* g_debug ("current_word_offset_utf8: %" G_GSIZE_FORMAT, current_word_offset_utf8); */
+
+		/* Skip the word if longer than the maximum allowed */
+		if (word_length_utf8 >= parser->max_word_length) {
+			/* Skip this word and keep on looping */
+			parser->cursor = next_word_offset_uchar;
+			continue;
+		}
+
+		/* Get word info... */
+		if (!get_word_info (&parser->utxt[parser->cursor],
+		                    word_length_uchar,
+		                    parser->skip_numbers,
+		                    &is_allowed,
+		                    &type)) {
+			/* Quit loop just in case */
+			parser->cursor = parser->utxt_size;
+			break;
+		}
+
+		/* Skip the word if not an allowed word start */
+		if (!is_allowed) {
+			/* Skip this word and keep on looping */
+			parser->cursor = next_word_offset_uchar;
+			continue;
+		}
+
+		/* check if word is reserved (looking at ORIGINAL UTF-8 buffer here! */
+		if (parser->skip_reserved_words &&
+		    tracker_parser_is_reserved_word_utf8 (&parser->txt[current_word_offset_utf8],
+		                                          word_length_utf8)) {
+			/* Skip this word and keep on looping */
+			parser->cursor = next_word_offset_uchar;
+			continue;
+		}
+
+		/* compute truncated word length (in UChar bytes) if needed (to
+		 * avoid extremely long words) */
+		truncated_length = (word_length_uchar < 2 * WORD_BUFFER_LENGTH ?
+		                    word_length_uchar :
+		                    2 * WORD_BUFFER_LENGTH);
+
+		/* Process the word here. If it fails, we can still go
+		 *  to the next one. Returns newly allocated UTF-8
+		 *  string always.
+		 * Enable UNAC stripping only if no ASCII and no CJK
+		 * Note we are passing UChar encoded string here!
+		 */
+		processed_word = process_word_uchar (parser,
+		                                     &(parser->utxt[parser->cursor]),
+		                                     truncated_length,
+		                                     type);
+		if (!processed_word) {
+			/* Skip this word and keep on looping */
+			parser->cursor = next_word_offset_uchar;
+			continue;
+		}
+	}
+
+	/* If we got a word here, set output */
+	if (processed_word) {
+		/* Set outputs */
+		*byte_offset_start = current_word_offset_utf8;
+		*byte_offset_end = current_word_offset_utf8 + word_length_utf8;
+
+		/* Update cursor */
+		parser->cursor += word_length_uchar;
+
+		parser->word_length = strlen (processed_word);
+		parser->word = processed_word;
+
+		return TRUE;
+	}
+
+	/* No more words... */
+	return FALSE;
+}
+
+TrackerParser *
+tracker_parser_new (TrackerLanguage *language,
+                    gint             max_word_length)
+{
+	TrackerParser *parser;
+
+	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
+	g_return_val_if_fail (max_word_length > 0, NULL);
+
+	parser = g_new0 (TrackerParser, 1);
+
+	parser->language = g_object_ref (language);
+
+	parser->max_word_length = max_word_length;
+	parser->word_length = 0;
+
+	parser->utxt = NULL;
+	parser->offsets = NULL;
+	parser->utxt_size = 0;
+	parser->bi = NULL;
+	parser->cursor = 0;
+
+	return parser;
+}
+
+void
+tracker_parser_free (TrackerParser *parser)
+{
+	g_return_if_fail (parser != NULL);
+
+	if (parser->language) {
+		g_object_unref (parser->language);
+	}
+
+	if (parser->bi) {
+		ubrk_close (parser->bi);
+	}
+
+	g_free (parser->utxt);
+	g_free (parser->offsets);
+
+	g_free (parser->word);
+
+	g_free (parser);
+}
+
+void
+tracker_parser_reset (TrackerParser *parser,
+                      const gchar   *txt,
+                      gint           txt_size,
+                      gboolean       delimit_words,
+                      gboolean       enable_stemmer,
+                      gboolean       enable_stop_words,
+                      gboolean       skip_reserved_words,
+                      gboolean       skip_numbers)
+{
+	UErrorCode error = U_ZERO_ERROR;
+	UConverter *converter;
+	UChar *last_uchar;
+	const gchar *last_utf8;
+
+	g_return_if_fail (parser != NULL);
+	g_return_if_fail (txt != NULL);
+
+	parser->enable_stemmer = enable_stemmer;
+	parser->enable_stop_words = enable_stop_words;
+	parser->delimit_words = delimit_words;
+
+	parser->txt_size = txt_size;
+	parser->txt = txt;
+	parser->skip_reserved_words = skip_reserved_words;
+	parser->skip_numbers = skip_numbers;
+
+	g_free (parser->word);
+	parser->word = NULL;
+
+	parser->word_position = 0;
+
+	parser->cursor = 0;
+
+	/* Open converter UTF-8 to UChar */
+	converter = ucnv_open ("UTF-8", &error);
+	if (!converter) {
+		g_warning ("Cannot open UTF-8 converter: '%s'",
+		           U_FAILURE (error) ? u_errorName (error) : "none");
+               return;
+	}
+
+	/* Allocate UChars and offsets buffers */
+	parser->utxt_size = txt_size + 1;
+	parser->utxt = g_malloc (parser->utxt_size * sizeof (UChar));
+	parser->offsets = g_malloc (parser->utxt_size * sizeof (gint32));
+
+	/* last_uchar and last_utf8 will be also an output parameter! */
+	last_uchar = parser->utxt;
+	last_utf8 = parser->txt;
+
+	/* Convert to UChars storing offsets */
+	ucnv_toUnicode (converter,
+	                &last_uchar,
+	                &parser->utxt[txt_size],
+	                &last_utf8,
+	                &parser->txt[txt_size],
+	                parser->offsets,
+	                FALSE,
+	                &error);
+	if (U_SUCCESS (error)) {
+		/* Proper UChar array size is now given by 'last_uchar' */
+		parser->utxt_size = last_uchar - parser->utxt;
+
+		/* Open word-break iterator */
+		parser->bi = ubrk_open(UBRK_WORD,
+		                       setlocale (LC_ALL, NULL),
+		                       parser->utxt,
+		                       parser->utxt_size,
+		                       &error);
+		if (U_SUCCESS (error)) {
+			/* Find FIRST word in the UChar array */
+			parser->cursor = ubrk_first (parser->bi);
+		}
+	}
+
+	/* If any error happened, reset buffers */
+	if (U_FAILURE (error)) {
+		g_warning ("Error initializing libicu support: '%s'",
+		           u_errorName (error));
+		/* Reset buffers */
+		g_free (parser->utxt);
+		g_free (parser->offsets);
+		parser->utxt = NULL;
+		parser->offsets = NULL;
+		parser->utxt_size = 0;
+	}
+
+	/* Close converter */
+	ucnv_close (converter);
+}
+
+static gchar *
+process_word_uchar (TrackerParser         *parser,
+                    const UChar           *word,
+                    gint                   length,
+                    TrackerParserWordType  type)
+{
+	UErrorCode error = U_ZERO_ERROR;
+	UChar normalized_buffer [WORD_BUFFER_LENGTH];
+	gchar *utf8_str = NULL;
+	gchar *stemmed = NULL;
+	size_t new_word_length;
+
+
+	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
+		UChar casefolded_buffer [WORD_BUFFER_LENGTH];
+
+		/* Casefold... */
+		new_word_length = u_strFoldCase (casefolded_buffer,
+		                                 WORD_BUFFER_LENGTH,
+		                                 word,
+		                                 length,
+		                                 U_FOLD_CASE_DEFAULT,
+		                                 &error);
+		if (U_FAILURE (error)) {
+			g_warning ("Error casefolding: '%s'",
+			           u_errorName (error));
+			return NULL;
+		}
+		if (new_word_length > WORD_BUFFER_LENGTH)
+			new_word_length = WORD_BUFFER_LENGTH;
+
+		/* NFC normalization... */
+		new_word_length = unorm_normalize (casefolded_buffer,
+		                                   new_word_length,
+		                                   UNORM_NFC,
+		                                   0,
+		                                   normalized_buffer,
+		                                   WORD_BUFFER_LENGTH,
+		                                   &error);
+		if (U_FAILURE (error)) {
+			g_warning ("Error normalizing: '%s'",
+			           u_errorName (error));
+			return NULL;
+		}
+
+		if (new_word_length > WORD_BUFFER_LENGTH)
+			new_word_length = WORD_BUFFER_LENGTH;
+	} else {
+		/* For ASCII-only, just tolower() each character */
+		new_word_length = u_strToLower (normalized_buffer,
+		                                WORD_BUFFER_LENGTH,
+		                                word,
+		                                length,
+		                                NULL,
+		                                &error);
+		if (U_FAILURE (error)) {
+			g_warning ("Error lowercasing: '%s'",
+			           u_errorName (error));
+			return NULL;
+		}
+	}
+
+	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
+	if (type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
+		gsize stripped_word_length;
+
+		/* Get unaccented string in UTF-8 */
+		utf8_str = tracker_parser_unaccent_UChar_word (normalized_buffer,
+		                                               new_word_length,
+		                                               &stripped_word_length);
+		if (utf8_str) {
+			new_word_length = stripped_word_length;
+		}
+	}
+
+	/* If stripping failed or not needed, convert to UTF-8 */
+	if (!utf8_str) {
+		UErrorCode icu_error = U_ZERO_ERROR;
+		UConverter *converter;
+		gsize utf8_len;
+
+		/* Open converter UChar to UTF-16BE */
+		converter = ucnv_open ("UTF-8", &icu_error);
+		if (!converter) {
+			g_warning ("Cannot open UTF-8 converter: '%s'",
+			           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+			return NULL;
+		}
+		/* Using same buffer size as for UTF-16 should always work. */
+		utf8_str = g_malloc (new_word_length * sizeof (UChar) + 1);
+
+		/* Convert from UChar to UTF-8 (NIL-terminated) */
+		utf8_len = ucnv_fromUChars (converter,
+		                            utf8_str,
+		                            new_word_length * sizeof (UChar) + 1,
+		                            normalized_buffer,
+		                            new_word_length,
+		                            &icu_error);
+		if (U_FAILURE (icu_error)) {
+			g_warning ("Cannot convert from UChar to UTF-8: '%s'",
+			           u_errorName (icu_error));
+			g_free (utf8_str);
+			ucnv_close (converter);
+			return NULL;
+		}
+
+		new_word_length = utf8_len;
+		ucnv_close (converter);
+	}
+
+	/* Stemming needed? */
+	if (parser->enable_stemmer) {
+		/* Input for stemmer ALWAYS in UTF-8, as well as output */
+		stemmed = tracker_language_stem_word (parser->language,
+		                                      utf8_str,
+		                                      new_word_length);
+
+		/* Log after stemming */
+		tracker_parser_message_hex ("   After stemming",
+		                            stemmed, strlen (stemmed));
+	}
+
+	/* If stemmed wanted and succeeded, free previous and return it */
+	if (stemmed) {
+		g_free (utf8_str);
+		return stemmed;
+	}
+
+	return utf8_str;
+}
+
+
+/* Both Input and Output are always UTF-8 */
+gchar *
+tracker_parser_process_word (TrackerParser *parser,
+                             const gchar   *word,
+                             gint           length,
+                             gboolean       do_strip)
+{
+	UErrorCode icu_error = U_ZERO_ERROR;
+	UConverter *converter;
+	UChar *uchar_word;
+	gsize uchar_len;
+	gchar *processed;
+
+	/* Open converter UTF-8 to UChar */
+	converter = ucnv_open ("UTF-8", &icu_error);
+	if (!converter) {
+		g_warning ("Cannot open UTF-8 converter: '%s'",
+		           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+		return NULL;
+	}
+
+	/* Compute length if not already as input */
+	if (length < 0) {
+		length = strlen (word);
+	}
+
+	/* Twice the size of the UTF-8 string for UChars */
+	uchar_word = g_malloc (2 * length);
+
+	/* Convert from UTF-8 to UChars*/
+	uchar_len = ucnv_toUChars (converter,
+	                           uchar_word,
+	                           2 * length,
+	                           word,
+	                           length,
+	                           &icu_error);
+	if (U_FAILURE (icu_error)) {
+		g_warning ("Cannot convert from UTF-8 to UChar: '%s'",
+		           u_errorName (icu_error));
+		g_free (uchar_word);
+		ucnv_close (converter);
+		return NULL;
+	}
+
+	ucnv_close (converter);
+
+	/* Process UChar based word */
+	processed = process_word_uchar (parser,
+	                                uchar_word,
+	                                uchar_len,
+	                                do_strip);
+	g_free (uchar_word);
+	return processed;
+}
+
+const gchar *
+tracker_parser_next (TrackerParser *parser,
+                     gint          *position,
+                     gint          *byte_offset_start,
+                     gint          *byte_offset_end,
+                     gboolean      *stop_word,
+                     gint          *word_length)
+{
+	const gchar  *str;
+	gint     byte_start = 0, byte_end = 0;
+
+	str = NULL;
+
+	g_free (parser->word);
+	parser->word = NULL;
+
+	if (parser_next (parser, &byte_start, &byte_end)) {
+		str = parser->word;
+	}
+
+	if (str &&
+	    parser->enable_stop_words &&
+	    tracker_language_is_stop_word (parser->language, str)) {
+		*stop_word = TRUE;
+	} else {
+		parser->word_position++;
+		*stop_word = FALSE;
+	}
+
+	*word_length = parser->word_length;
+	*position = parser->word_position;
+	*byte_offset_start = byte_start;
+	*byte_offset_end = byte_end;
+
+	return str;
+}
+
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
new file mode 100644
index 0000000..4a6ff35
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
+ * Copyright (C) 2008,2009,2010 Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301  USA
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>
+
+/* libunistring versions prior to 9.1.2 need this hack */
+#define _UNUSED_PARAMETER_
+#include <unistr.h>
+#include <uniwbrk.h>
+#include <unictype.h>
+#include <unicase.h>
+
+#include "tracker-parser.h"
+#include "tracker-parser-utils.h"
+
+/* Type of words detected */
+typedef enum {
+	TRACKER_PARSER_WORD_TYPE_ASCII,
+	TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
+	TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
+} TrackerParserWordType;
+
+/* Max possible length of a UTF-8 encoded string (just a safety limit) */
+#define WORD_BUFFER_LENGTH 512
+
+static gchar *process_word_utf8 (TrackerParser         *parser,
+                                 const gchar           *word,
+                                 gint                  length,
+                                 TrackerParserWordType type);
+
+struct TrackerParser {
+	const gchar           *txt;
+	gint                   txt_size;
+
+	TrackerLanguage       *language;
+	gboolean               enable_stemmer;
+	gboolean               enable_stop_words;
+	guint                  max_words_to_index;
+	guint                  max_word_length;
+	gboolean               delimit_words;
+	gboolean               skip_reserved_words;
+	gboolean               skip_numbers;
+
+	/* Private members */
+	gchar                   *word;
+	gint                    word_length;
+	guint                   word_position;
+
+	/* Cursor, as index of the input array of bytes */
+	gsize                  cursor;
+	/* libunistring flags array */
+	gchar                 *word_break_flags;
+	/* general category of the  start character in words */
+	uc_general_category_t  allowed_start;
+};
+
+static gboolean
+get_word_info (TrackerParser         *parser,
+               gsize                 *p_word_length,
+               gboolean              *p_is_allowed_word_start,
+               TrackerParserWordType *p_word_type)
+{
+	ucs4_t first_unichar;
+	gint first_unichar_len;
+	gsize i;
+	gboolean ascii_only;
+
+	/* Defaults */
+	*p_is_allowed_word_start = TRUE;
+
+	/* Get first character of the word as UCS4 */
+	first_unichar_len = u8_strmbtouc (&first_unichar,
+	                                  &(parser->txt[parser->cursor]));
+	if (first_unichar_len <= 0) {
+		/* This should only happen if NIL was passed to u8_strmbtouc,
+		 *  so better just force stop here */
+		return FALSE;
+	} else  {
+		/* If first character has length 1, it's ASCII-7 */
+		ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
+	}
+
+	/* Find next word break, and in the same loop checking if only ASCII
+	 *  characters */
+	i = parser->cursor + first_unichar_len;
+	while (i < parser->txt_size &&
+	       !parser->word_break_flags [i]) {
+
+		if (ascii_only &&
+		    !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
+			ascii_only = FALSE;
+		}
+
+		i++;
+	}
+
+	/* Word end is the first byte after the word, which is either the
+	 *  start of next word or the end of the string */
+	*p_word_length = i - parser->cursor;
+
+	/* We only want the words where the first character
+	 *  in the word is either a letter, a number or a symbol.
+	 * This is needed because the word break algorithm also
+	 *  considers word breaks after for example commas or other
+	 *  punctuation marks.
+	 * Note that looking at the first character in the string
+	 *  should be compatible with all Unicode normalization
+	 *  methods.
+	 */
+	if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
+	    !uc_is_general_category (first_unichar,
+	                             parser->allowed_start)) {
+		*p_is_allowed_word_start = FALSE;
+		return TRUE;
+	}
+
+	/* Decide word type */
+	if (ascii_only) {
+		*p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
+	} else if (IS_CJK_UCS4 (first_unichar)) {
+		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
+	} else {
+		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
+	}
+	return TRUE;
+}
+
+static gboolean
+parser_next (TrackerParser *parser,
+             gint          *byte_offset_start,
+             gint          *byte_offset_end)
+{
+	gsize word_length = 0;
+	gchar *processed_word = NULL;
+
+	*byte_offset_start = 0;
+	*byte_offset_end = 0;
+
+	g_return_val_if_fail (parser, FALSE);
+
+	/* Loop to look for next valid word */
+	while (!processed_word &&
+	       parser->cursor < parser->txt_size) {
+		TrackerParserWordType type;
+		gsize truncated_length;
+		gboolean is_allowed;
+
+		/* Get word info */
+		if (!get_word_info (parser,
+		                    &word_length,
+		                    &is_allowed,
+		                    &type)) {
+			/* Quit loop just in case */
+			parser->cursor = parser->txt_size;
+			break;
+		}
+
+		/* Skip the word if not an allowed word start */
+		if (!is_allowed) {
+			/* Skip this word and keep on looping */
+			parser->cursor += word_length;
+			continue;
+		}
+
+		/* Skip the word if longer than the maximum allowed */
+		if (word_length >= parser->max_word_length) {
+			/* Skip this word and keep on looping */
+			parser->cursor += word_length;
+			continue;
+		}
+
+		/* check if word is reserved and skip it if so */
+		if (parser->skip_reserved_words &&
+		    tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor],
+		                                          word_length)) {
+			/* Skip this word and keep on looping */
+			parser->cursor += word_length;
+			continue;
+		}
+
+		/* compute truncated word length if needed (to avoid extremely
+		 *  long words)*/
+		truncated_length = (word_length < WORD_BUFFER_LENGTH ?
+		                    word_length :
+		                    WORD_BUFFER_LENGTH - 1);
+
+		/* Process the word here. If it fails, we can still go
+		 *  to the next one. Returns newly allocated string
+		 *  always */
+		processed_word = process_word_utf8 (parser,
+		                                    &(parser->txt[parser->cursor]),
+		                                    truncated_length,
+		                                    type);
+		if (!processed_word) {
+			/* Skip this word and keep on looping */
+			parser->cursor += word_length;
+			continue;
+		}
+	}
+
+	/* If we got a word here, set output */
+	if (processed_word) {
+		/* Set outputs */
+		*byte_offset_start = parser->cursor;
+		*byte_offset_end = parser->cursor + word_length;
+
+		/* Update cursor */
+		parser->cursor += word_length;
+
+		parser->word_length = strlen (processed_word);
+		parser->word = processed_word;
+
+		return TRUE;
+	}
+
+	/* No more words... */
+	return FALSE;
+}
+
+TrackerParser *
+tracker_parser_new (TrackerLanguage *language,
+                    gint             max_word_length)
+{
+	TrackerParser *parser;
+
+	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
+	g_return_val_if_fail (max_word_length > 0, NULL);
+
+	parser = g_new0 (TrackerParser, 1);
+
+	parser->language = g_object_ref (language);
+
+	parser->max_word_length = max_word_length;
+	parser->word_length = 0;
+
+	parser->word_break_flags = NULL;
+
+	return parser;
+}
+
+void
+tracker_parser_free (TrackerParser *parser)
+{
+	g_return_if_fail (parser != NULL);
+
+	if (parser->language) {
+		g_object_unref (parser->language);
+	}
+
+	g_free (parser->word_break_flags);
+
+	g_free (parser->word);
+
+	g_free (parser);
+}
+
+void
+tracker_parser_reset (TrackerParser *parser,
+                      const gchar   *txt,
+                      gint           txt_size,
+                      gboolean       delimit_words,
+                      gboolean       enable_stemmer,
+                      gboolean       enable_stop_words,
+                      gboolean       skip_reserved_words,
+                      gboolean       skip_numbers)
+{
+	g_return_if_fail (parser != NULL);
+	g_return_if_fail (txt != NULL);
+
+	parser->enable_stemmer = enable_stemmer;
+	parser->enable_stop_words = enable_stop_words;
+	parser->delimit_words = delimit_words;
+
+	parser->txt_size = txt_size;
+	parser->txt = txt;
+	parser->skip_reserved_words = skip_reserved_words;
+	parser->skip_numbers = skip_numbers;
+
+	g_free (parser->word);
+	parser->word = NULL;
+
+	parser->word_position = 0;
+
+	parser->cursor = 0;
+
+	g_free (parser->word_break_flags);
+
+	/* Create array of flags, same size as original text. */
+	parser->word_break_flags = g_malloc (txt_size);
+
+	/* Get wordbreak flags in the whole string */
+	u8_wordbreaks ((const uint8_t *)txt,
+	               (size_t) txt_size,
+	               (char *)parser->word_break_flags);
+
+	/* Prepare a custom category which is a combination of the
+	 * desired ones */
+	parser->allowed_start = UC_LETTER;
+	if (!parser->skip_numbers) {
+		parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
+	}
+}
+
+gchar *
+tracker_parser_process_word (TrackerParser *parser,
+                             const gchar    *word,
+                             gint           length,
+                             gboolean       do_strip)
+{
+	return process_word_utf8 (parser,
+	                          word,
+	                          length,
+	                          (do_strip ?
+	                           TRACKER_PARSER_WORD_TYPE_OTHER_UNAC :
+	                           TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC));
+}
+
+static gchar *
+process_word_utf8 (TrackerParser         *parser,
+                   const gchar           *word,
+                   gint                  length,
+                   TrackerParserWordType type)
+{
+	gchar word_buffer [WORD_BUFFER_LENGTH];
+	gchar *normalized = NULL;
+	gchar *stripped = NULL;
+	gchar *stemmed = NULL;
+	size_t new_word_length;
+
+	g_return_val_if_fail (parser != NULL, NULL);
+	g_return_val_if_fail (word != NULL, NULL);
+
+	/* If length is set as -1, the input word MUST be NIL-terminated.
+	 * Otherwise, this restriction is not needed as the length to process
+	 *  is given as input argument */
+	if (length < 0) {
+		length = strlen (word);
+	}
+
+	/* Log original word */
+	tracker_parser_message_hex ("ORIGINAL word",
+	                            word, length);
+
+	/* Normalization and case-folding ONLY for non-ASCII */
+	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
+		/* Leave space for last NIL */
+		new_word_length = WORD_BUFFER_LENGTH - 1;
+
+		/* Casefold and NFC normalization in output.
+		 *  NOTE: if the output buffer is not big enough, u8_casefold will
+		 *  return a newly-allocated buffer. */
+		normalized = u8_casefold ((const uint8_t *)word,
+		                          length,
+		                          uc_locale_language (),
+		                          UNINORM_NFC,
+		                          word_buffer,
+		                          &new_word_length);
+
+		/* Case folding + Normalization failed, skip this word */
+		g_return_val_if_fail (normalized != NULL, NULL);
+
+		/* If output buffer is not the same as the one passed to
+		 *  u8_casefold, we know it was newly-allocated, so need
+		 *  to resize it in 1 byte to add last NIL */
+		if (normalized != word_buffer) {
+			normalized = g_realloc (normalized, new_word_length + 1);
+		}
+
+		/* Log after Normalization */
+		tracker_parser_message_hex (" After Casefolding and NFC normalization",
+		                            normalized, new_word_length);
+	}
+	else {
+		/* For ASCII-only, just tolower() each character */
+		gsize i;
+
+		normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
+
+		for (i = 0; i < length; i++) {
+			normalized[i] = g_ascii_tolower (word[i]);
+		}
+
+		new_word_length = length;
+
+		/* Log after tolower */
+		tracker_parser_message_hex (" After Lowercasing",
+		                            normalized, new_word_length);
+	}
+
+	/* Set output NIL */
+	normalized[new_word_length] = '\0';
+
+	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
+	if (type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
+		gsize stripped_word_length;
+
+		stripped = tracker_parser_unaccent_utf8_word (normalized,
+		                                              new_word_length,
+		                                              &stripped_word_length);
+
+		if (stripped) {
+			/* Log after UNAC stripping */
+			tracker_parser_message_hex ("  After UNAC stripping",
+			                            stripped, stripped_word_length);
+			new_word_length = stripped_word_length;
+		}
+	}
+
+	/* Stemming needed? */
+	if (parser->enable_stemmer) {
+		stemmed = tracker_language_stem_word (parser->language,
+		                                      stripped ? stripped : normalized,
+		                                      new_word_length);
+
+		/* Log after stemming */
+		tracker_parser_message_hex ("   After stemming",
+		                            stemmed, strlen (stemmed));
+	}
+
+	/* If stemmed wanted and succeeded, free previous and return it */
+	if (stemmed) {
+		g_free (stripped);
+		if (normalized != word_buffer) {
+			g_free (normalized);
+		}
+		return stemmed;
+	}
+
+	/* If stripped wanted and succeeded, free previous and return it */
+	if (stripped) {
+		if (normalized != word_buffer) {
+			g_free (normalized);
+		}
+		return stripped;
+	}
+
+	/* It may be the case that no stripping and no stemming was needed, and
+	 * that the output buffer in stack was enough for case-folding and
+	 * normalization. In this case, need to strdup() the string to return it */
+	return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
+}
+
+const gchar *
+tracker_parser_next (TrackerParser *parser,
+                     gint          *position,
+                     gint          *byte_offset_start,
+                     gint          *byte_offset_end,
+                     gboolean      *stop_word,
+                     gint          *word_length)
+{
+	const gchar  *str;
+	gint     byte_start = 0, byte_end = 0;
+
+	str = NULL;
+
+	g_free (parser->word);
+	parser->word = NULL;
+
+	if (parser_next (parser, &byte_start, &byte_end)) {
+		str = parser->word;
+	}
+
+	if (str &&
+	    parser->enable_stop_words &&
+	    tracker_language_is_stop_word (parser->language, str)) {
+		*stop_word = TRUE;
+	} else {
+		parser->word_position++;
+		*stop_word = FALSE;
+	}
+
+	*word_length = parser->word_length;
+	*position = parser->word_position;
+	*byte_offset_start = byte_start;
+	*byte_offset_end = byte_end;
+
+	return str;
+}
+
diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c
new file mode 100644
index 0000000..e6c8521
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-utils.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301  USA
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#ifdef HAVE_UNAC
+#include <unac.h>
+#endif
+
+#ifdef HAVE_LIBICU
+#include <unicode/utypes.h>
+#include <unicode/ucnv.h>
+#endif
+
+#include <libtracker-common/tracker-common.h>
+#include "tracker-parser-utils.h"
+
+
+/* Output is always UTF-8. */
+gchar *
+tracker_parser_unaccent_utf16be_word (const gchar *string,
+                                      gsize        ilength,
+                                      gsize        *p_olength)
+{
+	GError *error = NULL;
+	gchar *unaccented_str = NULL;
+	gchar *str_utf8 = NULL;
+	gsize unaccented_len;
+	gsize utf8_len;
+
+	*p_olength = 0;
+
+	if (unac_string_utf16 (string, ilength,
+	                       &unaccented_str, &unaccented_len) != 0) {
+		g_warning ("UNAC failed to strip accents");
+		return NULL;
+	}
+
+	/* Convert from UTF-16BE to UTF-8 */
+	str_utf8 = g_convert (unaccented_str,
+	                      unaccented_len,
+	                      "UTF-8",
+	                      "UTF-16BE",
+	                      NULL,
+	                      &utf8_len,
+	                      &error);
+	g_free (unaccented_str);
+
+	if (error) {
+		g_warning ("Could not convert back to UTF-8: %s",
+		           error->message);
+		g_error_free (error);
+		return NULL;
+	}
+
+	*p_olength = utf8_len;
+	return str_utf8;
+}
+
+
+#ifdef HAVE_LIBICU
+/* NOTE: Internally, UChars are UTF-16, but conversion needed just in case,
+ *  as libunac needs UTF-16BE. Output is always UTF-8.*/
+gchar *
+tracker_parser_unaccent_UChar_word (const UChar *string,
+                                    gsize        ilength,
+                                    gsize        *p_olength)
+{
+#ifdef HAVE_UNAC
+	UErrorCode icu_error = U_ZERO_ERROR;
+	UConverter *converter;
+	gchar *str_utf16;
+	gchar *str_utf8 = NULL;
+	gsize utf16_len;
+
+	*p_olength = 0;
+
+	/* Open converter UChar to UTF-16BE */
+	converter = ucnv_open ("UTF-16BE", &icu_error);
+	if (!converter) {
+		g_warning ("Cannot open UTF-16BE converter: '%s'",
+		           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+               return NULL;
+	}
+
+	/* Allocate buffer, same size as input string.
+	 * Note that ilength specifies number of UChars not
+	 *  number of bytes */
+	str_utf16 = g_malloc ((ilength + 1) * 2);
+
+	/* Convert from UChar to UTF-16BE */
+	utf16_len = ucnv_fromUChars (converter,
+	                             str_utf16,
+	                             (ilength + 1) * 2,
+	                             string,
+	                             ilength,
+	                             &icu_error);
+	if (U_FAILURE (icu_error)) {
+		g_warning ("Cannot convert from UChar to UTF-16BE: '%s' "
+		           "(ilength: %" G_GSIZE_FORMAT ")",
+		           u_errorName (icu_error),
+		           ilength);
+	} else {
+		str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
+		                                                 utf16_len,
+		                                                 p_olength);
+	}
+	ucnv_close (converter);
+	g_free (str_utf16);
+	return str_utf8;
+#else
+	return NULL;
+#endif
+}
+#endif
+
+gchar *
+tracker_parser_unaccent_utf8_word (const gchar *str,
+                                   gsize        ilength,
+                                   gsize        *p_olength)
+{
+#ifdef HAVE_UNAC
+	GError *error = NULL;
+	gchar *str_utf16 = NULL;
+	gchar *str_utf8 = NULL;
+	gsize utf16_len;
+
+	*p_olength = 0;
+
+	/* unac_string() does roughly the same than below, plus it
+	 * corrupts memory in 64bit systems, so avoid it for now.
+	 */
+	str_utf16 = g_convert (str, ilength, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
+
+	if (error) {
+		g_warning ("Could not convert to UTF-16: %s", error->message);
+		g_error_free (error);
+		return NULL;
+	} else {
+
+		str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
+		                                                 utf16_len,
+		                                                 p_olength);
+	}
+
+	g_free (str_utf16);
+	return str_utf8;
+#else
+	return NULL;
+#endif
+}
+
+
+/*
+ * Definition of the possible reserved words.
+ *  Length of word is explicitly given to avoid strlen() calls
+ */
+typedef struct {
+	const gchar *word;
+	gsize        word_length;
+} TrackerParserReservedWord;
+
+static const TrackerParserReservedWord reserved_words[] = {
+	{ "or", 2 },
+	{ NULL, 0 }
+};
+
+gboolean
+tracker_parser_is_reserved_word_utf8 (const gchar *word,
+                                      gsize word_length)
+{
+	gint i = 0;
+
+	/* Loop the array of predefined reserved words */
+	while (reserved_words[i].word != NULL) {
+		if (word_length == reserved_words[i].word_length &&
+		    strncmp (word,
+		             reserved_words[i].word,
+		             word_length) == 0) {
+			return TRUE;
+		}
+		i++;
+	}
+
+	return FALSE;
+}
+
+
+#if TRACKER_PARSER_DEBUG_HEX
+void
+tracker_parser_message_hex (const gchar  *message,
+                            const gchar  *str,
+                            gsize         str_length)
+{
+	gchar *hex_aux;
+	gchar *str_aux;
+
+	g_return_if_fail (message);
+	g_return_if_fail (str);
+	g_return_if_fail (str_length != 0);
+
+	/* String may not come NIL-terminated */
+	str_aux = g_malloc (str_length + 1);
+	memcpy (str_aux, str, str_length);
+	str_aux[str_length] = '\0';
+
+	/* Get hexadecimal representation of the input string */
+	hex_aux = tracker_strhex (str, str_length, ':');
+
+	/* Log it */
+	g_message ("%s: '%s' (%s)",
+	           message, str_aux, hex_aux);
+
+	g_free (str_aux);
+	g_free (hex_aux);
+}
+#endif
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
new file mode 100644
index 0000000..50805c1
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-utils.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301  USA
+ */
+
+#ifndef __TRACKER_PARSER_UTILS_H__
+#define __TRACKER_PARSER_UTILS_H__
+
+#include "config.h"
+
+#include <glib.h>
+
+#ifdef HAVE_LIBICU
+#include <unicode/utypes.h>
+#endif
+
+G_BEGIN_DECLS
+
+/* ASCII-7 is in range [0x00,0x7F] */
+#define IS_ASCII_UCS4(c)      ((c) <= 0x7F)
+
+/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6]  */
+#define IS_CJK_UCS4(c)        (((c) >= 0x3400 && (c) <= 0x4DB5)  ||	\
+                               ((c) >= 0x4E00 && (c) <= 0x9FA5)  ||	\
+                               ((c) >= 0x20000 && (c) <= 0x2A6D6))
+
+#define IS_UNDERSCORE_UCS4(c) ((c) == 0x005F)
+
+
+gchar *tracker_parser_unaccent_utf16be_word (const gchar *string,
+                                             gsize        ilength,
+                                             gsize        *p_olength);
+
+gchar *tracker_parser_unaccent_utf8_word (const gchar *string,
+                                          gsize        ilength,
+                                          gsize        *p_olength);
+
+#ifdef HAVE_LIBICU
+gchar *tracker_parser_unaccent_UChar_word (const UChar *string,
+                                           gsize        ilength,
+                                           gsize        *p_olength);
+#endif
+
+
+gboolean tracker_parser_is_reserved_word_utf8 (const gchar *word,
+                                               gsize word_length);
+
+
+/* Define to 1 if you want to enable debugging logs showing HEX contents
+ * of the words being parsed */
+#define TRACKER_PARSER_DEBUG_HEX 0
+
+#if TRACKER_PARSER_DEBUG_HEX
+void    tracker_parser_message_hex (const gchar  *message,
+                                    const gchar  *str,
+                                    gsize         str_length);
+#else
+#define tracker_parser_message_hex(a,b,c)
+#endif
+
+G_END_DECLS
+
+#endif /* __TRACKER_PARSER_UTILS_H__ */
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index 66535c9..cad4442 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -35,10 +35,11 @@ TrackerParser *tracker_parser_new             (TrackerLanguage *language,
 void           tracker_parser_reset           (TrackerParser   *parser,
                                                const gchar     *txt,
                                                gint             txt_size,
-                                               gboolean                 delimit_words,
-                                               gboolean                 enable_stemmer,
-                                               gboolean                 enable_stop_words,
-                                               gboolean                 parse_reserved_words);
+                                               gboolean         delimit_words,
+                                               gboolean         enable_stemmer,
+                                               gboolean         enable_stop_words,
+                                               gboolean         skip_reserved_words,
+                                               gboolean         skip_numbers);
 
 const gchar *  tracker_parser_next            (TrackerParser   *parser,
                                                gint            *position,
@@ -48,9 +49,10 @@ const gchar *  tracker_parser_next            (TrackerParser   *parser,
                                                gint            *word_length);
 
 gchar *        tracker_parser_process_word    (TrackerParser   *parser,
-                                               const char      *word,
+                                               const gchar     *word,
                                                gint             length,
-                                               gboolean                 do_strip);
+                                               gboolean         do_strip);
+
 void           tracker_parser_free            (TrackerParser   *parser);
 
 G_END_DECLS
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]