[tracker/parser-unicode-libs-review: 72/85] Some refactoring cleaning up the code of the parser

From: Aleksander Morgado <aleksm src gnome org>
To: commits-list gnome org
Cc:
Subject: [tracker/parser-unicode-libs-review: 72/85] Some refactoring cleaning up the code of the parser
Date: Tue, 4 May 2010 17:30:03 +0000 (UTC)
commit cdc54c5073bd5c2f94b29d29d727db2584635cfa
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Thu Apr 29 12:54:24 2010 +0200

    Some refactoring cleaning up the code of the parser

 src/libtracker-common/tracker-language.c         |   26 +-
 src/libtracker-common/tracker-language.h         |    2 +
 src/libtracker-common/tracker-utils.c            |   55 ++
 src/libtracker-common/tracker-utils.h            |   20 +-
 src/libtracker-fts/Makefile.am                   |    2 +
 src/libtracker-fts/tracker-parser-glib.c         |  502 +---------------
 src/libtracker-fts/tracker-parser-libunistring.c |  707 +---------------------
 src/libtracker-fts/tracker-parser-utils.c        |   80 +++
 src/libtracker-fts/tracker-parser-utils.h        |   33 +
 src/libtracker-fts/tracker-parser.h              |   10 +-
 10 files changed, 235 insertions(+), 1202 deletions(-)
---
diff --git a/src/libtracker-common/tracker-language.c b/src/libtracker-common/tracker-language.c
index 07df05a..d67d13d 100644
--- a/src/libtracker-common/tracker-language.c
+++ b/src/libtracker-common/tracker-language.c
@@ -330,7 +330,7 @@ tracker_language_new (const gchar *language_code)
 {
 	TrackerLanguage *language;
 
-	language = g_object_new (TRACKER_TYPE_LANGUAGE, 
+	language = g_object_new (TRACKER_TYPE_LANGUAGE,
 	                         "language-code", language_code,
 	                         NULL);
 
@@ -380,6 +380,30 @@ tracker_language_get_stop_words (TrackerLanguage *language)
 }
 
 /**
+ * tracker_language_is_stop_word:
+ * @language: a #TrackerLanguage
+ * @word: a string containing a word
+ *
+ * Returns %TRUE if the given @word is in the list of stop words of the
+ *  given @language.
+ *
+ * Returns: %TRUE if @word is a stop word. %FALSE otherwise.
+ */
+gboolean
+tracker_language_is_stop_word (TrackerLanguage *language,
+                               const gchar     *word)
+{
+	TrackerLanguagePriv *priv;
+
+	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), FALSE);
+	g_return_val_if_fail (word, FALSE);
+
+	priv = GET_PRIV (language);
+
+	return g_hash_table_lookup (priv->stop_words, word) != NULL;
+}
+
+/**
  * tracker_language_get_language_code:
  * @language: a #TrackerLanguage
  *
diff --git a/src/libtracker-common/tracker-language.h b/src/libtracker-common/tracker-language.h
index f0ff3cd..71e00f1 100644
--- a/src/libtracker-common/tracker-language.h
+++ b/src/libtracker-common/tracker-language.h
@@ -52,6 +52,8 @@ TrackerLanguage *tracker_language_new                (const gchar     *language_
 
 gboolean         tracker_language_get_enable_stemmer (TrackerLanguage *language);
 GHashTable *     tracker_language_get_stop_words     (TrackerLanguage *language);
+gboolean         tracker_language_is_stop_word       (TrackerLanguage *language,
+                                                      const gchar     *word);
 const gchar *    tracker_language_get_language_code  (TrackerLanguage *language);
 
 void             tracker_language_set_enable_stemmer (TrackerLanguage *language,
diff --git a/src/libtracker-common/tracker-utils.c b/src/libtracker-common/tracker-utils.c
index c2154f1..80e574b 100644
--- a/src/libtracker-common/tracker-utils.c
+++ b/src/libtracker-common/tracker-utils.c
@@ -20,6 +20,7 @@
 
 #include "config.h"
 
+#include <stdio.h>
 #include <string.h>
 #include <locale.h>
 
@@ -151,3 +152,57 @@ tracker_seconds_to_string (gdouble  seconds_elapsed,
 
 
 
+/**
+ * tracker_strhex:
+ * @data: The input array of bytes
+ * @size: Number of bytes in the input array
+ * @delimiter: Character to use as separator between each printed byte
+ *
+ * Returns the contents of @data as a printable string in hexadecimal
+ *  representation.
+ *
+ * Based on GNU PDF's pdf_text_test_get_hex()
+ *
+ * Returns: A newly allocated string which should be disposed with g_free()
+ **/
+gchar *
+tracker_strhex (const guint8 *data,
+                gsize         size,
+                gchar         delimiter)
+{
+	/*  */
+	gsize i;
+	gsize j;
+	gsize new_str_length;
+	gchar *new_str;
+	gchar new_hex_char [3];
+
+	/* Get new string length. If input string has N bytes, we need:
+	 * - 1 byte for last NUL char
+	 * - 2N bytes for hexadecimal char representation of each byte...
+	 * - N-1 bytes for the separator ':'
+	 * So... a total of (1+2N+N-1) = 3N bytes are needed... */
+	new_str_length =  3 * size;
+
+	/* Allocate memory for new array and initialize contents to NUL */
+	new_str = g_malloc0 (new_str_length);
+
+	/* Print hexadecimal representation of each byte... */
+	for(i=0, j=0; i<size; i++, j+=3) {
+		memset (new_hex_char, 0, 3);
+		/* Print character in helper array... */
+		sprintf (new_hex_char, "%02X", data[i]);
+		/* Copy to output string... */
+		memcpy (&new_str[j],&new_hex_char[0],2);
+		/* And if needed, add separator */
+		if(i != (size-1) ) {
+			new_str[j+2] = delimiter;
+		}
+	}
+
+	/* Set output string */
+	return new_str;
+}
+
+
+
diff --git a/src/libtracker-common/tracker-utils.h b/src/libtracker-common/tracker-utils.h
index 7af38eb..6364eaf 100644
--- a/src/libtracker-common/tracker-utils.h
+++ b/src/libtracker-common/tracker-utils.h
@@ -29,15 +29,17 @@ G_BEGIN_DECLS
 #error "only <libtracker-common/tracker-common.h> must be included directly."
 #endif
 
-gboolean tracker_is_empty_string            (const char  *str);
-gboolean tracker_is_blank_string            (const char  *str);
-gchar *  tracker_seconds_estimate_to_string (gdouble      seconds_elapsed,
-                                             gboolean     short_string,
-                                             guint        items_done,
-                                             guint        items_remaining);
-gchar *  tracker_seconds_to_string          (gdouble      seconds_elapsed,
-                                             gboolean     short_string);
-
+gboolean tracker_is_empty_string            (const char   *str);
+gboolean tracker_is_blank_string            (const char   *str);
+gchar *  tracker_seconds_estimate_to_string (gdouble       seconds_elapsed,
+                                             gboolean      short_string,
+                                             guint         items_done,
+                                             guint         items_remaining);
+gchar *  tracker_seconds_to_string          (gdouble       seconds_elapsed,
+                                             gboolean      short_string);
+gchar *  tracker_strhex                     (const guint8 *data,
+                                             gsize         size,
+                                             gchar         delimiter);
 G_END_DECLS
 
 #endif /* __LIBTRACKER_COMMON_UTILS_H__ */
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am
index 9b469c6..667cece 100644
--- a/src/libtracker-fts/Makefile.am
+++ b/src/libtracker-fts/Makefile.am
@@ -26,6 +26,8 @@ libtracker_fts_la_SOURCES = 						\
 	tracker-fts-config.h						\
 	tracker-fts-hash.c						\
 	tracker-fts-hash.h						\
+	tracker-parser-utils.c						\
+	tracker-parser-utils.h						\
 	tracker-parser.h
 
 if HAVE_LIBUNISTRING
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index c1f3a29..891d9f6 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -20,33 +20,22 @@
 
 #include "config.h"
 
-#include <stdio.h>
 #include <string.h>
 
 #ifdef HAVE_UNAC
 #include <unac.h>
 #endif
 
-#ifdef HAVE_LIBUNISTRING
-/* libunistring versions prior to 9.1.2 need this hack */
-#define _UNUSED_PARAMETER_
-#include <unistr.h>
-#include <uniwbrk.h>
-#include <unictype.h>
-#include <unicase.h>
-#else
 #include <pango/pango.h>
-#endif
 
+#include <libtracker-common/tracker-common.h>
 #include "tracker-parser.h"
+#include "tracker-parser-utils.h"
 
 /* Define to 1 if you want to enable debugging logs showing HEX contents
  * of the words being parsed */
 #define TRACKER_PARSER_DEBUG_HEX 0
 
-
-#ifndef HAVE_LIBUNISTRING
-
 /* Need pango for CJK ranges which are : 0x3400 - 0x4DB5, 0x4E00 -
  * 0x9FA5, 0x20000 - <= 0x2A6D6
  */
@@ -86,21 +75,6 @@ typedef enum {
 	TRACKER_PARSER_ENCODING_OTHER
 } TrackerParserEncoding;
 
-#else
-
-/* ASCII-7 is in range [0x00,0x7F] */
-#define IS_ASCII_BYTE(c) ((c) <= 0x7F)
-
-/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6]  */
-#define IS_CJK_UCS4(c)   (((c) >= 0x3400 && (c) <= 0x4DB5)  || \
-                          ((c) >= 0x4E00 && (c) <= 0x9FA5)  || \
-                          ((c) >= 0x20000 && (c) <= 0x2A6D6))
-
-/* Max possible length of a UTF-8 encoded string (just a safety limit) */
-#define WORD_BUFFER_LENGTH 512
-
-#endif /* !HAVE_LIBUNISTRING */
-
 
 struct TrackerParser {
 	const gchar           *txt;
@@ -118,8 +92,6 @@ struct TrackerParser {
 	gchar                   *word;
 	gint                    word_length;
 	guint                   word_position;
-
-#ifndef HAVE_LIBUNISTRING
 	TrackerParserEncoding   encoding;
 	const gchar             *cursor;
 
@@ -127,61 +99,8 @@ struct TrackerParser {
 	PangoLogAttr          *attrs;
 	guint                  attr_length;
 	guint                  attr_pos;
-#else
-	/* Cursor, as index of the input array of bytes */
-	gsize                  cursor;
-	/* libunistring flags array */
-	gchar                 *word_break_flags;
-	/* general category of the  start character in words */
-	uc_general_category_t  allowed_start;
-#endif /* !HAVE_LIBUNISTRING */
 };
 
-
-#if TRACKER_PARSER_DEBUG_HEX
-/* Based on GNU PDF's pdf_text_test_get_hex() */
-static gchar *
-tracker_strhex (const gchar *data,
-                const gsize size,
-                gchar delimiter)
-{
-	gint i;
-	gint j;
-	guint new_str_length;
-	gchar *new_str;
-	gchar new_hex_char [3];
-
-	/* Get new string length. If input string has N bytes, we need:
-	 * - 1 byte for last NUL char
-	 * - 2N bytes for hexadecimal char representation of each byte...
-	 * - N-1 bytes for the separator ':'
-	 * So... a total of (1+2N+N-1) = 3N bytes are needed... */
-	new_str_length =  3 * size;
-
-	/* Allocate memory for new array and initialize contents to NUL */
-	new_str = g_malloc0 (new_str_length);
-
-	/* Print hexadecimal representation of each byte... */
-	for(i=0, j=0; i<size; i++, j+=3) {
-		memset (new_hex_char, 0, 3);
-		/* Print character in helper array... */
-		sprintf (new_hex_char, "%02X", (guint8)(data[i]));
-		/* Copy to output string... */
-		memcpy (&new_str[j],&new_hex_char[0],2);
-		/* And if needed, add separator */
-		if(i != (size-1) ) {
-			new_str[j+2] = delimiter;
-		}
-	}
-
-	/* Set output string */
-	return new_str;
-}
-
-#endif /* TRACKER_PARSER_DEBUG_HEX */
-
-
-#ifndef HAVE_LIBUNISTRING
 static inline TrackerParserWordType
 get_word_type (gunichar c)
 {
@@ -224,64 +143,7 @@ get_word_type (gunichar c)
 
 	return TRACKER_PARSER_WORD_IGNORE;
 }
-#endif /* !HAVE_LIBUNISTRING */
-
-
-
-static inline gchar *
-strip_word (const gchar *str,
-            gint         length,
-            guint32     *len)
-{
-#ifdef HAVE_UNAC
-	GError *error = NULL;
-	gchar *str_utf16;
-	gsize utf16_len, unaccented_len, final_len;
-	gchar *unaccented_str = NULL;
-	gchar *s = NULL;
-
-	*len = 0;
-
-	/* unac_string() does roughly the same than below, plus it
-	 * corrupts memory in 64bit systems, so avoid it for now.
-	 */
-	str_utf16 = g_convert (str, length, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
-
-	if (error) {
-		g_warning ("Could not convert to UTF-16: %s", error->message);
-		g_error_free (error);
-		return NULL;
-	}
-
-	if (unac_string_utf16 (str_utf16, utf16_len,
-	                       &unaccented_str, &unaccented_len) != 0) {
-		g_warning ("UNAC failed to strip accents");
-		g_free (str_utf16);
-		return NULL;
-	}
-
-	g_free (str_utf16);
-
-	s = g_convert (unaccented_str, unaccented_len, "UTF-8", "UTF-16BE", NULL, &final_len, &error);
-	g_free (unaccented_str);
-
-	if (error) {
-		g_warning ("Could not convert back to UTF-8: %s", error->message);
-		g_error_free (error);
-		return NULL;
-	}
-
-	*len = (guint32) final_len;
-
-	return s;
-#else
-	*len = length;
-	return NULL;
-#endif
-}
 
-
-#ifndef HAVE_LIBUNISTRING
 static TrackerParserEncoding
 get_encoding (const gchar *txt)
 {
@@ -310,27 +172,6 @@ get_encoding (const gchar *txt)
 
 }
 
-#endif /* !HAVE_LIBUNISTRING */
-
-
-static gboolean
-is_stop_word (TrackerLanguage *language,
-              const gchar     *word)
-{
-	GHashTable *stop_words;
-
-	if (!word) {
-		return FALSE;
-	}
-
-	stop_words = tracker_language_get_stop_words (language);
-
-	return g_hash_table_lookup (stop_words, word) != NULL;
-}
-
-
-#ifndef HAVE_LIBUNISTRING
-
 static gboolean
 pango_next (TrackerParser *parser,
             gint          *byte_offset_start,
@@ -559,8 +400,6 @@ parser_next (TrackerParser *parser,
 		gchar       *utf8;
 		gchar       *processed_word;
 
-
-
 		utf8 = g_ucs4_to_utf8 (word, length, NULL, &bytes, NULL);
 
 		if (!utf8) {
@@ -588,146 +427,6 @@ parser_next (TrackerParser *parser,
 
 }
 
-#else
-
-
-/* Detect if a UTF-8 word is pure ASCII-7, so that there is no need to apply
- *  UNAC stripping.
- * Just check byte per byte, and if any of the bytes is >127, then it's not
- *  ASCII-7 */
-static gboolean
-is_ascii_word (const gchar *word,
-               gsize        length)
-{
-	gsize i;
-
-	for (i = 0; i < length; i++) {
-		if (!IS_ASCII_BYTE ((guchar)word[i])) {
-			return FALSE;
-		}
-	}
-	return TRUE;
-}
-
-
-/* libunistring-based parser */
-static gboolean
-parser_next (TrackerParser *parser,
-             gint          *byte_offset_start,
-             gint          *byte_offset_end)
-{
-
-	gsize word_length = 0;
-	gchar *processed_word = NULL;
-
-	*byte_offset_start = 0;
-	*byte_offset_end = 0;
-
-	g_return_val_if_fail (parser, FALSE);
-
-	/* Loop to look for next valid word */
-	while (!processed_word &&
-	       parser->cursor < parser->txt_size) {
-		ucs4_t first_unichar;
-		gint first_unichar_len;
-		gsize i;
-		gsize truncated_length;
-		gboolean do_strip;
-
-		/* Get first character of the word as UCS4 */
-		first_unichar_len = u8_strmbtouc (&first_unichar,
-		                                  &(parser->txt[parser->cursor]));
-		if (first_unichar_len <= 0) {
-			/* This should only happen if NIL was passed to u8_strmbtouc,
-			 *  so better just force stop here */
-			parser->cursor = parser->txt_size;
-			break;
-		}
-
-		/* Find next word break */
-		i = parser->cursor + first_unichar_len;
-		while (i < parser->txt_size &&
-		       !parser->word_break_flags [i]) {
-			i++;
-		}
-
-		/* Word end is the first byte after the word, which is either the
-		 *  start of next word or the end of the string */
-		word_length = i - parser->cursor;
-
-		/* We only want the words where the first character
-		 *  in the word is either a letter, a number or a symbol.
-		 * This is needed because the word break algorithm also
-		 *  considers word breaks after for example commas or other
-		 *  punctuation marks.
-		 * Note that looking at the first character in the string
-		 *  should be compatible with all Unicode normalization
-		 *  methods.
-		 */
-		if (!uc_is_general_category (first_unichar,
-		                             parser->allowed_start)) {
-			/* Skip this word and keep on looping */
-			parser->cursor += word_length;
-			continue;
-		}
-
-		/* check if word is reserved */
-		if (parser->parse_reserved_words &&
-		    word_length == 2 &&
-		    parser->txt[parser->cursor] == 'o' &&
-		    parser->txt[parser->cursor + 1] == 'r') {
-			/* Skip this word and keep on looping */
-			parser->cursor += word_length;
-			continue;
-		}
-
-		/* compute truncated word length if needed (to avoid extremely
-		 *  long words)*/
-		truncated_length = (word_length < WORD_BUFFER_LENGTH ?
-		                    word_length :
-		                    WORD_BUFFER_LENGTH - 1);
-
-		/* Enable UNAC stripping only if no ASCII and no CJK */
-		do_strip = (!is_ascii_word (&(parser->txt[parser->cursor]),
-		                            truncated_length) &&
-		            !IS_CJK_UCS4 (first_unichar));
-
-		/* Process the word here. If it fails, we can still go
-		 *  to the next one. Returns newly allocated string
-		 *  always */
-		processed_word = tracker_parser_process_word (parser,
-		                                              &(parser->txt[parser->cursor]),
-		                                              truncated_length,
-		                                              do_strip);
-		if (!processed_word) {
-			/* Skip this word and keep on looping */
-			parser->cursor += word_length;
-			continue;
-		}
-	}
-
-	/* If we got a word here, set output */
-	if (processed_word) {
-		/* Set outputs */
-		*byte_offset_start = parser->cursor;
-		*byte_offset_end = parser->cursor + word_length;
-
-		/* Update cursor */
-		parser->cursor += word_length;
-
-		parser->word_length = strlen (processed_word);
-		parser->word = processed_word;
-
-		return TRUE;
-	}
-
-	/* No more words... */
-	return FALSE;
-}
-
-#endif /* !HAVE_LIBUNISTRING */
-
-
 TrackerParser *
 tracker_parser_new (TrackerLanguage *language,
                     gint             max_word_length)
@@ -743,12 +442,7 @@ tracker_parser_new (TrackerLanguage *language,
 
 	parser->max_word_length = max_word_length;
 	parser->word_length = 0;
-
-#ifndef HAVE_LIBUNISTRING
 	parser->attrs = NULL;
-#else
-	parser->word_break_flags = NULL;
-#endif /* !HAVE_LIBUNISTRING */
 
 	return parser;
 }
@@ -762,11 +456,7 @@ tracker_parser_free (TrackerParser *parser)
 		g_object_unref (parser->language);
 	}
 
-#ifndef HAVE_LIBUNISTRING
 	g_free (parser->attrs);
-#else
-	g_free (parser->word_break_flags);
-#endif /* !HAVE_LIBUNISTRING */
 
 	g_free (parser->word);
 
@@ -785,13 +475,12 @@ tracker_parser_reset (TrackerParser *parser,
 	g_return_if_fail (parser != NULL);
 	g_return_if_fail (txt != NULL);
 
-#ifndef HAVE_LIBUNISTRING
+
 	g_free (parser->attrs);
 	parser->attrs = NULL;
 
 	parser->cursor = txt;
 	parser->encoding = get_encoding (txt);
-#endif
 
 	parser->enable_stemmer = enable_stemmer;
 	parser->enable_stop_words = enable_stop_words;
@@ -806,27 +495,6 @@ tracker_parser_reset (TrackerParser *parser,
 
 	parser->word_position = 0;
 
-#ifdef HAVE_LIBUNISTRING
-
-	parser->cursor = 0;
-
-	g_free (parser->word_break_flags);
-
-	/* Create array of flags, same size as original text. */
-	parser->word_break_flags = g_malloc (txt_size);
-
-	/* Get wordbreak flags in the whole string */
-	u8_wordbreaks ((const uint8_t *)txt,
-	               (size_t) txt_size,
-	               (char *)parser->word_break_flags);
-
-	/* Prepare a custom category which is a combination of the
-	 * desired ones */
-	parser->allowed_start = UC_LETTER;
-	parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
-	parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_SYMBOL);
-
-#else
 	if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
 		PangoLogAttr *attrs;
 
@@ -848,11 +516,8 @@ tracker_parser_reset (TrackerParser *parser,
 		parser->attrs = attrs;
 		parser->attr_pos = 0;
 	}
-#endif /* !HAVE_LIBUNISTRING */
 }
 
-
-#ifndef HAVE_LIBUNISTRING
 gchar *
 tracker_parser_process_word (TrackerParser *parser,
                              const gchar    *word,
@@ -862,7 +527,7 @@ tracker_parser_process_word (TrackerParser *parser,
 	gchar *stem_word;
 	gchar *str;
 	gchar *stripped_word;
-	guint  bytes, len;
+	gsize  bytes, len;
 
 	g_return_val_if_fail (parser != NULL, NULL);
 	g_return_val_if_fail (word != NULL, NULL);
@@ -889,7 +554,7 @@ tracker_parser_process_word (TrackerParser *parser,
 #endif
 
 		if (do_strip) {
-			stripped_word = strip_word (word, bytes, &len);
+			stripped_word = tracker_parser_unaccent_string (word, bytes, &len);
 
 			/* Log after UNAC stripping */
 #if TRACKER_PARSER_DEBUG_HEX
@@ -951,152 +616,6 @@ tracker_parser_process_word (TrackerParser *parser,
 	return str;
 }
 
-#else
-
-/* libunistring version of the word processor. */
-gchar *
-tracker_parser_process_word (TrackerParser *parser,
-                             const gchar    *word,
-                             gint           length,
-                             gboolean       do_strip)
-{
-	gchar word_buffer [WORD_BUFFER_LENGTH];
-	gchar *normalized = NULL;
-	gchar *stripped = NULL;
-	gchar *stemmed = NULL;
-	size_t new_word_length;
-
-	g_return_val_if_fail (parser != NULL, NULL);
-	g_return_val_if_fail (word != NULL, NULL);
-
-	/* If length is set as -1, the input word MUST be NIL-terminated.
-	 * Otherwise, this restriction is not needed as the length to process
-	 *  is given as input argument */
-	if (length < 0) {
-		length = strlen (word);
-	}
-
-	/* Log original word */
-#if TRACKER_PARSER_DEBUG_HEX
-	{
-		gchar *aux;
-		gchar *word_aux;
-
-		/* Word may not come NIL-terminated */
-		word_aux = g_malloc (length + 1);
-		memcpy (word_aux, word, length);
-		word_aux[length] = '\0';
-
-		aux = tracker_strhex (word, length, ':');
-		g_message ("ORIGINAL word: '%s' (%s)",
-		           word_aux, aux);
-		g_free (aux);
-		g_free (word_aux);
-	}
-#endif
-
-	/* Leave space for last NIL */
-	new_word_length = WORD_BUFFER_LENGTH - 1;
-
-	/* Casefold and NFC normalization in output.
-	 *  NOTE: if the output buffer is not big enough, u8_casefold will
-	 *  return a newly-allocated buffer. */
-	normalized = u8_casefold ((const uint8_t *)word,
-	                          length,
-	                          uc_locale_language (),
-	                          UNINORM_NFC,
-	                          word_buffer,
-	                          &new_word_length);
-
-	/* Case folding + Normalization failed, skip this word */
-	g_return_val_if_fail (normalized != NULL, NULL);
-
-	/* If output buffer is not the same as the one passed to
-	 *  u8_casefold, we know it was newly-allocated, so need
-	 *  to resize it in 1 byte to add last NIL */
-	if (normalized != word_buffer) {
-		normalized = g_realloc (normalized, new_word_length + 1);
-	}
-
-	/* Set output NIL */
-	normalized[new_word_length] = '\0';
-
-#if TRACKER_PARSER_DEBUG_HEX
-	{
-		gchar *aux;
-		aux = tracker_strhex (normalized, new_word_length, ':');
-		g_message (" After Casefolding and NFC normalization: '%s' (%s)",
-		           normalized, aux);
-		g_free (aux);
-	}
-#endif
-
-	/* UNAC stripping needed? */
-	if (do_strip) {
-		guint32 stripped_word_length;
-
-		stripped = strip_word (normalized,
-		                       new_word_length,
-		                       &stripped_word_length);
-
-		if (stripped) {
-			/* Log after UNAC stripping */
-#if TRACKER_PARSER_DEBUG_HEX
-			{
-				gchar *aux;
-				aux = tracker_strhex (stripped, stripped_word_length, ':');
-				g_message ("  After UNAC stripping: '%s' (%s)",
-				           stripped, aux);
-				g_free (aux);
-			}
-#endif
-			new_word_length = stripped_word_length;
-		}
-	}
-
-
-	/* Stemming needed? */
-	if (parser->enable_stemmer) {
-		stemmed = tracker_language_stem_word (parser->language,
-		                                      stripped ? stripped : normalized,
-		                                      new_word_length);
-#if TRACKER_PARSER_DEBUG_HEX
-		if (stemmed) {
-			gchar *aux;
-			aux = tracker_strhex (stemmed, strlen (stemmed), ':');
-			g_message ("   After stemming: '%s' (%s)",
-			           stemmed, aux);
-			g_free (aux);
-		}
-#endif
-	}
-
-	/* If stemmed wanted and succeeded, free previous and return it */
-	if (stemmed) {
-		g_free (stripped);
-		if (normalized != word_buffer) {
-			g_free (normalized);
-		}
-		return stemmed;
-	}
-
-	/* If stripped wanted and succeeded, free previous and return it */
-	if (stripped) {
-		if (normalized != word_buffer) {
-			g_free (normalized);
-		}
-		return stripped;
-	}
-
-	/* It may be the case that no stripping and no stemming was needed, and
-	 * that the output buffer in stack was enough for case-folding and
-	 * normalization. In this case, need to strdup() the string to return it */
-	return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
-}
-
-#endif /* !HAVE_LIBUNISTRING */
-
-
 const gchar *
 tracker_parser_next (TrackerParser *parser,
                      gint          *position,
@@ -1113,8 +632,6 @@ tracker_parser_next (TrackerParser *parser,
 	g_free (parser->word);
 	parser->word = NULL;
 
-
-#ifndef HAVE_LIBUNISTRING
 	if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
 		if (pango_next (parser, &byte_start, &byte_end)) {
 			str = parser->word;
@@ -1122,14 +639,13 @@ tracker_parser_next (TrackerParser *parser,
 		parser->word_position++;
 
 		*stop_word = FALSE;
-	} else
-#endif /* !HAVE_LIBUNISTRING */
-	{
+	} else {
 		if (parser_next (parser, &byte_start, &byte_end)) {
 			str = parser->word;
 		}
 
-		if (parser->enable_stop_words && is_stop_word (parser->language, str)) {
+		if (parser->enable_stop_words &&
+		    tracker_language_is_stop_word (parser->language, str)) {
 			*stop_word = TRUE;
 		} else {
 			parser->word_position++;
@@ -1145,5 +661,3 @@ tracker_parser_next (TrackerParser *parser,
 	return str;
 }
 
-
-
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index 5ffa73e..0052cb0 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -1,3 +1,4 @@
+
 /*
  * Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
  * Copyright (C) 2008, Nokia <ivan frade nokia com>
@@ -27,67 +28,21 @@
 #include <unac.h>
 #endif
 
-#ifdef HAVE_LIBUNISTRING
 /* libunistring versions prior to 9.1.2 need this hack */
 #define _UNUSED_PARAMETER_
 #include <unistr.h>
 #include <uniwbrk.h>
 #include <unictype.h>
 #include <unicase.h>
-#else
-#include <pango/pango.h>
-#endif
 
+#include <libtracker-common/tracker-common.h>
 #include "tracker-parser.h"
+#include "tracker-parser-utils.h"
 
 /* Define to 1 if you want to enable debugging logs showing HEX contents
  * of the words being parsed */
 #define TRACKER_PARSER_DEBUG_HEX 0
 
-
-#ifndef HAVE_LIBUNISTRING
-
-/* Need pango for CJK ranges which are : 0x3400 - 0x4DB5, 0x4E00 -
- * 0x9FA5, 0x20000 - <= 0x2A6D6
- */
-#define NEED_PANGO(c)            (((c) >= 0x3400 && (c) <= 0x4DB5)  ||  \
-                                  ((c) >= 0x4E00 && (c) <= 0x9FA5)  ||  \
-                                  ((c) >= 0x20000 && (c) <= 0x2A6D6))
-#define IS_LATIN(c)              (((c) <= 0x02AF) ||	\
-                                  ((c) >= 0x1E00 && (c) <= 0x1EFF))
-#define IS_ASCII(c)              ((c) <= 0x007F)
-#define IS_ASCII_ALPHA_LOWER(c)  ((c) >= 0x0061 && (c) <= 0x007A)
-#define IS_ASCII_ALPHA_HIGHER(c) ((c) >= 0x0041 && (c) <= 0x005A)
-#define IS_ASCII_NUMERIC(c)      ((c) >= 0x0030 && (c) <= 0x0039)
-#define IS_ASCII_IGNORE(c)       ((c) <= 0x002C)
-#define IS_HYPHEN(c)             ((c) == 0x002D)
-#define IS_UNDERSCORE(c)         ((c) == 0x005F)
-#define IS_NEWLINE(c)            ((c) == 0x000D)
-#define IS_O(c)                          ((c) == 0x006F)
-#define IS_R(c)                          ((c) == 0x0072)
-
-typedef enum {
-	TRACKER_PARSER_WORD_ASCII_HIGHER,
-	TRACKER_PARSER_WORD_ASCII_LOWER,
-	TRACKER_PARSER_WORD_HYPHEN,
-	TRACKER_PARSER_WORD_UNDERSCORE,
-	TRACKER_PARSER_WORD_NUM,
-	TRACKER_PARSER_WORD_ALPHA_HIGHER,
-	TRACKER_PARSER_WORD_ALPHA_LOWER,
-	TRACKER_PARSER_WORD_ALPHA,
-	TRACKER_PARSER_WORD_ALPHA_NUM,
-	TRACKER_PARSER_WORD_IGNORE
-} TrackerParserWordType;
-
-typedef enum {
-	TRACKER_PARSER_ENCODING_ASCII,
-	TRACKER_PARSER_ENCODING_LATIN,
-	TRACKER_PARSER_ENCODING_CJK,
-	TRACKER_PARSER_ENCODING_OTHER
-} TrackerParserEncoding;
-
-#else
-
 /* ASCII-7 is in range [0x00,0x7F] */
 #define IS_ASCII_BYTE(c) ((c) <= 0x7F)
 
@@ -99,8 +54,6 @@ typedef enum {
 /* Max possible length of a UTF-8 encoded string (just a safety limit) */
 #define WORD_BUFFER_LENGTH 512
 
-#endif /* !HAVE_LIBUNISTRING */
-
 
 struct TrackerParser {
 	const gchar           *txt;
@@ -119,480 +72,14 @@ struct TrackerParser {
 	gint                    word_length;
 	guint                   word_position;
 
-#ifndef HAVE_LIBUNISTRING
-	TrackerParserEncoding   encoding;
-	const gchar             *cursor;
-
-	/* Pango members for CJK text parsing */
-	PangoLogAttr          *attrs;
-	guint                  attr_length;
-	guint                  attr_pos;
-#else
 	/* Cursor, as index of the input array of bytes */
 	gsize                  cursor;
 	/* libunistring flags array */
 	gchar                 *word_break_flags;
 	/* general category of the  start character in words */
 	uc_general_category_t  allowed_start;
-#endif /* !HAVE_LIBUNISTRING */
 };
 
-
-#if TRACKER_PARSER_DEBUG_HEX
-/* Based on GNU PDF's pdf_text_test_get_hex() */
-static gchar *
-tracker_strhex (const gchar *data,
-                const gsize size,
-                gchar delimiter)
-{
-	gint i;
-	gint j;
-	guint new_str_length;
-	gchar *new_str;
-	gchar new_hex_char [3];
-
-	/* Get new string length. If input string has N bytes, we need:
-	 * - 1 byte for last NUL char
-	 * - 2N bytes for hexadecimal char representation of each byte...
-	 * - N-1 bytes for the separator ':'
-	 * So... a total of (1+2N+N-1) = 3N bytes are needed... */
-	new_str_length =  3 * size;
-
-	/* Allocate memory for new array and initialize contents to NUL */
-	new_str = g_malloc0 (new_str_length);
-
-	/* Print hexadecimal representation of each byte... */
-	for(i=0, j=0; i<size; i++, j+=3) {
-		memset (new_hex_char, 0, 3);
-		/* Print character in helper array... */
-		sprintf (new_hex_char, "%02X", (guint8)(data[i]));
-		/* Copy to output string... */
-		memcpy (&new_str[j],&new_hex_char[0],2);
-		/* And if needed, add separator */
-		if(i != (size-1) ) {
-			new_str[j+2] = delimiter;
-		}
-	}
-
-	/* Set output string */
-	return new_str;
-}
-
-#endif /* TRACKER_PARSER_DEBUG_HEX */
-
-
-#ifndef HAVE_LIBUNISTRING
-static inline TrackerParserWordType
-get_word_type (gunichar c)
-{
-	/* Fast ascii handling */
-	if (IS_ASCII (c)) {
-		if (IS_ASCII_ALPHA_LOWER (c)) {
-			return TRACKER_PARSER_WORD_ASCII_LOWER;
-		}
-
-		if (IS_ASCII_ALPHA_HIGHER (c)) {
-			return TRACKER_PARSER_WORD_ASCII_HIGHER;
-		}
-
-		if (IS_ASCII_IGNORE (c)) {
-			return TRACKER_PARSER_WORD_IGNORE;
-		}
-
-		if (IS_ASCII_NUMERIC (c)) {
-			return TRACKER_PARSER_WORD_NUM;
-		}
-
-		if (IS_HYPHEN (c)) {
-			return TRACKER_PARSER_WORD_HYPHEN;
-		}
-
-		if (IS_UNDERSCORE (c)) {
-			return TRACKER_PARSER_WORD_UNDERSCORE;
-		}
-	} else {
-		if (g_unichar_isalpha (c)) {
-			if (!g_unichar_isupper (c)) {
-				return TRACKER_PARSER_WORD_ALPHA_LOWER;
-			} else {
-				return TRACKER_PARSER_WORD_ALPHA_HIGHER;
-			}
-		} else if (g_unichar_isdigit (c)) {
-			return TRACKER_PARSER_WORD_NUM;
-		}
-	}
-
-	return TRACKER_PARSER_WORD_IGNORE;
-}
-#endif /* !HAVE_LIBUNISTRING */
-
-
-
-static inline gchar *
-strip_word (const gchar *str,
-            gint         length,
-            guint32     *len)
-{
-#ifdef HAVE_UNAC
-	GError *error = NULL;
-	gchar *str_utf16;
-	gsize utf16_len, unaccented_len, final_len;
-	gchar *unaccented_str = NULL;
-	gchar *s = NULL;
-
-	*len = 0;
-
-	/* unac_string() does roughly the same than below, plus it
-	 * corrupts memory in 64bit systems, so avoid it for now.
-	 */
-	str_utf16 = g_convert (str, length, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
-
-	if (error) {
-		g_warning ("Could not convert to UTF-16: %s", error->message);
-		g_error_free (error);
-		return NULL;
-	}
-
-	if (unac_string_utf16 (str_utf16, utf16_len,
-	                       &unaccented_str, &unaccented_len) != 0) {
-		g_warning ("UNAC failed to strip accents");
-		g_free (str_utf16);
-		return NULL;
-	}
-
-	g_free (str_utf16);
-
-	s = g_convert (unaccented_str, unaccented_len, "UTF-8", "UTF-16BE", NULL, &final_len, &error);
-	g_free (unaccented_str);
-
-	if (error) {
-		g_warning ("Could not convert back to UTF-8: %s", error->message);
-		g_error_free (error);
-		return NULL;
-	}
-
-	*len = (guint32) final_len;
-
-	return s;
-#else
-	*len = length;
-	return NULL;
-#endif
-}
-
-
-#ifndef HAVE_LIBUNISTRING
-static TrackerParserEncoding
-get_encoding (const gchar *txt)
-{
-	const gchar *p;
-	gunichar     c;
-	gint         i = 0;
-
-	/* Grab first 255 non-whitespace chars and test */
-	for (p = txt; *p && i < 255; p = g_utf8_next_char (p)) {
-		c = g_utf8_get_char (p);
-
-		if (!g_unichar_isspace (c)) {
-			i++;
-		}
-
-		if (IS_ASCII(c)) continue;
-
-		if (IS_LATIN(c)) return TRACKER_PARSER_ENCODING_LATIN;
-
-		if (NEED_PANGO(c)) return TRACKER_PARSER_ENCODING_CJK;
-
-		return TRACKER_PARSER_ENCODING_OTHER;
-	}
-
-	return TRACKER_PARSER_ENCODING_ASCII;
-
-}
-
-#endif /* !HAVE_LIBUNISTRING */
-
-
-static gboolean
-is_stop_word (TrackerLanguage *language,
-              const gchar     *word)
-{
-	GHashTable *stop_words;
-
-	if (!word) {
-		return FALSE;
-	}
-
-	stop_words = tracker_language_get_stop_words (language);
-
-	return g_hash_table_lookup (stop_words, word) != NULL;
-}
-
-
-#ifndef HAVE_LIBUNISTRING
-
-static gboolean
-pango_next (TrackerParser *parser,
-            gint          *byte_offset_start,
-            gint          *byte_offset_end)
-
-{
-	/* CJK text does not need stemming or other treatment */
-	gint    word_start = -1;
-	gint    old_word_start = -1;
-	guint   i;
-
-	for (i = parser->attr_pos; i < parser->attr_length; i++) {
-		if (parser->attrs[i].is_word_start) {
-			word_start = i;
-			continue;
-		}
-
-		if (parser->attrs[i].is_word_end && word_start != old_word_start) {
-			gchar *start_word, *end_word;
-
-			old_word_start = word_start;
-
-			start_word = g_utf8_offset_to_pointer (parser->txt, word_start);
-			end_word = g_utf8_offset_to_pointer (parser->txt, i);
-
-			if (start_word != end_word) {
-				gchar *str;
-				gchar *index_word;
-
-				/* Normalize word */
-				str = g_utf8_casefold (start_word, end_word - start_word);
-				if (!str) {
-					continue;
-				}
-
-				index_word = g_utf8_normalize (str, -1, G_NORMALIZE_NFC);
-				g_free (str);
-
-				if (!index_word) {
-					continue;
-				}
-
-				parser->word_length = strlen (index_word);
-				parser->word = index_word;
-
-				*byte_offset_start = (start_word - parser->txt);
-				*byte_offset_end = *byte_offset_start + (end_word - start_word);
-				parser->attr_pos = i;
-
-
-				return TRUE;
-
-			}
-
-			word_start = i;
-		}
-	}
-
-	parser->attr_pos = i;
-
-	return FALSE;
-}
-
-
-static gboolean
-parser_next (TrackerParser *parser,
-             gint          *byte_offset_start,
-             gint          *byte_offset_end)
-{
-	TrackerParserWordType word_type;
-	gunichar              word[64];
-	gboolean              is_valid;
-	guint                 length;
-	gint                  char_count = 0;
-	glong                 bytes;
-	const gchar          *p;
-	const gchar          *start;
-	const gchar          *end;
-	gboolean              do_strip = FALSE;
-
-	*byte_offset_start = 0;
-	*byte_offset_end = 0;
-
-	g_return_val_if_fail (parser, FALSE);
-
-	if (!parser->cursor) {
-		return FALSE;
-	}
-
-	word_type = TRACKER_PARSER_WORD_IGNORE;
-	is_valid = TRUE;
-	length = 0;
-	bytes = 0;
-
-	start = NULL;
-	end = NULL;
-
-	for (p = parser->cursor; *p && *p != '\0'; p = g_utf8_next_char (p)) {
-		TrackerParserWordType type;
-		gunichar              c;
-
-		char_count++;
-		c = g_utf8_get_char (p);
-		type = get_word_type (c);
-
-		if (type == TRACKER_PARSER_WORD_IGNORE ||
-		    (parser->delimit_words &&
-		     (type == TRACKER_PARSER_WORD_HYPHEN ||
-		      type == TRACKER_PARSER_WORD_UNDERSCORE))) {
-			if (!start) {
-				continue;
-			} else {
-				/* word break */
-
-				/* check if word is reserved */
-				if (is_valid && parser->parse_reserved_words) {
-					if (length == 2 && word[0] == 'o' && word[1] == 'r') {
-						break;
-					}
-				}
-
-				if (!is_valid ||
-				    word_type == TRACKER_PARSER_WORD_NUM) {
-					word_type = TRACKER_PARSER_WORD_IGNORE;
-					is_valid = TRUE;
-					length = 0;
-					bytes = 0;
-					start = NULL;
-					end = NULL;
-					do_strip = FALSE;
-
-					continue;
-				}
-
-				break;
-			}
-		}
-
-		if (!is_valid) {
-			continue;
-		}
-
-		if (!start) {
-			start = g_utf8_offset_to_pointer (parser->cursor, char_count-1);
-
-			/* Valid words must start with an alpha or
-			 * underscore if we are filtering.
-			 */
-
-			if (type == TRACKER_PARSER_WORD_NUM) {
-				is_valid = FALSE;
-				continue;
-			} else {
-				if (type == TRACKER_PARSER_WORD_HYPHEN) {
-					is_valid = parser->parse_reserved_words;
-					continue;
-				}
-			}
-		}
-
-		if (length >= parser->max_word_length) {
-			continue;
-		}
-
-		length++;
-
-		switch (type) {
-		case TRACKER_PARSER_WORD_ASCII_HIGHER:
-			c += 32;
-
-                        /* Fall through */
-		case TRACKER_PARSER_WORD_ASCII_LOWER:
-		case TRACKER_PARSER_WORD_HYPHEN:
-		case TRACKER_PARSER_WORD_UNDERSCORE:
-			if (word_type == TRACKER_PARSER_WORD_NUM ||
-			    word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
-				word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
-			} else {
-				word_type = TRACKER_PARSER_WORD_ALPHA;
-			}
-
-			break;
-
-		case TRACKER_PARSER_WORD_NUM:
-			if (word_type == TRACKER_PARSER_WORD_ALPHA ||
-			    word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
-				word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
-			} else {
-				word_type = TRACKER_PARSER_WORD_NUM;
-			}
-			break;
-
-		case TRACKER_PARSER_WORD_ALPHA_HIGHER:
-			c = g_unichar_tolower (c);
-
-			/* Fall through */
-		case TRACKER_PARSER_WORD_ALPHA_LOWER:
-			if (!do_strip) {
-				do_strip = TRUE;
-			}
-
-			if (word_type == TRACKER_PARSER_WORD_NUM ||
-			    word_type == TRACKER_PARSER_WORD_ALPHA_NUM) {
-				word_type = TRACKER_PARSER_WORD_ALPHA_NUM;
-			} else {
-				word_type = TRACKER_PARSER_WORD_ALPHA;
-			}
-
-			break;
-
-		case TRACKER_PARSER_WORD_ALPHA:
-		case TRACKER_PARSER_WORD_ALPHA_NUM:
-		case TRACKER_PARSER_WORD_IGNORE:
-		default:
-			break;
-		}
-
-		word[length -1] = c;
-	}
-
-	parser->cursor = NULL;
-
-	if (!is_valid) {
-		return FALSE;
-	}
-
-	if (word_type == TRACKER_PARSER_WORD_ALPHA_NUM || word_type == TRACKER_PARSER_WORD_ALPHA) {
-		gchar       *utf8;
-		gchar       *processed_word;
-
-
-
-		utf8 = g_ucs4_to_utf8 (word, length, NULL, &bytes, NULL);
-
-		if (!utf8) {
-			return FALSE;
-		}
-
-		*byte_offset_start = start-parser->txt;
-		*byte_offset_end = *byte_offset_start + bytes;
-
-		parser->cursor = parser->txt + *byte_offset_end;
-
-		processed_word = tracker_parser_process_word (parser, utf8, bytes, do_strip);
-		g_free (utf8);
-
-		if (processed_word) {
-			parser->word_length = strlen (processed_word);
-			parser->word = processed_word;
-
-			return TRUE;
-		}
-
-	}
-
-	return FALSE;
-
-}
-
-#else
-
-
 /* Detect if a UTF-8 word is pure ASCII-7, so that there is no need to apply
  *  UNAC stripping.
  * Just check byte per byte, and if any of the bytes is >127, then it's not
@@ -611,14 +98,12 @@ is_ascii_word (const gchar *word,
 	return TRUE;
 }
 
-
 /* libunistring-based parser */
 static gboolean
 parser_next (TrackerParser *parser,
              gint          *byte_offset_start,
              gint          *byte_offset_end)
 {
-
 	gsize word_length = 0;
 	gchar *processed_word = NULL;
 
@@ -727,9 +212,6 @@ parser_next (TrackerParser *parser,
 	return FALSE;
 }
 
-#endif /* !HAVE_LIBUNISTRING */
-
-
 TrackerParser *
 tracker_parser_new (TrackerLanguage *language,
                     gint             max_word_length)
@@ -746,11 +228,7 @@ tracker_parser_new (TrackerLanguage *language,
 	parser->max_word_length = max_word_length;
 	parser->word_length = 0;
 
-#ifndef HAVE_LIBUNISTRING
-	parser->attrs = NULL;
-#else
 	parser->word_break_flags = NULL;
-#endif /* !HAVE_LIBUNISTRING */
 
 	return parser;
 }
@@ -764,11 +242,7 @@ tracker_parser_free (TrackerParser *parser)
 		g_object_unref (parser->language);
 	}
 
-#ifndef HAVE_LIBUNISTRING
-	g_free (parser->attrs);
-#else
 	g_free (parser->word_break_flags);
-#endif /* !HAVE_LIBUNISTRING */
 
 	g_free (parser->word);
 
@@ -787,14 +261,6 @@ tracker_parser_reset (TrackerParser *parser,
 	g_return_if_fail (parser != NULL);
 	g_return_if_fail (txt != NULL);
 
-#ifndef HAVE_LIBUNISTRING
-	g_free (parser->attrs);
-	parser->attrs = NULL;
-
-	parser->cursor = txt;
-	parser->encoding = get_encoding (txt);
-#endif
-
 	parser->enable_stemmer = enable_stemmer;
 	parser->enable_stop_words = enable_stop_words;
 	parser->delimit_words = delimit_words;
@@ -808,8 +274,6 @@ tracker_parser_reset (TrackerParser *parser,
 
 	parser->word_position = 0;
 
-#ifdef HAVE_LIBUNISTRING
-
 	parser->cursor = 0;
 
 	g_free (parser->word_break_flags);
@@ -827,134 +291,8 @@ tracker_parser_reset (TrackerParser *parser,
 	parser->allowed_start = UC_LETTER;
 	parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
 	parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_SYMBOL);
-
-#else
-	if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
-		PangoLogAttr *attrs;
-
-		if (parser->txt_size == -1) {
-			parser->txt_size = strlen (parser->txt);
-		}
-
-		parser->attr_length = g_utf8_strlen (parser->txt, parser->txt_size) + 1;
-
-		attrs = g_new0 (PangoLogAttr, parser->attr_length);
-
-		pango_get_log_attrs (parser->txt,
-		                     txt_size,
-		                     0,
-		                     pango_language_from_string ("C"),
-		                     attrs,
-		                     parser->attr_length);
-
-		parser->attrs = attrs;
-		parser->attr_pos = 0;
-	}
-#endif /* !HAVE_LIBUNISTRING */
 }
 
-
-#ifndef HAVE_LIBUNISTRING
-gchar *
-tracker_parser_process_word (TrackerParser *parser,
-                             const gchar    *word,
-                             gint           length,
-                             gboolean       do_strip)
-{
-	gchar *stem_word;
-	gchar *str;
-	gchar *stripped_word;
-	guint  bytes, len;
-
-	g_return_val_if_fail (parser != NULL, NULL);
-	g_return_val_if_fail (word != NULL, NULL);
-
-	str = NULL;
-	stripped_word = NULL;
-
-	if (word) {
-		if (length == -1) {
-			bytes = strlen (word);
-		} else {
-			bytes = length;
-		}
-
-		/* Log original word */
-#if TRACKER_PARSER_DEBUG_HEX
-		{
-			gchar *aux;
-			aux = tracker_strhex (word, bytes, ':');
-			g_message ("ORIGINAL word: '%s' (%s)",
-			           word, aux);
-			g_free (aux);
-		}
-#endif
-
-		if (do_strip) {
-			stripped_word = strip_word (word, bytes, &len);
-
-			/* Log after UNAC stripping */
-#if TRACKER_PARSER_DEBUG_HEX
-			{
-				gchar *aux;
-				aux = tracker_strhex (stripped_word, len, ':');
-				g_message (" After UNAC stripping: '%s' (%s)",
-				           stripped_word, aux);
-				g_free (aux);
-			}
-#endif
-		} else {
-			stripped_word = NULL;
-		}
-
-
-		if (!stripped_word) {
-			str = g_utf8_normalize (word,
-			                        bytes,
-			                        G_NORMALIZE_NFC);
-		} else {
-			str = g_utf8_normalize (stripped_word,
-			                        len,
-			                        G_NORMALIZE_NFC);
-			g_free (stripped_word);
-		}
-
-		/* Log after normalization */
-#if TRACKER_PARSER_DEBUG_HEX
-		{
-			gchar *aux;
-			aux = tracker_strhex (str, strlen ((gchar *)str), ':');
-			g_message ("  After NFC normalization: '%s' (%s)",
-			           str, aux);
-			g_free (aux);
-		}
-#endif
-
-
-		if (!str) {
-			return NULL;
-		}
-
-		if (!parser->enable_stemmer) {
-			return str;
-		}
-
-		len = strlen (str);
-
-		stem_word = tracker_language_stem_word (parser->language, str, len);
-
-		if (stem_word) {
-			g_free (str);
-
-			return stem_word;
-		}
-	}
-
-	return str;
-}
-
-#else
-
 /* libunistring version of the word processor. */
 gchar *
 tracker_parser_process_word (TrackerParser *parser,
@@ -1035,11 +373,11 @@ tracker_parser_process_word (TrackerParser *parser,
 
 	/* UNAC stripping needed? */
 	if (do_strip) {
-		guint32 stripped_word_length;
+		gsize stripped_word_length;
 
-		stripped = strip_word (normalized,
-		                       new_word_length,
-		                       &stripped_word_length);
+		stripped = tracker_parser_unaccent_string (normalized,
+		                                           new_word_length,
+		                                           &stripped_word_length);
 
 		if (stripped) {
 			/* Log after UNAC stripping */
@@ -1096,9 +434,6 @@ tracker_parser_process_word (TrackerParser *parser,
 	return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
 }
 
-#endif /* !HAVE_LIBUNISTRING */
-
-
 const gchar *
 tracker_parser_next (TrackerParser *parser,
                      gint          *position,
@@ -1115,28 +450,16 @@ tracker_parser_next (TrackerParser *parser,
 	g_free (parser->word);
 	parser->word = NULL;
 
+	if (parser_next (parser, &byte_start, &byte_end)) {
+		str = parser->word;
+	}
 
-#ifndef HAVE_LIBUNISTRING
-	if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
-		if (pango_next (parser, &byte_start, &byte_end)) {
-			str = parser->word;
-		}
+	if (parser->enable_stop_words &&
+	    tracker_language_is_stop_word (parser->language, str)) {
+		*stop_word = TRUE;
+	} else {
 		parser->word_position++;
-
 		*stop_word = FALSE;
-	} else
-#endif /* !HAVE_LIBUNISTRING */
-	{
-		if (parser_next (parser, &byte_start, &byte_end)) {
-			str = parser->word;
-		}
-
-		if (parser->enable_stop_words && is_stop_word (parser->language, str)) {
-			*stop_word = TRUE;
-		} else {
-			parser->word_position++;
-			*stop_word = FALSE;
-		}
 	}
 
 	*word_length = parser->word_length;
@@ -1147,5 +470,3 @@ tracker_parser_next (TrackerParser *parser,
 	return str;
 }
 
-
-
diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c
new file mode 100644
index 0000000..503de1b
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-utils.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301  USA
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#ifdef HAVE_UNAC
+#include <unac.h>
+#endif
+
+#include "tracker-parser-utils.h"
+
+gchar *
+tracker_parser_unaccent_string (const gchar *str,
+                                gsize        ilength,
+                                gsize        *p_olength)
+{
+#ifdef HAVE_UNAC
+	GError *error = NULL;
+	gchar *str_utf16;
+	gsize utf16_len, unaccented_len, final_len;
+	gchar *unaccented_str = NULL;
+	gchar *s = NULL;
+
+	*p_olength = 0;
+
+	/* unac_string() does roughly the same than below, plus it
+	 * corrupts memory in 64bit systems, so avoid it for now.
+	 */
+	str_utf16 = g_convert (str, ilength, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
+
+	if (error) {
+		g_warning ("Could not convert to UTF-16: %s", error->message);
+		g_error_free (error);
+		return NULL;
+	}
+
+	if (unac_string_utf16 (str_utf16, utf16_len,
+	                       &unaccented_str, &unaccented_len) != 0) {
+		g_warning ("UNAC failed to strip accents");
+		g_free (str_utf16);
+		return NULL;
+	}
+
+	g_free (str_utf16);
+
+	s = g_convert (unaccented_str, unaccented_len, "UTF-8", "UTF-16BE", NULL, &final_len, &error);
+	g_free (unaccented_str);
+
+	if (error) {
+		g_warning ("Could not convert back to UTF-8: %s", error->message);
+		g_error_free (error);
+		return NULL;
+	}
+
+	*p_olength = final_len;
+
+	return s;
+#else
+	return NULL;
+#endif
+}
+
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
new file mode 100644
index 0000000..4e869b1
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-utils.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301  USA
+ */
+
+#ifndef __TRACKER_PARSER_UTILS_H__
+#define __TRACKER_PARSER_UTILS_H__
+
+#include <glib.h>
+
+G_BEGIN_DECLS
+
+gchar *tracker_parser_unaccent_string (const gchar *str,
+                                       gsize        ilength,
+                                       gsize        *p_olength);
+
+G_END_DECLS
+
+#endif /* __TRACKER_PARSER_UTILS_H__ */
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index 66535c9..e118ae7 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -35,10 +35,10 @@ TrackerParser *tracker_parser_new             (TrackerLanguage *language,
 void           tracker_parser_reset           (TrackerParser   *parser,
                                                const gchar     *txt,
                                                gint             txt_size,
-                                               gboolean                 delimit_words,
-                                               gboolean                 enable_stemmer,
-                                               gboolean                 enable_stop_words,
-                                               gboolean                 parse_reserved_words);
+                                               gboolean         delimit_words,
+                                               gboolean         enable_stemmer,
+                                               gboolean         enable_stop_words,
+                                               gboolean         parse_reserved_words);
 
 const gchar *  tracker_parser_next            (TrackerParser   *parser,
                                                gint            *position,
@@ -50,7 +50,7 @@ const gchar *  tracker_parser_next            (TrackerParser   *parser,
 gchar *        tracker_parser_process_word    (TrackerParser   *parser,
                                                const char      *word,
                                                gint             length,
-                                               gboolean                 do_strip);
+                                               gboolean         do_strip);
 void           tracker_parser_free            (TrackerParser   *parser);
 
 G_END_DECLS
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]