[tracker] FTS parsers: remove tracker_parser_process_word() from parser API



commit 9b1ffc04f3ba006a710babe46d95181ac8020492
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Fri May 28 10:00:10 2010 +0200

    FTS parsers: remove tracker_parser_process_word() from parser API

 src/libtracker-fts/tracker-parser-glib.c         |  156 +++++-----
 src/libtracker-fts/tracker-parser-libicu.c       |  387 +++++++++-------------
 src/libtracker-fts/tracker-parser-libunistring.c |  268 +++++++--------
 src/libtracker-fts/tracker-parser.h              |    5 -
 4 files changed, 364 insertions(+), 452 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index 9892829..670a46f 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -162,6 +162,83 @@ get_encoding (const gchar *txt)
 
 }
 
+static gchar *
+process_word_utf8 (TrackerParser *parser,
+		   const gchar   *word,
+		   gint           length,
+		   gboolean       do_strip)
+{
+	gchar *stem_word;
+	gchar *str;
+	gchar *stripped_word;
+	gsize  bytes, len;
+
+	g_return_val_if_fail (parser != NULL, NULL);
+	g_return_val_if_fail (word != NULL, NULL);
+
+	str = NULL;
+	stripped_word = NULL;
+
+	if (word) {
+		if (length == -1) {
+			bytes = strlen (word);
+		} else {
+			bytes = length;
+		}
+
+		/* Log original word */
+		tracker_parser_message_hex ("ORIGINAL word",
+		                            word, bytes);
+
+		if (parser->enable_unaccent && do_strip) {
+			stripped_word = tracker_parser_unaccent_utf8_word (word,
+			                                                   bytes,
+			                                                   &len);
+
+			/* Log after UNAC stripping */
+			tracker_parser_message_hex (" After UNAC stripping",
+			                            stripped_word, len);
+		} else {
+			stripped_word = NULL;
+		}
+
+		if (!stripped_word) {
+			str = g_utf8_normalize (word,
+			                        bytes,
+			                        G_NORMALIZE_NFC);
+		} else {
+			str = g_utf8_normalize (stripped_word,
+			                        len,
+			                        G_NORMALIZE_NFC);
+			g_free (stripped_word);
+		}
+
+		/* Log after normalization */
+		tracker_parser_message_hex ("  After NFC normalization",
+		                            str, strlen ((gchar *)str));
+
+		if (!str) {
+			return NULL;
+		}
+
+		if (!parser->enable_stemmer) {
+			return str;
+		}
+
+		len = strlen (str);
+
+		stem_word = tracker_language_stem_word (parser->language, str, len);
+
+		if (stem_word) {
+			g_free (str);
+
+			return stem_word;
+		}
+	}
+
+	return str;
+}
+
 static gboolean
 pango_next (TrackerParser *parser,
             gint          *byte_offset_start,
@@ -400,7 +477,7 @@ parser_next (TrackerParser *parser,
 
 		parser->cursor = parser->txt + *byte_offset_end;
 
-		processed_word = tracker_parser_process_word (parser, utf8, bytes, do_strip);
+		processed_word = process_word_utf8 (parser, utf8, bytes, do_strip);
 		g_free (utf8);
 
 		if (processed_word) {
@@ -503,83 +580,6 @@ tracker_parser_reset (TrackerParser *parser,
 	}
 }
 
-gchar *
-tracker_parser_process_word (TrackerParser *parser,
-                             const gchar   *word,
-                             gint           length,
-                             gboolean       do_strip)
-{
-	gchar *stem_word;
-	gchar *str;
-	gchar *stripped_word;
-	gsize  bytes, len;
-
-	g_return_val_if_fail (parser != NULL, NULL);
-	g_return_val_if_fail (word != NULL, NULL);
-
-	str = NULL;
-	stripped_word = NULL;
-
-	if (word) {
-		if (length == -1) {
-			bytes = strlen (word);
-		} else {
-			bytes = length;
-		}
-
-		/* Log original word */
-		tracker_parser_message_hex ("ORIGINAL word",
-		                            word, bytes);
-
-		if (parser->enable_unaccent && do_strip) {
-			stripped_word = tracker_parser_unaccent_utf8_word (word,
-			                                                   bytes,
-			                                                   &len);
-
-			/* Log after UNAC stripping */
-			tracker_parser_message_hex (" After UNAC stripping",
-			                            stripped_word, len);
-		} else {
-			stripped_word = NULL;
-		}
-
-		if (!stripped_word) {
-			str = g_utf8_normalize (word,
-			                        bytes,
-			                        G_NORMALIZE_NFC);
-		} else {
-			str = g_utf8_normalize (stripped_word,
-			                        len,
-			                        G_NORMALIZE_NFC);
-			g_free (stripped_word);
-		}
-
-		/* Log after normalization */
-		tracker_parser_message_hex ("  After NFC normalization",
-		                            str, strlen ((gchar *)str));
-
-		if (!str) {
-			return NULL;
-		}
-
-		if (!parser->enable_stemmer) {
-			return str;
-		}
-
-		len = strlen (str);
-
-		stem_word = tracker_language_stem_word (parser->language, str, len);
-
-		if (stem_word) {
-			g_free (str);
-
-			return stem_word;
-		}
-	}
-
-	return str;
-}
-
 const gchar *
 tracker_parser_next (TrackerParser *parser,
                      gint          *position,
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 0fdde7b..3e1ad98 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -44,11 +44,6 @@ typedef enum {
 /* Max possible length of a UChar encoded string (just a safety limit) */
 #define WORD_BUFFER_LENGTH 512
 
-static gchar *process_word_uchar (TrackerParser         *parser,
-                                  const UChar           *word,
-                                  gint                   length,
-                                  TrackerParserWordType  type);
-
 struct TrackerParser {
 	const gchar           *txt;
 	gint                   txt_size;
@@ -143,6 +138,168 @@ get_word_info (const UChar           *word,
 	return TRUE;
 }
 
+static gchar *
+process_word_uchar (TrackerParser         *parser,
+                    const UChar           *word,
+                    gint                   length,
+                    TrackerParserWordType  type)
+{
+	UErrorCode error = U_ZERO_ERROR;
+	UChar normalized_buffer [WORD_BUFFER_LENGTH];
+	gchar *utf8_str = NULL;
+	gchar *stemmed = NULL;
+	size_t new_word_length;
+
+	/* Log original word */
+	tracker_parser_message_hex ("ORIGINAL word",
+	                            (guint8 *)word,
+				    length * sizeof (UChar));
+
+
+	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
+		UChar casefolded_buffer [WORD_BUFFER_LENGTH];
+
+		/* Casefold... */
+		new_word_length = u_strFoldCase (casefolded_buffer,
+		                                 WORD_BUFFER_LENGTH,
+		                                 word,
+		                                 length,
+		                                 U_FOLD_CASE_DEFAULT,
+		                                 &error);
+		if (U_FAILURE (error)) {
+			g_warning ("Error casefolding: '%s'",
+			           u_errorName (error));
+			return NULL;
+		}
+		if (new_word_length > WORD_BUFFER_LENGTH)
+			new_word_length = WORD_BUFFER_LENGTH;
+
+		/* Log after casefolding */
+		tracker_parser_message_hex (" After Casefolding",
+		                            (guint8 *)casefolded_buffer,
+					    new_word_length * sizeof (UChar));
+
+		/* NFC normalization... */
+		new_word_length = unorm_normalize (casefolded_buffer,
+		                                   new_word_length,
+		                                   UNORM_NFC,
+		                                   0,
+		                                   normalized_buffer,
+		                                   WORD_BUFFER_LENGTH,
+		                                   &error);
+		if (U_FAILURE (error)) {
+			g_warning ("Error normalizing: '%s'",
+			           u_errorName (error));
+			return NULL;
+		}
+
+		if (new_word_length > WORD_BUFFER_LENGTH)
+			new_word_length = WORD_BUFFER_LENGTH;
+
+		/* Log after casefolding */
+		tracker_parser_message_hex (" After Normalization",
+		                            (guint8 *)normalized_buffer,
+					    new_word_length * sizeof (UChar));
+	} else {
+		/* For ASCII-only, just tolower() each character */
+		new_word_length = u_strToLower (normalized_buffer,
+		                                WORD_BUFFER_LENGTH,
+		                                word,
+		                                length,
+		                                NULL,
+		                                &error);
+		if (U_FAILURE (error)) {
+			g_warning ("Error lowercasing: '%s'",
+			           u_errorName (error));
+			return NULL;
+		}
+
+		/* Log after casefolding */
+		tracker_parser_message_hex (" After lowercase",
+		                            (guint8 *)normalized_buffer,
+					    new_word_length * sizeof (UChar));
+	}
+
+	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
+	if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
+		gsize stripped_word_length;
+
+		/* Get unaccented string in UTF-8 */
+		utf8_str = tracker_parser_unaccent_UChar_word (normalized_buffer,
+		                                               new_word_length,
+		                                               &stripped_word_length);
+		if (utf8_str) {
+			new_word_length = stripped_word_length;
+
+			/* Log after unaccenting */
+			tracker_parser_message_hex ("   After UNAC",
+						    utf8_str,
+						    new_word_length);
+		}
+	}
+
+	/* If stripping failed or not needed, convert to UTF-8 */
+	if (!utf8_str) {
+		UErrorCode icu_error = U_ZERO_ERROR;
+		UConverter *converter;
+		gsize utf8_len;
+
+		/* Open converter UChar to UTF-16BE */
+		converter = ucnv_open ("UTF-8", &icu_error);
+		if (!converter) {
+			g_warning ("Cannot open UTF-8 converter: '%s'",
+			           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+			return NULL;
+		}
+		/* A character encoded in 2 bytes in UTF-16 may get expanded to 3 or 4 bytes
+		 *  in UTF-8. */
+		utf8_str = g_malloc (2 * new_word_length * sizeof (UChar) + 1);
+
+		/* Convert from UChar to UTF-8 (NIL-terminated) */
+		utf8_len = ucnv_fromUChars (converter,
+		                            utf8_str,
+		                            2 * new_word_length * sizeof (UChar) + 1,
+		                            normalized_buffer,
+		                            new_word_length,
+		                            &icu_error);
+		if (U_FAILURE (icu_error)) {
+			g_warning ("Cannot convert from UChar to UTF-8: '%s'",
+			           u_errorName (icu_error));
+			g_free (utf8_str);
+			ucnv_close (converter);
+			return NULL;
+		}
+
+		new_word_length = utf8_len;
+		ucnv_close (converter);
+
+		/* Log after unaccenting */
+		tracker_parser_message_hex ("   After UTF8 conversion",
+		                            utf8_str,
+					    new_word_length);
+	}
+
+	/* Stemming needed? */
+	if (parser->enable_stemmer) {
+		/* Input for stemmer ALWAYS in UTF-8, as well as output */
+		stemmed = tracker_language_stem_word (parser->language,
+		                                      utf8_str,
+		                                      new_word_length);
+
+		/* Log after stemming */
+		tracker_parser_message_hex ("   After stemming",
+		                            stemmed, strlen (stemmed));
+	}
+
+	/* If stemmed wanted and succeeded, free previous and return it */
+	if (stemmed) {
+		g_free (utf8_str);
+		return stemmed;
+	}
+
+	return utf8_str;
+}
+
 static gboolean
 parser_next (TrackerParser *parser,
              gint          *byte_offset_start,
@@ -397,226 +554,6 @@ tracker_parser_reset (TrackerParser *parser,
 	ucnv_close (converter);
 }
 
-static gchar *
-process_word_uchar (TrackerParser         *parser,
-                    const UChar           *word,
-                    gint                   length,
-                    TrackerParserWordType  type)
-{
-	UErrorCode error = U_ZERO_ERROR;
-	UChar normalized_buffer [WORD_BUFFER_LENGTH];
-	gchar *utf8_str = NULL;
-	gchar *stemmed = NULL;
-	size_t new_word_length;
-
-	/* Log original word */
-	tracker_parser_message_hex ("ORIGINAL word",
-	                            (guint8 *)word,
-				    length * sizeof (UChar));
-
-
-	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
-		UChar casefolded_buffer [WORD_BUFFER_LENGTH];
-
-		/* Casefold... */
-		new_word_length = u_strFoldCase (casefolded_buffer,
-		                                 WORD_BUFFER_LENGTH,
-		                                 word,
-		                                 length,
-		                                 U_FOLD_CASE_DEFAULT,
-		                                 &error);
-		if (U_FAILURE (error)) {
-			g_warning ("Error casefolding: '%s'",
-			           u_errorName (error));
-			return NULL;
-		}
-		if (new_word_length > WORD_BUFFER_LENGTH)
-			new_word_length = WORD_BUFFER_LENGTH;
-
-		/* Log after casefolding */
-		tracker_parser_message_hex (" After Casefolding",
-		                            (guint8 *)casefolded_buffer,
-					    new_word_length * sizeof (UChar));
-
-		/* NFC normalization... */
-		new_word_length = unorm_normalize (casefolded_buffer,
-		                                   new_word_length,
-		                                   UNORM_NFC,
-		                                   0,
-		                                   normalized_buffer,
-		                                   WORD_BUFFER_LENGTH,
-		                                   &error);
-		if (U_FAILURE (error)) {
-			g_warning ("Error normalizing: '%s'",
-			           u_errorName (error));
-			return NULL;
-		}
-
-		if (new_word_length > WORD_BUFFER_LENGTH)
-			new_word_length = WORD_BUFFER_LENGTH;
-
-		/* Log after casefolding */
-		tracker_parser_message_hex (" After Normalization",
-		                            (guint8 *)normalized_buffer,
-					    new_word_length * sizeof (UChar));
-	} else {
-		/* For ASCII-only, just tolower() each character */
-		new_word_length = u_strToLower (normalized_buffer,
-		                                WORD_BUFFER_LENGTH,
-		                                word,
-		                                length,
-		                                NULL,
-		                                &error);
-		if (U_FAILURE (error)) {
-			g_warning ("Error lowercasing: '%s'",
-			           u_errorName (error));
-			return NULL;
-		}
-
-		/* Log after casefolding */
-		tracker_parser_message_hex (" After lowercase",
-		                            (guint8 *)normalized_buffer,
-					    new_word_length * sizeof (UChar));
-	}
-
-	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
-	if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
-		gsize stripped_word_length;
-
-		/* Get unaccented string in UTF-8 */
-		utf8_str = tracker_parser_unaccent_UChar_word (normalized_buffer,
-		                                               new_word_length,
-		                                               &stripped_word_length);
-		if (utf8_str) {
-			new_word_length = stripped_word_length;
-
-			/* Log after unaccenting */
-			tracker_parser_message_hex ("   After UNAC",
-						    utf8_str,
-						    new_word_length);
-		}
-	}
-
-	/* If stripping failed or not needed, convert to UTF-8 */
-	if (!utf8_str) {
-		UErrorCode icu_error = U_ZERO_ERROR;
-		UConverter *converter;
-		gsize utf8_len;
-
-		/* Open converter UChar to UTF-16BE */
-		converter = ucnv_open ("UTF-8", &icu_error);
-		if (!converter) {
-			g_warning ("Cannot open UTF-8 converter: '%s'",
-			           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
-			return NULL;
-		}
-		/* A character encoded in 2 bytes in UTF-16 may get expanded to 3 or 4 bytes
-		 *  in UTF-8. */
-		utf8_str = g_malloc (2 * new_word_length * sizeof (UChar) + 1);
-
-		/* Convert from UChar to UTF-8 (NIL-terminated) */
-		utf8_len = ucnv_fromUChars (converter,
-		                            utf8_str,
-		                            2 * new_word_length * sizeof (UChar) + 1,
-		                            normalized_buffer,
-		                            new_word_length,
-		                            &icu_error);
-		if (U_FAILURE (icu_error)) {
-			g_warning ("Cannot convert from UChar to UTF-8: '%s'",
-			           u_errorName (icu_error));
-			g_free (utf8_str);
-			ucnv_close (converter);
-			return NULL;
-		}
-
-		new_word_length = utf8_len;
-		ucnv_close (converter);
-
-		/* Log after unaccenting */
-		tracker_parser_message_hex ("   After UTF8 conversion",
-		                            utf8_str,
-					    new_word_length);
-	}
-
-	/* Stemming needed? */
-	if (parser->enable_stemmer) {
-		/* Input for stemmer ALWAYS in UTF-8, as well as output */
-		stemmed = tracker_language_stem_word (parser->language,
-		                                      utf8_str,
-		                                      new_word_length);
-
-		/* Log after stemming */
-		tracker_parser_message_hex ("   After stemming",
-		                            stemmed, strlen (stemmed));
-	}
-
-	/* If stemmed wanted and succeeded, free previous and return it */
-	if (stemmed) {
-		g_free (utf8_str);
-		return stemmed;
-	}
-
-	return utf8_str;
-}
-
-
-/* Both Input and Output are always UTF-8 */
-gchar *
-tracker_parser_process_word (TrackerParser *parser,
-                             const gchar   *word,
-                             gint           length,
-                             gboolean       do_strip)
-{
-	UErrorCode icu_error = U_ZERO_ERROR;
-	UConverter *converter;
-	UChar *uchar_word;
-	gsize uchar_len;
-	gchar *processed;
-
-	/* Open converter UTF-8 to UChar */
-	converter = ucnv_open ("UTF-8", &icu_error);
-	if (!converter) {
-		g_warning ("Cannot open UTF-8 converter: '%s'",
-		           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
-		return NULL;
-	}
-
-	/* Compute length if not already as input */
-	if (length < 0) {
-		length = strlen (word);
-	}
-
-	/* Twice the size of the UTF-8 string for UChars */
-	uchar_word = g_malloc (2 * length);
-
-	/* Convert from UTF-8 to UChars*/
-	uchar_len = ucnv_toUChars (converter,
-	                           uchar_word,
-	                           2 * length,
-	                           word,
-	                           length,
-	                           &icu_error);
-	if (U_FAILURE (icu_error)) {
-		g_warning ("Cannot convert from UTF-8 to UChar: '%s'",
-		           u_errorName (icu_error));
-		g_free (uchar_word);
-		ucnv_close (converter);
-		return NULL;
-	}
-
-	ucnv_close (converter);
-
-	/* Process UChar based word */
-	processed = process_word_uchar (parser,
-	                                uchar_word,
-	                                uchar_len,
-	                                (do_strip ?
-	                                 TRACKER_PARSER_WORD_TYPE_OTHER_UNAC :
-	                                 TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC));
-	g_free (uchar_word);
-	return processed;
-}
-
 const gchar *
 tracker_parser_next (TrackerParser *parser,
                      gint          *position,
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index 7b21947..67dda5f 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -43,11 +43,6 @@ typedef enum {
 /* Max possible length of a UTF-8 encoded string (just a safety limit) */
 #define WORD_BUFFER_LENGTH 512
 
-static gchar *process_word_utf8 (TrackerParser         *parser,
-                                 const gchar           *word,
-                                 gint                  length,
-                                 TrackerParserWordType type);
-
 struct TrackerParser {
 	const gchar           *txt;
 	gint                   txt_size;
@@ -144,6 +139,130 @@ get_word_info (TrackerParser         *parser,
 	return TRUE;
 }
 
+static gchar *
+process_word_utf8 (TrackerParser         *parser,
+                   const gchar           *word,
+                   gint                  length,
+                   TrackerParserWordType type)
+{
+	gchar word_buffer [WORD_BUFFER_LENGTH];
+	gchar *normalized = NULL;
+	gchar *stripped = NULL;
+	gchar *stemmed = NULL;
+	size_t new_word_length;
+
+	g_return_val_if_fail (parser != NULL, NULL);
+	g_return_val_if_fail (word != NULL, NULL);
+
+	/* If length is set as -1, the input word MUST be NIL-terminated.
+	 * Otherwise, this restriction is not needed as the length to process
+	 *  is given as input argument */
+	if (length < 0) {
+		length = strlen (word);
+	}
+
+	/* Log original word */
+	tracker_parser_message_hex ("ORIGINAL word",
+	                            word, length);
+
+	/* Normalization and case-folding ONLY for non-ASCII */
+	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
+		/* Leave space for last NIL */
+		new_word_length = WORD_BUFFER_LENGTH - 1;
+
+		/* Casefold and NFC normalization in output.
+		 *  NOTE: if the output buffer is not big enough, u8_casefold will
+		 *  return a newly-allocated buffer. */
+		normalized = u8_casefold ((const uint8_t *)word,
+		                          length,
+		                          uc_locale_language (),
+		                          UNINORM_NFC,
+		                          word_buffer,
+		                          &new_word_length);
+
+		/* Case folding + Normalization failed, ignore this word */
+		g_return_val_if_fail (normalized != NULL, NULL);
+
+		/* If output buffer is not the same as the one passed to
+		 *  u8_casefold, we know it was newly-allocated, so need
+		 *  to resize it in 1 byte to add last NIL */
+		if (normalized != word_buffer) {
+			normalized = g_realloc (normalized, new_word_length + 1);
+		}
+
+		/* Log after Normalization */
+		tracker_parser_message_hex (" After Casefolding and NFC normalization",
+		                            normalized, new_word_length);
+	} else {
+		/* For ASCII-only, just tolower() each character */
+		gsize i;
+
+		normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
+
+		for (i = 0; i < length; i++) {
+			normalized[i] = g_ascii_tolower (word[i]);
+		}
+
+		new_word_length = length;
+
+		/* Log after tolower */
+		tracker_parser_message_hex (" After Lowercasing",
+		                            normalized, new_word_length);
+	}
+
+	/* Set output NIL */
+	normalized[new_word_length] = '\0';
+
+	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
+	if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
+		gsize stripped_word_length;
+
+		stripped = tracker_parser_unaccent_utf8_word (normalized,
+		                                              new_word_length,
+		                                              &stripped_word_length);
+
+		if (stripped) {
+			/* Log after UNAC stripping */
+			tracker_parser_message_hex ("  After UNAC stripping",
+			                            stripped, stripped_word_length);
+			new_word_length = stripped_word_length;
+		}
+	}
+
+	/* Stemming needed? */
+	if (parser->enable_stemmer) {
+		stemmed = tracker_language_stem_word (parser->language,
+		                                      stripped ? stripped : normalized,
+		                                      new_word_length);
+
+		/* Log after stemming */
+		tracker_parser_message_hex ("   After stemming",
+		                            stemmed, strlen (stemmed));
+	}
+
+	/* If stemmed wanted and succeeded, free previous and return it */
+	if (stemmed) {
+		g_free (stripped);
+		if (normalized != word_buffer) {
+			g_free (normalized);
+		}
+		return stemmed;
+	}
+
+	/* If stripped wanted and succeeded, free previous and return it */
+	if (stripped) {
+		if (normalized != word_buffer) {
+			g_free (normalized);
+		}
+		return stripped;
+	}
+
+	/* It may be the case that no stripping and no stemming was needed, and
+	 * that the output buffer in stack was enough for case-folding and
+	 * normalization. In this case, need to strdup() the string to return it */
+	return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
+}
+
 static gboolean
 parser_next (TrackerParser *parser,
              gint          *byte_offset_start,
@@ -315,145 +434,6 @@ tracker_parser_reset (TrackerParser *parser,
 	}
 }
 
-gchar *
-tracker_parser_process_word (TrackerParser *parser,
-                             const gchar    *word,
-                             gint           length,
-                             gboolean       do_strip)
-{
-
-	return process_word_utf8 (parser,
-	                          word,
-	                          length,
-	                          (do_strip ?
-	                           TRACKER_PARSER_WORD_TYPE_OTHER_UNAC :
-	                           TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC));
-}
-
-static gchar *
-process_word_utf8 (TrackerParser         *parser,
-                   const gchar           *word,
-                   gint                  length,
-                   TrackerParserWordType type)
-{
-	gchar word_buffer [WORD_BUFFER_LENGTH];
-	gchar *normalized = NULL;
-	gchar *stripped = NULL;
-	gchar *stemmed = NULL;
-	size_t new_word_length;
-
-	g_return_val_if_fail (parser != NULL, NULL);
-	g_return_val_if_fail (word != NULL, NULL);
-
-	/* If length is set as -1, the input word MUST be NIL-terminated.
-	 * Otherwise, this restriction is not needed as the length to process
-	 *  is given as input argument */
-	if (length < 0) {
-		length = strlen (word);
-	}
-
-	/* Log original word */
-	tracker_parser_message_hex ("ORIGINAL word",
-	                            word, length);
-
-	/* Normalization and case-folding ONLY for non-ASCII */
-	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
-		/* Leave space for last NIL */
-		new_word_length = WORD_BUFFER_LENGTH - 1;
-
-		/* Casefold and NFC normalization in output.
-		 *  NOTE: if the output buffer is not big enough, u8_casefold will
-		 *  return a newly-allocated buffer. */
-		normalized = u8_casefold ((const uint8_t *)word,
-		                          length,
-		                          uc_locale_language (),
-		                          UNINORM_NFC,
-		                          word_buffer,
-		                          &new_word_length);
-
-		/* Case folding + Normalization failed, ignore this word */
-		g_return_val_if_fail (normalized != NULL, NULL);
-
-		/* If output buffer is not the same as the one passed to
-		 *  u8_casefold, we know it was newly-allocated, so need
-		 *  to resize it in 1 byte to add last NIL */
-		if (normalized != word_buffer) {
-			normalized = g_realloc (normalized, new_word_length + 1);
-		}
-
-		/* Log after Normalization */
-		tracker_parser_message_hex (" After Casefolding and NFC normalization",
-		                            normalized, new_word_length);
-	} else {
-		/* For ASCII-only, just tolower() each character */
-		gsize i;
-
-		normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
-
-		for (i = 0; i < length; i++) {
-			normalized[i] = g_ascii_tolower (word[i]);
-		}
-
-		new_word_length = length;
-
-		/* Log after tolower */
-		tracker_parser_message_hex (" After Lowercasing",
-		                            normalized, new_word_length);
-	}
-
-	/* Set output NIL */
-	normalized[new_word_length] = '\0';
-
-	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
-	if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
-		gsize stripped_word_length;
-
-		stripped = tracker_parser_unaccent_utf8_word (normalized,
-		                                              new_word_length,
-		                                              &stripped_word_length);
-
-		if (stripped) {
-			/* Log after UNAC stripping */
-			tracker_parser_message_hex ("  After UNAC stripping",
-			                            stripped, stripped_word_length);
-			new_word_length = stripped_word_length;
-		}
-	}
-
-	/* Stemming needed? */
-	if (parser->enable_stemmer) {
-		stemmed = tracker_language_stem_word (parser->language,
-		                                      stripped ? stripped : normalized,
-		                                      new_word_length);
-
-		/* Log after stemming */
-		tracker_parser_message_hex ("   After stemming",
-		                            stemmed, strlen (stemmed));
-	}
-
-	/* If stemmed wanted and succeeded, free previous and return it */
-	if (stemmed) {
-		g_free (stripped);
-		if (normalized != word_buffer) {
-			g_free (normalized);
-		}
-		return stemmed;
-	}
-
-	/* If stripped wanted and succeeded, free previous and return it */
-	if (stripped) {
-		if (normalized != word_buffer) {
-			g_free (normalized);
-		}
-		return stripped;
-	}
-
-	/* It may be the case that no stripping and no stemming was needed, and
-	 * that the output buffer in stack was enough for case-folding and
-	 * normalization. In this case, need to strdup() the string to return it */
-	return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
-}
-
 const gchar *
 tracker_parser_next (TrackerParser *parser,
                      gint          *position,
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index 57426c3..cc12398 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -48,11 +48,6 @@ const gchar *  tracker_parser_next            (TrackerParser   *parser,
                                                gboolean        *stop_word,
                                                gint            *word_length);
 
-gchar *        tracker_parser_process_word    (TrackerParser   *parser,
-                                               const gchar     *word,
-                                               gint             length,
-                                               gboolean         do_strip);
-
 void           tracker_parser_free            (TrackerParser   *parser);
 
 G_END_DECLS



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]