[tracker/parser-unicode-libs-review: 66/85] Added normalization and case folding



commit f31b3ff06608057266149026c8ed415947857940
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Wed Apr 28 12:15:09 2010 +0200

    Added normalization and case folding

 src/libtracker-fts/tracker-parser.c |  230 ++++++++++++++++++++++++++---------
 1 files changed, 172 insertions(+), 58 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser.c
index 3d37874..47b2071 100644
--- a/src/libtracker-fts/tracker-parser.c
+++ b/src/libtracker-fts/tracker-parser.c
@@ -27,15 +27,21 @@
 #endif
 
 #ifdef HAVE_LIBUNISTRING
-/* # include <unistr.h> */
-# include <uniwbrk.h>
-# include <unictype.h>
+/* libunistring versions prior to 9.1.2 need this hack */
+#define _UNUSED_PARAMETER_
+#include <unistr.h>
+#include <uniwbrk.h>
+#include <unictype.h>
+#include <unicase.h>
 #else
 #include <pango/pango.h>
 #endif
 
 #include "tracker-parser.h"
 
+/* Max possible length of a UTF-8 encoded string (just a safety limit) */
+#define WORD_BUFFER_LENGTH 512
+
 
 #ifndef HAVE_LIBUNISTRING
 
@@ -534,14 +540,16 @@ parser_next (TrackerParser *parser,
 /* Use libunistring
  *  void u8_wordbreaks (const uint8_t *s, size_t n, char *p)
  *  int u8_strmbtouc (ucs4_t *puc, const uint8_t *s)
+ *  uint8_t * u8_casefold (const uint8_t *s, size_t n, const char *iso639_language, uninorm_t nf, uint8_t *resultbuf, size_t *lengthp)
  */
 static gboolean
 parser_next (TrackerParser *parser,
              gint          *byte_offset_start,
              gint          *byte_offset_end)
 {
-	gchar *word = NULL;
-	gsize word_length;
+
+	gsize word_length = 0;
+	gchar *processed_word = NULL;
 
 	*byte_offset_start = 0;
 	*byte_offset_end = 0;
@@ -549,7 +557,7 @@ parser_next (TrackerParser *parser,
 	g_return_val_if_fail (parser, FALSE);
 
 	/* Loop to look for next valid word */
-	while (!word &&
+	while (!processed_word &&
 	       parser->cursor < parser->txt_size) {
 		ucs4_t first_unichar;
 		gint first_unichar_len;
@@ -571,7 +579,6 @@ parser_next (TrackerParser *parser,
 		word_length = i - parser->cursor;
 
 		if (first_unichar_len > 0) {
-
 			/* We only want the words where the first character
 			 *  in the word is either a letter, a number or a symbol.
 			 * This is needed because the word break algorithm also
@@ -581,10 +588,31 @@ parser_next (TrackerParser *parser,
 			 *  should be compatible with all Unicode normalization
 			 *  methods.
 			 */
-			if (uc_is_general_category (first_unichar, parser->allowed_start)) {
-				word = g_malloc (word_length + 1);
-				memcpy (word, &(parser->txt[parser->cursor]), word_length);
-				word[word_length] = '\0';
+			if (uc_is_general_category (first_unichar,
+			                            parser->allowed_start)) {
+				gchar word_buffer [WORD_BUFFER_LENGTH];
+				gsize new_length;
+
+				/* compute truncated word length if needed */
+				new_length = (word_length < WORD_BUFFER_LENGTH ?
+				              word_length :
+				              WORD_BUFFER_LENGTH - 1);
+
+				/* Word here needs always to be NIL-terminated */
+				memcpy (word_buffer, &(parser->txt[parser->cursor]), new_length);
+				word_buffer[new_length] = '\0';
+
+				/* Process the word here. If it fails, we can still go
+				 *  to the next one. Returns newly allocated string
+				 *  always */
+				processed_word = tracker_parser_process_word (parser,
+				                                              word_buffer,
+				                                              new_length,
+				                                              TRUE);
+				if (!processed_word) {
+					/* Skip this word and keep on looping */
+					parser->cursor += word_length;
+				}
 			} else {
 				/* Skip this word and keep on looping */
 				parser->cursor += word_length;
@@ -596,10 +624,8 @@ parser_next (TrackerParser *parser,
 		}
 	}
 
-	/* If we got a word here, process it */
-	if (word) {
-		gchar *processed_word;
-
+	/* If we got a word here, set output */
+	if (processed_word) {
 		/* Set outputs */
 		*byte_offset_start = parser->cursor;
 		*byte_offset_end = parser->cursor + word_length;
@@ -607,20 +633,10 @@ parser_next (TrackerParser *parser,
 		/* Update cursor */
 		parser->cursor += word_length;
 
-		/* g_debug ("start: '%d', end: '%d', new cursor at: '%d'", */
-		/*          *byte_offset_start, *byte_offset_end, (gint)parser->cursor); */
-
-		/* TODO: tolower, do_strip */
-
-		processed_word = tracker_parser_process_word (parser, word, word_length, TRUE);
-		g_free (word);
-
-		if (processed_word) {
-			parser->word_length = strlen (processed_word);
-			parser->word = processed_word;
+		parser->word_length = strlen (processed_word);
+		parser->word = processed_word;
 
-			return TRUE;
-		}
+		return TRUE;
 	}
 
 	/* No more words... */
@@ -753,6 +769,8 @@ tracker_parser_reset (TrackerParser *parser,
 #endif /* !HAVE_LIBUNISTRING */
 }
 
+
+#ifndef HAVE_LIBUNISTRING
 gchar *
 tracker_parser_process_word (TrackerParser *parser,
                              const gchar    *word,
@@ -768,51 +786,147 @@ tracker_parser_process_word (TrackerParser *parser,
 
 	str = NULL;
 
-	if (word) {
-		if (length == -1) {
-			bytes = strlen (word);
-		} else {
-			bytes = length;
-		}
+	bytes = length == -1 ? strlen (word) : length;
 
-		g_debug ("ORIGINAL word: '%s'", word);
+	g_debug ("ORIGINAL word: '%s'", word);
 
-		str = g_utf8_normalize (word,
-		                        bytes,
-		                        G_NORMALIZE_NFC);
-		if (!str) {
-			return NULL;
-		}
+	str = g_utf8_normalize (word,
+	                        bytes,
+	                        G_NORMALIZE_NFC);
+	if (!str) {
+		return NULL;
+	}
 
-		len = strlen (str);
-		g_debug ("  After NFC normalization: '%s'", str);
+	len = strlen (str);
 
-		if (do_strip) {
-			gchar *stripped_word;
+	g_debug ("  After NFC normalization: '%s'", str);
 
-			stripped_word = strip_word (str, len, &len);
-			g_debug (" After UNAC stripping: '%s'", stripped_word);
-			g_free (str);
-			str = stripped_word;
-		}
+	if (do_strip) {
+		gchar *stripped_word;
+
+		stripped_word = strip_word (str, len, &len);
+		g_debug (" After UNAC stripping: '%s'", stripped_word);
+		g_free (str);
+		str = stripped_word;
+	}
 
-		if (!parser->enable_stemmer) {
-			return str;
+	if (!parser->enable_stemmer) {
+		return str;
+	}
+
+	stem_word = tracker_language_stem_word (parser->language, str, len);
+	g_debug ("  After Stemming: '%s'", stem_word);
+
+	if (stem_word) {
+		g_free (str);
+
+		return stem_word;
+	}
+
+	return str;
+}
+
+#else
+
+/* libunistring version of the word processor. */
+gchar *
+tracker_parser_process_word (TrackerParser *parser,
+                             const gchar    *word,
+                             gint           length,
+                             gboolean       do_strip)
+{
+	gchar word_buffer [WORD_BUFFER_LENGTH];
+	gchar *normalized = NULL;
+	gchar *stripped = NULL;
+	gchar *stemmed = NULL;
+	size_t new_word_length;
+
+	g_return_val_if_fail (parser != NULL, NULL);
+	g_return_val_if_fail (word != NULL, NULL);
+
+	if (length < 0) {
+		length = strlen (word);
+	}
+
+	g_debug ("Original word: '%s'", word);
+
+	/* Leave space for last NIL */
+	new_word_length = WORD_BUFFER_LENGTH - 1;
+
+	/* Casefold and NFC normalization in output.
+	 *  NOTE: if the output buffer is not big enough, u8_casefold will
+	 *  return a newly-allocated buffer. */
+	normalized = u8_casefold ((const uint8_t *)word,
+	                          length,
+	                          uc_locale_language (),
+	                          UNINORM_NFC,
+	                          word_buffer,
+	                          &new_word_length);
+
+	/* Case folding + Normalization failed, skip this word */
+	g_return_val_if_fail (normalized != NULL, NULL);
+
+	/* If output buffer is not the same as the one passed to
+	 *  u8_casefold, we know it was newly-allocated, so need
+	 *  to resize it in 1 byte to add last NIL */
+	if (normalized != word_buffer) {
+		normalized = g_realloc (normalized, new_word_length + 1);
+	}
+
+	/* Set output NIL */
+	normalized[new_word_length] = '\0';
+
+	g_debug (" After Casefolding and NFC normalization: '%s'", normalized);
+
+	/* UNAC stripping needed? */
+	if (do_strip) {
+		guint32 stripped_word_length;
+
+		stripped = strip_word (normalized,
+		                       new_word_length,
+		                       &stripped_word_length);
+
+		if (stripped) {
+			g_debug ("  After UNAC stripping: '%s'", stripped);
+			new_word_length = stripped_word_length;
 		}
+	}
 
-		stem_word = tracker_language_stem_word (parser->language, str, len);
-		g_debug ("  After Stemming: '%s'", stem_word);
+	/* Stemming needed? */
+	if (parser->enable_stemmer) {
+		stemmed = tracker_language_stem_word (parser->language,
+		                                      stripped ? stripped : normalized,
+		                                      new_word_length);
 
-		if (stem_word) {
-			g_free (str);
+		g_debug ("   After Stemming: '%s'", stemmed);
+	}
 
-			return stem_word;
+	/* If stemmed wanted and succeeded, free previous and return it */
+	if (stemmed) {
+		g_free (stripped);
+		if (normalized != word_buffer) {
+			g_free (normalized);
 		}
+		return stemmed;
 	}
 
-	return str;
+	/* If stripped wanted and succeeded, free previous and return it */
+	if (stripped) {
+		if (normalized != word_buffer) {
+			g_free (normalized);
+		}
+		return stripped;
+	}
+
+	/* It may be the case that no stripping and no stemming was needed, and
+	 * that the output buffer in stack was enough for case-folding and
+	 * normalization. In this case, need to strdup() the string to return it */
+	return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
 }
 
+#endif /* !HAVE_LIBUNISTRING */
+
+
 const gchar *
 tracker_parser_next (TrackerParser *parser,
                      gint          *position,



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]