[tracker/parser-unicode-libs-review] Improve ASCII-only parsing

From: Aleksander Morgado <aleksm src gnome org>
To: commits-list gnome org
Cc:
Subject: [tracker/parser-unicode-libs-review] Improve ASCII-only parsing
Date: Wed, 5 May 2010 10:42:41 +0000 (UTC)
commit b16612e50feb5cb944d83b374e2ab3d0898c3be6
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Wed May 5 12:41:21 2010 +0200

    Improve ASCII-only parsing
    
     * Don't perform normalization if ASCII-only
     * Don't perform full case-folding if ASCII-only (only lowercase)

 src/libtracker-fts/tracker-parser-libicu.c       |  125 +++++++-----
 src/libtracker-fts/tracker-parser-libunistring.c |  227 ++++++++++++++--------
 2 files changed, 220 insertions(+), 132 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 40e740b..5a4f1e3 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -42,6 +42,13 @@
                           ((c) >= 0x4E00 && (c) <= 0x9FA5)  || \
                           ((c) >= 0x20000 && (c) <= 0x2A6D6))
 
+/* Type of words detected */
+typedef enum {
+	TRACKER_PARSER_WORD_TYPE_ASCII,
+	TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
+	TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
+} TrackerParserWordType;
+
 /* Max possible length of a UChar encoded string (just a safety limit) */
 #define WORD_BUFFER_LENGTH 512
 
@@ -49,7 +56,7 @@
 static gchar *process_word_uchar (TrackerParser *parser,
                                   const UChar   *word,
                                   gint           length,
-                                  gboolean       do_strip);
+                                  TrackerParserWordType type);
 
 
 struct TrackerParser {
@@ -84,19 +91,15 @@ struct TrackerParser {
 
 
 static gboolean
-get_word_info (const UChar *word,
-               gsize        word_length,
-               gboolean    *p_is_allowed_word_start,
-               gboolean    *p_is_ascii_or_cjk)
+get_word_info (const UChar           *word,
+               gsize                  word_length,
+               gboolean              *p_is_allowed_word_start,
+               TrackerParserWordType *p_word_type)
 {
 	UCharIterator iter;
 	UChar32 unichar;
 	guint8 unichar_gc;
 
-	/* Defaults... */
-	*p_is_allowed_word_start = TRUE;
-	*p_is_ascii_or_cjk = TRUE;
-
 	/* Get first character of the word as UCS4 */
 	uiter_setString (&iter, word, word_length);
 	unichar = uiter_current32 (&iter);
@@ -128,10 +131,13 @@ get_word_info (const UChar *word,
 	    unichar_gc != U_OTHER_SYMBOL) {
 		*p_is_allowed_word_start = FALSE;
 		return TRUE;
+	} else {
+		*p_is_allowed_word_start = TRUE;
 	}
 
 	/* Word starts with a CJK character? */
 	if (IS_CJK_UCS4 ((guint32)unichar)) {
+		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
 		return TRUE;
 	}
 
@@ -139,17 +145,16 @@ get_word_info (const UChar *word,
 	while (unichar != U_SENTINEL)
 	{
 		if (!IS_ASCII_UCS4 ((guint32)unichar)) {
-			*p_is_ascii_or_cjk = FALSE;
+			*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
 			return TRUE;
 		}
 		unichar = uiter_next32 (&iter);
 	}
 
+	*p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
 	return TRUE;
 }
 
-
-/* libunistring-based parser */
 static gboolean
 parser_next (TrackerParser *parser,
              gint          *byte_offset_start,
@@ -168,7 +173,7 @@ parser_next (TrackerParser *parser,
 	/* Loop to look for next valid word */
 	while (!processed_word &&
 	       parser->cursor < parser->utxt_size) {
-		gboolean is_ascii_or_cjk;
+		TrackerParserWordType type;
 		gboolean is_allowed;
 		gsize next_word_offset_uchar;
 		gsize next_word_offset_utf8;
@@ -204,7 +209,7 @@ parser_next (TrackerParser *parser,
 		if (!get_word_info (&parser->utxt[parser->cursor],
 		                    word_length_uchar,
 		                    &is_allowed,
-		                    &is_ascii_or_cjk)) {
+		                    &type)) {
 			/* Quit loop just in case */
 			parser->cursor = parser->utxt_size;
 			break;
@@ -243,7 +248,7 @@ parser_next (TrackerParser *parser,
 		processed_word = process_word_uchar (parser,
 		                                     &(parser->utxt[parser->cursor]),
 		                                     truncated_length,
-		                                     !is_ascii_or_cjk);
+		                                     type);
 		if (!processed_word) {
 			/* Skip this word and keep on looping */
 			parser->cursor = next_word_offset_uchar;
@@ -407,53 +412,69 @@ tracker_parser_reset (TrackerParser *parser,
 }
 
 static gchar *
-process_word_uchar (TrackerParser *parser,
-                    const UChar   *word,
-                    gint           length,
-                    gboolean       do_strip)
+process_word_uchar (TrackerParser         *parser,
+                    const UChar           *word,
+                    gint                   length,
+                    TrackerParserWordType  type)
 {
 	UErrorCode error = U_ZERO_ERROR;
-	UChar casefolded_buffer [WORD_BUFFER_LENGTH];
 	UChar normalized_buffer [WORD_BUFFER_LENGTH];
 	gchar *utf8_str = NULL;
 	gchar *stemmed = NULL;
 	size_t new_word_length;
 
-	/* Casefold... */
-	new_word_length = u_strFoldCase (casefolded_buffer,
-	                                 WORD_BUFFER_LENGTH,
-	                                 word,
-	                                 length,
-	                                 U_FOLD_CASE_DEFAULT,
-	                                 &error);
-	if (U_FAILURE (error)) {
-		g_warning ("Error casefolding: '%s'",
-		           u_errorName (error));
-		return NULL;
-	}
 
-	if (new_word_length > WORD_BUFFER_LENGTH)
-		new_word_length = WORD_BUFFER_LENGTH;
-
-	/* NFC normalization... */
-	new_word_length = unorm_normalize (casefolded_buffer,
-	                                   new_word_length,
-	                                   UNORM_NFC,
-	                                   0,
-	                                   normalized_buffer,
-	                                   WORD_BUFFER_LENGTH,
-	                                   &error);
-	if (U_FAILURE (error)) {
-		g_warning ("Error normalizing: '%s'",
-		           u_errorName (error));
-		return NULL;
-	}
+	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
+		UChar casefolded_buffer [WORD_BUFFER_LENGTH];
+
+		/* Casefold... */
+		new_word_length = u_strFoldCase (casefolded_buffer,
+		                                 WORD_BUFFER_LENGTH,
+		                                 word,
+		                                 length,
+		                                 U_FOLD_CASE_DEFAULT,
+		                                 &error);
+		if (U_FAILURE (error)) {
+			g_warning ("Error casefolding: '%s'",
+			           u_errorName (error));
+			return NULL;
+		}
+		if (new_word_length > WORD_BUFFER_LENGTH)
+			new_word_length = WORD_BUFFER_LENGTH;
+
+		/* NFC normalization... */
+		new_word_length = unorm_normalize (casefolded_buffer,
+		                                   new_word_length,
+		                                   UNORM_NFC,
+		                                   0,
+		                                   normalized_buffer,
+		                                   WORD_BUFFER_LENGTH,
+		                                   &error);
+		if (U_FAILURE (error)) {
+			g_warning ("Error normalizing: '%s'",
+			           u_errorName (error));
+			return NULL;
+		}
 
-	if (new_word_length > WORD_BUFFER_LENGTH)
-		new_word_length = WORD_BUFFER_LENGTH;
+		if (new_word_length > WORD_BUFFER_LENGTH)
+			new_word_length = WORD_BUFFER_LENGTH;
+	} else {
+		/* For ASCII-only, just tolower() each character */
+		new_word_length = u_strToLower (normalized_buffer,
+		                                WORD_BUFFER_LENGTH,
+		                                word,
+		                                length,
+		                                NULL,
+		                                &error);
+		if (U_FAILURE (error)) {
+			g_warning ("Error lowercasing: '%s'",
+			           u_errorName (error));
+			return NULL;
+		}
+	}
 
-	/* UNAC stripping needed? */
-	if (do_strip) {
+	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
+	if (type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
 		gsize stripped_word_length;
 
 		/* Get unaccented string in UTF-8 */
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index 6fec131..f022cbb 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -33,6 +33,7 @@
 #include "tracker-parser.h"
 #include "tracker-parser-utils.h"
 
+
 /* ASCII-7 is in range [0x00,0x7F] */
 #define IS_ASCII_BYTE(c) ((c) <= 0x7F)
 
@@ -41,9 +42,20 @@
                           ((c) >= 0x4E00 && (c) <= 0x9FA5)  || \
                           ((c) >= 0x20000 && (c) <= 0x2A6D6))
 
+/* Type of words detected */
+typedef enum {
+	TRACKER_PARSER_WORD_TYPE_ASCII,
+	TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
+	TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
+} TrackerParserWordType;
+
 /* Max possible length of a UTF-8 encoded string (just a safety limit) */
 #define WORD_BUFFER_LENGTH 512
 
+static gchar *process_word_utf8 (TrackerParser         *parser,
+                                 const gchar           *word,
+                                 gint                  length,
+                                 TrackerParserWordType type);
 
 struct TrackerParser {
 	const gchar           *txt;
@@ -70,25 +82,76 @@ struct TrackerParser {
 	uc_general_category_t  allowed_start;
 };
 
-/* Detect if a UTF-8 word is pure ASCII-7, so that there is no need to apply
- *  UNAC stripping.
- * Just check byte per byte, and if any of the bytes is >127, then it's not
- *  ASCII-7 */
 static gboolean
-is_ascii_word (const gchar *word,
-               gsize        length)
+get_word_info (TrackerParser         *parser,
+               gsize                 *p_word_length,
+               gboolean              *p_is_allowed_word_start,
+               TrackerParserWordType *p_word_type)
 {
+	ucs4_t first_unichar;
+	gint first_unichar_len;
 	gsize i;
+	gboolean ascii_only;
+
+	/* Defaults */
+	*p_is_allowed_word_start = TRUE;
+
+	/* Get first character of the word as UCS4 */
+	first_unichar_len = u8_strmbtouc (&first_unichar,
+	                                  &(parser->txt[parser->cursor]));
+	if (first_unichar_len <= 0) {
+		/* This should only happen if NIL was passed to u8_strmbtouc,
+		 *  so better just force stop here */
+		return FALSE;
+	} else  {
+		/* If first character has length 1, it's ASCII-7 */
+		ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
+	}
+
+	/* Find next word break, and in the same loop checking if only ASCII
+	 *  characters */
+	i = parser->cursor + first_unichar_len;
+	while (i < parser->txt_size &&
+	       !parser->word_break_flags [i]) {
 
-	for (i = 0; i < length; i++) {
-		if (!IS_ASCII_BYTE ((guchar)word[i])) {
-			return FALSE;
+		if (ascii_only &&
+		    !IS_ASCII_BYTE ((guchar)parser->txt[i])) {
+			ascii_only = FALSE;
 		}
+
+		i++;
+	}
+
+	/* Word end is the first byte after the word, which is either the
+	 *  start of next word or the end of the string */
+	*p_word_length = i - parser->cursor;
+
+	/* We only want the words where the first character
+	 *  in the word is either a letter, a number or a symbol.
+	 * This is needed because the word break algorithm also
+	 *  considers word breaks after for example commas or other
+	 *  punctuation marks.
+	 * Note that looking at the first character in the string
+	 *  should be compatible with all Unicode normalization
+	 *  methods.
+	 */
+	if (!uc_is_general_category (first_unichar,
+	                             parser->allowed_start)) {
+		*p_is_allowed_word_start = FALSE;
+		return TRUE;
+	}
+
+	/* Decide word type */
+	if (ascii_only) {
+		*p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
+	} else if (IS_CJK_UCS4 (first_unichar)) {
+		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
+	} else {
+		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
 	}
 	return TRUE;
 }
 
-/* libunistring-based parser */
 static gboolean
 parser_next (TrackerParser *parser,
              gint          *byte_offset_start,
@@ -105,44 +168,22 @@ parser_next (TrackerParser *parser,
 	/* Loop to look for next valid word */
 	while (!processed_word &&
 	       parser->cursor < parser->txt_size) {
-		ucs4_t first_unichar;
-		gint first_unichar_len;
-		gsize i;
+		TrackerParserWordType type;
 		gsize truncated_length;
-		gboolean do_strip;
-
-		/* Get first character of the word as UCS4 */
-		first_unichar_len = u8_strmbtouc (&first_unichar,
-		                                  &(parser->txt[parser->cursor]));
-		if (first_unichar_len <= 0) {
-			/* This should only happen if NIL was passed to u8_strmbtouc,
-			 *  so better just force stop here */
+		gboolean is_allowed;
+
+		/* Get word info */
+		if (!get_word_info (parser,
+		                    &word_length,
+		                    &is_allowed,
+		                    &type)) {
+			/* Quit loop just in case */
 			parser->cursor = parser->txt_size;
 			break;
 		}
 
-		/* Find next word break */
-		i = parser->cursor + first_unichar_len;
-		while (i < parser->txt_size &&
-		       !parser->word_break_flags [i]) {
-			i++;
-		}
-
-		/* Word end is the first byte after the word, which is either the
-		 *  start of next word or the end of the string */
-		word_length = i - parser->cursor;
-
-		/* We only want the words where the first character
-		 *  in the word is either a letter, a number or a symbol.
-		 * This is needed because the word break algorithm also
-		 *  considers word breaks after for example commas or other
-		 *  punctuation marks.
-		 * Note that looking at the first character in the string
-		 *  should be compatible with all Unicode normalization
-		 *  methods.
-		 */
-		if (!uc_is_general_category (first_unichar,
-		                             parser->allowed_start)) {
+		/* Skip the word if not an allowed word start */
+		if (!is_allowed) {
 			/* Skip this word and keep on looping */
 			parser->cursor += word_length;
 			continue;
@@ -164,18 +205,13 @@ parser_next (TrackerParser *parser,
 		                    word_length :
 		                    WORD_BUFFER_LENGTH - 1);
 
-		/* Enable UNAC stripping only if no ASCII and no CJK */
-		do_strip = (!is_ascii_word (&(parser->txt[parser->cursor]),
-		                            truncated_length) &&
-		            !IS_CJK_UCS4 (first_unichar));
-
 		/* Process the word here. If it fails, we can still go
 		 *  to the next one. Returns newly allocated string
 		 *  always */
-		processed_word = tracker_parser_process_word (parser,
-		                                              &(parser->txt[parser->cursor]),
-		                                              truncated_length,
-		                                              do_strip);
+		processed_word = process_word_utf8 (parser,
+		                                    &(parser->txt[parser->cursor]),
+		                                    truncated_length,
+		                                    type);
 		if (!processed_word) {
 			/* Skip this word and keep on looping */
 			parser->cursor += word_length;
@@ -283,13 +319,26 @@ tracker_parser_reset (TrackerParser *parser,
 	parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_SYMBOL);
 }
 
-/* libunistring version of the word processor. */
 gchar *
 tracker_parser_process_word (TrackerParser *parser,
                              const gchar    *word,
                              gint           length,
                              gboolean       do_strip)
 {
+	return process_word_utf8 (parser,
+	                          word,
+	                          length,
+	                          (do_strip ?
+	                           TRACKER_PARSER_WORD_TYPE_OTHER_UNAC :
+	                           TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC));
+}
+
+static gchar *
+process_word_utf8 (TrackerParser         *parser,
+                   const gchar           *word,
+                   gint                  length,
+                   TrackerParserWordType type)
+{
 	gchar word_buffer [WORD_BUFFER_LENGTH];
 	gchar *normalized = NULL;
 	gchar *stripped = NULL;
@@ -310,38 +359,57 @@ tracker_parser_process_word (TrackerParser *parser,
 	tracker_parser_message_hex ("ORIGINAL word",
 	                            word, length);
 
-	/* Leave space for last NIL */
-	new_word_length = WORD_BUFFER_LENGTH - 1;
+	/* Normalization and case-folding ONLY for non-ASCII */
+	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
+		/* Leave space for last NIL */
+		new_word_length = WORD_BUFFER_LENGTH - 1;
+
+		/* Casefold and NFC normalization in output.
+		 *  NOTE: if the output buffer is not big enough, u8_casefold will
+		 *  return a newly-allocated buffer. */
+		normalized = u8_casefold ((const uint8_t *)word,
+		                          length,
+		                          uc_locale_language (),
+		                          UNINORM_NFC,
+		                          word_buffer,
+		                          &new_word_length);
+
+		/* Case folding + Normalization failed, skip this word */
+		g_return_val_if_fail (normalized != NULL, NULL);
+
+		/* If output buffer is not the same as the one passed to
+		 *  u8_casefold, we know it was newly-allocated, so need
+		 *  to resize it in 1 byte to add last NIL */
+		if (normalized != word_buffer) {
+			normalized = g_realloc (normalized, new_word_length + 1);
+		}
 
-	/* Casefold and NFC normalization in output.
-	 *  NOTE: if the output buffer is not big enough, u8_casefold will
-	 *  return a newly-allocated buffer. */
-	normalized = u8_casefold ((const uint8_t *)word,
-	                          length,
-	                          uc_locale_language (),
-	                          UNINORM_NFC,
-	                          word_buffer,
-	                          &new_word_length);
-
-	/* Case folding + Normalization failed, skip this word */
-	g_return_val_if_fail (normalized != NULL, NULL);
-
-	/* If output buffer is not the same as the one passed to
-	 *  u8_casefold, we know it was newly-allocated, so need
-	 *  to resize it in 1 byte to add last NIL */
-	if (normalized != word_buffer) {
-		normalized = g_realloc (normalized, new_word_length + 1);
+		/* Log after Normalization */
+		tracker_parser_message_hex (" After Casefolding and NFC normalization",
+		                            normalized, new_word_length);
+	}
+	else {
+		/* For ASCII-only, just tolower() each character */
+		gsize i;
+
+		normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length) : word_buffer;
+
+		for (i = 0; i < length; i++) {
+			normalized[i] = g_ascii_tolower (word[i]);
+		}
+
+		new_word_length = length;
+
+		/* Log after tolower */
+		tracker_parser_message_hex (" After Lowercasing",
+		                            normalized, new_word_length);
 	}
 
 	/* Set output NIL */
 	normalized[new_word_length] = '\0';
 
-	/* Log after Normalization */
-	tracker_parser_message_hex (" After Casefolding and NFC normalization",
-	                            normalized, new_word_length);
-
-	/* UNAC stripping needed? */
-	if (do_strip) {
+	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
+	if (type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
 		gsize stripped_word_length;
 
 		stripped = tracker_parser_unaccent_utf8_word (normalized,
@@ -356,7 +424,6 @@ tracker_parser_process_word (TrackerParser *parser,
 		}
 	}
 
-
 	/* Stemming needed? */
 	if (parser->enable_stemmer) {
 		stemmed = tracker_language_stem_word (parser->language,
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]