[tracker/parser-unicode-libs-review: 68/85] Minor fixes in the libunistring-based parser

From: Aleksander Morgado <aleksm src gnome org>
To: commits-list gnome org
Cc:
Subject: [tracker/parser-unicode-libs-review: 68/85] Minor fixes in the libunistring-based parser
Date: Tue, 4 May 2010 17:29:43 +0000 (UTC)
commit b91dcc83adfbd165030bb5b7a9b388b02e4209be
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Wed Apr 28 19:01:31 2010 +0200

    Minor fixes in the libunistring-based parser
    
     * Don't perform UNAC stripping if ASCII or CJK
     * Reformat the main word-walker loop

 src/libtracker-fts/tracker-parser.c |  148 ++++++++++++++++++++++-------------
 1 files changed, 95 insertions(+), 53 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser.c
index 3ab8017..13ffe27 100644
--- a/src/libtracker-fts/tracker-parser.c
+++ b/src/libtracker-fts/tracker-parser.c
@@ -44,9 +44,6 @@
  * of the words being parsed */
 #define TRACKER_PARSER_DEBUG_HEX 0
 
-/* Max possible length of a UTF-8 encoded string (just a safety limit) */
-#define WORD_BUFFER_LENGTH 512
-
 
 #ifndef HAVE_LIBUNISTRING
 
@@ -89,6 +86,18 @@ typedef enum {
 	TRACKER_PARSER_ENCODING_OTHER
 } TrackerParserEncoding;
 
+#else
+
+/* ASCII-7 is in range [0x00,0x7F] */
+#define IS_ASCII_BYTE(c) ((c) <= 0x7F)
+
+/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6]  */
+#define IS_CJK_UCS4(c)   (((c) >= 0x3400 && (c) <= 0x4DB5)  || \
+                          ((c) >= 0x4E00 && (c) <= 0x9FA5)  || \
+                          ((c) >= 0x20000 && (c) <= 0x2A6D6))
+
+/* Max possible length of a UTF-8 encoded string (just a safety limit) */
+#define WORD_BUFFER_LENGTH 512
 
 #endif /* !HAVE_LIBUNISTRING */
 
@@ -109,12 +118,11 @@ struct TrackerParser {
 	gchar                   *word;
 	gint                    word_length;
 	guint                   word_position;
+
 #ifndef HAVE_LIBUNISTRING
 	TrackerParserEncoding   encoding;
 	const gchar             *cursor;
-#endif /* !HAVE_LIBUNISTRING */
 
-#ifndef HAVE_LIBUNISTRING
 	/* Pango members for CJK text parsing */
 	PangoLogAttr          *attrs;
 	guint                  attr_length;
@@ -584,11 +592,28 @@ parser_next (TrackerParser *parser,
 
 #else
 
-/* Use libunistring
- *  void u8_wordbreaks (const uint8_t *s, size_t n, char *p)
- *  int u8_strmbtouc (ucs4_t *puc, const uint8_t *s)
- *  uint8_t * u8_casefold (const uint8_t *s, size_t n, const char *iso639_language, uninorm_t nf, uint8_t *resultbuf, size_t *lengthp)
- */
+
+/* Detect if a UTF-8 word is pure ASCII-7, so that there is no need to apply
+ *  UNAC stripping.
+ * Just check byte per byte, and if any of the bytes is >127, then it's not
+ *  ASCII-7 */
+static gboolean
+is_ascii_word (const gchar *word)
+{
+	guchar *i;
+
+	i = (guchar *)word;
+	while (*i != '\0') {
+		if (!IS_ASCII_BYTE (*i)) {
+			return FALSE;
+		}
+		i++;
+	}
+	return TRUE;
+}
+
+
+/* libunistring-based parser */
 static gboolean
 parser_next (TrackerParser *parser,
              gint          *byte_offset_start,
@@ -609,10 +634,19 @@ parser_next (TrackerParser *parser,
 		ucs4_t first_unichar;
 		gint first_unichar_len;
 		gsize i;
+		gsize new_length;
+		gchar word_buffer [WORD_BUFFER_LENGTH];
+		gboolean do_strip;
 
 		/* Get first character of the word as UCS4 */
 		first_unichar_len = u8_strmbtouc (&first_unichar,
 		                                  &(parser->txt[parser->cursor]));
+		if (first_unichar_len <= 0) {
+			/* This should only happen if NIL was passed to u8_strmbtouc,
+			 *  so better just force stop here */
+			parser->cursor = parser->txt_size;
+			break;
+		}
 
 		/* Find next word break */
 		i = parser->cursor + first_unichar_len;
@@ -625,49 +659,56 @@ parser_next (TrackerParser *parser,
 		 *  start of next word or the end of the string */
 		word_length = i - parser->cursor;
 
-		if (first_unichar_len > 0) {
-			/* We only want the words where the first character
-			 *  in the word is either a letter, a number or a symbol.
-			 * This is needed because the word break algorithm also
-			 *  considers word breaks after for example commas or other
-			 *  punctuation marks.
-			 * Note that looking at the first character in the string
-			 *  should be compatible with all Unicode normalization
-			 *  methods.
-			 */
-			if (uc_is_general_category (first_unichar,
-			                            parser->allowed_start)) {
-				gchar word_buffer [WORD_BUFFER_LENGTH];
-				gsize new_length;
-
-				/* compute truncated word length if needed */
-				new_length = (word_length < WORD_BUFFER_LENGTH ?
-				              word_length :
-				              WORD_BUFFER_LENGTH - 1);
-
-				/* Word here needs always to be NIL-terminated */
-				memcpy (word_buffer, &(parser->txt[parser->cursor]), new_length);
-				word_buffer[new_length] = '\0';
-
-				/* Process the word here. If it fails, we can still go
-				 *  to the next one. Returns newly allocated string
-				 *  always */
-				processed_word = tracker_parser_process_word (parser,
-				                                              word_buffer,
-				                                              new_length,
-				                                              TRUE);
-				if (!processed_word) {
-					/* Skip this word and keep on looping */
-					parser->cursor += word_length;
-				}
-			} else {
-				/* Skip this word and keep on looping */
-				parser->cursor += word_length;
-			}
-		} else {
-			/* This should only happen if NIL was passed to u8_strmbtouc,
-			 *  so better just force stop here */
-			parser->cursor = parser->txt_size;
+		/* We only want the words where the first character
+		 *  in the word is either a letter, a number or a symbol.
+		 * This is needed because the word break algorithm also
+		 *  considers word breaks after for example commas or other
+		 *  punctuation marks.
+		 * Note that looking at the first character in the string
+		 *  should be compatible with all Unicode normalization
+		 *  methods.
+		 */
+		if (!uc_is_general_category (first_unichar,
+		                             parser->allowed_start)) {
+			/* Skip this word and keep on looping */
+			parser->cursor += word_length;
+			continue;
+		}
+
+		/* check if word is reserved */
+		if (parser->parse_reserved_words &&
+		    word_length == 2 &&
+		    parser->txt[parser->cursor] == 'o' &&
+		    parser->txt[parser->cursor + 1] == 'r') {
+			/* Skip this word and keep on looping */
+			parser->cursor += word_length;
+			continue;
+		}
+
+		/* compute truncated word length if needed */
+		new_length = (word_length < WORD_BUFFER_LENGTH ?
+		              word_length :
+		              WORD_BUFFER_LENGTH - 1);
+
+		/* Word here needs always to be NIL-terminated */
+		memcpy (word_buffer, &(parser->txt[parser->cursor]), new_length);
+		word_buffer[new_length] = '\0';
+
+		/* Enable UNAC stripping only if no ASCII and no CJK */
+		do_strip = (!is_ascii_word (word_buffer) &&
+		            !IS_CJK_UCS4 (first_unichar));
+
+		/* Process the word here. If it fails, we can still go
+		 *  to the next one. Returns newly allocated string
+		 *  always */
+		processed_word = tracker_parser_process_word (parser,
+		                                              word_buffer,
+		                                              new_length,
+		                                              do_strip);
+		if (!processed_word) {
+			/* Skip this word and keep on looping */
+			parser->cursor += word_length;
+			continue;
 		}
 	}
 
@@ -1008,6 +1049,7 @@ tracker_parser_process_word (TrackerParser *parser,
 		}
 	}
 
+
 	/* Stemming needed? */
 	if (parser->enable_stemmer) {
 		stemmed = tracker_language_stem_word (parser->language,
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]