[tracker/parser-libunistring-review] Bugfixing libicu-based parser

From: Aleksander Morgado <aleksm src gnome org>
To: commits-list gnome org
Cc:
Subject: [tracker/parser-libunistring-review] Bugfixing libicu-based parser
Date: Tue, 4 May 2010 16:31:55 +0000 (UTC)
commit 931ed821146321db3a6b119cd3fb9a4678bc6aec
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Tue May 4 13:06:36 2010 +0200

    Bugfixing libicu-based parser

 src/libtracker-fts/tracker-parser-libicu.c |   40 +++++++++++++++++++---------
 src/libtracker-fts/tracker-parser-utils.c  |   14 ++++++---
 tests/libtracker-fts/tracker-parser-test.c |   29 +++++++++++++++++++-
 3 files changed, 63 insertions(+), 20 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 9089dca..6414d40 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -93,8 +93,9 @@ get_word_info (const UChar *word,
 	UChar32 unichar;
 	guint8 unichar_gc;
 
-	*p_is_allowed_word_start = FALSE;
-	*p_is_ascii_or_cjk = FALSE;
+	/* Defaults... */
+	*p_is_allowed_word_start = TRUE;
+	*p_is_ascii_or_cjk = TRUE;
 
 	/* Get first character of the word as UCS4 */
 	uiter_setString (&iter, word, word_length);
@@ -131,7 +132,6 @@ get_word_info (const UChar *word,
 
 	/* Word starts with a CJK character? */
 	if (IS_CJK_UCS4 ((guint32)unichar)) {
-		*p_is_ascii_or_cjk = TRUE;
 		return TRUE;
 	}
 
@@ -139,7 +139,7 @@ get_word_info (const UChar *word,
 	while (unichar != U_SENTINEL)
 	{
 		if (!IS_ASCII_UCS4 ((guint32)unichar)) {
-			*p_is_ascii_or_cjk = TRUE;
+			*p_is_ascii_or_cjk = FALSE;
 			return TRUE;
 		}
 		unichar = uiter_next32 (&iter);
@@ -179,7 +179,7 @@ parser_next (TrackerParser *parser,
 
 		/* Find next word break. */
 		next_word_offset_uchar = ubrk_next (parser->bi);
-		if (next_word_offset_uchar == UBRK_DONE) {
+		if (next_word_offset_uchar >= parser->utxt_size) {
 			/* Last word support... */
 			next_word_offset_uchar = parser->utxt_size;
 			next_word_offset_utf8 = parser->txt_size;
@@ -193,6 +193,13 @@ parser_next (TrackerParser *parser,
 		word_length_uchar = next_word_offset_uchar - parser->cursor;
 		word_length_utf8 = next_word_offset_utf8 - current_word_offset_utf8;
 
+		/* g_debug ("word_length_uchar: %" G_GSIZE_FORMAT, word_length_uchar); */
+		/* g_debug ("next_word_offset_uchar: %" G_GSIZE_FORMAT, next_word_offset_uchar); */
+		/* g_debug ("current_word_offset_uchar: %" G_GSIZE_FORMAT, parser->cursor); */
+		/* g_debug ("word_length_utf8: %" G_GSIZE_FORMAT, word_length_utf8); */
+		/* g_debug ("next_word_offset_utf8: %" G_GSIZE_FORMAT, next_word_offset_utf8); */
+		/* g_debug ("current_word_offset_utf8: %" G_GSIZE_FORMAT, current_word_offset_utf8); */
+
 		/* Get word info... */
 		if (!get_word_info (&parser->utxt[parser->cursor],
 		                    word_length_uchar,
@@ -203,6 +210,13 @@ parser_next (TrackerParser *parser,
 			break;
 		}
 
+		/* Skip the word if not an allowed word start */
+		if (!is_allowed) {
+			/* Skip this word and keep on looping */
+			parser->cursor = next_word_offset_uchar;
+			continue;
+		}
+
 		/* check if word is reserved (looking at ORIGINAL UTF-8 buffer
 		 *  here! */
 		if (parser->parse_reserved_words &&
@@ -343,9 +357,9 @@ tracker_parser_reset (TrackerParser *parser,
 	}
 
 	/* Allocate UChars and offsets buffers */
-	parser->utxt_size = txt_size * sizeof (UChar) + 1;
-	parser->utxt = g_malloc (parser->utxt_size);
-	parser->offsets = g_malloc (parser->utxt_size);
+	parser->utxt_size = txt_size + 1;
+	parser->utxt = g_malloc (parser->utxt_size * sizeof (UChar));
+	parser->offsets = g_malloc (parser->utxt_size * sizeof (gint32));
 
 	/* last_uchar and last_utf8 will be also an output parameter! */
 	last_uchar = parser->utxt;
@@ -354,9 +368,9 @@ tracker_parser_reset (TrackerParser *parser,
 	/* Convert to UChars storing offsets */
 	ucnv_toUnicode (converter,
 	                &last_uchar,
-	                &parser->utxt[parser->utxt_size],
+	                &parser->utxt[txt_size],
 	                &last_utf8,
-	                &parser->txt[parser->txt_size],
+	                &parser->txt[txt_size],
 	                parser->offsets,
 	                FALSE,
 	                &error);
@@ -464,13 +478,13 @@ process_word_uchar (TrackerParser *parser,
 			           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
 			return NULL;
 		}
-		/* Using same  buffer size as for UTF-16 should always work. */
-		utf8_str = g_malloc (new_word_length + 1);
+		/* Using same buffer size as for UTF-16 should always work. */
+		utf8_str = g_malloc (new_word_length * sizeof (UChar) + 1);
 
 		/* Convert from UChar to UTF-8 */
 		utf8_len = ucnv_fromUChars (converter,
 		                            utf8_str,
-		                            new_word_length,
+		                            new_word_length * sizeof (UChar) + 1,
 		                            normalized_buffer,
 		                            new_word_length,
 		                            &icu_error);
diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c
index 0a37440..222b4a1 100644
--- a/src/libtracker-fts/tracker-parser-utils.c
+++ b/src/libtracker-fts/tracker-parser-utils.c
@@ -101,19 +101,23 @@ tracker_parser_unaccent_UChar_word (const UChar *string,
                return NULL;
 	}
 
-	/* Allocate buffer, same size as input string */
-	str_utf16 = g_malloc (ilength);
+	/* Allocate buffer, same size as input string.
+	 * Note that ilength specifies number of UChars not
+	 *  number of bytes */
+	str_utf16 = g_malloc ((ilength + 1) * 2);
 
 	/* Convert from UChar to UTF-16BE */
 	utf16_len = ucnv_fromUChars (converter,
 	                             str_utf16,
-	                             ilength,
+	                             (ilength + 1) * 2,
 	                             string,
 	                             ilength,
 	                             &icu_error);
 	if (U_FAILURE (icu_error)) {
-		g_warning ("Cannot convert from UChar to UTF-16BE: '%s'",
-		           u_errorName (icu_error));
+		g_warning ("Cannot convert from UChar to UTF-16BE: '%s' "
+		           "(ilength: %" G_GSIZE_FORMAT ")",
+		           u_errorName (icu_error),
+		           ilength);
 	} else {
 		str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
 		                                                 utf16_len,
diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c
index b2ec05d..41d7cc5 100644
--- a/tests/libtracker-fts/tracker-parser-test.c
+++ b/tests/libtracker-fts/tracker-parser-test.c
@@ -20,6 +20,7 @@
 #include "config.h"
 
 #include <string.h>
+#include <locale.h>
 
 #include <glib.h>
 #include <gio/gio.h>
@@ -124,9 +125,9 @@ run_parsing (void)
 	/* Create the parser */
 	parser = tracker_parser_new (language,
 	                             max_word_length);
-	g_object_unref (language);
 	if (!parser) {
 		g_printerr ("Parser creation failed!\n");
+		g_object_unref (language);
 		return FALSE;
 	}
 
@@ -144,11 +145,14 @@ run_parsing (void)
 	while (1) {
 		const gchar *word;
 		gchar *word_hex;
+		gchar *original_word;
+		gchar *original_word_hex;
 		gint position;
 		gint byte_offset_start;
 		gint byte_offset_end;
 		gboolean stop_word;
 		gint word_length;
+		gint original_word_length;
 
 		/* Process next word */
 		word = tracker_parser_next (parser,
@@ -163,20 +167,38 @@ run_parsing (void)
 			break;
 		}
 
+		/* Get original word */
+		original_word_length = byte_offset_end - byte_offset_start;
+		original_word = g_malloc (original_word_length + 1);
+		memcpy (original_word,
+		        &text[byte_offset_start],
+		        original_word_length);
+		original_word[original_word_length] = '\0';
+
+		/* Get hex strings */
 		word_hex = tracker_strhex (word, word_length, ':');
+		original_word_hex = tracker_strhex (original_word,
+		                                    original_word_length,
+		                                    ':');
 
-		g_print ("WORD at %d [%d,%d]: '%s' (%s) (stop? %s)\n",
+		g_print ("WORD at %d [%d,%d] Original: '%s' (%s), "
+		         "Processed: '%s' (%s) (stop? %s)\n",
 		         position,
 		         byte_offset_start,
 		         byte_offset_end,
+		         original_word,
+		         original_word_hex,
 		         word,
 		         word_hex,
 		         stop_word ? "yes" : "no");
 
 		g_free (word_hex);
+		g_free (original_word_hex);
+		g_free (original_word);
 	}
 
 	tracker_parser_free (parser);
+	g_object_unref (language);
 	return TRUE;
 }
 
@@ -189,6 +211,9 @@ main (int argc, char **argv)
 		g_thread_init (NULL);
 	}
 
+	/* Setup locale */
+	setlocale (LC_ALL, "");
+
 	/* Setup context */
 	if (!setup_context (argc, argv)) {
 		g_printerr ("Context setup failed... exiting\n");
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]