[tracker/parser-unicode-libs-review: 82/85] Bugfixing libicu-based parser
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/parser-unicode-libs-review: 82/85] Bugfixing libicu-based parser
- Date: Tue, 4 May 2010 17:30:54 +0000 (UTC)
commit 8f1cea89f53a05fab6b1a40768ed7cd5cb1f480b
Author: Aleksander Morgado <aleksander lanedo com>
Date: Tue May 4 13:06:36 2010 +0200
Bugfixing libicu-based parser
src/libtracker-fts/tracker-parser-libicu.c | 40 +++++++++++++++++++---------
src/libtracker-fts/tracker-parser-utils.c | 14 ++++++---
tests/libtracker-fts/tracker-parser-test.c | 29 +++++++++++++++++++-
3 files changed, 63 insertions(+), 20 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 9089dca..6414d40 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -93,8 +93,9 @@ get_word_info (const UChar *word,
UChar32 unichar;
guint8 unichar_gc;
- *p_is_allowed_word_start = FALSE;
- *p_is_ascii_or_cjk = FALSE;
+ /* Defaults... */
+ *p_is_allowed_word_start = TRUE;
+ *p_is_ascii_or_cjk = TRUE;
/* Get first character of the word as UCS4 */
uiter_setString (&iter, word, word_length);
@@ -131,7 +132,6 @@ get_word_info (const UChar *word,
/* Word starts with a CJK character? */
if (IS_CJK_UCS4 ((guint32)unichar)) {
- *p_is_ascii_or_cjk = TRUE;
return TRUE;
}
@@ -139,7 +139,7 @@ get_word_info (const UChar *word,
while (unichar != U_SENTINEL)
{
if (!IS_ASCII_UCS4 ((guint32)unichar)) {
- *p_is_ascii_or_cjk = TRUE;
+ *p_is_ascii_or_cjk = FALSE;
return TRUE;
}
unichar = uiter_next32 (&iter);
@@ -179,7 +179,7 @@ parser_next (TrackerParser *parser,
/* Find next word break. */
next_word_offset_uchar = ubrk_next (parser->bi);
- if (next_word_offset_uchar == UBRK_DONE) {
+ if (next_word_offset_uchar >= parser->utxt_size) {
/* Last word support... */
next_word_offset_uchar = parser->utxt_size;
next_word_offset_utf8 = parser->txt_size;
@@ -193,6 +193,13 @@ parser_next (TrackerParser *parser,
word_length_uchar = next_word_offset_uchar - parser->cursor;
word_length_utf8 = next_word_offset_utf8 - current_word_offset_utf8;
+ /* g_debug ("word_length_uchar: %" G_GSIZE_FORMAT, word_length_uchar); */
+ /* g_debug ("next_word_offset_uchar: %" G_GSIZE_FORMAT, next_word_offset_uchar); */
+ /* g_debug ("current_word_offset_uchar: %" G_GSIZE_FORMAT, parser->cursor); */
+ /* g_debug ("word_length_utf8: %" G_GSIZE_FORMAT, word_length_utf8); */
+ /* g_debug ("next_word_offset_utf8: %" G_GSIZE_FORMAT, next_word_offset_utf8); */
+ /* g_debug ("current_word_offset_utf8: %" G_GSIZE_FORMAT, current_word_offset_utf8); */
+
/* Get word info... */
if (!get_word_info (&parser->utxt[parser->cursor],
word_length_uchar,
@@ -203,6 +210,13 @@ parser_next (TrackerParser *parser,
break;
}
+ /* Skip the word if not an allowed word start */
+ if (!is_allowed) {
+ /* Skip this word and keep on looping */
+ parser->cursor = next_word_offset_uchar;
+ continue;
+ }
+
/* check if word is reserved (looking at ORIGINAL UTF-8 buffer
* here! */
if (parser->parse_reserved_words &&
@@ -343,9 +357,9 @@ tracker_parser_reset (TrackerParser *parser,
}
/* Allocate UChars and offsets buffers */
- parser->utxt_size = txt_size * sizeof (UChar) + 1;
- parser->utxt = g_malloc (parser->utxt_size);
- parser->offsets = g_malloc (parser->utxt_size);
+ parser->utxt_size = txt_size + 1;
+ parser->utxt = g_malloc (parser->utxt_size * sizeof (UChar));
+ parser->offsets = g_malloc (parser->utxt_size * sizeof (gint32));
/* last_uchar and last_utf8 will be also an output parameter! */
last_uchar = parser->utxt;
@@ -354,9 +368,9 @@ tracker_parser_reset (TrackerParser *parser,
/* Convert to UChars storing offsets */
ucnv_toUnicode (converter,
&last_uchar,
- &parser->utxt[parser->utxt_size],
+ &parser->utxt[txt_size],
&last_utf8,
- &parser->txt[parser->txt_size],
+ &parser->txt[txt_size],
parser->offsets,
FALSE,
&error);
@@ -464,13 +478,13 @@ process_word_uchar (TrackerParser *parser,
U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
return NULL;
}
- /* Using same buffer size as for UTF-16 should always work. */
- utf8_str = g_malloc (new_word_length + 1);
+ /* Using same buffer size as for UTF-16 should always work. */
+ utf8_str = g_malloc (new_word_length * sizeof (UChar) + 1);
/* Convert from UChar to UTF-8 */
utf8_len = ucnv_fromUChars (converter,
utf8_str,
- new_word_length,
+ new_word_length * sizeof (UChar) + 1,
normalized_buffer,
new_word_length,
&icu_error);
diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c
index 0a37440..222b4a1 100644
--- a/src/libtracker-fts/tracker-parser-utils.c
+++ b/src/libtracker-fts/tracker-parser-utils.c
@@ -101,19 +101,23 @@ tracker_parser_unaccent_UChar_word (const UChar *string,
return NULL;
}
- /* Allocate buffer, same size as input string */
- str_utf16 = g_malloc (ilength);
+ /* Allocate buffer, same size as input string.
+ * Note that ilength specifies number of UChars not
+ * number of bytes */
+ str_utf16 = g_malloc ((ilength + 1) * 2);
/* Convert from UChar to UTF-16BE */
utf16_len = ucnv_fromUChars (converter,
str_utf16,
- ilength,
+ (ilength + 1) * 2,
string,
ilength,
&icu_error);
if (U_FAILURE (icu_error)) {
- g_warning ("Cannot convert from UChar to UTF-16BE: '%s'",
- u_errorName (icu_error));
+ g_warning ("Cannot convert from UChar to UTF-16BE: '%s' "
+ "(ilength: %" G_GSIZE_FORMAT ")",
+ u_errorName (icu_error),
+ ilength);
} else {
str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
utf16_len,
diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c
index b2ec05d..41d7cc5 100644
--- a/tests/libtracker-fts/tracker-parser-test.c
+++ b/tests/libtracker-fts/tracker-parser-test.c
@@ -20,6 +20,7 @@
#include "config.h"
#include <string.h>
+#include <locale.h>
#include <glib.h>
#include <gio/gio.h>
@@ -124,9 +125,9 @@ run_parsing (void)
/* Create the parser */
parser = tracker_parser_new (language,
max_word_length);
- g_object_unref (language);
if (!parser) {
g_printerr ("Parser creation failed!\n");
+ g_object_unref (language);
return FALSE;
}
@@ -144,11 +145,14 @@ run_parsing (void)
while (1) {
const gchar *word;
gchar *word_hex;
+ gchar *original_word;
+ gchar *original_word_hex;
gint position;
gint byte_offset_start;
gint byte_offset_end;
gboolean stop_word;
gint word_length;
+ gint original_word_length;
/* Process next word */
word = tracker_parser_next (parser,
@@ -163,20 +167,38 @@ run_parsing (void)
break;
}
+ /* Get original word */
+ original_word_length = byte_offset_end - byte_offset_start;
+ original_word = g_malloc (original_word_length + 1);
+ memcpy (original_word,
+ &text[byte_offset_start],
+ original_word_length);
+ original_word[original_word_length] = '\0';
+
+ /* Get hex strings */
word_hex = tracker_strhex (word, word_length, ':');
+ original_word_hex = tracker_strhex (original_word,
+ original_word_length,
+ ':');
- g_print ("WORD at %d [%d,%d]: '%s' (%s) (stop? %s)\n",
+ g_print ("WORD at %d [%d,%d] Original: '%s' (%s), "
+ "Processed: '%s' (%s) (stop? %s)\n",
position,
byte_offset_start,
byte_offset_end,
+ original_word,
+ original_word_hex,
word,
word_hex,
stop_word ? "yes" : "no");
g_free (word_hex);
+ g_free (original_word_hex);
+ g_free (original_word);
}
tracker_parser_free (parser);
+ g_object_unref (language);
return TRUE;
}
@@ -189,6 +211,9 @@ main (int argc, char **argv)
g_thread_init (NULL);
}
+ /* Setup locale */
+ setlocale (LC_ALL, "");
+
/* Setup context */
if (!setup_context (argc, argv)) {
g_printerr ("Context setup failed... exiting\n");
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]