[tracker/parser-unicode-libs-review: 80/85] Added first non-tested implementation of the libicu based word breaking and processing
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/parser-unicode-libs-review: 80/85] Added first non-tested implementation of the libicu based word breaking and processing
- Date: Tue, 4 May 2010 17:30:44 +0000 (UTC)
commit 8ae4a0bec0c9d53fe4feddb4e64941ae6caffe95
Author: Aleksander Morgado <aleksander lanedo com>
Date: Tue May 4 09:42:00 2010 +0200
Added first non-tested implementation of the libicu based word breaking and processing
src/libtracker-fts/tracker-parser-libicu.c | 539 +++++++++++++++++-----------
1 files changed, 334 insertions(+), 205 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index e29e0ea..9089dca 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -24,13 +24,18 @@
#include <string.h>
#include <locale.h>
-#include <ubrk.h>
+#include <unicode/utypes.h>
+#include <unicode/ucnv.h>
+#include <unicode/ubrk.h>
+#include <unicode/ustring.h>
+#include <unicode/uchar.h>
+#include <unicode/unorm.h>
#include "tracker-parser.h"
#include "tracker-parser-utils.h"
/* ASCII-7 is in range [0x00,0x7F] */
-#define IS_ASCII_BYTE(c) ((c) <= 0x7F)
+#define IS_ASCII_UCS4(c) ((c) <= 0x7F)
/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6] */
#define IS_CJK_UCS4(c) (((c) >= 0x3400 && (c) <= 0x4DB5) || \
@@ -41,6 +46,12 @@
#define WORD_BUFFER_LENGTH 512
+static gchar *process_word_uchar (TrackerParser *parser,
+ const UChar *word,
+ gint length,
+ gboolean do_strip);
+
+
struct TrackerParser {
const gchar *txt;
gint txt_size;
@@ -61,6 +72,8 @@ struct TrackerParser {
/* Text as UChars */
UChar *utxt;
gint utxt_size;
+ /* Original offset of each UChar in the input txt string */
+ gint32 *offsets;
/* The word-break iterator */
UBreakIterator *bi;
@@ -69,32 +82,83 @@ struct TrackerParser {
gsize cursor;
};
-/* Detect if a UTF-8 word is pure ASCII-7, so that there is no need to apply
- * UNAC stripping.
- * Just check byte per byte, and if any of the bytes is >127, then it's not
- * ASCII-7 */
+
static gboolean
-is_ascii_word (const gchar *word,
- gsize length)
+get_word_info (const UChar *word,
+ gsize word_length,
+ gboolean *p_is_allowed_word_start,
+ gboolean *p_is_ascii_or_cjk)
{
- gsize i;
+ UCharIterator iter;
+ UChar32 unichar;
+ guint8 unichar_gc;
+
+ *p_is_allowed_word_start = FALSE;
+ *p_is_ascii_or_cjk = FALSE;
+
+ /* Get first character of the word as UCS4 */
+ uiter_setString (&iter, word, word_length);
+ unichar = uiter_current32 (&iter);
+ if (unichar == U_SENTINEL) {
+ return FALSE;
+ }
+
+ /* We only want the words where the first character
+ * in the word is either a letter, a number or a symbol.
+ * This is needed because the word break algorithm also
+ * considers word breaks after for example commas or other
+ * punctuation marks.
+ * Note that looking at the first character in the string
+ * should be compatible with all Unicode normalization
+ * methods.
+ */
+ unichar_gc = u_charType (unichar);
+ if (unichar_gc != U_UPPERCASE_LETTER &&
+ unichar_gc != U_LOWERCASE_LETTER &&
+ unichar_gc != U_TITLECASE_LETTER &&
+ unichar_gc != U_MODIFIER_LETTER &&
+ unichar_gc != U_OTHER_LETTER &&
+ unichar_gc != U_DECIMAL_DIGIT_NUMBER &&
+ unichar_gc != U_LETTER_NUMBER &&
+ unichar_gc != U_OTHER_NUMBER &&
+ unichar_gc != U_MATH_SYMBOL &&
+ unichar_gc != U_CURRENCY_SYMBOL &&
+ unichar_gc != U_MODIFIER_SYMBOL &&
+ unichar_gc != U_OTHER_SYMBOL) {
+ *p_is_allowed_word_start = FALSE;
+ return TRUE;
+ }
+
+ /* Word starts with a CJK character? */
+ if (IS_CJK_UCS4 ((guint32)unichar)) {
+ *p_is_ascii_or_cjk = TRUE;
+ return TRUE;
+ }
- for (i = 0; i < length; i++) {
- if (!IS_ASCII_BYTE ((guchar)word[i])) {
- return FALSE;
+ /* Is ASCII-only string? */
+ while (unichar != U_SENTINEL)
+ {
+ if (!IS_ASCII_UCS4 ((guint32)unichar)) {
+ *p_is_ascii_or_cjk = TRUE;
+ return TRUE;
}
+ unichar = uiter_next32 (&iter);
}
+
return TRUE;
}
+
/* libunistring-based parser */
static gboolean
parser_next (TrackerParser *parser,
gint *byte_offset_start,
gint *byte_offset_end)
{
- gsize word_length = 0;
+ gsize word_length_uchar = 0;
+ gsize word_length_utf8 = 0;
gchar *processed_word = NULL;
+ gsize current_word_offset_utf8;
*byte_offset_start = 0;
*byte_offset_end = 0;
@@ -103,81 +167,72 @@ parser_next (TrackerParser *parser,
/* Loop to look for next valid word */
while (!processed_word &&
- parser->cursor < parser->txt_size) {
- ucs4_t first_unichar;
- gint first_unichar_len;
- gsize i;
+ parser->cursor < parser->utxt_size) {
+ gboolean is_ascii_or_cjk;
+ gboolean is_allowed;
+ gsize next_word_offset_uchar;
+ gsize next_word_offset_utf8;
gsize truncated_length;
- gboolean do_strip;
-
- /* Get first character of the word as UCS4 */
- first_unichar_len = u8_strmbtouc (&first_unichar,
- &(parser->txt[parser->cursor]));
- if (first_unichar_len <= 0) {
- /* This should only happen if NIL was passed to u8_strmbtouc,
- * so better just force stop here */
- parser->cursor = parser->txt_size;
- break;
- }
- /* Find next word break */
- i = parser->cursor + first_unichar_len;
- while (i < parser->txt_size &&
- !parser->word_break_flags [i]) {
- i++;
+ /* Set current word offset in the original UTF-8 string */
+ current_word_offset_utf8 = parser->offsets[parser->cursor];
+
+ /* Find next word break. */
+ next_word_offset_uchar = ubrk_next (parser->bi);
+ if (next_word_offset_uchar == UBRK_DONE) {
+ /* Last word support... */
+ next_word_offset_uchar = parser->utxt_size;
+ next_word_offset_utf8 = parser->txt_size;
+ }
+ else {
+ next_word_offset_utf8 = parser->offsets[next_word_offset_uchar];
}
/* Word end is the first byte after the word, which is either the
* start of next word or the end of the string */
- word_length = i - parser->cursor;
-
- /* We only want the words where the first character
- * in the word is either a letter, a number or a symbol.
- * This is needed because the word break algorithm also
- * considers word breaks after for example commas or other
- * punctuation marks.
- * Note that looking at the first character in the string
- * should be compatible with all Unicode normalization
- * methods.
- */
- if (!uc_is_general_category (first_unichar,
- parser->allowed_start)) {
- /* Skip this word and keep on looping */
- parser->cursor += word_length;
- continue;
+ word_length_uchar = next_word_offset_uchar - parser->cursor;
+ word_length_utf8 = next_word_offset_utf8 - current_word_offset_utf8;
+
+ /* Get word info... */
+ if (!get_word_info (&parser->utxt[parser->cursor],
+ word_length_uchar,
+ &is_allowed,
+ &is_ascii_or_cjk)) {
+ /* Quit loop just in case */
+ parser->cursor = parser->utxt_size;
+ break;
}
- /* check if word is reserved */
+ /* check if word is reserved (looking at ORIGINAL UTF-8 buffer
+ * here! */
if (parser->parse_reserved_words &&
- word_length == 2 &&
- parser->txt[parser->cursor] == 'o' &&
- parser->txt[parser->cursor + 1] == 'r') {
+ word_length_utf8 == 2 &&
+ parser->txt[current_word_offset_utf8] == 'o' &&
+ parser->txt[current_word_offset_utf8 + 1] == 'r') {
/* Skip this word and keep on looping */
- parser->cursor += word_length;
+ parser->cursor = next_word_offset_uchar;
continue;
}
- /* compute truncated word length if needed (to avoid extremely
- * long words)*/
- truncated_length = (word_length < WORD_BUFFER_LENGTH ?
- word_length :
- WORD_BUFFER_LENGTH - 1);
-
- /* Enable UNAC stripping only if no ASCII and no CJK */
- do_strip = (!is_ascii_word (&(parser->txt[parser->cursor]),
- truncated_length) &&
- !IS_CJK_UCS4 (first_unichar));
+ /* compute truncated word length (in UChar bytes) if needed (to
+ * avoid extremely long words) */
+ truncated_length = (word_length_uchar < 2 * WORD_BUFFER_LENGTH ?
+ word_length_uchar :
+ 2 * WORD_BUFFER_LENGTH);
/* Process the word here. If it fails, we can still go
- * to the next one. Returns newly allocated string
- * always */
- processed_word = tracker_parser_process_word (parser,
- &(parser->txt[parser->cursor]),
- truncated_length,
- do_strip);
+ * to the next one. Returns newly allocated UTF-8
+ * string always.
+ * Enable UNAC stripping only if no ASCII and no CJK
+ * Note we are passing UChar encoded string here!
+ */
+ processed_word = process_word_uchar (parser,
+ &(parser->utxt[parser->cursor]),
+ truncated_length,
+ !is_ascii_or_cjk);
if (!processed_word) {
/* Skip this word and keep on looping */
- parser->cursor += word_length;
+ parser->cursor = next_word_offset_uchar;
continue;
}
}
@@ -185,11 +240,11 @@ parser_next (TrackerParser *parser,
/* If we got a word here, set output */
if (processed_word) {
/* Set outputs */
- *byte_offset_start = parser->cursor;
- *byte_offset_end = parser->cursor + word_length;
+ *byte_offset_start = current_word_offset_utf8;
+ *byte_offset_end = current_word_offset_utf8 + word_length_utf8;
/* Update cursor */
- parser->cursor += word_length;
+ parser->cursor += word_length_uchar;
parser->word_length = strlen (processed_word);
parser->word = processed_word;
@@ -218,6 +273,7 @@ tracker_parser_new (TrackerLanguage *language,
parser->word_length = 0;
parser->utxt = NULL;
+ parser->offsets = NULL;
parser->utxt_size = 0;
parser->bi = NULL;
parser->cursor = 0;
@@ -239,6 +295,7 @@ tracker_parser_free (TrackerParser *parser)
}
g_free (parser->utxt);
+ g_free (parser->offsets);
g_free (parser->word);
@@ -254,8 +311,10 @@ tracker_parser_reset (TrackerParser *parser,
gboolean enable_stop_words,
gboolean parse_reserved_words)
{
- UErrorCode error;
+ UErrorCode error = U_ZERO_ERROR;
UConverter *converter;
+ UChar *last_uchar;
+ const gchar *last_utf8;
g_return_if_fail (parser != NULL);
g_return_if_fail (txt != NULL);
@@ -280,159 +339,229 @@ tracker_parser_reset (TrackerParser *parser,
if (!converter) {
g_warning ("Cannot open UTF-8 converter: '%s'",
U_FAILURE (error) ? u_errorName (error) : "none");
- return;
+ return;
}
- /* Allocate UChars buffer */
- parser->utxt_size = txt_size * 2 + 1;
+ /* Allocate UChars and offsets buffers */
+ parser->utxt_size = txt_size * sizeof (UChar) + 1;
parser->utxt = g_malloc (parser->utxt_size);
+ parser->offsets = g_malloc (parser->utxt_size);
+
+ /* last_uchar and last_utf8 will be also an output parameter! */
+ last_uchar = parser->utxt;
+ last_utf8 = parser->txt;
+
+ /* Convert to UChars storing offsets */
+ ucnv_toUnicode (converter,
+ &last_uchar,
+ &parser->utxt[parser->utxt_size],
+ &last_utf8,
+ &parser->txt[parser->txt_size],
+ parser->offsets,
+ FALSE,
+ &error);
+ if (U_SUCCESS (error)) {
+ /* Proper UChar array size is now given by 'last_uchar' */
+ parser->utxt_size = last_uchar - parser->utxt;
+
+ /* Open word-break iterator */
+ parser->bi = ubrk_open(UBRK_WORD,
+ setlocale (LC_ALL, NULL),
+ parser->utxt,
+ parser->utxt_size,
+ &error);
+ if (U_SUCCESS (error)) {
+ /* Find FIRST word in the UChar array */
+ parser->cursor = ubrk_first (parser->bi);
+ }
+ }
- /* Convert to UChars */
- parser->utxt_size = ucnv_toUChars (converter,
- parser->utxt,
- parser->utxt_size,
- parser->txt,
- parser->txt_size,
- &error);
+ /* If any error happened, reset buffers */
if (U_FAILURE (error)) {
- g_warning ("Cannot convert from UTF-8 to UChar: '%s'",
+ g_warning ("Error initializing libicu support: '%s'",
u_errorName (error));
- /* Error converting to UChars... reset buffer */
+ /* Reset buffers */
g_free (parser->utxt);
+ g_free (parser->offsets);
parser->utxt = NULL;
+ parser->offsets = NULL;
parser->utxt_size = 0;
- ucnv_close (converter);
- return;
}
- /* Open word-break iterator */
- parser->bi = ubrk_open(UBRK_WORD,
- setlocale (LC_ALL, NULL),
- parser->utxt,
- parser->utxt_size,
- &error);
+ /* Close converter */
+ ucnv_close (converter);
+}
+
+static gchar *
+process_word_uchar (TrackerParser *parser,
+ const UChar *word,
+ gint length,
+ gboolean do_strip)
+{
+ UErrorCode error = U_ZERO_ERROR;
+ UChar casefolded_buffer [WORD_BUFFER_LENGTH];
+ UChar normalized_buffer [WORD_BUFFER_LENGTH];
+ gchar *utf8_str = NULL;
+ gchar *stemmed = NULL;
+ size_t new_word_length;
+
+ /* Casefold... */
+ new_word_length = u_strFoldCase (casefolded_buffer,
+ WORD_BUFFER_LENGTH,
+ word,
+ length,
+ U_FOLD_CASE_DEFAULT,
+ &error);
if (U_FAILURE (error)) {
- g_warning ("Cannot open word-breaker: '%s'",
+ g_warning ("Error casefolding: '%s'",
u_errorName (error));
- g_free (parser->utxt);
- parser->utxt = NULL;
- parser->utxt_size = 0;
+ return NULL;
+ }
+
+ if (new_word_length > WORD_BUFFER_LENGTH)
+ new_word_length = WORD_BUFFER_LENGTH;
+
+ /* NFC normalization... */
+ new_word_length = unorm_normalize (casefolded_buffer,
+ new_word_length,
+ UNORM_NFC,
+ 0,
+ normalized_buffer,
+ WORD_BUFFER_LENGTH,
+ &error);
+ if (U_FAILURE (error)) {
+ g_warning ("Error normalizing: '%s'",
+ u_errorName (error));
+ return NULL;
+ }
+
+ if (new_word_length > WORD_BUFFER_LENGTH)
+ new_word_length = WORD_BUFFER_LENGTH;
+
+ /* UNAC stripping needed? */
+ if (do_strip) {
+ gsize stripped_word_length;
+
+ /* Get unaccented string in UTF-8 */
+ utf8_str = tracker_parser_unaccent_UChar_word (normalized_buffer,
+ new_word_length,
+ &stripped_word_length);
+ if (utf8_str) {
+ new_word_length = stripped_word_length;
+ }
+ }
+
+ /* If stripping failed or not needed, convert to UTF-8 */
+ if (!utf8_str) {
+ UErrorCode icu_error = U_ZERO_ERROR;
+ UConverter *converter;
+ gsize utf8_len;
+
+ /* Open converter UChar to UTF-16BE */
+ converter = ucnv_open ("UTF-8", &icu_error);
+ if (!converter) {
+ g_warning ("Cannot open UTF-8 converter: '%s'",
+ U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+ return NULL;
+ }
+ /* Using same buffer size as for UTF-16 should always work. */
+ utf8_str = g_malloc (new_word_length + 1);
+
+ /* Convert from UChar to UTF-8 */
+ utf8_len = ucnv_fromUChars (converter,
+ utf8_str,
+ new_word_length,
+ normalized_buffer,
+ new_word_length,
+ &icu_error);
+ if (U_FAILURE (icu_error)) {
+ g_warning ("Cannot convert from UChar to UTF-8: '%s'",
+ u_errorName (icu_error));
+ g_free (utf8_str);
+ ucnv_close (converter);
+ return NULL;
+ }
+
+ utf8_str[utf8_len] = '\0';
+ new_word_length = utf8_len;
ucnv_close (converter);
- return;
}
- /* Find FIRST word in the UChar array */
- parser->cursor = ubrk_first (parser->bi);
+ /* Stemming needed? */
+ if (parser->enable_stemmer) {
+ /* Input for stemmer ALWAYS in UTF-8, as well as output */
+ stemmed = tracker_language_stem_word (parser->language,
+ utf8_str,
+ new_word_length);
+
+ /* Log after stemming */
+ tracker_parser_message_hex (" After stemming",
+ stemmed, strlen (stemmed));
+ }
+
+ /* If stemmed wanted and succeeded, free previous and return it */
+ if (stemmed) {
+ g_free (utf8_str);
+ return stemmed;
+ }
+
+ return utf8_str;
}
-/* libunistring version of the word processor. */
+
+/* Both Input and Output are always UTF-8 */
gchar *
tracker_parser_process_word (TrackerParser *parser,
- const gchar *word,
+ const gchar *word,
gint length,
gboolean do_strip)
{
- /* gchar word_buffer [WORD_BUFFER_LENGTH]; */
- /* gchar *normalized = NULL; */
- /* gchar *stripped = NULL; */
- /* gchar *stemmed = NULL; */
- /* size_t new_word_length; */
-
- /* g_return_val_if_fail (parser != NULL, NULL); */
- /* g_return_val_if_fail (word != NULL, NULL); */
-
-
- /* /\* If length is set as -1, the input word MUST be NIL-terminated. */
- /* * Otherwise, this restriction is not needed as the length to process */
- /* * is given as input argument *\/ */
- /* if (length < 0) { */
- /* length = strlen (word); */
- /* } */
-
- /* /\* Log original word *\/ */
- /* tracker_parser_message_hex ("ORIGINAL word", */
- /* word, length); */
-
- /* /\* Leave space for last NIL *\/ */
- /* new_word_length = WORD_BUFFER_LENGTH - 1; */
-
- /* /\* Casefold and NFC normalization in output. */
- /* * NOTE: if the output buffer is not big enough, u8_casefold will */
- /* * return a newly-allocated buffer. *\/ */
- /* normalized = u8_casefold ((const uint8_t *)word, */
- /* length, */
- /* uc_locale_language (), */
- /* UNINORM_NFC, */
- /* word_buffer, */
- /* &new_word_length); */
-
- /* /\* Case folding + Normalization failed, skip this word *\/ */
- /* g_return_val_if_fail (normalized != NULL, NULL); */
-
- /* /\* If output buffer is not the same as the one passed to */
- /* * u8_casefold, we know it was newly-allocated, so need */
- /* * to resize it in 1 byte to add last NIL *\/ */
- /* if (normalized != word_buffer) { */
- /* normalized = g_realloc (normalized, new_word_length + 1); */
- /* } */
-
- /* /\* Set output NIL *\/ */
- /* normalized[new_word_length] = '\0'; */
-
- /* /\* Log after Normalization *\/ */
- /* tracker_parser_message_hex (" After Casefolding and NFC normalization", */
- /* normalized, new_word_length); */
-
- /* /\* UNAC stripping needed? *\/ */
- /* if (do_strip) { */
- /* gsize stripped_word_length; */
-
- /* stripped = tracker_parser_unaccent_string (normalized, */
- /* new_word_length, */
- /* &stripped_word_length); */
-
- /* if (stripped) { */
- /* /\* Log after UNAC stripping *\/ */
- /* tracker_parser_message_hex (" After UNAC stripping", */
- /* stripped, stripped_word_length); */
- /* new_word_length = stripped_word_length; */
- /* } */
- /* } */
-
-
- /* /\* Stemming needed? *\/ */
- /* if (parser->enable_stemmer) { */
- /* stemmed = tracker_language_stem_word (parser->language, */
- /* stripped ? stripped : normalized, */
- /* new_word_length); */
-
- /* /\* Log after stemming *\/ */
- /* tracker_parser_message_hex (" After stemming", */
- /* stemmed, strlen (stemmed)); */
- /* } */
-
- /* /\* If stemmed wanted and succeeded, free previous and return it *\/ */
- /* if (stemmed) { */
- /* g_free (stripped); */
- /* if (normalized != word_buffer) { */
- /* g_free (normalized); */
- /* } */
- /* return stemmed; */
- /* } */
-
- /* /\* If stripped wanted and succeeded, free previous and return it *\/ */
- /* if (stripped) { */
- /* if (normalized != word_buffer) { */
- /* g_free (normalized); */
- /* } */
- /* return stripped; */
- /* } */
-
- /* /\* It may be the case that no stripping and no stemming was needed, and */
- /* * that the output buffer in stack was enough for case-folding and */
- /* * normalization. In this case, need to strdup() the string to return it *\/ */
- /* return normalized == word_buffer ? g_strdup (word_buffer) : normalized; */
- return NULL;
+ UErrorCode icu_error = U_ZERO_ERROR;
+ UConverter *converter;
+ UChar *uchar_word;
+ gsize uchar_len;
+ gchar *processed;
+
+ /* Open converter UTF-8 to UChar */
+ converter = ucnv_open ("UTF-8", &icu_error);
+ if (!converter) {
+ g_warning ("Cannot open UTF-8 converter: '%s'",
+ U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+ return NULL;
+ }
+
+ /* Compute length if not already as input */
+ if (length < 0) {
+ length = strlen (word);
+ }
+
+ /* Twice the size of the UTF-8 string for UChars */
+ uchar_word = g_malloc (2 * length);
+
+ /* Convert from UTF-8 to UChars*/
+ uchar_len = ucnv_toUChars (converter,
+ uchar_word,
+ 2 * length,
+ word,
+ length,
+ &icu_error);
+ if (U_FAILURE (icu_error)) {
+ g_warning ("Cannot convert from UTF-8 to UChar: '%s'",
+ u_errorName (icu_error));
+ g_free (uchar_word);
+ ucnv_close (converter);
+ return NULL;
+ }
+
+ ucnv_close (converter);
+
+ /* Process UChar based word */
+ processed = process_word_uchar (parser,
+ uchar_word,
+ uchar_len,
+ do_strip);
+ g_free (uchar_word);
+ return processed;
}
const gchar *
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]