[tracker/parser-libunistring-review] Added first non-tested implementation of the libicu based word breaking and processing

From: Aleksander Morgado <aleksm src gnome org>
To: commits-list gnome org
Cc:
Subject: [tracker/parser-libunistring-review] Added first non-tested implementation of the libicu based word breaking and processing
Date: Tue, 4 May 2010 16:31:45 +0000 (UTC)
commit 06e11f3211c2a5a9c581cae82ff5176004b66a69
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Tue May 4 09:42:00 2010 +0200

    Added first non-tested implementation of the libicu based word breaking and processing

 src/libtracker-fts/tracker-parser-libicu.c |  539 +++++++++++++++++-----------
 1 files changed, 334 insertions(+), 205 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index e29e0ea..9089dca 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -24,13 +24,18 @@
 #include <string.h>
 #include <locale.h>
 
-#include <ubrk.h>
+#include <unicode/utypes.h>
+#include <unicode/ucnv.h>
+#include <unicode/ubrk.h>
+#include <unicode/ustring.h>
+#include <unicode/uchar.h>
+#include <unicode/unorm.h>
 
 #include "tracker-parser.h"
 #include "tracker-parser-utils.h"
 
 /* ASCII-7 is in range [0x00,0x7F] */
-#define IS_ASCII_BYTE(c) ((c) <= 0x7F)
+#define IS_ASCII_UCS4(c) ((c) <= 0x7F)
 
 /* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6]  */
 #define IS_CJK_UCS4(c)   (((c) >= 0x3400 && (c) <= 0x4DB5)  || \
@@ -41,6 +46,12 @@
 #define WORD_BUFFER_LENGTH 512
 
 
+static gchar *process_word_uchar (TrackerParser *parser,
+                                  const UChar   *word,
+                                  gint           length,
+                                  gboolean       do_strip);
+
+
 struct TrackerParser {
 	const gchar           *txt;
 	gint                   txt_size;
@@ -61,6 +72,8 @@ struct TrackerParser {
 	/* Text as UChars */
 	UChar                 *utxt;
 	gint                   utxt_size;
+	/* Original offset of each UChar in the input txt string */
+	gint32                *offsets;
 
 	/* The word-break iterator */
 	UBreakIterator        *bi;
@@ -69,32 +82,83 @@ struct TrackerParser {
 	gsize                  cursor;
 };
 
-/* Detect if a UTF-8 word is pure ASCII-7, so that there is no need to apply
- *  UNAC stripping.
- * Just check byte per byte, and if any of the bytes is >127, then it's not
- *  ASCII-7 */
+
 static gboolean
-is_ascii_word (const gchar *word,
-               gsize        length)
+get_word_info (const UChar *word,
+               gsize        word_length,
+               gboolean    *p_is_allowed_word_start,
+               gboolean    *p_is_ascii_or_cjk)
 {
-	gsize i;
+	UCharIterator iter;
+	UChar32 unichar;
+	guint8 unichar_gc;
+
+	*p_is_allowed_word_start = FALSE;
+	*p_is_ascii_or_cjk = FALSE;
+
+	/* Get first character of the word as UCS4 */
+	uiter_setString (&iter, word, word_length);
+	unichar = uiter_current32 (&iter);
+	if (unichar == U_SENTINEL) {
+		return FALSE;
+	}
+
+	/* We only want the words where the first character
+	 *  in the word is either a letter, a number or a symbol.
+	 * This is needed because the word break algorithm also
+	 *  considers word breaks after for example commas or other
+	 *  punctuation marks.
+	 * Note that looking at the first character in the string
+	 *  should be compatible with all Unicode normalization
+	 *  methods.
+	 */
+	unichar_gc = u_charType (unichar);
+	if (unichar_gc != U_UPPERCASE_LETTER &&
+	    unichar_gc != U_LOWERCASE_LETTER &&
+	    unichar_gc != U_TITLECASE_LETTER &&
+	    unichar_gc != U_MODIFIER_LETTER &&
+	    unichar_gc != U_OTHER_LETTER &&
+	    unichar_gc != U_DECIMAL_DIGIT_NUMBER &&
+	    unichar_gc != U_LETTER_NUMBER &&
+	    unichar_gc != U_OTHER_NUMBER &&
+	    unichar_gc != U_MATH_SYMBOL &&
+	    unichar_gc != U_CURRENCY_SYMBOL &&
+	    unichar_gc != U_MODIFIER_SYMBOL &&
+	    unichar_gc != U_OTHER_SYMBOL) {
+		*p_is_allowed_word_start = FALSE;
+		return TRUE;
+	}
+
+	/* Word starts with a CJK character? */
+	if (IS_CJK_UCS4 ((guint32)unichar)) {
+		*p_is_ascii_or_cjk = TRUE;
+		return TRUE;
+	}
 
-	for (i = 0; i < length; i++) {
-		if (!IS_ASCII_BYTE ((guchar)word[i])) {
-			return FALSE;
+	/* Is ASCII-only string? */
+	while (unichar != U_SENTINEL)
+	{
+		if (!IS_ASCII_UCS4 ((guint32)unichar)) {
+			*p_is_ascii_or_cjk = TRUE;
+			return TRUE;
 		}
+		unichar = uiter_next32 (&iter);
 	}
+
 	return TRUE;
 }
 
+
 /* libunistring-based parser */
 static gboolean
 parser_next (TrackerParser *parser,
              gint          *byte_offset_start,
              gint          *byte_offset_end)
 {
-	gsize word_length = 0;
+	gsize word_length_uchar = 0;
+	gsize word_length_utf8 = 0;
 	gchar *processed_word = NULL;
+	gsize current_word_offset_utf8;
 
 	*byte_offset_start = 0;
 	*byte_offset_end = 0;
@@ -103,81 +167,72 @@ parser_next (TrackerParser *parser,
 
 	/* Loop to look for next valid word */
 	while (!processed_word &&
-	       parser->cursor < parser->txt_size) {
-		ucs4_t first_unichar;
-		gint first_unichar_len;
-		gsize i;
+	       parser->cursor < parser->utxt_size) {
+		gboolean is_ascii_or_cjk;
+		gboolean is_allowed;
+		gsize next_word_offset_uchar;
+		gsize next_word_offset_utf8;
 		gsize truncated_length;
-		gboolean do_strip;
-
-		/* Get first character of the word as UCS4 */
-		first_unichar_len = u8_strmbtouc (&first_unichar,
-		                                  &(parser->txt[parser->cursor]));
-		if (first_unichar_len <= 0) {
-			/* This should only happen if NIL was passed to u8_strmbtouc,
-			 *  so better just force stop here */
-			parser->cursor = parser->txt_size;
-			break;
-		}
 
-		/* Find next word break */
-		i = parser->cursor + first_unichar_len;
-		while (i < parser->txt_size &&
-		       !parser->word_break_flags [i]) {
-			i++;
+		/* Set current word offset in the original UTF-8 string */
+		current_word_offset_utf8 = parser->offsets[parser->cursor];
+
+		/* Find next word break. */
+		next_word_offset_uchar = ubrk_next (parser->bi);
+		if (next_word_offset_uchar == UBRK_DONE) {
+			/* Last word support... */
+			next_word_offset_uchar = parser->utxt_size;
+			next_word_offset_utf8 = parser->txt_size;
+		}
+		else {
+			next_word_offset_utf8 = parser->offsets[next_word_offset_uchar];
 		}
 
 		/* Word end is the first byte after the word, which is either the
 		 *  start of next word or the end of the string */
-		word_length = i - parser->cursor;
-
-		/* We only want the words where the first character
-		 *  in the word is either a letter, a number or a symbol.
-		 * This is needed because the word break algorithm also
-		 *  considers word breaks after for example commas or other
-		 *  punctuation marks.
-		 * Note that looking at the first character in the string
-		 *  should be compatible with all Unicode normalization
-		 *  methods.
-		 */
-		if (!uc_is_general_category (first_unichar,
-		                             parser->allowed_start)) {
-			/* Skip this word and keep on looping */
-			parser->cursor += word_length;
-			continue;
+		word_length_uchar = next_word_offset_uchar - parser->cursor;
+		word_length_utf8 = next_word_offset_utf8 - current_word_offset_utf8;
+
+		/* Get word info... */
+		if (!get_word_info (&parser->utxt[parser->cursor],
+		                    word_length_uchar,
+		                    &is_allowed,
+		                    &is_ascii_or_cjk)) {
+			/* Quit loop just in case */
+			parser->cursor = parser->utxt_size;
+			break;
 		}
 
-		/* check if word is reserved */
+		/* check if word is reserved (looking at ORIGINAL UTF-8 buffer
+		 *  here! */
 		if (parser->parse_reserved_words &&
-		    word_length == 2 &&
-		    parser->txt[parser->cursor] == 'o' &&
-		    parser->txt[parser->cursor + 1] == 'r') {
+		    word_length_utf8 == 2 &&
+		    parser->txt[current_word_offset_utf8] == 'o' &&
+		    parser->txt[current_word_offset_utf8 + 1] == 'r') {
 			/* Skip this word and keep on looping */
-			parser->cursor += word_length;
+			parser->cursor = next_word_offset_uchar;
 			continue;
 		}
 
-		/* compute truncated word length if needed (to avoid extremely
-		 *  long words)*/
-		truncated_length = (word_length < WORD_BUFFER_LENGTH ?
-		                    word_length :
-		                    WORD_BUFFER_LENGTH - 1);
-
-		/* Enable UNAC stripping only if no ASCII and no CJK */
-		do_strip = (!is_ascii_word (&(parser->txt[parser->cursor]),
-		                            truncated_length) &&
-		            !IS_CJK_UCS4 (first_unichar));
+		/* compute truncated word length (in UChar bytes) if needed (to
+		 * avoid extremely long words) */
+		truncated_length = (word_length_uchar < 2 * WORD_BUFFER_LENGTH ?
+		                    word_length_uchar :
+		                    2 * WORD_BUFFER_LENGTH);
 
 		/* Process the word here. If it fails, we can still go
-		 *  to the next one. Returns newly allocated string
-		 *  always */
-		processed_word = tracker_parser_process_word (parser,
-		                                              &(parser->txt[parser->cursor]),
-		                                              truncated_length,
-		                                              do_strip);
+		 *  to the next one. Returns newly allocated UTF-8
+		 *  string always.
+		 * Enable UNAC stripping only if no ASCII and no CJK
+		 * Note we are passing UChar encoded string here!
+		 */
+		processed_word = process_word_uchar (parser,
+		                                     &(parser->utxt[parser->cursor]),
+		                                     truncated_length,
+		                                     !is_ascii_or_cjk);
 		if (!processed_word) {
 			/* Skip this word and keep on looping */
-			parser->cursor += word_length;
+			parser->cursor = next_word_offset_uchar;
 			continue;
 		}
 	}
@@ -185,11 +240,11 @@ parser_next (TrackerParser *parser,
 	/* If we got a word here, set output */
 	if (processed_word) {
 		/* Set outputs */
-		*byte_offset_start = parser->cursor;
-		*byte_offset_end = parser->cursor + word_length;
+		*byte_offset_start = current_word_offset_utf8;
+		*byte_offset_end = current_word_offset_utf8 + word_length_utf8;
 
 		/* Update cursor */
-		parser->cursor += word_length;
+		parser->cursor += word_length_uchar;
 
 		parser->word_length = strlen (processed_word);
 		parser->word = processed_word;
@@ -218,6 +273,7 @@ tracker_parser_new (TrackerLanguage *language,
 	parser->word_length = 0;
 
 	parser->utxt = NULL;
+	parser->offsets = NULL;
 	parser->utxt_size = 0;
 	parser->bi = NULL;
 	parser->cursor = 0;
@@ -239,6 +295,7 @@ tracker_parser_free (TrackerParser *parser)
 	}
 
 	g_free (parser->utxt);
+	g_free (parser->offsets);
 
 	g_free (parser->word);
 
@@ -254,8 +311,10 @@ tracker_parser_reset (TrackerParser *parser,
                       gboolean       enable_stop_words,
                       gboolean       parse_reserved_words)
 {
-	UErrorCode error;
+	UErrorCode error = U_ZERO_ERROR;
 	UConverter *converter;
+	UChar *last_uchar;
+	const gchar *last_utf8;
 
 	g_return_if_fail (parser != NULL);
 	g_return_if_fail (txt != NULL);
@@ -280,159 +339,229 @@ tracker_parser_reset (TrackerParser *parser,
 	if (!converter) {
 		g_warning ("Cannot open UTF-8 converter: '%s'",
 		           U_FAILURE (error) ? u_errorName (error) : "none");
-		return;
+               return;
 	}
 
-	/* Allocate UChars buffer */
-	parser->utxt_size = txt_size * 2 + 1;
+	/* Allocate UChars and offsets buffers */
+	parser->utxt_size = txt_size * sizeof (UChar) + 1;
 	parser->utxt = g_malloc (parser->utxt_size);
+	parser->offsets = g_malloc (parser->utxt_size);
+
+	/* last_uchar and last_utf8 will be also an output parameter! */
+	last_uchar = parser->utxt;
+	last_utf8 = parser->txt;
+
+	/* Convert to UChars storing offsets */
+	ucnv_toUnicode (converter,
+	                &last_uchar,
+	                &parser->utxt[parser->utxt_size],
+	                &last_utf8,
+	                &parser->txt[parser->txt_size],
+	                parser->offsets,
+	                FALSE,
+	                &error);
+	if (U_SUCCESS (error)) {
+		/* Proper UChar array size is now given by 'last_uchar' */
+		parser->utxt_size = last_uchar - parser->utxt;
+
+		/* Open word-break iterator */
+		parser->bi = ubrk_open(UBRK_WORD,
+		                       setlocale (LC_ALL, NULL),
+		                       parser->utxt,
+		                       parser->utxt_size,
+		                       &error);
+		if (U_SUCCESS (error)) {
+			/* Find FIRST word in the UChar array */
+			parser->cursor = ubrk_first (parser->bi);
+		}
+	}
 
-	/* Convert to UChars */
-	parser->utxt_size = ucnv_toUChars (converter,
-	                                   parser->utxt,
-	                                   parser->utxt_size,
-	                                   parser->txt,
-	                                   parser->txt_size,
-	                                   &error);
+	/* If any error happened, reset buffers */
 	if (U_FAILURE (error)) {
-		g_warning ("Cannot convert from UTF-8 to UChar: '%s'",
+		g_warning ("Error initializing libicu support: '%s'",
 		           u_errorName (error));
-		/* Error converting to UChars... reset buffer */
+		/* Reset buffers */
 		g_free (parser->utxt);
+		g_free (parser->offsets);
 		parser->utxt = NULL;
+		parser->offsets = NULL;
 		parser->utxt_size = 0;
-		ucnv_close (converter);
-		return;
 	}
 
-	/* Open word-break iterator */
-	parser->bi = ubrk_open(UBRK_WORD,
-	                       setlocale (LC_ALL, NULL),
-	                       parser->utxt,
-	                       parser->utxt_size,
-	                       &error);
+	/* Close converter */
+	ucnv_close (converter);
+}
+
+static gchar *
+process_word_uchar (TrackerParser *parser,
+                    const UChar   *word,
+                    gint           length,
+                    gboolean       do_strip)
+{
+	UErrorCode error = U_ZERO_ERROR;
+	UChar casefolded_buffer [WORD_BUFFER_LENGTH];
+	UChar normalized_buffer [WORD_BUFFER_LENGTH];
+	gchar *utf8_str = NULL;
+	gchar *stemmed = NULL;
+	size_t new_word_length;
+
+	/* Casefold... */
+	new_word_length = u_strFoldCase (casefolded_buffer,
+	                                 WORD_BUFFER_LENGTH,
+	                                 word,
+	                                 length,
+	                                 U_FOLD_CASE_DEFAULT,
+	                                 &error);
 	if (U_FAILURE (error)) {
-		g_warning ("Cannot open word-breaker: '%s'",
+		g_warning ("Error casefolding: '%s'",
 		           u_errorName (error));
-		g_free (parser->utxt);
-		parser->utxt = NULL;
-		parser->utxt_size = 0;
+		return NULL;
+	}
+
+	if (new_word_length > WORD_BUFFER_LENGTH)
+		new_word_length = WORD_BUFFER_LENGTH;
+
+	/* NFC normalization... */
+	new_word_length = unorm_normalize (casefolded_buffer,
+	                                   new_word_length,
+	                                   UNORM_NFC,
+	                                   0,
+	                                   normalized_buffer,
+	                                   WORD_BUFFER_LENGTH,
+	                                   &error);
+	if (U_FAILURE (error)) {
+		g_warning ("Error normalizing: '%s'",
+		           u_errorName (error));
+		return NULL;
+	}
+
+	if (new_word_length > WORD_BUFFER_LENGTH)
+		new_word_length = WORD_BUFFER_LENGTH;
+
+	/* UNAC stripping needed? */
+	if (do_strip) {
+		gsize stripped_word_length;
+
+		/* Get unaccented string in UTF-8 */
+		utf8_str = tracker_parser_unaccent_UChar_word (normalized_buffer,
+		                                               new_word_length,
+		                                               &stripped_word_length);
+		if (utf8_str) {
+			new_word_length = stripped_word_length;
+		}
+	}
+
+	/* If stripping failed or not needed, convert to UTF-8 */
+	if (!utf8_str) {
+		UErrorCode icu_error = U_ZERO_ERROR;
+		UConverter *converter;
+		gsize utf8_len;
+
+		/* Open converter UChar to UTF-16BE */
+		converter = ucnv_open ("UTF-8", &icu_error);
+		if (!converter) {
+			g_warning ("Cannot open UTF-8 converter: '%s'",
+			           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+			return NULL;
+		}
+		/* Using same  buffer size as for UTF-16 should always work. */
+		utf8_str = g_malloc (new_word_length + 1);
+
+		/* Convert from UChar to UTF-8 */
+		utf8_len = ucnv_fromUChars (converter,
+		                            utf8_str,
+		                            new_word_length,
+		                            normalized_buffer,
+		                            new_word_length,
+		                            &icu_error);
+		if (U_FAILURE (icu_error)) {
+			g_warning ("Cannot convert from UChar to UTF-8: '%s'",
+			           u_errorName (icu_error));
+			g_free (utf8_str);
+			ucnv_close (converter);
+			return NULL;
+		}
+
+		utf8_str[utf8_len] = '\0';
+		new_word_length = utf8_len;
 		ucnv_close (converter);
-		return;
 	}
 
-	/* Find FIRST word in the UChar array */
-	parser->cursor = ubrk_first (parser->bi);
+	/* Stemming needed? */
+	if (parser->enable_stemmer) {
+		/* Input for stemmer ALWAYS in UTF-8, as well as output */
+		stemmed = tracker_language_stem_word (parser->language,
+		                                      utf8_str,
+		                                      new_word_length);
+
+		/* Log after stemming */
+		tracker_parser_message_hex ("   After stemming",
+		                            stemmed, strlen (stemmed));
+	}
+
+	/* If stemmed wanted and succeeded, free previous and return it */
+	if (stemmed) {
+		g_free (utf8_str);
+		return stemmed;
+	}
+
+	return utf8_str;
 }
 
-/* libunistring version of the word processor. */
+
+/* Both Input and Output are always UTF-8 */
 gchar *
 tracker_parser_process_word (TrackerParser *parser,
-                             const gchar    *word,
+                             const gchar   *word,
                              gint           length,
                              gboolean       do_strip)
 {
-	/* gchar word_buffer [WORD_BUFFER_LENGTH]; */
-	/* gchar *normalized = NULL; */
-	/* gchar *stripped = NULL; */
-	/* gchar *stemmed = NULL; */
-	/* size_t new_word_length; */
-
-	/* g_return_val_if_fail (parser != NULL, NULL); */
-	/* g_return_val_if_fail (word != NULL, NULL); */
-
-
-	/* /\* If length is set as -1, the input word MUST be NIL-terminated. */
-	/*  * Otherwise, this restriction is not needed as the length to process */
-	/*  *  is given as input argument *\/ */
-	/* if (length < 0) { */
-	/* 	length = strlen (word); */
-	/* } */
-
-	/* /\* Log original word *\/ */
-	/* tracker_parser_message_hex ("ORIGINAL word", */
-	/*                             word, length); */
-
-	/* /\* Leave space for last NIL *\/ */
-	/* new_word_length = WORD_BUFFER_LENGTH - 1; */
-
-	/* /\* Casefold and NFC normalization in output. */
-	/*  *  NOTE: if the output buffer is not big enough, u8_casefold will */
-	/*  *  return a newly-allocated buffer. *\/ */
-	/* normalized = u8_casefold ((const uint8_t *)word, */
-	/*                           length, */
-	/*                           uc_locale_language (), */
-	/*                           UNINORM_NFC, */
-	/*                           word_buffer, */
-	/*                           &new_word_length); */
-
-	/* /\* Case folding + Normalization failed, skip this word *\/ */
-	/* g_return_val_if_fail (normalized != NULL, NULL); */
-
-	/* /\* If output buffer is not the same as the one passed to */
-	/*  *  u8_casefold, we know it was newly-allocated, so need */
-	/*  *  to resize it in 1 byte to add last NIL *\/ */
-	/* if (normalized != word_buffer) { */
-	/* 	normalized = g_realloc (normalized, new_word_length + 1); */
-	/* } */
-
-	/* /\* Set output NIL *\/ */
-	/* normalized[new_word_length] = '\0'; */
-
-	/* /\* Log after Normalization *\/ */
-	/* tracker_parser_message_hex (" After Casefolding and NFC normalization", */
-	/*                             normalized, new_word_length); */
-
-	/* /\* UNAC stripping needed? *\/ */
-	/* if (do_strip) { */
-	/* 	gsize stripped_word_length; */
-
-	/* 	stripped = tracker_parser_unaccent_string (normalized, */
-	/* 	                                           new_word_length, */
-	/* 	                                           &stripped_word_length); */
-
-	/* 	if (stripped) { */
-	/* 		/\* Log after UNAC stripping *\/ */
-	/* 		tracker_parser_message_hex ("  After UNAC stripping", */
-	/* 		                            stripped, stripped_word_length); */
-	/* 		new_word_length = stripped_word_length; */
-	/* 	} */
-	/* } */
-
-
-	/* /\* Stemming needed? *\/ */
-	/* if (parser->enable_stemmer) { */
-	/* 	stemmed = tracker_language_stem_word (parser->language, */
-	/* 	                                      stripped ? stripped : normalized, */
-	/* 	                                      new_word_length); */
-
-	/* 	/\* Log after stemming *\/ */
-	/* 	tracker_parser_message_hex ("   After stemming", */
-	/* 	                            stemmed, strlen (stemmed)); */
-	/* } */
-
-	/* /\* If stemmed wanted and succeeded, free previous and return it *\/ */
-	/* if (stemmed) { */
-	/* 	g_free (stripped); */
-	/* 	if (normalized != word_buffer) { */
-	/* 		g_free (normalized); */
-	/* 	} */
-	/* 	return stemmed; */
-	/* } */
-
-	/* /\* If stripped wanted and succeeded, free previous and return it *\/ */
-	/* if (stripped) { */
-	/* 	if (normalized != word_buffer) { */
-	/* 		g_free (normalized); */
-	/* 	} */
-	/* 	return stripped; */
-	/* } */
-
-	/* /\* It may be the case that no stripping and no stemming was needed, and */
-	/*  * that the output buffer in stack was enough for case-folding and */
-	/*  * normalization. In this case, need to strdup() the string to return it *\/ */
-	/* return normalized == word_buffer ? g_strdup (word_buffer) : normalized; */
-	return NULL;
+	UErrorCode icu_error = U_ZERO_ERROR;
+	UConverter *converter;
+	UChar *uchar_word;
+	gsize uchar_len;
+	gchar *processed;
+
+	/* Open converter UTF-8 to UChar */
+	converter = ucnv_open ("UTF-8", &icu_error);
+	if (!converter) {
+		g_warning ("Cannot open UTF-8 converter: '%s'",
+		           U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+		return NULL;
+	}
+
+	/* Compute length if not already as input */
+	if (length < 0) {
+		length = strlen (word);
+	}
+
+	/* Twice the size of the UTF-8 string for UChars */
+	uchar_word = g_malloc (2 * length);
+
+	/* Convert from UTF-8 to UChars*/
+	uchar_len = ucnv_toUChars (converter,
+	                           uchar_word,
+	                           2 * length,
+	                           word,
+	                           length,
+	                           &icu_error);
+	if (U_FAILURE (icu_error)) {
+		g_warning ("Cannot convert from UTF-8 to UChar: '%s'",
+		           u_errorName (icu_error));
+		g_free (uchar_word);
+		ucnv_close (converter);
+		return NULL;
+	}
+
+	ucnv_close (converter);
+
+	/* Process UChar based word */
+	processed = process_word_uchar (parser,
+	                                uchar_word,
+	                                uchar_len,
+	                                do_strip);
+	g_free (uchar_word);
+	return processed;
 }
 
 const gchar *
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]