[tracker/parser-libunistring-review] WIP with libicu support



commit c47d2bc10f1130dad7e516a806a17fb261756606
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Mon May 3 13:24:35 2010 +0200

    WIP with libicu support

 src/libtracker-fts/Makefile.am             |   24 +-
 src/libtracker-fts/tracker-parser-libicu.c |  474 ++++++++++++++++++++++++++++
 2 files changed, 492 insertions(+), 6 deletions(-)
---
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am
index 667cece..62c6d7a 100644
--- a/src/libtracker-fts/Makefile.am
+++ b/src/libtracker-fts/Makefile.am
@@ -12,9 +12,13 @@ INCLUDES =								\
 	$(SQLITE3_CFLAGS)
 
 if HAVE_LIBUNISTRING
-INCLUDES += $(LIBUNISTRING_CFLAGS)
+  INCLUDES += $(LIBUNISTRING_CFLAGS)
 else
-INCLUDES += $(PANGO_CFLAGS)
+if HAVE_LIBICU
+  INCLUDES += $(LIBICU_CFLAGS)
+else
+  INCLUDES += $(PANGO_CFLAGS)
+endif
 endif
 
 noinst_LTLIBRARIES = libtracker-fts.la
@@ -31,9 +35,13 @@ libtracker_fts_la_SOURCES = 						\
 	tracker-parser.h
 
 if HAVE_LIBUNISTRING
-libtracker_fts_la_SOURCES += tracker-parser-libunistring.c
+  libtracker_fts_la_SOURCES += tracker-parser-libunistring.c
+else
+if HAVE_LIBICU
+  libtracker_fts_la_SOURCES += tracker-parser-libicu.c
 else
-libtracker_fts_la_SOURCES += tracker-parser-glib.c
+  libtracker_fts_la_SOURCES += tracker-parser-glib.c
+endif
 endif
 
 libtracker_fts_la_LIBADD =						\
@@ -46,7 +54,11 @@ libtracker_fts_la_LIBADD =						\
 	$(GLIB2_LIBS)
 
 if HAVE_LIBUNISTRING
-libtracker_fts_la_LIBADD += $(LIBUNISTRING_LIBS)
+  libtracker_fts_la_LIBADD += $(LIBUNISTRING_LIBS)
+else
+if HAVE_LIBICU
+  libtracker_fts_la_LIBADD += $(LIBICU_LIBS)
 else
-libtracker_fts_la_LIBADD += $(PANGO_LIBS)
+  libtracker_fts_la_LIBADD += $(PANGO_LIBS)
+endif
 endif
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
new file mode 100644
index 0000000..e29e0ea
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -0,0 +1,474 @@
+/*
+ * Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
+ * Copyright (C) 2008,2009,2010 Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301  USA
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <locale.h>
+
+#include <ubrk.h>
+
+#include "tracker-parser.h"
+#include "tracker-parser-utils.h"
+
+/* ASCII-7 is in range [0x00,0x7F] */
+#define IS_ASCII_BYTE(c) ((c) <= 0x7F)
+
+/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6]  */
+#define IS_CJK_UCS4(c)   (((c) >= 0x3400 && (c) <= 0x4DB5)  || \
+                          ((c) >= 0x4E00 && (c) <= 0x9FA5)  || \
+                          ((c) >= 0x20000 && (c) <= 0x2A6D6))
+
+/* Max possible length of a UTF-8 encoded string (just a safety limit) */
+#define WORD_BUFFER_LENGTH 512
+
+
+struct TrackerParser {
+	const gchar           *txt;
+	gint                   txt_size;
+
+	TrackerLanguage       *language;
+	gboolean               enable_stemmer;
+	gboolean               enable_stop_words;
+	guint                  max_words_to_index;
+	guint                  max_word_length;
+	gboolean               delimit_words;
+	gboolean               parse_reserved_words;
+
+	/* Private members */
+	gchar                 *word;
+	gint                   word_length;
+	guint                  word_position;
+
+	/* Text as UChars */
+	UChar                 *utxt;
+	gint                   utxt_size;
+
+	/* The word-break iterator */
+	UBreakIterator        *bi;
+
+	/* Cursor, as index of the utxt array of bytes */
+	gsize                  cursor;
+};
+
+/* Detect if a UTF-8 word is pure ASCII-7, so that there is no need to apply
+ *  UNAC stripping.
+ * Just check byte per byte, and if any of the bytes is >127, then it's not
+ *  ASCII-7 */
+static gboolean
+is_ascii_word (const gchar *word,
+               gsize        length)
+{
+	gsize i;
+
+	for (i = 0; i < length; i++) {
+		if (!IS_ASCII_BYTE ((guchar)word[i])) {
+			return FALSE;
+		}
+	}
+	return TRUE;
+}
+
+/* libunistring-based parser */
+static gboolean
+parser_next (TrackerParser *parser,
+             gint          *byte_offset_start,
+             gint          *byte_offset_end)
+{
+	gsize word_length = 0;
+	gchar *processed_word = NULL;
+
+	*byte_offset_start = 0;
+	*byte_offset_end = 0;
+
+	g_return_val_if_fail (parser, FALSE);
+
+	/* Loop to look for next valid word */
+	while (!processed_word &&
+	       parser->cursor < parser->txt_size) {
+		ucs4_t first_unichar;
+		gint first_unichar_len;
+		gsize i;
+		gsize truncated_length;
+		gboolean do_strip;
+
+		/* Get first character of the word as UCS4 */
+		first_unichar_len = u8_strmbtouc (&first_unichar,
+		                                  &(parser->txt[parser->cursor]));
+		if (first_unichar_len <= 0) {
+			/* This should only happen if NIL was passed to u8_strmbtouc,
+			 *  so better just force stop here */
+			parser->cursor = parser->txt_size;
+			break;
+		}
+
+		/* Find next word break */
+		i = parser->cursor + first_unichar_len;
+		while (i < parser->txt_size &&
+		       !parser->word_break_flags [i]) {
+			i++;
+		}
+
+		/* Word end is the first byte after the word, which is either the
+		 *  start of next word or the end of the string */
+		word_length = i - parser->cursor;
+
+		/* We only want the words where the first character
+		 *  in the word is either a letter, a number or a symbol.
+		 * This is needed because the word break algorithm also
+		 *  considers word breaks after for example commas or other
+		 *  punctuation marks.
+		 * Note that looking at the first character in the string
+		 *  should be compatible with all Unicode normalization
+		 *  methods.
+		 */
+		if (!uc_is_general_category (first_unichar,
+		                             parser->allowed_start)) {
+			/* Skip this word and keep on looping */
+			parser->cursor += word_length;
+			continue;
+		}
+
+		/* check if word is reserved */
+		if (parser->parse_reserved_words &&
+		    word_length == 2 &&
+		    parser->txt[parser->cursor] == 'o' &&
+		    parser->txt[parser->cursor + 1] == 'r') {
+			/* Skip this word and keep on looping */
+			parser->cursor += word_length;
+			continue;
+		}
+
+		/* compute truncated word length if needed (to avoid extremely
+		 *  long words)*/
+		truncated_length = (word_length < WORD_BUFFER_LENGTH ?
+		                    word_length :
+		                    WORD_BUFFER_LENGTH - 1);
+
+		/* Enable UNAC stripping only if no ASCII and no CJK */
+		do_strip = (!is_ascii_word (&(parser->txt[parser->cursor]),
+		                            truncated_length) &&
+		            !IS_CJK_UCS4 (first_unichar));
+
+		/* Process the word here. If it fails, we can still go
+		 *  to the next one. Returns newly allocated string
+		 *  always */
+		processed_word = tracker_parser_process_word (parser,
+		                                              &(parser->txt[parser->cursor]),
+		                                              truncated_length,
+		                                              do_strip);
+		if (!processed_word) {
+			/* Skip this word and keep on looping */
+			parser->cursor += word_length;
+			continue;
+		}
+	}
+
+	/* If we got a word here, set output */
+	if (processed_word) {
+		/* Set outputs */
+		*byte_offset_start = parser->cursor;
+		*byte_offset_end = parser->cursor + word_length;
+
+		/* Update cursor */
+		parser->cursor += word_length;
+
+		parser->word_length = strlen (processed_word);
+		parser->word = processed_word;
+
+		return TRUE;
+	}
+
+	/* No more words... */
+	return FALSE;
+}
+
+TrackerParser *
+tracker_parser_new (TrackerLanguage *language,
+                    gint             max_word_length)
+{
+	TrackerParser *parser;
+
+	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
+	g_return_val_if_fail (max_word_length > 0, NULL);
+
+	parser = g_new0 (TrackerParser, 1);
+
+	parser->language = g_object_ref (language);
+
+	parser->max_word_length = max_word_length;
+	parser->word_length = 0;
+
+	parser->utxt = NULL;
+	parser->utxt_size = 0;
+	parser->bi = NULL;
+	parser->cursor = 0;
+
+	return parser;
+}
+
+void
+tracker_parser_free (TrackerParser *parser)
+{
+	g_return_if_fail (parser != NULL);
+
+	if (parser->language) {
+		g_object_unref (parser->language);
+	}
+
+	if (parser->bi) {
+		ubrk_close (parser->bi);
+	}
+
+	g_free (parser->utxt);
+
+	g_free (parser->word);
+
+	g_free (parser);
+}
+
+void
+tracker_parser_reset (TrackerParser *parser,
+                      const gchar   *txt,
+                      gint           txt_size,
+                      gboolean       delimit_words,
+                      gboolean       enable_stemmer,
+                      gboolean       enable_stop_words,
+                      gboolean       parse_reserved_words)
+{
+	UErrorCode error;
+	UConverter *converter;
+
+	g_return_if_fail (parser != NULL);
+	g_return_if_fail (txt != NULL);
+
+	parser->enable_stemmer = enable_stemmer;
+	parser->enable_stop_words = enable_stop_words;
+	parser->delimit_words = delimit_words;
+
+	parser->txt_size = txt_size;
+	parser->txt = txt;
+	parser->parse_reserved_words = parse_reserved_words;
+
+	g_free (parser->word);
+	parser->word = NULL;
+
+	parser->word_position = 0;
+
+	parser->cursor = 0;
+
+	/* Open converter UTF-8 to UChar */
+	converter = ucnv_open ("UTF-8", &error);
+	if (!converter) {
+		g_warning ("Cannot open UTF-8 converter: '%s'",
+		           U_FAILURE (error) ? u_errorName (error) : "none");
+		return;
+	}
+
+	/* Allocate UChars buffer */
+	parser->utxt_size = txt_size * 2 + 1;
+	parser->utxt = g_malloc (parser->utxt_size);
+
+	/* Convert to UChars */
+	parser->utxt_size = ucnv_toUChars (converter,
+	                                   parser->utxt,
+	                                   parser->utxt_size,
+	                                   parser->txt,
+	                                   parser->txt_size,
+	                                   &error);
+	if (U_FAILURE (error)) {
+		g_warning ("Cannot convert from UTF-8 to UChar: '%s'",
+		           u_errorName (error));
+		/* Error converting to UChars... reset buffer */
+		g_free (parser->utxt);
+		parser->utxt = NULL;
+		parser->utxt_size = 0;
+		ucnv_close (converter);
+		return;
+	}
+
+	/* Open word-break iterator */
+	parser->bi = ubrk_open(UBRK_WORD,
+	                       setlocale (LC_ALL, NULL),
+	                       parser->utxt,
+	                       parser->utxt_size,
+	                       &error);
+	if (U_FAILURE (error)) {
+		g_warning ("Cannot open word-breaker: '%s'",
+		           u_errorName (error));
+		g_free (parser->utxt);
+		parser->utxt = NULL;
+		parser->utxt_size = 0;
+		ucnv_close (converter);
+		return;
+	}
+
+	/* Find FIRST word in the UChar array */
+	parser->cursor = ubrk_first (parser->bi);
+}
+
+/* libunistring version of the word processor. */
+gchar *
+tracker_parser_process_word (TrackerParser *parser,
+                             const gchar    *word,
+                             gint           length,
+                             gboolean       do_strip)
+{
+	/* gchar word_buffer [WORD_BUFFER_LENGTH]; */
+	/* gchar *normalized = NULL; */
+	/* gchar *stripped = NULL; */
+	/* gchar *stemmed = NULL; */
+	/* size_t new_word_length; */
+
+	/* g_return_val_if_fail (parser != NULL, NULL); */
+	/* g_return_val_if_fail (word != NULL, NULL); */
+
+
+	/* /\* If length is set as -1, the input word MUST be NIL-terminated. */
+	/*  * Otherwise, this restriction is not needed as the length to process */
+	/*  *  is given as input argument *\/ */
+	/* if (length < 0) { */
+	/* 	length = strlen (word); */
+	/* } */
+
+	/* /\* Log original word *\/ */
+	/* tracker_parser_message_hex ("ORIGINAL word", */
+	/*                             word, length); */
+
+	/* /\* Leave space for last NIL *\/ */
+	/* new_word_length = WORD_BUFFER_LENGTH - 1; */
+
+	/* /\* Casefold and NFC normalization in output. */
+	/*  *  NOTE: if the output buffer is not big enough, u8_casefold will */
+	/*  *  return a newly-allocated buffer. *\/ */
+	/* normalized = u8_casefold ((const uint8_t *)word, */
+	/*                           length, */
+	/*                           uc_locale_language (), */
+	/*                           UNINORM_NFC, */
+	/*                           word_buffer, */
+	/*                           &new_word_length); */
+
+	/* /\* Case folding + Normalization failed, skip this word *\/ */
+	/* g_return_val_if_fail (normalized != NULL, NULL); */
+
+	/* /\* If output buffer is not the same as the one passed to */
+	/*  *  u8_casefold, we know it was newly-allocated, so need */
+	/*  *  to resize it in 1 byte to add last NIL *\/ */
+	/* if (normalized != word_buffer) { */
+	/* 	normalized = g_realloc (normalized, new_word_length + 1); */
+	/* } */
+
+	/* /\* Set output NIL *\/ */
+	/* normalized[new_word_length] = '\0'; */
+
+	/* /\* Log after Normalization *\/ */
+	/* tracker_parser_message_hex (" After Casefolding and NFC normalization", */
+	/*                             normalized, new_word_length); */
+
+	/* /\* UNAC stripping needed? *\/ */
+	/* if (do_strip) { */
+	/* 	gsize stripped_word_length; */
+
+	/* 	stripped = tracker_parser_unaccent_string (normalized, */
+	/* 	                                           new_word_length, */
+	/* 	                                           &stripped_word_length); */
+
+	/* 	if (stripped) { */
+	/* 		/\* Log after UNAC stripping *\/ */
+	/* 		tracker_parser_message_hex ("  After UNAC stripping", */
+	/* 		                            stripped, stripped_word_length); */
+	/* 		new_word_length = stripped_word_length; */
+	/* 	} */
+	/* } */
+
+
+	/* /\* Stemming needed? *\/ */
+	/* if (parser->enable_stemmer) { */
+	/* 	stemmed = tracker_language_stem_word (parser->language, */
+	/* 	                                      stripped ? stripped : normalized, */
+	/* 	                                      new_word_length); */
+
+	/* 	/\* Log after stemming *\/ */
+	/* 	tracker_parser_message_hex ("   After stemming", */
+	/* 	                            stemmed, strlen (stemmed)); */
+	/* } */
+
+	/* /\* If stemmed wanted and succeeded, free previous and return it *\/ */
+	/* if (stemmed) { */
+	/* 	g_free (stripped); */
+	/* 	if (normalized != word_buffer) { */
+	/* 		g_free (normalized); */
+	/* 	} */
+	/* 	return stemmed; */
+	/* } */
+
+	/* /\* If stripped wanted and succeeded, free previous and return it *\/ */
+	/* if (stripped) { */
+	/* 	if (normalized != word_buffer) { */
+	/* 		g_free (normalized); */
+	/* 	} */
+	/* 	return stripped; */
+	/* } */
+
+	/* /\* It may be the case that no stripping and no stemming was needed, and */
+	/*  * that the output buffer in stack was enough for case-folding and */
+	/*  * normalization. In this case, need to strdup() the string to return it *\/ */
+	/* return normalized == word_buffer ? g_strdup (word_buffer) : normalized; */
+	return NULL;
+}
+
+const gchar *
+tracker_parser_next (TrackerParser *parser,
+                     gint          *position,
+                     gint          *byte_offset_start,
+                     gint          *byte_offset_end,
+                     gboolean      *stop_word,
+                     gint          *word_length)
+{
+	const gchar  *str;
+	gint     byte_start = 0, byte_end = 0;
+
+	str = NULL;
+
+	g_free (parser->word);
+	parser->word = NULL;
+
+	if (parser_next (parser, &byte_start, &byte_end)) {
+		str = parser->word;
+	}
+
+	if (str &&
+	    parser->enable_stop_words &&
+	    tracker_language_is_stop_word (parser->language, str)) {
+		*stop_word = TRUE;
+	} else {
+		parser->word_position++;
+		*stop_word = FALSE;
+	}
+
+	*word_length = parser->word_length;
+	*position = parser->word_position;
+	*byte_offset_start = byte_start;
+	*byte_offset_end = byte_end;
+
+	return str;
+}
+



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]