[tracker/parser-unicode-libs-review: 77/85] WIP with libicu support
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/parser-unicode-libs-review: 77/85] WIP with libicu support
- Date: Tue, 4 May 2010 17:30:28 +0000 (UTC)
commit c99e666201ee642877ef080a8488e87dbe5401d5
Author: Aleksander Morgado <aleksander lanedo com>
Date: Mon May 3 13:24:35 2010 +0200
WIP with libicu support
src/libtracker-fts/Makefile.am | 24 +-
src/libtracker-fts/tracker-parser-libicu.c | 474 ++++++++++++++++++++++++++++
2 files changed, 492 insertions(+), 6 deletions(-)
---
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am
index 667cece..62c6d7a 100644
--- a/src/libtracker-fts/Makefile.am
+++ b/src/libtracker-fts/Makefile.am
@@ -12,9 +12,13 @@ INCLUDES = \
$(SQLITE3_CFLAGS)
if HAVE_LIBUNISTRING
-INCLUDES += $(LIBUNISTRING_CFLAGS)
+ INCLUDES += $(LIBUNISTRING_CFLAGS)
else
-INCLUDES += $(PANGO_CFLAGS)
+if HAVE_LIBICU
+ INCLUDES += $(LIBICU_CFLAGS)
+else
+ INCLUDES += $(PANGO_CFLAGS)
+endif
endif
noinst_LTLIBRARIES = libtracker-fts.la
@@ -31,9 +35,13 @@ libtracker_fts_la_SOURCES = \
tracker-parser.h
if HAVE_LIBUNISTRING
-libtracker_fts_la_SOURCES += tracker-parser-libunistring.c
+ libtracker_fts_la_SOURCES += tracker-parser-libunistring.c
+else
+if HAVE_LIBICU
+ libtracker_fts_la_SOURCES += tracker-parser-libicu.c
else
-libtracker_fts_la_SOURCES += tracker-parser-glib.c
+ libtracker_fts_la_SOURCES += tracker-parser-glib.c
+endif
endif
libtracker_fts_la_LIBADD = \
@@ -46,7 +54,11 @@ libtracker_fts_la_LIBADD = \
$(GLIB2_LIBS)
if HAVE_LIBUNISTRING
-libtracker_fts_la_LIBADD += $(LIBUNISTRING_LIBS)
+ libtracker_fts_la_LIBADD += $(LIBUNISTRING_LIBS)
+else
+if HAVE_LIBICU
+ libtracker_fts_la_LIBADD += $(LIBICU_LIBS)
else
-libtracker_fts_la_LIBADD += $(PANGO_LIBS)
+ libtracker_fts_la_LIBADD += $(PANGO_LIBS)
+endif
endif
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
new file mode 100644
index 0000000..e29e0ea
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -0,0 +1,474 @@
+/*
+ * Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
+ * Copyright (C) 2008,2009,2010 Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <locale.h>
+
+#include <ubrk.h>
+
+#include "tracker-parser.h"
+#include "tracker-parser-utils.h"
+
+/* ASCII-7 is in range [0x00,0x7F] */
+#define IS_ASCII_BYTE(c) ((c) <= 0x7F)
+
+/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6] */
+#define IS_CJK_UCS4(c) (((c) >= 0x3400 && (c) <= 0x4DB5) || \
+ ((c) >= 0x4E00 && (c) <= 0x9FA5) || \
+ ((c) >= 0x20000 && (c) <= 0x2A6D6))
+
+/* Max possible length of a UTF-8 encoded string (just a safety limit) */
+#define WORD_BUFFER_LENGTH 512
+
+
+struct TrackerParser {
+ const gchar *txt;
+ gint txt_size;
+
+ TrackerLanguage *language;
+ gboolean enable_stemmer;
+ gboolean enable_stop_words;
+ guint max_words_to_index;
+ guint max_word_length;
+ gboolean delimit_words;
+ gboolean parse_reserved_words;
+
+ /* Private members */
+ gchar *word;
+ gint word_length;
+ guint word_position;
+
+ /* Text as UChars */
+ UChar *utxt;
+ gint utxt_size;
+
+ /* The word-break iterator */
+ UBreakIterator *bi;
+
+ /* Cursor, as index of the utxt array of bytes */
+ gsize cursor;
+};
+
+/* Detect if a UTF-8 word is pure ASCII-7, so that there is no need to apply
+ * UNAC stripping.
+ * Just check byte per byte, and if any of the bytes is >127, then it's not
+ * ASCII-7 */
+static gboolean
+is_ascii_word (const gchar *word,
+ gsize length)
+{
+ gsize i;
+
+ for (i = 0; i < length; i++) {
+ if (!IS_ASCII_BYTE ((guchar)word[i])) {
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
+
+/* libunistring-based parser */
+static gboolean
+parser_next (TrackerParser *parser,
+ gint *byte_offset_start,
+ gint *byte_offset_end)
+{
+ gsize word_length = 0;
+ gchar *processed_word = NULL;
+
+ *byte_offset_start = 0;
+ *byte_offset_end = 0;
+
+ g_return_val_if_fail (parser, FALSE);
+
+ /* Loop to look for next valid word */
+ while (!processed_word &&
+ parser->cursor < parser->txt_size) {
+ ucs4_t first_unichar;
+ gint first_unichar_len;
+ gsize i;
+ gsize truncated_length;
+ gboolean do_strip;
+
+ /* Get first character of the word as UCS4 */
+ first_unichar_len = u8_strmbtouc (&first_unichar,
+ &(parser->txt[parser->cursor]));
+ if (first_unichar_len <= 0) {
+ /* This should only happen if NIL was passed to u8_strmbtouc,
+ * so better just force stop here */
+ parser->cursor = parser->txt_size;
+ break;
+ }
+
+ /* Find next word break */
+ i = parser->cursor + first_unichar_len;
+ while (i < parser->txt_size &&
+ !parser->word_break_flags [i]) {
+ i++;
+ }
+
+ /* Word end is the first byte after the word, which is either the
+ * start of next word or the end of the string */
+ word_length = i - parser->cursor;
+
+ /* We only want the words where the first character
+ * in the word is either a letter, a number or a symbol.
+ * This is needed because the word break algorithm also
+ * considers word breaks after for example commas or other
+ * punctuation marks.
+ * Note that looking at the first character in the string
+ * should be compatible with all Unicode normalization
+ * methods.
+ */
+ if (!uc_is_general_category (first_unichar,
+ parser->allowed_start)) {
+ /* Skip this word and keep on looping */
+ parser->cursor += word_length;
+ continue;
+ }
+
+ /* check if word is reserved */
+ if (parser->parse_reserved_words &&
+ word_length == 2 &&
+ parser->txt[parser->cursor] == 'o' &&
+ parser->txt[parser->cursor + 1] == 'r') {
+ /* Skip this word and keep on looping */
+ parser->cursor += word_length;
+ continue;
+ }
+
+ /* compute truncated word length if needed (to avoid extremely
+ * long words)*/
+ truncated_length = (word_length < WORD_BUFFER_LENGTH ?
+ word_length :
+ WORD_BUFFER_LENGTH - 1);
+
+ /* Enable UNAC stripping only if no ASCII and no CJK */
+ do_strip = (!is_ascii_word (&(parser->txt[parser->cursor]),
+ truncated_length) &&
+ !IS_CJK_UCS4 (first_unichar));
+
+ /* Process the word here. If it fails, we can still go
+ * to the next one. Returns newly allocated string
+ * always */
+ processed_word = tracker_parser_process_word (parser,
+ &(parser->txt[parser->cursor]),
+ truncated_length,
+ do_strip);
+ if (!processed_word) {
+ /* Skip this word and keep on looping */
+ parser->cursor += word_length;
+ continue;
+ }
+ }
+
+ /* If we got a word here, set output */
+ if (processed_word) {
+ /* Set outputs */
+ *byte_offset_start = parser->cursor;
+ *byte_offset_end = parser->cursor + word_length;
+
+ /* Update cursor */
+ parser->cursor += word_length;
+
+ parser->word_length = strlen (processed_word);
+ parser->word = processed_word;
+
+ return TRUE;
+ }
+
+ /* No more words... */
+ return FALSE;
+}
+
+TrackerParser *
+tracker_parser_new (TrackerLanguage *language,
+ gint max_word_length)
+{
+ TrackerParser *parser;
+
+ g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
+ g_return_val_if_fail (max_word_length > 0, NULL);
+
+ parser = g_new0 (TrackerParser, 1);
+
+ parser->language = g_object_ref (language);
+
+ parser->max_word_length = max_word_length;
+ parser->word_length = 0;
+
+ parser->utxt = NULL;
+ parser->utxt_size = 0;
+ parser->bi = NULL;
+ parser->cursor = 0;
+
+ return parser;
+}
+
+void
+tracker_parser_free (TrackerParser *parser)
+{
+ g_return_if_fail (parser != NULL);
+
+ if (parser->language) {
+ g_object_unref (parser->language);
+ }
+
+ if (parser->bi) {
+ ubrk_close (parser->bi);
+ }
+
+ g_free (parser->utxt);
+
+ g_free (parser->word);
+
+ g_free (parser);
+}
+
+void
+tracker_parser_reset (TrackerParser *parser,
+ const gchar *txt,
+ gint txt_size,
+ gboolean delimit_words,
+ gboolean enable_stemmer,
+ gboolean enable_stop_words,
+ gboolean parse_reserved_words)
+{
+ UErrorCode error;
+ UConverter *converter;
+
+ g_return_if_fail (parser != NULL);
+ g_return_if_fail (txt != NULL);
+
+ parser->enable_stemmer = enable_stemmer;
+ parser->enable_stop_words = enable_stop_words;
+ parser->delimit_words = delimit_words;
+
+ parser->txt_size = txt_size;
+ parser->txt = txt;
+ parser->parse_reserved_words = parse_reserved_words;
+
+ g_free (parser->word);
+ parser->word = NULL;
+
+ parser->word_position = 0;
+
+ parser->cursor = 0;
+
+ /* Open converter UTF-8 to UChar */
+ converter = ucnv_open ("UTF-8", &error);
+ if (!converter) {
+ g_warning ("Cannot open UTF-8 converter: '%s'",
+ U_FAILURE (error) ? u_errorName (error) : "none");
+ return;
+ }
+
+ /* Allocate UChars buffer */
+ parser->utxt_size = txt_size * 2 + 1;
+ parser->utxt = g_malloc (parser->utxt_size);
+
+ /* Convert to UChars */
+ parser->utxt_size = ucnv_toUChars (converter,
+ parser->utxt,
+ parser->utxt_size,
+ parser->txt,
+ parser->txt_size,
+ &error);
+ if (U_FAILURE (error)) {
+ g_warning ("Cannot convert from UTF-8 to UChar: '%s'",
+ u_errorName (error));
+ /* Error converting to UChars... reset buffer */
+ g_free (parser->utxt);
+ parser->utxt = NULL;
+ parser->utxt_size = 0;
+ ucnv_close (converter);
+ return;
+ }
+
+ /* Open word-break iterator */
+ parser->bi = ubrk_open(UBRK_WORD,
+ setlocale (LC_ALL, NULL),
+ parser->utxt,
+ parser->utxt_size,
+ &error);
+ if (U_FAILURE (error)) {
+ g_warning ("Cannot open word-breaker: '%s'",
+ u_errorName (error));
+ g_free (parser->utxt);
+ parser->utxt = NULL;
+ parser->utxt_size = 0;
+ ucnv_close (converter);
+ return;
+ }
+
+ /* Find FIRST word in the UChar array */
+ parser->cursor = ubrk_first (parser->bi);
+}
+
+/* libunistring version of the word processor. */
+gchar *
+tracker_parser_process_word (TrackerParser *parser,
+ const gchar *word,
+ gint length,
+ gboolean do_strip)
+{
+ /* gchar word_buffer [WORD_BUFFER_LENGTH]; */
+ /* gchar *normalized = NULL; */
+ /* gchar *stripped = NULL; */
+ /* gchar *stemmed = NULL; */
+ /* size_t new_word_length; */
+
+ /* g_return_val_if_fail (parser != NULL, NULL); */
+ /* g_return_val_if_fail (word != NULL, NULL); */
+
+
+ /* /\* If length is set as -1, the input word MUST be NIL-terminated. */
+ /* * Otherwise, this restriction is not needed as the length to process */
+ /* * is given as input argument *\/ */
+ /* if (length < 0) { */
+ /* length = strlen (word); */
+ /* } */
+
+ /* /\* Log original word *\/ */
+ /* tracker_parser_message_hex ("ORIGINAL word", */
+ /* word, length); */
+
+ /* /\* Leave space for last NIL *\/ */
+ /* new_word_length = WORD_BUFFER_LENGTH - 1; */
+
+ /* /\* Casefold and NFC normalization in output. */
+ /* * NOTE: if the output buffer is not big enough, u8_casefold will */
+ /* * return a newly-allocated buffer. *\/ */
+ /* normalized = u8_casefold ((const uint8_t *)word, */
+ /* length, */
+ /* uc_locale_language (), */
+ /* UNINORM_NFC, */
+ /* word_buffer, */
+ /* &new_word_length); */
+
+ /* /\* Case folding + Normalization failed, skip this word *\/ */
+ /* g_return_val_if_fail (normalized != NULL, NULL); */
+
+ /* /\* If output buffer is not the same as the one passed to */
+ /* * u8_casefold, we know it was newly-allocated, so need */
+ /* * to resize it in 1 byte to add last NIL *\/ */
+ /* if (normalized != word_buffer) { */
+ /* normalized = g_realloc (normalized, new_word_length + 1); */
+ /* } */
+
+ /* /\* Set output NIL *\/ */
+ /* normalized[new_word_length] = '\0'; */
+
+ /* /\* Log after Normalization *\/ */
+ /* tracker_parser_message_hex (" After Casefolding and NFC normalization", */
+ /* normalized, new_word_length); */
+
+ /* /\* UNAC stripping needed? *\/ */
+ /* if (do_strip) { */
+ /* gsize stripped_word_length; */
+
+ /* stripped = tracker_parser_unaccent_string (normalized, */
+ /* new_word_length, */
+ /* &stripped_word_length); */
+
+ /* if (stripped) { */
+ /* /\* Log after UNAC stripping *\/ */
+ /* tracker_parser_message_hex (" After UNAC stripping", */
+ /* stripped, stripped_word_length); */
+ /* new_word_length = stripped_word_length; */
+ /* } */
+ /* } */
+
+
+ /* /\* Stemming needed? *\/ */
+ /* if (parser->enable_stemmer) { */
+ /* stemmed = tracker_language_stem_word (parser->language, */
+ /* stripped ? stripped : normalized, */
+ /* new_word_length); */
+
+ /* /\* Log after stemming *\/ */
+ /* tracker_parser_message_hex (" After stemming", */
+ /* stemmed, strlen (stemmed)); */
+ /* } */
+
+ /* /\* If stemmed wanted and succeeded, free previous and return it *\/ */
+ /* if (stemmed) { */
+ /* g_free (stripped); */
+ /* if (normalized != word_buffer) { */
+ /* g_free (normalized); */
+ /* } */
+ /* return stemmed; */
+ /* } */
+
+ /* /\* If stripped wanted and succeeded, free previous and return it *\/ */
+ /* if (stripped) { */
+ /* if (normalized != word_buffer) { */
+ /* g_free (normalized); */
+ /* } */
+ /* return stripped; */
+ /* } */
+
+ /* /\* It may be the case that no stripping and no stemming was needed, and */
+ /* * that the output buffer in stack was enough for case-folding and */
+ /* * normalization. In this case, need to strdup() the string to return it *\/ */
+ /* return normalized == word_buffer ? g_strdup (word_buffer) : normalized; */
+ return NULL;
+}
+
+const gchar *
+tracker_parser_next (TrackerParser *parser,
+ gint *position,
+ gint *byte_offset_start,
+ gint *byte_offset_end,
+ gboolean *stop_word,
+ gint *word_length)
+{
+ const gchar *str;
+ gint byte_start = 0, byte_end = 0;
+
+ str = NULL;
+
+ g_free (parser->word);
+ parser->word = NULL;
+
+ if (parser_next (parser, &byte_start, &byte_end)) {
+ str = parser->word;
+ }
+
+ if (str &&
+ parser->enable_stop_words &&
+ tracker_language_is_stop_word (parser->language, str)) {
+ *stop_word = TRUE;
+ } else {
+ parser->word_position++;
+ *stop_word = FALSE;
+ }
+
+ *word_length = parser->word_length;
+ *position = parser->word_position;
+ *byte_offset_start = byte_start;
+ *byte_offset_end = byte_end;
+
+ return str;
+}
+
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]