[tracker/parser-unicode-libs-review] FTS parser: Added new GNU libunistring and libicu based parsers
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/parser-unicode-libs-review] FTS parser: Added new GNU libunistring and libicu based parsers
- Date: Fri, 14 May 2010 17:21:32 +0000 (UTC)
commit b497d3a1842d376869258b2ff09d93715d073fb9
Author: Aleksander Morgado <aleksander lanedo com>
Date: Fri May 14 18:45:18 2010 +0200
FTS parser: Added new GNU libunistring and libicu based parsers
* configure.ac: New --with-unicode-support option added to configure,
which enables choosing between three parser implementations (glib,
libunistring and libicu), defaulting to libunistring if none specified.
* src/libtracker-fts/tracker-parser-utils.[h|c]: New files including
the common utilities for all parsers.
* src/libtracker-fts/tracker-parser-glib.c: Original custom/pango
parser, file renamed from `src/libtracker-fts/tracker-parser.c'
* src/libtracker-fts/tracker-parser-libicu.c: libicu-based parser
implementation.
* src/libtracker-fts/tracker-parser-libunistring.c: libunistring-based
parser implementation.
* src/libtracker-fts/tracker-parser.h: API of the parser implementation
extended to enable/disable indexing numbers.
* src/libtracker-fts/Makefile.am: Setup the compilation of the new
parsers.
configure.ac | 72 +++
src/libtracker-fts/Makefile.am | 37 +-
src/libtracker-fts/tracker-fts.c | 6 +-
.../{tracker-parser.c => tracker-parser-glib.c} | 129 ++---
src/libtracker-fts/tracker-parser-libicu.c | 637 ++++++++++++++++++++
src/libtracker-fts/tracker-parser-libunistring.c | 499 +++++++++++++++
src/libtracker-fts/tracker-parser-utils.c | 235 +++++++
src/libtracker-fts/tracker-parser-utils.h | 77 +++
src/libtracker-fts/tracker-parser.h | 14 +-
9 files changed, 1602 insertions(+), 104 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 9e48269..16c5eda 100644
--- a/configure.ac
+++ b/configure.ac
@@ -857,6 +857,77 @@ fi
AM_CONDITIONAL(HAVE_MINER_FLICKR, test "x$have_miner_flickr" = "xyes")
+##################################################################
+# GLib, libunistring or libicu?
+# By default, AUTO with this order of preference:
+# 1) libunistring
+# 2) libicu
+# 3) glib
+##################################################################
+
+# Check for libunistring...
+AC_CHECK_HEADER(uniwbrk.h,
+ AC_CHECK_LIB(unistring, u8_wordbreaks))
+have_libunistring=${ac_cv_lib_unistring_u8_wordbreaks:-no}
+LIBUNISTRING_CFLAGS="$CFLAGS"
+LIBUNISTRING_LIBS="$LIBS"
+
+# Check for libicu... Note that AC_CHECK_LIB cannot be used as
+# symbol name includes libicu library version... don't want to
+# look for ubrk_next_4_2
+AC_CHECK_HEADER(unicode/ubrk.h, [have_libicu=yes],[have_libicu=no])
+LIBICU_CFLAGS=""
+LIBICU_LIBS="-licuuc"
+
+# Configure option
+AC_ARG_WITH([unicode-support],
+ AS_HELP_STRING([--with-unicode-support],
+ [Unicode support library? (libunistring|libicu|glib) [[default=auto]]]),,
+ [with_unicode_support=auto])
+
+# If auto, decide ourselves
+if test "x$with_unicode_support" = "xauto"; then
+ if test "x$have_libunistring" = "xyes"; then
+ with_unicode_support=libunistring
+ else
+ if test "x$have_libicu" = "xyes"; then
+ with_unicode_support=libicu
+ else
+ with_unicode_support=glib
+ fi
+ fi
+fi
+
+case "x$with_unicode_support" in
+ # Use libunistring
+ "xlibunistring")
+ AC_SUBST(LIBUNISTRING_CFLAGS)
+ AC_SUBST(LIBUNISTRING_LIBS)
+ if test "x$have_libunistring" = "xyes"; then
+ AC_DEFINE(HAVE_LIBUNISTRING, [], [libunistring Unicode support library])
+ else
+ AC_MSG_ERROR([***libunistring requested but not found - exiting!])
+ fi
+ ;;
+ # Use libicu
+ "xlibicu")
+ AC_SUBST(LIBICU_CFLAGS)
+ AC_SUBST(LIBICU_LIBS)
+ if test "x$have_libicu" = "xyes"; then
+ AC_DEFINE(HAVE_LIBICU, [], [libicu Unicode support library])
+ else
+ AC_MSG_ERROR([***libicu requested but not found - exiting!])
+ fi
+ ;;
+ # Use glib
+ "xglib") ;;
+ # Invalid option value
+ *) AC_MSG_ERROR([***wrong value for --with-unicode-support: $with_unicode_support - exiting!]) ;;
+esac
+
+AM_CONDITIONAL(HAVE_LIBUNISTRING, test "x$have_libunistring" = "xyes")
+AM_CONDITIONAL(HAVE_LIBICU, test "x$have_libicu" = "xyes")
+
####################################################################
# Miner Evolution
####################################################################
@@ -1852,6 +1923,7 @@ Build Configuration:
Support for accent stripping (unac): $have_unac
Support for Cyrillic languages (enca): $have_enca
Support for network status detection: $have_network_manager
+ Unicode support library: $with_unicode_support
Applications:
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am
index 4938097..62c6d7a 100644
--- a/src/libtracker-fts/Makefile.am
+++ b/src/libtracker-fts/Makefile.am
@@ -7,11 +7,20 @@ INCLUDES = \
$(WARN_CFLAGS) \
$(GLIB2_CFLAGS) \
$(GCOV_CFLAGS) \
- $(PANGO_CFLAGS) \
$(DBUS_CFLAGS) \
$(UNAC_CFLAGS) \
$(SQLITE3_CFLAGS)
+if HAVE_LIBUNISTRING
+ INCLUDES += $(LIBUNISTRING_CFLAGS)
+else
+if HAVE_LIBICU
+ INCLUDES += $(LIBICU_CFLAGS)
+else
+ INCLUDES += $(PANGO_CFLAGS)
+endif
+endif
+
noinst_LTLIBRARIES = libtracker-fts.la
libtracker_fts_la_SOURCES = \
@@ -21,15 +30,35 @@ libtracker_fts_la_SOURCES = \
tracker-fts-config.h \
tracker-fts-hash.c \
tracker-fts-hash.h \
- tracker-parser.c \
+ tracker-parser-utils.c \
+ tracker-parser-utils.h \
tracker-parser.h
+if HAVE_LIBUNISTRING
+ libtracker_fts_la_SOURCES += tracker-parser-libunistring.c
+else
+if HAVE_LIBICU
+ libtracker_fts_la_SOURCES += tracker-parser-libicu.c
+else
+ libtracker_fts_la_SOURCES += tracker-parser-glib.c
+endif
+endif
+
libtracker_fts_la_LIBADD = \
$(top_builddir)/src/libtracker-common/libtracker-common.la \
$(SQLITE3_LIBS) \
$(DBUS_LIBS) \
$(GTHREAD_LIBS) \
$(GCOV_LIBS) \
- $(PANGO_LIBS) \
$(UNAC_LIBS) \
- $(GLIB2_LIBS)
+ $(GLIB2_LIBS)
+
+if HAVE_LIBUNISTRING
+ libtracker_fts_la_LIBADD += $(LIBUNISTRING_LIBS)
+else
+if HAVE_LIBICU
+ libtracker_fts_la_LIBADD += $(LIBICU_LIBS)
+else
+ libtracker_fts_la_LIBADD += $(PANGO_LIBS)
+endif
+endif
diff --git a/src/libtracker-fts/tracker-fts.c b/src/libtracker-fts/tracker-fts.c
index c2f6f60..ab2b329 100644
--- a/src/libtracker-fts/tracker-fts.c
+++ b/src/libtracker-fts/tracker-fts.c
@@ -3666,7 +3666,7 @@ static void snippetOffsetsOfColumn(
pVtab = pQuery->pFts;
nColumn = pVtab->nColumn;
- tracker_parser_reset (pVtab->parser, zDoc, nDoc, FALSE, TRUE, pVtab->stop_words, FALSE);
+ tracker_parser_reset (pVtab->parser, zDoc, nDoc, FALSE, TRUE, pVtab->stop_words, TRUE, TRUE);
aTerm = pQuery->pTerms;
nTerm = pQuery->nTerms;
@@ -4363,7 +4363,7 @@ static int tokenizeSegment(
int firstIndex = pQuery->nTerms;
int nTerm = 1;
- tracker_parser_reset (parser, pSegment, nSegment, FALSE, TRUE, v->stop_words, TRUE);
+ tracker_parser_reset (parser, pSegment, nSegment, FALSE, TRUE, v->stop_words, FALSE, TRUE);
while( 1 ){
const char *pToken;
@@ -4816,7 +4816,7 @@ int Catid,
if (!zText) return SQLITE_OK;
- tracker_parser_reset (parser, zText, strlen (zText), FALSE, TRUE, v->stop_words, FALSE);
+ tracker_parser_reset (parser, zText, strlen (zText), FALSE, TRUE, v->stop_words, TRUE, TRUE);
while( 1 ){
diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser-glib.c
similarity index 86%
rename from src/libtracker-fts/tracker-parser.c
rename to src/libtracker-fts/tracker-parser-glib.c
index bd9326f..a2144af 100644
--- a/src/libtracker-fts/tracker-parser.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
- * Copyright (C) 2008, Nokia <ivan frade nokia com>
+ * Copyright (C) 2008,2009,2010 Nokia <ivan frade nokia com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -21,15 +21,11 @@
#include "config.h"
#include <string.h>
-#include <pango/pango.h>
-#ifdef HAVE_UNAC
-#include <unac.h>
-#endif
+#include <pango/pango.h>
#include "tracker-parser.h"
-
-#define INDEX_NUMBER_MIN_LENGTH 6
+#include "tracker-parser-utils.h"
/* Need pango for CJK ranges which are : 0x3400 - 0x4DB5, 0x4E00 -
* 0x9FA5, 0x20000 - <= 0x2A6D6
@@ -80,7 +76,8 @@ struct TrackerParser {
guint max_words_to_index;
guint max_word_length;
gboolean delimit_words;
- gboolean parse_reserved_words;
+ gboolean skip_reserved_words;
+ gboolean skip_numbers;
/* Private members */
gchar *word;
@@ -138,58 +135,6 @@ get_word_type (gunichar c)
return TRACKER_PARSER_WORD_IGNORE;
}
-static inline gchar *
-strip_word (const gchar *str,
- gint length,
- guint32 *len)
-{
-#ifdef HAVE_UNAC
- GError *error = NULL;
- gchar *str_utf16;
- gsize utf16_len, unaccented_len, final_len;
- gchar *unaccented_str = NULL;
- gchar *s = NULL;
-
- *len = 0;
-
- /* unac_string() does roughly the same than below, plus it
- * corrupts memory in 64bit systems, so avoid it for now.
- */
- str_utf16 = g_convert (str, length, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
-
- if (error) {
- g_warning ("Could not convert to UTF-16: %s", error->message);
- g_error_free (error);
- return NULL;
- }
-
- if (unac_string_utf16 (str_utf16, utf16_len,
- &unaccented_str, &unaccented_len) != 0) {
- g_warning ("UNAC failed to strip accents");
- g_free (str_utf16);
- return NULL;
- }
-
- g_free (str_utf16);
-
- s = g_convert (unaccented_str, unaccented_len, "UTF-8", "UTF-16BE", NULL, &final_len, &error);
- g_free (unaccented_str);
-
- if (error) {
- g_warning ("Could not convert back to UTF-8: %s", error->message);
- g_error_free (error);
- return NULL;
- }
-
- *len = (guint32) final_len;
-
- return s;
-#else
- *len = length;
- return NULL;
-#endif
-}
-
static TrackerParserEncoding
get_encoding (const gchar *txt)
{
@@ -219,21 +164,6 @@ get_encoding (const gchar *txt)
}
static gboolean
-is_stop_word (TrackerLanguage *language,
- const gchar *word)
-{
- GHashTable *stop_words;
-
- if (!word) {
- return FALSE;
- }
-
- stop_words = tracker_language_get_stop_words (language);
-
- return g_hash_table_lookup (stop_words, word) != NULL;
-}
-
-static gboolean
pango_next (TrackerParser *parser,
gint *byte_offset_start,
gint *byte_offset_end)
@@ -348,14 +278,14 @@ parser_next (TrackerParser *parser,
/* word break */
/* check if word is reserved */
- if (is_valid && parser->parse_reserved_words) {
+ if (is_valid && parser->skip_reserved_words) {
if (length == 2 && word[0] == 'o' && word[1] == 'r') {
- break;
+ is_valid = FALSE;
}
}
if (!is_valid ||
- word_type == TRACKER_PARSER_WORD_NUM) {
+ (parser->skip_numbers && word_type == TRACKER_PARSER_WORD_NUM)) {
word_type = TRACKER_PARSER_WORD_IGNORE;
is_valid = TRUE;
length = 0;
@@ -382,12 +312,12 @@ parser_next (TrackerParser *parser,
* underscore if we are filtering.
*/
- if (type == TRACKER_PARSER_WORD_NUM) {
+ if (parser->skip_numbers && type == TRACKER_PARSER_WORD_NUM) {
is_valid = FALSE;
continue;
} else {
if (type == TRACKER_PARSER_WORD_HYPHEN) {
- is_valid = parser->parse_reserved_words;
+ is_valid = !parser->skip_reserved_words;
continue;
}
}
@@ -463,8 +393,6 @@ parser_next (TrackerParser *parser,
gchar *utf8;
gchar *processed_word;
-
-
utf8 = g_ucs4_to_utf8 (word, length, NULL, &bytes, NULL);
if (!utf8) {
@@ -535,7 +463,8 @@ tracker_parser_reset (TrackerParser *parser,
gboolean delimit_words,
gboolean enable_stemmer,
gboolean enable_stop_words,
- gboolean parse_reserved_words)
+ gboolean skip_reserved_words,
+ gboolean skip_numbers)
{
g_return_if_fail (parser != NULL);
g_return_if_fail (txt != NULL);
@@ -543,21 +472,23 @@ tracker_parser_reset (TrackerParser *parser,
g_free (parser->attrs);
parser->attrs = NULL;
+ parser->cursor = txt;
+ parser->encoding = get_encoding (txt);
+
parser->enable_stemmer = enable_stemmer;
parser->enable_stop_words = enable_stop_words;
parser->delimit_words = delimit_words;
- parser->encoding = get_encoding (txt);
+
parser->txt_size = txt_size;
parser->txt = txt;
- parser->parse_reserved_words = parse_reserved_words;
+ parser->skip_reserved_words = skip_reserved_words;
+ parser->skip_numbers = skip_numbers;
g_free (parser->word);
parser->word = NULL;
parser->word_position = 0;
- parser->cursor = txt;
-
if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
PangoLogAttr *attrs;
@@ -583,14 +514,14 @@ tracker_parser_reset (TrackerParser *parser,
gchar *
tracker_parser_process_word (TrackerParser *parser,
- const char *word,
+ const gchar *word,
gint length,
gboolean do_strip)
{
gchar *stem_word;
gchar *str;
gchar *stripped_word;
- guint bytes, len;
+ gsize bytes, len;
g_return_val_if_fail (parser != NULL, NULL);
g_return_val_if_fail (word != NULL, NULL);
@@ -605,8 +536,18 @@ tracker_parser_process_word (TrackerParser *parser,
bytes = length;
}
+ /* Log original word */
+ tracker_parser_message_hex ("ORIGINAL word",
+ word, bytes);
+
if (do_strip) {
- stripped_word = strip_word (word, bytes, &len);
+ stripped_word = tracker_parser_unaccent_utf8_word (word,
+ bytes,
+ &len);
+
+ /* Log after UNAC stripping */
+ tracker_parser_message_hex (" After UNAC stripping",
+ stripped_word, len);
} else {
stripped_word = NULL;
}
@@ -622,6 +563,10 @@ tracker_parser_process_word (TrackerParser *parser,
g_free (stripped_word);
}
+ /* Log after normalization */
+ tracker_parser_message_hex (" After NFC normalization",
+ str, strlen ((gchar *)str));
+
if (!str) {
return NULL;
}
@@ -672,7 +617,9 @@ tracker_parser_next (TrackerParser *parser,
str = parser->word;
}
- if (parser->enable_stop_words && is_stop_word (parser->language, str)) {
+ if (str &&
+ parser->enable_stop_words &&
+ tracker_language_is_stop_word (parser->language, str)) {
*stop_word = TRUE;
} else {
parser->word_position++;
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
new file mode 100644
index 0000000..190931c
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -0,0 +1,637 @@
+/*
+ * Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
+ * Copyright (C) 2008,2009,2010 Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <locale.h>
+
+#include <unicode/utypes.h>
+#include <unicode/ucnv.h>
+#include <unicode/ubrk.h>
+#include <unicode/ustring.h>
+#include <unicode/uchar.h>
+#include <unicode/unorm.h>
+
+#include "tracker-parser.h"
+#include "tracker-parser-utils.h"
+
+/* Type of words detected */
+typedef enum {
+ TRACKER_PARSER_WORD_TYPE_ASCII,
+ TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
+ TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
+} TrackerParserWordType;
+
+/* Max possible length of a UChar encoded string (just a safety limit) */
+#define WORD_BUFFER_LENGTH 512
+
+
+static gchar *process_word_uchar (TrackerParser *parser,
+ const UChar *word,
+ gint length,
+ TrackerParserWordType type);
+
+
+struct TrackerParser {
+ const gchar *txt;
+ gint txt_size;
+
+ TrackerLanguage *language;
+ gboolean enable_stemmer;
+ gboolean enable_stop_words;
+ guint max_words_to_index;
+ guint max_word_length;
+ gboolean delimit_words;
+ gboolean skip_reserved_words;
+ gboolean skip_numbers;
+
+ /* Private members */
+ gchar *word;
+ gint word_length;
+ guint word_position;
+
+ /* Text as UChars */
+ UChar *utxt;
+ gint utxt_size;
+ /* Original offset of each UChar in the input txt string */
+ gint32 *offsets;
+
+ /* The word-break iterator */
+ UBreakIterator *bi;
+
+ /* Cursor, as index of the utxt array of bytes */
+ gsize cursor;
+};
+
+
+static gboolean
+get_word_info (const UChar *word,
+ gsize word_length,
+ gboolean skip_numbers,
+ gboolean *p_is_allowed_word_start,
+ TrackerParserWordType *p_word_type)
+{
+ UCharIterator iter;
+ UChar32 unichar;
+ guint8 unichar_gc;
+
+ /* Get first character of the word as UCS4 */
+ uiter_setString (&iter, word, word_length);
+ unichar = uiter_current32 (&iter);
+ if (unichar == U_SENTINEL) {
+ return FALSE;
+ }
+
+ /* We only want the words where the first character
+ * in the word is either a letter, a number or a symbol.
+ * This is needed because the word break algorithm also
+ * considers word breaks after for example commas or other
+ * punctuation marks.
+ * Note that looking at the first character in the string
+ * should be compatible with all Unicode normalization
+ * methods.
+ */
+ unichar_gc = u_charType (unichar);
+ if (unichar_gc == U_UPPERCASE_LETTER ||
+ unichar_gc == U_LOWERCASE_LETTER ||
+ unichar_gc == U_TITLECASE_LETTER ||
+ unichar_gc == U_MODIFIER_LETTER ||
+ unichar_gc == U_OTHER_LETTER ||
+ IS_UNDERSCORE_UCS4 ((guint32)unichar) ||
+ (!skip_numbers &&
+ (unichar_gc == U_DECIMAL_DIGIT_NUMBER ||
+ unichar_gc == U_LETTER_NUMBER ||
+ unichar_gc == U_OTHER_NUMBER))) {
+ *p_is_allowed_word_start = TRUE;
+ } else {
+ *p_is_allowed_word_start = FALSE;
+ return TRUE;
+ }
+
+ /* Word starts with a CJK character? */
+ if (IS_CJK_UCS4 ((guint32)unichar)) {
+ *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
+ return TRUE;
+ }
+
+ /* Is ASCII-only string? */
+ while (unichar != U_SENTINEL)
+ {
+ if (!IS_ASCII_UCS4 ((guint32)unichar)) {
+ *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
+ return TRUE;
+ }
+ unichar = uiter_next32 (&iter);
+ }
+
+ *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
+ return TRUE;
+}
+
+static gboolean
+parser_next (TrackerParser *parser,
+ gint *byte_offset_start,
+ gint *byte_offset_end)
+{
+ gsize word_length_uchar = 0;
+ gsize word_length_utf8 = 0;
+ gchar *processed_word = NULL;
+ gsize current_word_offset_utf8;
+
+ *byte_offset_start = 0;
+ *byte_offset_end = 0;
+
+ g_return_val_if_fail (parser, FALSE);
+
+ /* Loop to look for next valid word */
+ while (!processed_word &&
+ parser->cursor < parser->utxt_size) {
+ TrackerParserWordType type;
+ gboolean is_allowed;
+ gsize next_word_offset_uchar;
+ gsize next_word_offset_utf8;
+ gsize truncated_length;
+
+ /* Set current word offset in the original UTF-8 string */
+ current_word_offset_utf8 = parser->offsets[parser->cursor];
+
+ /* Find next word break. */
+ next_word_offset_uchar = ubrk_next (parser->bi);
+ if (next_word_offset_uchar >= parser->utxt_size) {
+ /* Last word support... */
+ next_word_offset_uchar = parser->utxt_size;
+ next_word_offset_utf8 = parser->txt_size;
+ }
+ else {
+ next_word_offset_utf8 = parser->offsets[next_word_offset_uchar];
+ }
+
+ /* Word end is the first byte after the word, which is either the
+ * start of next word or the end of the string */
+ word_length_uchar = next_word_offset_uchar - parser->cursor;
+ word_length_utf8 = next_word_offset_utf8 - current_word_offset_utf8;
+
+ /* g_debug ("word_length_uchar: %" G_GSIZE_FORMAT, word_length_uchar); */
+ /* g_debug ("next_word_offset_uchar: %" G_GSIZE_FORMAT, next_word_offset_uchar); */
+ /* g_debug ("current_word_offset_uchar: %" G_GSIZE_FORMAT, parser->cursor); */
+ /* g_debug ("word_length_utf8: %" G_GSIZE_FORMAT, word_length_utf8); */
+ /* g_debug ("next_word_offset_utf8: %" G_GSIZE_FORMAT, next_word_offset_utf8); */
+ /* g_debug ("current_word_offset_utf8: %" G_GSIZE_FORMAT, current_word_offset_utf8); */
+
+ /* Skip the word if longer than the maximum allowed */
+ if (word_length_utf8 >= parser->max_word_length) {
+ /* Skip this word and keep on looping */
+ parser->cursor = next_word_offset_uchar;
+ continue;
+ }
+
+ /* Get word info... */
+ if (!get_word_info (&parser->utxt[parser->cursor],
+ word_length_uchar,
+ parser->skip_numbers,
+ &is_allowed,
+ &type)) {
+ /* Quit loop just in case */
+ parser->cursor = parser->utxt_size;
+ break;
+ }
+
+ /* Skip the word if not an allowed word start */
+ if (!is_allowed) {
+ /* Skip this word and keep on looping */
+ parser->cursor = next_word_offset_uchar;
+ continue;
+ }
+
+ /* check if word is reserved (looking at ORIGINAL UTF-8 buffer here! */
+ if (parser->skip_reserved_words &&
+ tracker_parser_is_reserved_word_utf8 (&parser->txt[current_word_offset_utf8],
+ word_length_utf8)) {
+ /* Skip this word and keep on looping */
+ parser->cursor = next_word_offset_uchar;
+ continue;
+ }
+
+ /* compute truncated word length (in UChar bytes) if needed (to
+ * avoid extremely long words) */
+ truncated_length = (word_length_uchar < 2 * WORD_BUFFER_LENGTH ?
+ word_length_uchar :
+ 2 * WORD_BUFFER_LENGTH);
+
+ /* Process the word here. If it fails, we can still go
+ * to the next one. Returns newly allocated UTF-8
+ * string always.
+ * Enable UNAC stripping only if no ASCII and no CJK
+ * Note we are passing UChar encoded string here!
+ */
+ processed_word = process_word_uchar (parser,
+ &(parser->utxt[parser->cursor]),
+ truncated_length,
+ type);
+ if (!processed_word) {
+ /* Skip this word and keep on looping */
+ parser->cursor = next_word_offset_uchar;
+ continue;
+ }
+ }
+
+ /* If we got a word here, set output */
+ if (processed_word) {
+ /* Set outputs */
+ *byte_offset_start = current_word_offset_utf8;
+ *byte_offset_end = current_word_offset_utf8 + word_length_utf8;
+
+ /* Update cursor */
+ parser->cursor += word_length_uchar;
+
+ parser->word_length = strlen (processed_word);
+ parser->word = processed_word;
+
+ return TRUE;
+ }
+
+ /* No more words... */
+ return FALSE;
+}
+
+TrackerParser *
+tracker_parser_new (TrackerLanguage *language,
+ gint max_word_length)
+{
+ TrackerParser *parser;
+
+ g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
+ g_return_val_if_fail (max_word_length > 0, NULL);
+
+ parser = g_new0 (TrackerParser, 1);
+
+ parser->language = g_object_ref (language);
+
+ parser->max_word_length = max_word_length;
+ parser->word_length = 0;
+
+ parser->utxt = NULL;
+ parser->offsets = NULL;
+ parser->utxt_size = 0;
+ parser->bi = NULL;
+ parser->cursor = 0;
+
+ return parser;
+}
+
+void
+tracker_parser_free (TrackerParser *parser)
+{
+ g_return_if_fail (parser != NULL);
+
+ if (parser->language) {
+ g_object_unref (parser->language);
+ }
+
+ if (parser->bi) {
+ ubrk_close (parser->bi);
+ }
+
+ g_free (parser->utxt);
+ g_free (parser->offsets);
+
+ g_free (parser->word);
+
+ g_free (parser);
+}
+
+void
+tracker_parser_reset (TrackerParser *parser,
+ const gchar *txt,
+ gint txt_size,
+ gboolean delimit_words,
+ gboolean enable_stemmer,
+ gboolean enable_stop_words,
+ gboolean skip_reserved_words,
+ gboolean skip_numbers)
+{
+ UErrorCode error = U_ZERO_ERROR;
+ UConverter *converter;
+ UChar *last_uchar;
+ const gchar *last_utf8;
+
+ g_return_if_fail (parser != NULL);
+ g_return_if_fail (txt != NULL);
+
+ parser->enable_stemmer = enable_stemmer;
+ parser->enable_stop_words = enable_stop_words;
+ parser->delimit_words = delimit_words;
+
+ parser->txt_size = txt_size;
+ parser->txt = txt;
+ parser->skip_reserved_words = skip_reserved_words;
+ parser->skip_numbers = skip_numbers;
+
+ g_free (parser->word);
+ parser->word = NULL;
+
+ parser->word_position = 0;
+
+ parser->cursor = 0;
+
+ /* Open converter UTF-8 to UChar */
+ converter = ucnv_open ("UTF-8", &error);
+ if (!converter) {
+ g_warning ("Cannot open UTF-8 converter: '%s'",
+ U_FAILURE (error) ? u_errorName (error) : "none");
+ return;
+ }
+
+ /* Allocate UChars and offsets buffers */
+ parser->utxt_size = txt_size + 1;
+ parser->utxt = g_malloc (parser->utxt_size * sizeof (UChar));
+ parser->offsets = g_malloc (parser->utxt_size * sizeof (gint32));
+
+ /* last_uchar and last_utf8 will be also an output parameter! */
+ last_uchar = parser->utxt;
+ last_utf8 = parser->txt;
+
+ /* Convert to UChars storing offsets */
+ ucnv_toUnicode (converter,
+ &last_uchar,
+ &parser->utxt[txt_size],
+ &last_utf8,
+ &parser->txt[txt_size],
+ parser->offsets,
+ FALSE,
+ &error);
+ if (U_SUCCESS (error)) {
+ /* Proper UChar array size is now given by 'last_uchar' */
+ parser->utxt_size = last_uchar - parser->utxt;
+
+ /* Open word-break iterator */
+ parser->bi = ubrk_open(UBRK_WORD,
+ setlocale (LC_ALL, NULL),
+ parser->utxt,
+ parser->utxt_size,
+ &error);
+ if (U_SUCCESS (error)) {
+ /* Find FIRST word in the UChar array */
+ parser->cursor = ubrk_first (parser->bi);
+ }
+ }
+
+ /* If any error happened, reset buffers */
+ if (U_FAILURE (error)) {
+ g_warning ("Error initializing libicu support: '%s'",
+ u_errorName (error));
+ /* Reset buffers */
+ g_free (parser->utxt);
+ g_free (parser->offsets);
+ parser->utxt = NULL;
+ parser->offsets = NULL;
+ parser->utxt_size = 0;
+ }
+
+ /* Close converter */
+ ucnv_close (converter);
+}
+
+static gchar *
+process_word_uchar (TrackerParser *parser,
+ const UChar *word,
+ gint length,
+ TrackerParserWordType type)
+{
+ UErrorCode error = U_ZERO_ERROR;
+ UChar normalized_buffer [WORD_BUFFER_LENGTH];
+ gchar *utf8_str = NULL;
+ gchar *stemmed = NULL;
+ size_t new_word_length;
+
+
+ if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
+ UChar casefolded_buffer [WORD_BUFFER_LENGTH];
+
+ /* Casefold... */
+ new_word_length = u_strFoldCase (casefolded_buffer,
+ WORD_BUFFER_LENGTH,
+ word,
+ length,
+ U_FOLD_CASE_DEFAULT,
+ &error);
+ if (U_FAILURE (error)) {
+ g_warning ("Error casefolding: '%s'",
+ u_errorName (error));
+ return NULL;
+ }
+ if (new_word_length > WORD_BUFFER_LENGTH)
+ new_word_length = WORD_BUFFER_LENGTH;
+
+ /* NFC normalization... */
+ new_word_length = unorm_normalize (casefolded_buffer,
+ new_word_length,
+ UNORM_NFC,
+ 0,
+ normalized_buffer,
+ WORD_BUFFER_LENGTH,
+ &error);
+ if (U_FAILURE (error)) {
+ g_warning ("Error normalizing: '%s'",
+ u_errorName (error));
+ return NULL;
+ }
+
+ if (new_word_length > WORD_BUFFER_LENGTH)
+ new_word_length = WORD_BUFFER_LENGTH;
+ } else {
+ /* For ASCII-only, just tolower() each character */
+ new_word_length = u_strToLower (normalized_buffer,
+ WORD_BUFFER_LENGTH,
+ word,
+ length,
+ NULL,
+ &error);
+ if (U_FAILURE (error)) {
+ g_warning ("Error lowercasing: '%s'",
+ u_errorName (error));
+ return NULL;
+ }
+ }
+
+ /* UNAC stripping needed? (for non-CJK and non-ASCII) */
+ if (type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
+ gsize stripped_word_length;
+
+ /* Get unaccented string in UTF-8 */
+ utf8_str = tracker_parser_unaccent_UChar_word (normalized_buffer,
+ new_word_length,
+ &stripped_word_length);
+ if (utf8_str) {
+ new_word_length = stripped_word_length;
+ }
+ }
+
+ /* If stripping failed or not needed, convert to UTF-8 */
+ if (!utf8_str) {
+ UErrorCode icu_error = U_ZERO_ERROR;
+ UConverter *converter;
+ gsize utf8_len;
+
+ /* Open converter UChar to UTF-16BE */
+ converter = ucnv_open ("UTF-8", &icu_error);
+ if (!converter) {
+ g_warning ("Cannot open UTF-8 converter: '%s'",
+ U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+ return NULL;
+ }
+ /* Using same buffer size as for UTF-16 should always work. */
+ utf8_str = g_malloc (new_word_length * sizeof (UChar) + 1);
+
+ /* Convert from UChar to UTF-8 (NIL-terminated) */
+ utf8_len = ucnv_fromUChars (converter,
+ utf8_str,
+ new_word_length * sizeof (UChar) + 1,
+ normalized_buffer,
+ new_word_length,
+ &icu_error);
+ if (U_FAILURE (icu_error)) {
+ g_warning ("Cannot convert from UChar to UTF-8: '%s'",
+ u_errorName (icu_error));
+ g_free (utf8_str);
+ ucnv_close (converter);
+ return NULL;
+ }
+
+ new_word_length = utf8_len;
+ ucnv_close (converter);
+ }
+
+ /* Stemming needed? */
+ if (parser->enable_stemmer) {
+ /* Input for stemmer ALWAYS in UTF-8, as well as output */
+ stemmed = tracker_language_stem_word (parser->language,
+ utf8_str,
+ new_word_length);
+
+ /* Log after stemming */
+ tracker_parser_message_hex (" After stemming",
+ stemmed, strlen (stemmed));
+ }
+
+ /* If stemmed wanted and succeeded, free previous and return it */
+ if (stemmed) {
+ g_free (utf8_str);
+ return stemmed;
+ }
+
+ return utf8_str;
+}
+
+
+/* Both Input and Output are always UTF-8 */
+gchar *
+tracker_parser_process_word (TrackerParser *parser,
+ const gchar *word,
+ gint length,
+ gboolean do_strip)
+{
+ UErrorCode icu_error = U_ZERO_ERROR;
+ UConverter *converter;
+ UChar *uchar_word;
+ gsize uchar_len;
+ gchar *processed;
+
+ /* Open converter UTF-8 to UChar */
+ converter = ucnv_open ("UTF-8", &icu_error);
+ if (!converter) {
+ g_warning ("Cannot open UTF-8 converter: '%s'",
+ U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+ return NULL;
+ }
+
+ /* Compute length if not already as input */
+ if (length < 0) {
+ length = strlen (word);
+ }
+
+ /* Twice the size of the UTF-8 string for UChars */
+ uchar_word = g_malloc (2 * length);
+
+ /* Convert from UTF-8 to UChars*/
+ uchar_len = ucnv_toUChars (converter,
+ uchar_word,
+ 2 * length,
+ word,
+ length,
+ &icu_error);
+ if (U_FAILURE (icu_error)) {
+ g_warning ("Cannot convert from UTF-8 to UChar: '%s'",
+ u_errorName (icu_error));
+ g_free (uchar_word);
+ ucnv_close (converter);
+ return NULL;
+ }
+
+ ucnv_close (converter);
+
+ /* Process UChar based word */
+ processed = process_word_uchar (parser,
+ uchar_word,
+ uchar_len,
+ do_strip);
+ g_free (uchar_word);
+ return processed;
+}
+
+const gchar *
+tracker_parser_next (TrackerParser *parser,
+ gint *position,
+ gint *byte_offset_start,
+ gint *byte_offset_end,
+ gboolean *stop_word,
+ gint *word_length)
+{
+ const gchar *str;
+ gint byte_start = 0, byte_end = 0;
+
+ str = NULL;
+
+ g_free (parser->word);
+ parser->word = NULL;
+
+ if (parser_next (parser, &byte_start, &byte_end)) {
+ str = parser->word;
+ }
+
+ if (str &&
+ parser->enable_stop_words &&
+ tracker_language_is_stop_word (parser->language, str)) {
+ *stop_word = TRUE;
+ } else {
+ parser->word_position++;
+ *stop_word = FALSE;
+ }
+
+ *word_length = parser->word_length;
+ *position = parser->word_position;
+ *byte_offset_start = byte_start;
+ *byte_offset_end = byte_end;
+
+ return str;
+}
+
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
new file mode 100644
index 0000000..4a6ff35
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
+ * Copyright (C) 2008,2009,2010 Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <string.h>
+
+/* libunistring versions prior to 9.1.2 need this hack */
+#define _UNUSED_PARAMETER_
+#include <unistr.h>
+#include <uniwbrk.h>
+#include <unictype.h>
+#include <unicase.h>
+
+#include "tracker-parser.h"
+#include "tracker-parser-utils.h"
+
+/* Type of words detected */
+typedef enum {
+ TRACKER_PARSER_WORD_TYPE_ASCII,
+ TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
+ TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
+} TrackerParserWordType;
+
+/* Max possible length of a UTF-8 encoded string (just a safety limit) */
+#define WORD_BUFFER_LENGTH 512
+
+static gchar *process_word_utf8 (TrackerParser *parser,
+ const gchar *word,
+ gint length,
+ TrackerParserWordType type);
+
+struct TrackerParser {
+ const gchar *txt;
+ gint txt_size;
+
+ TrackerLanguage *language;
+ gboolean enable_stemmer;
+ gboolean enable_stop_words;
+ guint max_words_to_index;
+ guint max_word_length;
+ gboolean delimit_words;
+ gboolean skip_reserved_words;
+ gboolean skip_numbers;
+
+ /* Private members */
+ gchar *word;
+ gint word_length;
+ guint word_position;
+
+ /* Cursor, as index of the input array of bytes */
+ gsize cursor;
+ /* libunistring flags array */
+ gchar *word_break_flags;
+ /* general category of the start character in words */
+ uc_general_category_t allowed_start;
+};
+
+static gboolean
+get_word_info (TrackerParser *parser,
+ gsize *p_word_length,
+ gboolean *p_is_allowed_word_start,
+ TrackerParserWordType *p_word_type)
+{
+ ucs4_t first_unichar;
+ gint first_unichar_len;
+ gsize i;
+ gboolean ascii_only;
+
+ /* Defaults */
+ *p_is_allowed_word_start = TRUE;
+
+ /* Get first character of the word as UCS4 */
+ first_unichar_len = u8_strmbtouc (&first_unichar,
+ &(parser->txt[parser->cursor]));
+ if (first_unichar_len <= 0) {
+ /* This should only happen if NIL was passed to u8_strmbtouc,
+ * so better just force stop here */
+ return FALSE;
+ } else {
+ /* If first character has length 1, it's ASCII-7 */
+ ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
+ }
+
+ /* Find next word break, and in the same loop checking if only ASCII
+ * characters */
+ i = parser->cursor + first_unichar_len;
+ while (i < parser->txt_size &&
+ !parser->word_break_flags [i]) {
+
+ if (ascii_only &&
+ !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
+ ascii_only = FALSE;
+ }
+
+ i++;
+ }
+
+ /* Word end is the first byte after the word, which is either the
+ * start of next word or the end of the string */
+ *p_word_length = i - parser->cursor;
+
+ /* We only want the words where the first character
+ * in the word is either a letter, a number or a symbol.
+ * This is needed because the word break algorithm also
+ * considers word breaks after for example commas or other
+ * punctuation marks.
+ * Note that looking at the first character in the string
+ * should be compatible with all Unicode normalization
+ * methods.
+ */
+ if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
+ !uc_is_general_category (first_unichar,
+ parser->allowed_start)) {
+ *p_is_allowed_word_start = FALSE;
+ return TRUE;
+ }
+
+ /* Decide word type */
+ if (ascii_only) {
+ *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
+ } else if (IS_CJK_UCS4 (first_unichar)) {
+ *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
+ } else {
+ *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
+ }
+ return TRUE;
+}
+
+static gboolean
+parser_next (TrackerParser *parser,
+ gint *byte_offset_start,
+ gint *byte_offset_end)
+{
+ gsize word_length = 0;
+ gchar *processed_word = NULL;
+
+ *byte_offset_start = 0;
+ *byte_offset_end = 0;
+
+ g_return_val_if_fail (parser, FALSE);
+
+ /* Loop to look for next valid word */
+ while (!processed_word &&
+ parser->cursor < parser->txt_size) {
+ TrackerParserWordType type;
+ gsize truncated_length;
+ gboolean is_allowed;
+
+ /* Get word info */
+ if (!get_word_info (parser,
+ &word_length,
+ &is_allowed,
+ &type)) {
+ /* Quit loop just in case */
+ parser->cursor = parser->txt_size;
+ break;
+ }
+
+ /* Skip the word if not an allowed word start */
+ if (!is_allowed) {
+ /* Skip this word and keep on looping */
+ parser->cursor += word_length;
+ continue;
+ }
+
+ /* Skip the word if longer than the maximum allowed */
+ if (word_length >= parser->max_word_length) {
+ /* Skip this word and keep on looping */
+ parser->cursor += word_length;
+ continue;
+ }
+
+ /* check if word is reserved and skip it if so */
+ if (parser->skip_reserved_words &&
+ tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor],
+ word_length)) {
+ /* Skip this word and keep on looping */
+ parser->cursor += word_length;
+ continue;
+ }
+
+ /* compute truncated word length if needed (to avoid extremely
+ * long words)*/
+ truncated_length = (word_length < WORD_BUFFER_LENGTH ?
+ word_length :
+ WORD_BUFFER_LENGTH - 1);
+
+ /* Process the word here. If it fails, we can still go
+ * to the next one. Returns newly allocated string
+ * always */
+ processed_word = process_word_utf8 (parser,
+ &(parser->txt[parser->cursor]),
+ truncated_length,
+ type);
+ if (!processed_word) {
+ /* Skip this word and keep on looping */
+ parser->cursor += word_length;
+ continue;
+ }
+ }
+
+ /* If we got a word here, set output */
+ if (processed_word) {
+ /* Set outputs */
+ *byte_offset_start = parser->cursor;
+ *byte_offset_end = parser->cursor + word_length;
+
+ /* Update cursor */
+ parser->cursor += word_length;
+
+ parser->word_length = strlen (processed_word);
+ parser->word = processed_word;
+
+ return TRUE;
+ }
+
+ /* No more words... */
+ return FALSE;
+}
+
+TrackerParser *
+tracker_parser_new (TrackerLanguage *language,
+ gint max_word_length)
+{
+ TrackerParser *parser;
+
+ g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
+ g_return_val_if_fail (max_word_length > 0, NULL);
+
+ parser = g_new0 (TrackerParser, 1);
+
+ parser->language = g_object_ref (language);
+
+ parser->max_word_length = max_word_length;
+ parser->word_length = 0;
+
+ parser->word_break_flags = NULL;
+
+ return parser;
+}
+
+void
+tracker_parser_free (TrackerParser *parser)
+{
+ g_return_if_fail (parser != NULL);
+
+ if (parser->language) {
+ g_object_unref (parser->language);
+ }
+
+ g_free (parser->word_break_flags);
+
+ g_free (parser->word);
+
+ g_free (parser);
+}
+
+void
+tracker_parser_reset (TrackerParser *parser,
+ const gchar *txt,
+ gint txt_size,
+ gboolean delimit_words,
+ gboolean enable_stemmer,
+ gboolean enable_stop_words,
+ gboolean skip_reserved_words,
+ gboolean skip_numbers)
+{
+ g_return_if_fail (parser != NULL);
+ g_return_if_fail (txt != NULL);
+
+ parser->enable_stemmer = enable_stemmer;
+ parser->enable_stop_words = enable_stop_words;
+ parser->delimit_words = delimit_words;
+
+ parser->txt_size = txt_size;
+ parser->txt = txt;
+ parser->skip_reserved_words = skip_reserved_words;
+ parser->skip_numbers = skip_numbers;
+
+ g_free (parser->word);
+ parser->word = NULL;
+
+ parser->word_position = 0;
+
+ parser->cursor = 0;
+
+ g_free (parser->word_break_flags);
+
+ /* Create array of flags, same size as original text. */
+ parser->word_break_flags = g_malloc (txt_size);
+
+ /* Get wordbreak flags in the whole string */
+ u8_wordbreaks ((const uint8_t *)txt,
+ (size_t) txt_size,
+ (char *)parser->word_break_flags);
+
+ /* Prepare a custom category which is a combination of the
+ * desired ones */
+ parser->allowed_start = UC_LETTER;
+ if (!parser->skip_numbers) {
+ parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
+ }
+}
+
+gchar *
+tracker_parser_process_word (TrackerParser *parser,
+ const gchar *word,
+ gint length,
+ gboolean do_strip)
+{
+ return process_word_utf8 (parser,
+ word,
+ length,
+ (do_strip ?
+ TRACKER_PARSER_WORD_TYPE_OTHER_UNAC :
+ TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC));
+}
+
+static gchar *
+process_word_utf8 (TrackerParser *parser,
+ const gchar *word,
+ gint length,
+ TrackerParserWordType type)
+{
+ gchar word_buffer [WORD_BUFFER_LENGTH];
+ gchar *normalized = NULL;
+ gchar *stripped = NULL;
+ gchar *stemmed = NULL;
+ size_t new_word_length;
+
+ g_return_val_if_fail (parser != NULL, NULL);
+ g_return_val_if_fail (word != NULL, NULL);
+
+ /* If length is set as -1, the input word MUST be NIL-terminated.
+ * Otherwise, this restriction is not needed as the length to process
+ * is given as input argument */
+ if (length < 0) {
+ length = strlen (word);
+ }
+
+ /* Log original word */
+ tracker_parser_message_hex ("ORIGINAL word",
+ word, length);
+
+ /* Normalization and case-folding ONLY for non-ASCII */
+ if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
+ /* Leave space for last NIL */
+ new_word_length = WORD_BUFFER_LENGTH - 1;
+
+ /* Casefold and NFC normalization in output.
+ * NOTE: if the output buffer is not big enough, u8_casefold will
+ * return a newly-allocated buffer. */
+ normalized = u8_casefold ((const uint8_t *)word,
+ length,
+ uc_locale_language (),
+ UNINORM_NFC,
+ word_buffer,
+ &new_word_length);
+
+ /* Case folding + Normalization failed, skip this word */
+ g_return_val_if_fail (normalized != NULL, NULL);
+
+ /* If output buffer is not the same as the one passed to
+ * u8_casefold, we know it was newly-allocated, so need
+ * to resize it in 1 byte to add last NIL */
+ if (normalized != word_buffer) {
+ normalized = g_realloc (normalized, new_word_length + 1);
+ }
+
+ /* Log after Normalization */
+ tracker_parser_message_hex (" After Casefolding and NFC normalization",
+ normalized, new_word_length);
+ }
+ else {
+ /* For ASCII-only, just tolower() each character */
+ gsize i;
+
+ normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
+
+ for (i = 0; i < length; i++) {
+ normalized[i] = g_ascii_tolower (word[i]);
+ }
+
+ new_word_length = length;
+
+ /* Log after tolower */
+ tracker_parser_message_hex (" After Lowercasing",
+ normalized, new_word_length);
+ }
+
+ /* Set output NIL */
+ normalized[new_word_length] = '\0';
+
+ /* UNAC stripping needed? (for non-CJK and non-ASCII) */
+ if (type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
+ gsize stripped_word_length;
+
+ stripped = tracker_parser_unaccent_utf8_word (normalized,
+ new_word_length,
+ &stripped_word_length);
+
+ if (stripped) {
+ /* Log after UNAC stripping */
+ tracker_parser_message_hex (" After UNAC stripping",
+ stripped, stripped_word_length);
+ new_word_length = stripped_word_length;
+ }
+ }
+
+ /* Stemming needed? */
+ if (parser->enable_stemmer) {
+ stemmed = tracker_language_stem_word (parser->language,
+ stripped ? stripped : normalized,
+ new_word_length);
+
+ /* Log after stemming */
+ tracker_parser_message_hex (" After stemming",
+ stemmed, strlen (stemmed));
+ }
+
+ /* If stemmed wanted and succeeded, free previous and return it */
+ if (stemmed) {
+ g_free (stripped);
+ if (normalized != word_buffer) {
+ g_free (normalized);
+ }
+ return stemmed;
+ }
+
+ /* If stripped wanted and succeeded, free previous and return it */
+ if (stripped) {
+ if (normalized != word_buffer) {
+ g_free (normalized);
+ }
+ return stripped;
+ }
+
+ /* It may be the case that no stripping and no stemming was needed, and
+ * that the output buffer in stack was enough for case-folding and
+ * normalization. In this case, need to strdup() the string to return it */
+ return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
+}
+
+const gchar *
+tracker_parser_next (TrackerParser *parser,
+ gint *position,
+ gint *byte_offset_start,
+ gint *byte_offset_end,
+ gboolean *stop_word,
+ gint *word_length)
+{
+ const gchar *str;
+ gint byte_start = 0, byte_end = 0;
+
+ str = NULL;
+
+ g_free (parser->word);
+ parser->word = NULL;
+
+ if (parser_next (parser, &byte_start, &byte_end)) {
+ str = parser->word;
+ }
+
+ if (str &&
+ parser->enable_stop_words &&
+ tracker_language_is_stop_word (parser->language, str)) {
+ *stop_word = TRUE;
+ } else {
+ parser->word_position++;
+ *stop_word = FALSE;
+ }
+
+ *word_length = parser->word_length;
+ *position = parser->word_position;
+ *byte_offset_start = byte_start;
+ *byte_offset_end = byte_end;
+
+ return str;
+}
+
diff --git a/src/libtracker-fts/tracker-parser-utils.c b/src/libtracker-fts/tracker-parser-utils.c
new file mode 100644
index 0000000..e6c8521
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-utils.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#ifdef HAVE_UNAC
+#include <unac.h>
+#endif
+
+#ifdef HAVE_LIBICU
+#include <unicode/utypes.h>
+#include <unicode/ucnv.h>
+#endif
+
+#include <libtracker-common/tracker-common.h>
+#include "tracker-parser-utils.h"
+
+
+/* Output is always UTF-8. */
+gchar *
+tracker_parser_unaccent_utf16be_word (const gchar *string,
+ gsize ilength,
+ gsize *p_olength)
+{
+ GError *error = NULL;
+ gchar *unaccented_str = NULL;
+ gchar *str_utf8 = NULL;
+ gsize unaccented_len;
+ gsize utf8_len;
+
+ *p_olength = 0;
+
+ if (unac_string_utf16 (string, ilength,
+ &unaccented_str, &unaccented_len) != 0) {
+ g_warning ("UNAC failed to strip accents");
+ return NULL;
+ }
+
+ /* Convert from UTF-16BE to UTF-8 */
+ str_utf8 = g_convert (unaccented_str,
+ unaccented_len,
+ "UTF-8",
+ "UTF-16BE",
+ NULL,
+ &utf8_len,
+ &error);
+ g_free (unaccented_str);
+
+ if (error) {
+ g_warning ("Could not convert back to UTF-8: %s",
+ error->message);
+ g_error_free (error);
+ return NULL;
+ }
+
+ *p_olength = utf8_len;
+ return str_utf8;
+}
+
+
+#ifdef HAVE_LIBICU
+/* NOTE: Internally, UChars are UTF-16, but conversion needed just in case,
+ * as libunac needs UTF-16BE. Output is always UTF-8.*/
+gchar *
+tracker_parser_unaccent_UChar_word (const UChar *string,
+ gsize ilength,
+ gsize *p_olength)
+{
+#ifdef HAVE_UNAC
+ UErrorCode icu_error = U_ZERO_ERROR;
+ UConverter *converter;
+ gchar *str_utf16;
+ gchar *str_utf8 = NULL;
+ gsize utf16_len;
+
+ *p_olength = 0;
+
+ /* Open converter UChar to UTF-16BE */
+ converter = ucnv_open ("UTF-16BE", &icu_error);
+ if (!converter) {
+ g_warning ("Cannot open UTF-16BE converter: '%s'",
+ U_FAILURE (icu_error) ? u_errorName (icu_error) : "none");
+ return NULL;
+ }
+
+ /* Allocate buffer, same size as input string.
+ * Note that ilength specifies number of UChars not
+ * number of bytes */
+ str_utf16 = g_malloc ((ilength + 1) * 2);
+
+ /* Convert from UChar to UTF-16BE */
+ utf16_len = ucnv_fromUChars (converter,
+ str_utf16,
+ (ilength + 1) * 2,
+ string,
+ ilength,
+ &icu_error);
+ if (U_FAILURE (icu_error)) {
+ g_warning ("Cannot convert from UChar to UTF-16BE: '%s' "
+ "(ilength: %" G_GSIZE_FORMAT ")",
+ u_errorName (icu_error),
+ ilength);
+ } else {
+ str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
+ utf16_len,
+ p_olength);
+ }
+ ucnv_close (converter);
+ g_free (str_utf16);
+ return str_utf8;
+#else
+ return NULL;
+#endif
+}
+#endif
+
+gchar *
+tracker_parser_unaccent_utf8_word (const gchar *str,
+ gsize ilength,
+ gsize *p_olength)
+{
+#ifdef HAVE_UNAC
+ GError *error = NULL;
+ gchar *str_utf16 = NULL;
+ gchar *str_utf8 = NULL;
+ gsize utf16_len;
+
+ *p_olength = 0;
+
+ /* unac_string() does roughly the same than below, plus it
+ * corrupts memory in 64bit systems, so avoid it for now.
+ */
+ str_utf16 = g_convert (str, ilength, "UTF-16BE", "UTF-8", NULL, &utf16_len, &error);
+
+ if (error) {
+ g_warning ("Could not convert to UTF-16: %s", error->message);
+ g_error_free (error);
+ return NULL;
+ } else {
+
+ str_utf8 = tracker_parser_unaccent_utf16be_word (str_utf16,
+ utf16_len,
+ p_olength);
+ }
+
+ g_free (str_utf16);
+ return str_utf8;
+#else
+ return NULL;
+#endif
+}
+
+
+/*
+ * Definition of the possible reserved words.
+ * Length of word is explicitly given to avoid strlen() calls
+ */
+typedef struct {
+ const gchar *word;
+ gsize word_length;
+} TrackerParserReservedWord;
+
+static const TrackerParserReservedWord reserved_words[] = {
+ { "or", 2 },
+ { NULL, 0 }
+};
+
+gboolean
+tracker_parser_is_reserved_word_utf8 (const gchar *word,
+ gsize word_length)
+{
+ gint i = 0;
+
+ /* Loop the array of predefined reserved words */
+ while (reserved_words[i].word != NULL) {
+ if (word_length == reserved_words[i].word_length &&
+ strncmp (word,
+ reserved_words[i].word,
+ word_length) == 0) {
+ return TRUE;
+ }
+ i++;
+ }
+
+ return FALSE;
+}
+
+
+#if TRACKER_PARSER_DEBUG_HEX
+void
+tracker_parser_message_hex (const gchar *message,
+ const gchar *str,
+ gsize str_length)
+{
+ gchar *hex_aux;
+ gchar *str_aux;
+
+ g_return_if_fail (message);
+ g_return_if_fail (str);
+ g_return_if_fail (str_length != 0);
+
+ /* String may not come NIL-terminated */
+ str_aux = g_malloc (str_length + 1);
+ memcpy (str_aux, str, str_length);
+ str_aux[str_length] = '\0';
+
+ /* Get hexadecimal representation of the input string */
+ hex_aux = tracker_strhex (str, str_length, ':');
+
+ /* Log it */
+ g_message ("%s: '%s' (%s)",
+ message, str_aux, hex_aux);
+
+ g_free (str_aux);
+ g_free (hex_aux);
+}
+#endif
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
new file mode 100644
index 0000000..50805c1
--- /dev/null
+++ b/src/libtracker-fts/tracker-parser-utils.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#ifndef __TRACKER_PARSER_UTILS_H__
+#define __TRACKER_PARSER_UTILS_H__
+
+#include "config.h"
+
+#include <glib.h>
+
+#ifdef HAVE_LIBICU
+#include <unicode/utypes.h>
+#endif
+
+G_BEGIN_DECLS
+
+/* ASCII-7 is in range [0x00,0x7F] */
+#define IS_ASCII_UCS4(c) ((c) <= 0x7F)
+
+/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6] */
+#define IS_CJK_UCS4(c) (((c) >= 0x3400 && (c) <= 0x4DB5) || \
+ ((c) >= 0x4E00 && (c) <= 0x9FA5) || \
+ ((c) >= 0x20000 && (c) <= 0x2A6D6))
+
+#define IS_UNDERSCORE_UCS4(c) ((c) == 0x005F)
+
+
+gchar *tracker_parser_unaccent_utf16be_word (const gchar *string,
+ gsize ilength,
+ gsize *p_olength);
+
+gchar *tracker_parser_unaccent_utf8_word (const gchar *string,
+ gsize ilength,
+ gsize *p_olength);
+
+#ifdef HAVE_LIBICU
+gchar *tracker_parser_unaccent_UChar_word (const UChar *string,
+ gsize ilength,
+ gsize *p_olength);
+#endif
+
+
+gboolean tracker_parser_is_reserved_word_utf8 (const gchar *word,
+ gsize word_length);
+
+
+/* Define to 1 if you want to enable debugging logs showing HEX contents
+ * of the words being parsed */
+#define TRACKER_PARSER_DEBUG_HEX 0
+
+#if TRACKER_PARSER_DEBUG_HEX
+void tracker_parser_message_hex (const gchar *message,
+ const gchar *str,
+ gsize str_length);
+#else
+#define tracker_parser_message_hex(a,b,c)
+#endif
+
+G_END_DECLS
+
+#endif /* __TRACKER_PARSER_UTILS_H__ */
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index 66535c9..cad4442 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -35,10 +35,11 @@ TrackerParser *tracker_parser_new (TrackerLanguage *language,
void tracker_parser_reset (TrackerParser *parser,
const gchar *txt,
gint txt_size,
- gboolean delimit_words,
- gboolean enable_stemmer,
- gboolean enable_stop_words,
- gboolean parse_reserved_words);
+ gboolean delimit_words,
+ gboolean enable_stemmer,
+ gboolean enable_stop_words,
+ gboolean skip_reserved_words,
+ gboolean skip_numbers);
const gchar * tracker_parser_next (TrackerParser *parser,
gint *position,
@@ -48,9 +49,10 @@ const gchar * tracker_parser_next (TrackerParser *parser,
gint *word_length);
gchar * tracker_parser_process_word (TrackerParser *parser,
- const char *word,
+ const gchar *word,
gint length,
- gboolean do_strip);
+ gboolean do_strip);
+
void tracker_parser_free (TrackerParser *parser);
G_END_DECLS
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]