[tracker/parser-unicode-libs-review: 65/85] Added libunistring-based word-break
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/parser-unicode-libs-review: 65/85] Added libunistring-based word-break
- Date: Tue, 4 May 2010 17:29:28 +0000 (UTC)
commit 6c804624b648b4dae076617ed3c622e69aa6e542
Author: Aleksander Morgado <aleksander lanedo com>
Date: Tue Apr 27 20:51:32 2010 +0200
Added libunistring-based word-break
src/libtracker-fts/Makefile.am | 10 ++-
src/libtracker-fts/tracker-parser.c | 227 +++++++++++++++++++++++++++++++----
2 files changed, 210 insertions(+), 27 deletions(-)
---
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am
index 4938097..c174566 100644
--- a/src/libtracker-fts/Makefile.am
+++ b/src/libtracker-fts/Makefile.am
@@ -12,6 +12,10 @@ INCLUDES = \
$(UNAC_CFLAGS) \
$(SQLITE3_CFLAGS)
+if HAVE_LIBUNISTRING
+INCLUDES += $(LIBUNISTRING_CFLAGS)
+endif
+
noinst_LTLIBRARIES = libtracker-fts.la
libtracker_fts_la_SOURCES = \
@@ -32,4 +36,8 @@ libtracker_fts_la_LIBADD = \
$(GCOV_LIBS) \
$(PANGO_LIBS) \
$(UNAC_LIBS) \
- $(GLIB2_LIBS)
+ $(GLIB2_LIBS)
+
+if HAVE_LIBUNISTRING
+libtracker_fts_la_LIBADD += $(LIBUNISTRING_LIBS)
+endif
diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser.c
index bd9326f..3d37874 100644
--- a/src/libtracker-fts/tracker-parser.c
+++ b/src/libtracker-fts/tracker-parser.c
@@ -21,15 +21,23 @@
#include "config.h"
#include <string.h>
-#include <pango/pango.h>
#ifdef HAVE_UNAC
#include <unac.h>
#endif
+#ifdef HAVE_LIBUNISTRING
+/* # include <unistr.h> */
+# include <uniwbrk.h>
+# include <unictype.h>
+#else
+#include <pango/pango.h>
+#endif
+
#include "tracker-parser.h"
-#define INDEX_NUMBER_MIN_LENGTH 6
+
+#ifndef HAVE_LIBUNISTRING
/* Need pango for CJK ranges which are : 0x3400 - 0x4DB5, 0x4E00 -
* 0x9FA5, 0x20000 - <= 0x2A6D6
@@ -70,6 +78,10 @@ typedef enum {
TRACKER_PARSER_ENCODING_OTHER
} TrackerParserEncoding;
+
+#endif /* !HAVE_LIBUNISTRING */
+
+
struct TrackerParser {
const gchar *txt;
gint txt_size;
@@ -86,15 +98,29 @@ struct TrackerParser {
gchar *word;
gint word_length;
guint word_position;
+#ifndef HAVE_LIBUNISTRING
TrackerParserEncoding encoding;
const gchar *cursor;
+#endif /* !HAVE_LIBUNISTRING */
+#ifndef HAVE_LIBUNISTRING
/* Pango members for CJK text parsing */
PangoLogAttr *attrs;
guint attr_length;
guint attr_pos;
+#else
+ /* Cursor, as index of the input array of bytes */
+ gsize cursor;
+ /* libunistring flags array */
+ gchar *word_break_flags;
+ /* general category of the start character in words */
+ uc_general_category_t allowed_start;
+#endif /* !HAVE_LIBUNISTRING */
};
+
+
+#ifndef HAVE_LIBUNISTRING
static inline TrackerParserWordType
get_word_type (gunichar c)
{
@@ -137,6 +163,9 @@ get_word_type (gunichar c)
return TRACKER_PARSER_WORD_IGNORE;
}
+#endif /* !HAVE_LIBUNISTRING */
+
+
static inline gchar *
strip_word (const gchar *str,
@@ -190,6 +219,8 @@ strip_word (const gchar *str,
#endif
}
+
+#ifndef HAVE_LIBUNISTRING
static TrackerParserEncoding
get_encoding (const gchar *txt)
{
@@ -218,6 +249,9 @@ get_encoding (const gchar *txt)
}
+#endif /* !HAVE_LIBUNISTRING */
+
+
static gboolean
is_stop_word (TrackerLanguage *language,
const gchar *word)
@@ -233,6 +267,9 @@ is_stop_word (TrackerLanguage *language,
return g_hash_table_lookup (stop_words, word) != NULL;
}
+
+#ifndef HAVE_LIBUNISTRING
+
static gboolean
pango_next (TrackerParser *parser,
gint *byte_offset_start,
@@ -492,6 +529,107 @@ parser_next (TrackerParser *parser,
}
+#else
+
+/* Use libunistring
+ * void u8_wordbreaks (const uint8_t *s, size_t n, char *p)
+ * int u8_strmbtouc (ucs4_t *puc, const uint8_t *s)
+ */
+static gboolean
+parser_next (TrackerParser *parser,
+ gint *byte_offset_start,
+ gint *byte_offset_end)
+{
+ gchar *word = NULL;
+ gsize word_length;
+
+ *byte_offset_start = 0;
+ *byte_offset_end = 0;
+
+ g_return_val_if_fail (parser, FALSE);
+
+ /* Loop to look for next valid word */
+ while (!word &&
+ parser->cursor < parser->txt_size) {
+ ucs4_t first_unichar;
+ gint first_unichar_len;
+ gsize i;
+
+ /* Get first character of the word as UCS4 */
+ first_unichar_len = u8_strmbtouc (&first_unichar,
+ &(parser->txt[parser->cursor]));
+
+ /* Find next word break */
+ i = parser->cursor + first_unichar_len;
+ while (!parser->word_break_flags [i] &&
+ i < parser->txt_size) {
+ i++;
+ }
+
+ /* Word end is the first byte after the word, which is either the
+ * start of next word or the end of the string */
+ word_length = i - parser->cursor;
+
+ if (first_unichar_len > 0) {
+
+ /* We only want the words where the first character
+ * in the word is either a letter, a number or a symbol.
+ * This is needed because the word break algorithm also
+ * considers word breaks after for example commas or other
+ * punctuation marks.
+ * Note that looking at the first character in the string
+ * should be compatible with all Unicode normalization
+ * methods.
+ */
+ if (uc_is_general_category (first_unichar, parser->allowed_start)) {
+ word = g_malloc (word_length + 1);
+ memcpy (word, &(parser->txt[parser->cursor]), word_length);
+ word[word_length] = '\0';
+ } else {
+ /* Skip this word and keep on looping */
+ parser->cursor += word_length;
+ }
+ } else {
+ /* This should only happen if NIL was passed to u8_strmbtouc,
+ * so better just force stop here */
+ parser->cursor = parser->txt_size;
+ }
+ }
+
+ /* If we got a word here, process it */
+ if (word) {
+ gchar *processed_word;
+
+ /* Set outputs */
+ *byte_offset_start = parser->cursor;
+ *byte_offset_end = parser->cursor + word_length;
+
+ /* Update cursor */
+ parser->cursor += word_length;
+
+ /* g_debug ("start: '%d', end: '%d', new cursor at: '%d'", */
+ /* *byte_offset_start, *byte_offset_end, (gint)parser->cursor); */
+
+ /* TODO: tolower, do_strip */
+
+ processed_word = tracker_parser_process_word (parser, word, word_length, TRUE);
+ g_free (word);
+
+ if (processed_word) {
+ parser->word_length = strlen (processed_word);
+ parser->word = processed_word;
+
+ return TRUE;
+ }
+ }
+
+ /* No more words... */
+ return FALSE;
+}
+
+#endif /* !HAVE_LIBUNISTRING */
+
+
TrackerParser *
tracker_parser_new (TrackerLanguage *language,
gint max_word_length)
@@ -507,7 +645,12 @@ tracker_parser_new (TrackerLanguage *language,
parser->max_word_length = max_word_length;
parser->word_length = 0;
+
+#ifndef HAVE_LIBUNISTRING
parser->attrs = NULL;
+#else
+ parser->word_break_flags = NULL;
+#endif /* !HAVE_LIBUNISTRING */
return parser;
}
@@ -521,7 +664,11 @@ tracker_parser_free (TrackerParser *parser)
g_object_unref (parser->language);
}
+#ifndef HAVE_LIBUNISTRING
g_free (parser->attrs);
+#else
+ g_free (parser->word_break_flags);
+#endif /* !HAVE_LIBUNISTRING */
g_free (parser->word);
@@ -540,13 +687,18 @@ tracker_parser_reset (TrackerParser *parser,
g_return_if_fail (parser != NULL);
g_return_if_fail (txt != NULL);
+#ifndef HAVE_LIBUNISTRING
g_free (parser->attrs);
parser->attrs = NULL;
+ parser->cursor = txt;
+ parser->encoding = get_encoding (txt);
+#endif
+
parser->enable_stemmer = enable_stemmer;
parser->enable_stop_words = enable_stop_words;
parser->delimit_words = delimit_words;
- parser->encoding = get_encoding (txt);
+
parser->txt_size = txt_size;
parser->txt = txt;
parser->parse_reserved_words = parse_reserved_words;
@@ -556,8 +708,27 @@ tracker_parser_reset (TrackerParser *parser,
parser->word_position = 0;
- parser->cursor = txt;
+#ifdef HAVE_LIBUNISTRING
+ parser->cursor = 0;
+
+ g_free (parser->word_break_flags);
+
+ /* Create array of flags, same size as original text. */
+ parser->word_break_flags = g_malloc (txt_size);
+
+ /* Get wordbreak flags in the whole string */
+ u8_wordbreaks ((const uint8_t *)txt,
+ (size_t) txt_size,
+ (char *)parser->word_break_flags);
+
+ /* Prepare a custom category which is a combination of the
+ * desired ones */
+ parser->allowed_start = UC_LETTER;
+ parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
+ parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_SYMBOL);
+
+#else
if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
PangoLogAttr *attrs;
@@ -579,24 +750,23 @@ tracker_parser_reset (TrackerParser *parser,
parser->attrs = attrs;
parser->attr_pos = 0;
}
+#endif /* !HAVE_LIBUNISTRING */
}
gchar *
tracker_parser_process_word (TrackerParser *parser,
- const char *word,
+ const gchar *word,
gint length,
gboolean do_strip)
{
gchar *stem_word;
gchar *str;
- gchar *stripped_word;
guint bytes, len;
g_return_val_if_fail (parser != NULL, NULL);
g_return_val_if_fail (word != NULL, NULL);
str = NULL;
- stripped_word = NULL;
if (word) {
if (length == -1) {
@@ -605,34 +775,33 @@ tracker_parser_process_word (TrackerParser *parser,
bytes = length;
}
- if (do_strip) {
- stripped_word = strip_word (word, bytes, &len);
- } else {
- stripped_word = NULL;
- }
-
- if (!stripped_word) {
- str = g_utf8_normalize (word,
- bytes,
- G_NORMALIZE_NFC);
- } else {
- str = g_utf8_normalize (stripped_word,
- len,
- G_NORMALIZE_NFC);
- g_free (stripped_word);
- }
+ g_debug ("ORIGINAL word: '%s'", word);
+ str = g_utf8_normalize (word,
+ bytes,
+ G_NORMALIZE_NFC);
if (!str) {
return NULL;
}
+ len = strlen (str);
+ g_debug (" After NFC normalization: '%s'", str);
+
+ if (do_strip) {
+ gchar *stripped_word;
+
+ stripped_word = strip_word (str, len, &len);
+ g_debug (" After UNAC stripping: '%s'", stripped_word);
+ g_free (str);
+ str = stripped_word;
+ }
+
if (!parser->enable_stemmer) {
return str;
}
- len = strlen (str);
-
stem_word = tracker_language_stem_word (parser->language, str, len);
+ g_debug (" After Stemming: '%s'", stem_word);
if (stem_word) {
g_free (str);
@@ -660,6 +829,8 @@ tracker_parser_next (TrackerParser *parser,
g_free (parser->word);
parser->word = NULL;
+
+#ifndef HAVE_LIBUNISTRING
if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
if (pango_next (parser, &byte_start, &byte_end)) {
str = parser->word;
@@ -667,7 +838,9 @@ tracker_parser_next (TrackerParser *parser,
parser->word_position++;
*stop_word = FALSE;
- } else {
+ } else
+#endif /* !HAVE_LIBUNISTRING */
+ {
if (parser_next (parser, &byte_start, &byte_end)) {
str = parser->word;
}
@@ -688,3 +861,5 @@ tracker_parser_next (TrackerParser *parser,
return str;
}
+
+
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]