[tracker/parser-unicode-libs-review: 65/85] Added libunistring-based word-break



commit 6c804624b648b4dae076617ed3c622e69aa6e542
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Tue Apr 27 20:51:32 2010 +0200

    Added libunistring-based word-break

 src/libtracker-fts/Makefile.am      |   10 ++-
 src/libtracker-fts/tracker-parser.c |  227 +++++++++++++++++++++++++++++++----
 2 files changed, 210 insertions(+), 27 deletions(-)
---
diff --git a/src/libtracker-fts/Makefile.am b/src/libtracker-fts/Makefile.am
index 4938097..c174566 100644
--- a/src/libtracker-fts/Makefile.am
+++ b/src/libtracker-fts/Makefile.am
@@ -12,6 +12,10 @@ INCLUDES =								\
 	$(UNAC_CFLAGS)							\
 	$(SQLITE3_CFLAGS)
 
+if HAVE_LIBUNISTRING
+INCLUDES += $(LIBUNISTRING_CFLAGS)
+endif
+
 noinst_LTLIBRARIES = libtracker-fts.la
 
 libtracker_fts_la_SOURCES = 						\
@@ -32,4 +36,8 @@ libtracker_fts_la_LIBADD =						\
 	$(GCOV_LIBS)							\
 	$(PANGO_LIBS)							\
 	$(UNAC_LIBS)							\
-	$(GLIB2_LIBS)							
+	$(GLIB2_LIBS)
+
+if HAVE_LIBUNISTRING
+libtracker_fts_la_LIBADD += $(LIBUNISTRING_LIBS)
+endif
diff --git a/src/libtracker-fts/tracker-parser.c b/src/libtracker-fts/tracker-parser.c
index bd9326f..3d37874 100644
--- a/src/libtracker-fts/tracker-parser.c
+++ b/src/libtracker-fts/tracker-parser.c
@@ -21,15 +21,23 @@
 #include "config.h"
 
 #include <string.h>
-#include <pango/pango.h>
 
 #ifdef HAVE_UNAC
 #include <unac.h>
 #endif
 
+#ifdef HAVE_LIBUNISTRING
+/* # include <unistr.h> */
+# include <uniwbrk.h>
+# include <unictype.h>
+#else
+#include <pango/pango.h>
+#endif
+
 #include "tracker-parser.h"
 
-#define INDEX_NUMBER_MIN_LENGTH 6
+
+#ifndef HAVE_LIBUNISTRING
 
 /* Need pango for CJK ranges which are : 0x3400 - 0x4DB5, 0x4E00 -
  * 0x9FA5, 0x20000 - <= 0x2A6D6
@@ -70,6 +78,10 @@ typedef enum {
 	TRACKER_PARSER_ENCODING_OTHER
 } TrackerParserEncoding;
 
+
+#endif /* !HAVE_LIBUNISTRING */
+
+
 struct TrackerParser {
 	const gchar           *txt;
 	gint                   txt_size;
@@ -86,15 +98,29 @@ struct TrackerParser {
 	gchar                   *word;
 	gint                    word_length;
 	guint                   word_position;
+#ifndef HAVE_LIBUNISTRING
 	TrackerParserEncoding   encoding;
 	const gchar             *cursor;
+#endif /* !HAVE_LIBUNISTRING */
 
+#ifndef HAVE_LIBUNISTRING
 	/* Pango members for CJK text parsing */
 	PangoLogAttr          *attrs;
 	guint                  attr_length;
 	guint                  attr_pos;
+#else
+	/* Cursor, as index of the input array of bytes */
+	gsize                  cursor;
+	/* libunistring flags array */
+	gchar                 *word_break_flags;
+	/* general category of the  start character in words */
+	uc_general_category_t  allowed_start;
+#endif /* !HAVE_LIBUNISTRING */
 };
 
+
+
+#ifndef HAVE_LIBUNISTRING
 static inline TrackerParserWordType
 get_word_type (gunichar c)
 {
@@ -137,6 +163,9 @@ get_word_type (gunichar c)
 
 	return TRACKER_PARSER_WORD_IGNORE;
 }
+#endif /* !HAVE_LIBUNISTRING */
+
+
 
 static inline gchar *
 strip_word (const gchar *str,
@@ -190,6 +219,8 @@ strip_word (const gchar *str,
 #endif
 }
 
+
+#ifndef HAVE_LIBUNISTRING
 static TrackerParserEncoding
 get_encoding (const gchar *txt)
 {
@@ -218,6 +249,9 @@ get_encoding (const gchar *txt)
 
 }
 
+#endif /* !HAVE_LIBUNISTRING */
+
+
 static gboolean
 is_stop_word (TrackerLanguage *language,
               const gchar     *word)
@@ -233,6 +267,9 @@ is_stop_word (TrackerLanguage *language,
 	return g_hash_table_lookup (stop_words, word) != NULL;
 }
 
+
+#ifndef HAVE_LIBUNISTRING
+
 static gboolean
 pango_next (TrackerParser *parser,
             gint          *byte_offset_start,
@@ -492,6 +529,107 @@ parser_next (TrackerParser *parser,
 
 }
 
+#else
+
+/* Use libunistring
+ *  void u8_wordbreaks (const uint8_t *s, size_t n, char *p)
+ *  int u8_strmbtouc (ucs4_t *puc, const uint8_t *s)
+ */
+static gboolean
+parser_next (TrackerParser *parser,
+             gint          *byte_offset_start,
+             gint          *byte_offset_end)
+{
+	gchar *word = NULL;
+	gsize word_length;
+
+	*byte_offset_start = 0;
+	*byte_offset_end = 0;
+
+	g_return_val_if_fail (parser, FALSE);
+
+	/* Loop to look for next valid word */
+	while (!word &&
+	       parser->cursor < parser->txt_size) {
+		ucs4_t first_unichar;
+		gint first_unichar_len;
+		gsize i;
+
+		/* Get first character of the word as UCS4 */
+		first_unichar_len = u8_strmbtouc (&first_unichar,
+		                                  &(parser->txt[parser->cursor]));
+
+		/* Find next word break */
+		i = parser->cursor + first_unichar_len;
+		while (!parser->word_break_flags [i] &&
+		       i < parser->txt_size) {
+			i++;
+		}
+
+		/* Word end is the first byte after the word, which is either the
+		 *  start of next word or the end of the string */
+		word_length = i - parser->cursor;
+
+		if (first_unichar_len > 0) {
+
+			/* We only want the words where the first character
+			 *  in the word is either a letter, a number or a symbol.
+			 * This is needed because the word break algorithm also
+			 *  considers word breaks after for example commas or other
+			 *  punctuation marks.
+			 * Note that looking at the first character in the string
+			 *  should be compatible with all Unicode normalization
+			 *  methods.
+			 */
+			if (uc_is_general_category (first_unichar, parser->allowed_start)) {
+				word = g_malloc (word_length + 1);
+				memcpy (word, &(parser->txt[parser->cursor]), word_length);
+				word[word_length] = '\0';
+			} else {
+				/* Skip this word and keep on looping */
+				parser->cursor += word_length;
+			}
+		} else {
+			/* This should only happen if NIL was passed to u8_strmbtouc,
+			 *  so better just force stop here */
+			parser->cursor = parser->txt_size;
+		}
+	}
+
+	/* If we got a word here, process it */
+	if (word) {
+		gchar *processed_word;
+
+		/* Set outputs */
+		*byte_offset_start = parser->cursor;
+		*byte_offset_end = parser->cursor + word_length;
+
+		/* Update cursor */
+		parser->cursor += word_length;
+
+		/* g_debug ("start: '%d', end: '%d', new cursor at: '%d'", */
+		/*          *byte_offset_start, *byte_offset_end, (gint)parser->cursor); */
+
+		/* TODO: tolower, do_strip */
+
+		processed_word = tracker_parser_process_word (parser, word, word_length, TRUE);
+		g_free (word);
+
+		if (processed_word) {
+			parser->word_length = strlen (processed_word);
+			parser->word = processed_word;
+
+			return TRUE;
+		}
+	}
+
+	/* No more words... */
+	return FALSE;
+}
+
+#endif /* !HAVE_LIBUNISTRING */
+
+
 TrackerParser *
 tracker_parser_new (TrackerLanguage *language,
                     gint             max_word_length)
@@ -507,7 +645,12 @@ tracker_parser_new (TrackerLanguage *language,
 
 	parser->max_word_length = max_word_length;
 	parser->word_length = 0;
+
+#ifndef HAVE_LIBUNISTRING
 	parser->attrs = NULL;
+#else
+	parser->word_break_flags = NULL;
+#endif /* !HAVE_LIBUNISTRING */
 
 	return parser;
 }
@@ -521,7 +664,11 @@ tracker_parser_free (TrackerParser *parser)
 		g_object_unref (parser->language);
 	}
 
+#ifndef HAVE_LIBUNISTRING
 	g_free (parser->attrs);
+#else
+	g_free (parser->word_break_flags);
+#endif /* !HAVE_LIBUNISTRING */
 
 	g_free (parser->word);
 
@@ -540,13 +687,18 @@ tracker_parser_reset (TrackerParser *parser,
 	g_return_if_fail (parser != NULL);
 	g_return_if_fail (txt != NULL);
 
+#ifndef HAVE_LIBUNISTRING
 	g_free (parser->attrs);
 	parser->attrs = NULL;
 
+	parser->cursor = txt;
+	parser->encoding = get_encoding (txt);
+#endif
+
 	parser->enable_stemmer = enable_stemmer;
 	parser->enable_stop_words = enable_stop_words;
 	parser->delimit_words = delimit_words;
-	parser->encoding = get_encoding (txt);
+
 	parser->txt_size = txt_size;
 	parser->txt = txt;
 	parser->parse_reserved_words = parse_reserved_words;
@@ -556,8 +708,27 @@ tracker_parser_reset (TrackerParser *parser,
 
 	parser->word_position = 0;
 
-	parser->cursor = txt;
+#ifdef HAVE_LIBUNISTRING
 
+	parser->cursor = 0;
+
+	g_free (parser->word_break_flags);
+
+	/* Create array of flags, same size as original text. */
+	parser->word_break_flags = g_malloc (txt_size);
+
+	/* Get wordbreak flags in the whole string */
+	u8_wordbreaks ((const uint8_t *)txt,
+	               (size_t) txt_size,
+	               (char *)parser->word_break_flags);
+
+	/* Prepare a custom category which is a combination of the
+	 * desired ones */
+	parser->allowed_start = UC_LETTER;
+	parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
+	parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_SYMBOL);
+
+#else
 	if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
 		PangoLogAttr *attrs;
 
@@ -579,24 +750,23 @@ tracker_parser_reset (TrackerParser *parser,
 		parser->attrs = attrs;
 		parser->attr_pos = 0;
 	}
+#endif /* !HAVE_LIBUNISTRING */
 }
 
 gchar *
 tracker_parser_process_word (TrackerParser *parser,
-                             const char    *word,
+                             const gchar    *word,
                              gint           length,
                              gboolean       do_strip)
 {
 	gchar *stem_word;
 	gchar *str;
-	gchar *stripped_word;
 	guint  bytes, len;
 
 	g_return_val_if_fail (parser != NULL, NULL);
 	g_return_val_if_fail (word != NULL, NULL);
 
 	str = NULL;
-	stripped_word = NULL;
 
 	if (word) {
 		if (length == -1) {
@@ -605,34 +775,33 @@ tracker_parser_process_word (TrackerParser *parser,
 			bytes = length;
 		}
 
-		if (do_strip) {
-			stripped_word = strip_word (word, bytes, &len);
-		} else {
-			stripped_word = NULL;
-		}
-
-		if (!stripped_word) {
-			str = g_utf8_normalize (word,
-			                        bytes,
-			                        G_NORMALIZE_NFC);
-		} else {
-			str = g_utf8_normalize (stripped_word,
-			                        len,
-			                        G_NORMALIZE_NFC);
-			g_free (stripped_word);
-		}
+		g_debug ("ORIGINAL word: '%s'", word);
 
+		str = g_utf8_normalize (word,
+		                        bytes,
+		                        G_NORMALIZE_NFC);
 		if (!str) {
 			return NULL;
 		}
 
+		len = strlen (str);
+		g_debug ("  After NFC normalization: '%s'", str);
+
+		if (do_strip) {
+			gchar *stripped_word;
+
+			stripped_word = strip_word (str, len, &len);
+			g_debug (" After UNAC stripping: '%s'", stripped_word);
+			g_free (str);
+			str = stripped_word;
+		}
+
 		if (!parser->enable_stemmer) {
 			return str;
 		}
 
-		len = strlen (str);
-
 		stem_word = tracker_language_stem_word (parser->language, str, len);
+		g_debug ("  After Stemming: '%s'", stem_word);
 
 		if (stem_word) {
 			g_free (str);
@@ -660,6 +829,8 @@ tracker_parser_next (TrackerParser *parser,
 	g_free (parser->word);
 	parser->word = NULL;
 
+
+#ifndef HAVE_LIBUNISTRING
 	if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
 		if (pango_next (parser, &byte_start, &byte_end)) {
 			str = parser->word;
@@ -667,7 +838,9 @@ tracker_parser_next (TrackerParser *parser,
 		parser->word_position++;
 
 		*stop_word = FALSE;
-	} else {
+	} else
+#endif /* !HAVE_LIBUNISTRING */
+	{
 		if (parser_next (parser, &byte_start, &byte_end)) {
 			str = parser->word;
 		}
@@ -688,3 +861,5 @@ tracker_parser_next (TrackerParser *parser,
 	return str;
 }
 
+
+



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]