[tracker/rss-enclosures] Fixes NB#179570: FTS search of file extension doesn't always work



commit c5cdd688d37f8ee085a4808580b41cc5b4544e5c
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Mon Aug 2 19:30:05 2010 +0200

    Fixes NB#179570: FTS search of file extension doesn't always work
    
      * Added a new set of 'forced wordbreakers', which is just a list
        of Unicode characters which should act always as word separators.
        The fix is just adding the '.' as forced wordbreaker, so that
        filename.ext is always split into "filename" and "ext".

 src/libtracker-fts/README.parsers                |    7 +++
 src/libtracker-fts/tracker-parser-libicu.c       |   60 ++++++++++++++++++++++
 src/libtracker-fts/tracker-parser-libunistring.c |   50 +++++++++++++-----
 src/libtracker-fts/tracker-parser-utils.h        |    8 +++
 tests/libtracker-fts/tracker-parser-test.c       |   11 +++-
 5 files changed, 119 insertions(+), 17 deletions(-)
---
diff --git a/src/libtracker-fts/README.parsers b/src/libtracker-fts/README.parsers
index 54b4ede..f67d535 100644
--- a/src/libtracker-fts/README.parsers
+++ b/src/libtracker-fts/README.parsers
@@ -43,6 +43,13 @@ Parser based on glib/pango:
  * Performs NFC normalization in non-CJK strings.
 
 
+Notes:
+  * As of tracker 0.9.15, the libunistring and libicu parsers have a list of
+     Unicode characters which will always act as word breakers. This hack works
+     on top of the unicode word-breaking algorithm, and was mainly done in order
+     to be able to perform FTS searches using file extension as input for the
+     FTS search.
+
 References:
  [1] UAX#29, Unicode Standard Annex #29: TEXT BOUNDARIES
       http://unicode.org/reports/tr29
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 74b2fed..7388f69 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -55,6 +55,7 @@ struct TrackerParser {
 	gboolean               ignore_stop_words;
 	gboolean               ignore_reserved_words;
 	gboolean               ignore_numbers;
+	gboolean               enable_forced_wordbreaks;
 
 	/* Private members */
 	gchar                 *word;
@@ -378,6 +379,47 @@ process_word_uchar (TrackerParser         *parser,
 }
 
 static gboolean
+parser_check_forced_wordbreaks (const UChar *buffer,
+                                gsize        current,
+                                gsize       *next)
+{
+	gsize unicode_word_length = *next - current;
+	gsize word_length = 0;
+	UCharIterator iter;
+	UChar32 unichar;
+
+	uiter_setString (&iter, &buffer[current], unicode_word_length);
+
+	/* Iterate over the string looking for forced word breaks */
+	while ((unichar = uiter_next32 (&iter)) != U_SENTINEL &&
+	       word_length < unicode_word_length) {
+
+		if (IS_FORCED_WORDBREAK_UCS4 ((guint32) unichar)) {
+			/* Support word starting with a forced wordbreak */
+			if (word_length == 0) {
+				word_length = 1;
+			}
+			break;
+		}
+
+		word_length ++;
+	}
+
+	/* g_debug ("current: %" G_GSIZE_FORMAT ", " */
+	/*          "next: %" G_GSIZE_FORMAT ", " */
+	/*          "now: %" G_GSIZE_FORMAT, */
+	/*          current, */
+	/*          *next, */
+	/*          current + word_length); */
+
+	if (word_length != unicode_word_length) {
+		*next = current + word_length;
+		return TRUE;
+	}
+	return FALSE;
+}
+
+static gboolean
 parser_next (TrackerParser *parser,
              gint          *byte_offset_start,
              gint          *byte_offset_end,
@@ -407,6 +449,19 @@ parser_next (TrackerParser *parser,
 
 		/* Find next word break. */
 		next_word_offset_uchar = ubrk_next (parser->bi);
+
+		/* Check if any forced wordbreaks here... */
+		if (parser->enable_forced_wordbreaks) {
+			/* Returns TRUE if next word offset changed */
+			if (parser_check_forced_wordbreaks (parser->utxt,
+			                                    parser->cursor,
+			                                    &next_word_offset_uchar)) {
+				/* We need to reset the iterator so that next word
+				 * actually returns the same result */
+				ubrk_previous (parser->bi);
+			}
+		}
+
 		if (next_word_offset_uchar >= parser->utxt_size) {
 			/* Last word support... */
 			next_word_offset_uchar = parser->utxt_size;
@@ -565,6 +620,11 @@ tracker_parser_reset (TrackerParser *parser,
 	parser->ignore_reserved_words = ignore_reserved_words;
 	parser->ignore_numbers = ignore_numbers;
 
+	/* Note: We're forcing some unicode characters to behave
+	 * as wordbreakers: e.g, the '.' The main reason for this
+	 * is to enable FTS searches matching file extension. */
+	parser->enable_forced_wordbreaks = TRUE;
+
 	parser->txt_size = txt_size;
 	parser->txt = txt;
 
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index 0ef3db5..1824528 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -54,6 +54,7 @@ struct TrackerParser {
 	gboolean               ignore_stop_words;
 	gboolean               ignore_reserved_words;
 	gboolean               ignore_numbers;
+	gboolean               enable_forced_wordbreaks;
 
 	/* Private members */
 	gchar                 *word;
@@ -76,7 +77,6 @@ get_word_info (TrackerParser         *parser,
 {
 	ucs4_t first_unichar;
 	gint first_unichar_len;
-	gsize i;
 	gboolean ascii_only;
 
 	/* Defaults */
@@ -94,24 +94,41 @@ get_word_info (TrackerParser         *parser,
 		ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
 	}
 
-	/* Find next word break, and in the same loop checking if only ASCII
-	 *  characters */
-	i = parser->cursor + first_unichar_len;
-	while (i < parser->txt_size &&
-	       !parser->word_break_flags [i]) {
+	/* Consider word starts with a forced wordbreak */
+	if (parser->enable_forced_wordbreaks &&
+	    IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) {
+		*p_word_length = first_unichar_len;
+	} else {
+		gsize i;
 
-		if (ascii_only &&
-		    !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
-			ascii_only = FALSE;
+		/* Find next word break, and in the same loop checking if only ASCII
+		 *  characters */
+		i = parser->cursor + first_unichar_len;
+		while (1) {
+			/* Text bounds reached? */
+			if (i >= parser->txt_size)
+				break;
+			/* Proper unicode word break detected? */
+			if (parser->word_break_flags[i])
+				break;
+			/* Forced word break detected? */
+			if (parser->enable_forced_wordbreaks &&
+			    IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i]))
+				break;
+
+			if (ascii_only &&
+			    !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
+				ascii_only = FALSE;
+			}
+
+			i++;
 		}
 
-		i++;
+		/* Word end is the first byte after the word, which is either the
+		 *  start of next word or the end of the string */
+		*p_word_length = i - parser->cursor;
 	}
 
-	/* Word end is the first byte after the word, which is either the
-	 *  start of next word or the end of the string */
-	*p_word_length = i - parser->cursor;
-
 	/* We only want the words where the first character
 	 *  in the word is either a letter, a number or a symbol.
 	 * This is needed because the word break algorithm also
@@ -454,6 +471,11 @@ tracker_parser_reset (TrackerParser *parser,
 	parser->ignore_reserved_words = ignore_reserved_words;
 	parser->ignore_numbers = ignore_numbers;
 
+	/* Note: We're forcing some unicode characters to behave
+	 * as wordbreakers: e.g, the '.' The main reason for this
+	 * is to enable FTS searches matching file extension. */
+	parser->enable_forced_wordbreaks = TRUE;
+
 	parser->txt_size = txt_size;
 	parser->txt = txt;
 
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
index f88a9d1..614740f 100644
--- a/src/libtracker-fts/tracker-parser-utils.h
+++ b/src/libtracker-fts/tracker-parser-utils.h
@@ -52,6 +52,14 @@ G_BEGIN_DECLS
                                ((c) >= 0x20D0 && (c) <= 0x20FF)  ||	\
                                ((c) >= 0xFE20 && (c) <= 0xFE2F))
 
+/* Forced word breaks in Unicode parsers.
+ * If any of these is found INSIDE a properly delimited Unicode word, a new word
+ * break is forced and the Unicode word is split in two words.
+ * Current forced wordbreaks:
+ *   - 0x002E: DOT ('.')
+ */
+#define IS_FORCED_WORDBREAK_UCS4(c) ((c) == 0x002E)
+
 
 gboolean tracker_parser_is_reserved_word_utf8 (const gchar *word,
                                                gsize word_length);
diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c
index b9b8ceb..032cba2 100644
--- a/tests/libtracker-fts/tracker-parser-test.c
+++ b/tests/libtracker-fts/tracker-parser-test.c
@@ -313,17 +313,22 @@ static const TestDataExpectedWord test_data_casefolding[] = {
 
 /* Number of expected words tests */
 static const TestDataExpectedNWords test_data_nwords[] = {
-#ifdef FULL_UNICODE_TESTS /* glib/pango thinks 32.3 are 2 words */
+#ifdef FULL_UNICODE_TESTS /* glib/pango assumes ' is a word breaker */
 	{ "The quick (\"brown\") fox canâ??t jump 32.3 feet, right?", TRUE,   8 },
-	{ "The quick (\"brown\") fox canâ??t jump 32.3 feet, right?", FALSE,  9 },
+	{ "The quick (\"brown\") fox canâ??t jump 32.3 feet, right?", FALSE, 10 },
 #endif
+	/* Note: as of 0.9.15, the dot is always a word breaker, even between
+	 *  numbers. */
+	{ "filename.txt",                                           TRUE,   2 },
+	{ ".hidden.txt",                                            TRUE,   2 },
+	{ "noextension.",                                           TRUE,   1 },
 	{ "ã??ã?¢ã?»ã?µã??ã?¨ã?³ã?¹",                                          TRUE,   2 }, /* katakana */
 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't work properly with chinese */
 	{ "æ?¬å·?æ??主æµ?ç??é£?å?³",                                          TRUE,   8 }, /* chinese */
 #endif
 	{ "Ð?меÑ?иканÑ?кие Ñ?Ñ?да наÑ?одÑ?Ñ?Ñ?Ñ? в междÑ?наÑ?однÑ?Ñ? водаÑ?.",     TRUE,   6 }, /* russian */
-	{ "Bần ch� là m�t anh nghèo xác",                            TRUE,   7 }, /* vietnamese */
 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't work properly with chinese */
+	{ "Bần ch� là m�t anh nghèo xác",                            TRUE,   7 }, /* vietnamese */
 	{ "ã??ã?¢ã?»ã?µã??ã?¨ã?³ã?¹ æ?¬å·?æ??主æµ?ç??é£?å?³ katakana, chinese, english", TRUE,  13 }, /* mixed */
 #endif
 	{ NULL,                                                     FALSE,  0 }



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]