[tracker/parser-unicode-libs-review] Filtering rules updated



commit a88554a8a5d66030ab4819efe73103e170eaadef
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Fri May 7 13:46:43 2010 +0200

    Filtering rules updated
    
     * By default, skip numbers
     * Enable underscore as word starter
     * Remove symbols from valid word starters

 src/libtracker-fts/tracker-fts.c                 |    6 +-
 src/libtracker-fts/tracker-parser-glib.c         |   19 +++++----
 src/libtracker-fts/tracker-parser-libicu.c       |   47 ++++++++++------------
 src/libtracker-fts/tracker-parser-libunistring.c |   30 ++++++--------
 src/libtracker-fts/tracker-parser-utils.h        |   11 +++++
 src/libtracker-fts/tracker-parser.h              |    3 +-
 tests/libtracker-fts/tracker-parser-test.c       |    1 +
 7 files changed, 62 insertions(+), 55 deletions(-)
---
diff --git a/src/libtracker-fts/tracker-fts.c b/src/libtracker-fts/tracker-fts.c
index 857b3f8..f6c5b8b 100644
--- a/src/libtracker-fts/tracker-fts.c
+++ b/src/libtracker-fts/tracker-fts.c
@@ -3658,7 +3658,7 @@ static void snippetOffsetsOfColumn(
   pVtab = pQuery->pFts;
   nColumn = pVtab->nColumn;
 
-  tracker_parser_reset (pVtab->parser, zDoc, nDoc, FALSE, TRUE, pVtab->stop_words, FALSE);
+  tracker_parser_reset (pVtab->parser, zDoc, nDoc, FALSE, TRUE, pVtab->stop_words, TRUE, TRUE);
 
   aTerm = pQuery->pTerms;
   nTerm = pQuery->nTerms;
@@ -4355,7 +4355,7 @@ static int tokenizeSegment(
   int firstIndex = pQuery->nTerms;
   int nTerm = 1;
 
-  tracker_parser_reset (parser, pSegment, nSegment, FALSE, TRUE, v->stop_words, TRUE);
+  tracker_parser_reset (parser, pSegment, nSegment, FALSE, TRUE, v->stop_words, FALSE, TRUE);
 
   while( 1 ){
     const char *pToken;
@@ -4808,7 +4808,7 @@ int Catid,
 
   if (!zText) return SQLITE_OK;
 
-  tracker_parser_reset (parser, zText, strlen (zText), FALSE, TRUE, v->stop_words, FALSE);
+  tracker_parser_reset (parser, zText, strlen (zText), FALSE, TRUE, v->stop_words, TRUE, TRUE);
 
   while( 1 ){
 
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index 83a969b..555331f 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -77,7 +77,8 @@ struct TrackerParser {
 	guint                  max_words_to_index;
 	guint                  max_word_length;
 	gboolean               delimit_words;
-	gboolean               parse_reserved_words;
+	gboolean               skip_reserved_words;
+	gboolean               skip_numbers;
 
 	/* Private members */
 	gchar                   *word;
@@ -278,14 +279,14 @@ parser_next (TrackerParser *parser,
 				/* word break */
 
 				/* check if word is reserved */
-				if (is_valid && parser->parse_reserved_words) {
+				if (is_valid && parser->skip_reserved_words) {
 					if (length == 2 && word[0] == 'o' && word[1] == 'r') {
-						break;
+						is_valid = FALSE;
 					}
 				}
 
 				if (!is_valid ||
-				    word_type == TRACKER_PARSER_WORD_NUM) {
+				    (parser->skip_numbers && word_type == TRACKER_PARSER_WORD_NUM)) {
 					word_type = TRACKER_PARSER_WORD_IGNORE;
 					is_valid = TRUE;
 					length = 0;
@@ -312,12 +313,12 @@ parser_next (TrackerParser *parser,
 			 * underscore if we are filtering.
 			 */
 
-			if (type == TRACKER_PARSER_WORD_NUM) {
+			if (parser->skip_numbers && type == TRACKER_PARSER_WORD_NUM) {
 				is_valid = FALSE;
 				continue;
 			} else {
 				if (type == TRACKER_PARSER_WORD_HYPHEN) {
-					is_valid = parser->parse_reserved_words;
+					is_valid = !parser->skip_reserved_words;
 					continue;
 				}
 			}
@@ -461,7 +462,8 @@ tracker_parser_reset (TrackerParser *parser,
                       gboolean       delimit_words,
                       gboolean       enable_stemmer,
                       gboolean       enable_stop_words,
-                      gboolean       parse_reserved_words)
+                      gboolean       skip_reserved_words,
+                      gboolean       skip_numbers)
 {
 	g_return_if_fail (parser != NULL);
 	g_return_if_fail (txt != NULL);
@@ -479,7 +481,8 @@ tracker_parser_reset (TrackerParser *parser,
 
 	parser->txt_size = txt_size;
 	parser->txt = txt;
-	parser->parse_reserved_words = parse_reserved_words;
+	parser->skip_reserved_words = skip_reserved_words;
+	parser->skip_numbers = skip_numbers;
 
 	g_free (parser->word);
 	parser->word = NULL;
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 33c062c..190931c 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -34,14 +34,6 @@
 #include "tracker-parser.h"
 #include "tracker-parser-utils.h"
 
-/* ASCII-7 is in range [0x00,0x7F] */
-#define IS_ASCII_UCS4(c) ((c) <= 0x7F)
-
-/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6]  */
-#define IS_CJK_UCS4(c)   (((c) >= 0x3400 && (c) <= 0x4DB5)  || \
-                          ((c) >= 0x4E00 && (c) <= 0x9FA5)  || \
-                          ((c) >= 0x20000 && (c) <= 0x2A6D6))
-
 /* Type of words detected */
 typedef enum {
 	TRACKER_PARSER_WORD_TYPE_ASCII,
@@ -69,7 +61,8 @@ struct TrackerParser {
 	guint                  max_words_to_index;
 	guint                  max_word_length;
 	gboolean               delimit_words;
-	gboolean               parse_reserved_words;
+	gboolean               skip_reserved_words;
+	gboolean               skip_numbers;
 
 	/* Private members */
 	gchar                 *word;
@@ -93,6 +86,7 @@ struct TrackerParser {
 static gboolean
 get_word_info (const UChar           *word,
                gsize                  word_length,
+               gboolean               skip_numbers,
                gboolean              *p_is_allowed_word_start,
                TrackerParserWordType *p_word_type)
 {
@@ -117,22 +111,20 @@ get_word_info (const UChar           *word,
 	 *  methods.
 	 */
 	unichar_gc = u_charType (unichar);
-	if (unichar_gc != U_UPPERCASE_LETTER &&
-	    unichar_gc != U_LOWERCASE_LETTER &&
-	    unichar_gc != U_TITLECASE_LETTER &&
-	    unichar_gc != U_MODIFIER_LETTER &&
-	    unichar_gc != U_OTHER_LETTER &&
-	    unichar_gc != U_DECIMAL_DIGIT_NUMBER &&
-	    unichar_gc != U_LETTER_NUMBER &&
-	    unichar_gc != U_OTHER_NUMBER &&
-	    unichar_gc != U_MATH_SYMBOL &&
-	    unichar_gc != U_CURRENCY_SYMBOL &&
-	    unichar_gc != U_MODIFIER_SYMBOL &&
-	    unichar_gc != U_OTHER_SYMBOL) {
+	if (unichar_gc == U_UPPERCASE_LETTER ||
+	    unichar_gc == U_LOWERCASE_LETTER ||
+	    unichar_gc == U_TITLECASE_LETTER ||
+	    unichar_gc == U_MODIFIER_LETTER ||
+	    unichar_gc == U_OTHER_LETTER ||
+	    IS_UNDERSCORE_UCS4 ((guint32)unichar) ||
+	    (!skip_numbers &&
+	     (unichar_gc == U_DECIMAL_DIGIT_NUMBER ||
+	      unichar_gc == U_LETTER_NUMBER ||
+	      unichar_gc == U_OTHER_NUMBER))) {
+		*p_is_allowed_word_start = TRUE;
+	} else {
 		*p_is_allowed_word_start = FALSE;
 		return TRUE;
-	} else {
-		*p_is_allowed_word_start = TRUE;
 	}
 
 	/* Word starts with a CJK character? */
@@ -215,6 +207,7 @@ parser_next (TrackerParser *parser,
 		/* Get word info... */
 		if (!get_word_info (&parser->utxt[parser->cursor],
 		                    word_length_uchar,
+		                    parser->skip_numbers,
 		                    &is_allowed,
 		                    &type)) {
 			/* Quit loop just in case */
@@ -230,7 +223,7 @@ parser_next (TrackerParser *parser,
 		}
 
 		/* check if word is reserved (looking at ORIGINAL UTF-8 buffer here! */
-		if (parser->parse_reserved_words &&
+		if (parser->skip_reserved_words &&
 		    tracker_parser_is_reserved_word_utf8 (&parser->txt[current_word_offset_utf8],
 		                                          word_length_utf8)) {
 			/* Skip this word and keep on looping */
@@ -333,7 +326,8 @@ tracker_parser_reset (TrackerParser *parser,
                       gboolean       delimit_words,
                       gboolean       enable_stemmer,
                       gboolean       enable_stop_words,
-                      gboolean       parse_reserved_words)
+                      gboolean       skip_reserved_words,
+                      gboolean       skip_numbers)
 {
 	UErrorCode error = U_ZERO_ERROR;
 	UConverter *converter;
@@ -349,7 +343,8 @@ tracker_parser_reset (TrackerParser *parser,
 
 	parser->txt_size = txt_size;
 	parser->txt = txt;
-	parser->parse_reserved_words = parse_reserved_words;
+	parser->skip_reserved_words = skip_reserved_words;
+	parser->skip_numbers = skip_numbers;
 
 	g_free (parser->word);
 	parser->word = NULL;
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index bad3cea..4a6ff35 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -33,15 +33,6 @@
 #include "tracker-parser.h"
 #include "tracker-parser-utils.h"
 
-
-/* ASCII-7 is in range [0x00,0x7F] */
-#define IS_ASCII_BYTE(c) ((c) <= 0x7F)
-
-/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6]  */
-#define IS_CJK_UCS4(c)   (((c) >= 0x3400 && (c) <= 0x4DB5)  || \
-                          ((c) >= 0x4E00 && (c) <= 0x9FA5)  || \
-                          ((c) >= 0x20000 && (c) <= 0x2A6D6))
-
 /* Type of words detected */
 typedef enum {
 	TRACKER_PARSER_WORD_TYPE_ASCII,
@@ -67,7 +58,8 @@ struct TrackerParser {
 	guint                  max_words_to_index;
 	guint                  max_word_length;
 	gboolean               delimit_words;
-	gboolean               parse_reserved_words;
+	gboolean               skip_reserved_words;
+	gboolean               skip_numbers;
 
 	/* Private members */
 	gchar                   *word;
@@ -115,7 +107,7 @@ get_word_info (TrackerParser         *parser,
 	       !parser->word_break_flags [i]) {
 
 		if (ascii_only &&
-		    !IS_ASCII_BYTE ((guchar)parser->txt[i])) {
+		    !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
 			ascii_only = FALSE;
 		}
 
@@ -135,7 +127,8 @@ get_word_info (TrackerParser         *parser,
 	 *  should be compatible with all Unicode normalization
 	 *  methods.
 	 */
-	if (!uc_is_general_category (first_unichar,
+	if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
+	    !uc_is_general_category (first_unichar,
 	                             parser->allowed_start)) {
 		*p_is_allowed_word_start = FALSE;
 		return TRUE;
@@ -197,7 +190,7 @@ parser_next (TrackerParser *parser,
 		}
 
 		/* check if word is reserved and skip it if so */
-		if (parser->parse_reserved_words &&
+		if (parser->skip_reserved_words &&
 		    tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor],
 		                                          word_length)) {
 			/* Skip this word and keep on looping */
@@ -288,7 +281,8 @@ tracker_parser_reset (TrackerParser *parser,
                       gboolean       delimit_words,
                       gboolean       enable_stemmer,
                       gboolean       enable_stop_words,
-                      gboolean       parse_reserved_words)
+                      gboolean       skip_reserved_words,
+                      gboolean       skip_numbers)
 {
 	g_return_if_fail (parser != NULL);
 	g_return_if_fail (txt != NULL);
@@ -299,7 +293,8 @@ tracker_parser_reset (TrackerParser *parser,
 
 	parser->txt_size = txt_size;
 	parser->txt = txt;
-	parser->parse_reserved_words = parse_reserved_words;
+	parser->skip_reserved_words = skip_reserved_words;
+	parser->skip_numbers = skip_numbers;
 
 	g_free (parser->word);
 	parser->word = NULL;
@@ -321,8 +316,9 @@ tracker_parser_reset (TrackerParser *parser,
 	/* Prepare a custom category which is a combination of the
 	 * desired ones */
 	parser->allowed_start = UC_LETTER;
-	parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
-	parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_SYMBOL);
+	if (!parser->skip_numbers) {
+		parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
+	}
 }
 
 gchar *
diff --git a/src/libtracker-fts/tracker-parser-utils.h b/src/libtracker-fts/tracker-parser-utils.h
index 9c007bd..50805c1 100644
--- a/src/libtracker-fts/tracker-parser-utils.h
+++ b/src/libtracker-fts/tracker-parser-utils.h
@@ -30,6 +30,17 @@
 
 G_BEGIN_DECLS
 
+/* ASCII-7 is in range [0x00,0x7F] */
+#define IS_ASCII_UCS4(c)      ((c) <= 0x7F)
+
+/* CJK ranges are : [0x3400,0x4DB5], [0x4E00,0x9FA5], [0x20000,0x2A6D6]  */
+#define IS_CJK_UCS4(c)        (((c) >= 0x3400 && (c) <= 0x4DB5)  ||	\
+                               ((c) >= 0x4E00 && (c) <= 0x9FA5)  ||	\
+                               ((c) >= 0x20000 && (c) <= 0x2A6D6))
+
+#define IS_UNDERSCORE_UCS4(c) ((c) == 0x005F)
+
+
 gchar *tracker_parser_unaccent_utf16be_word (const gchar *string,
                                              gsize        ilength,
                                              gsize        *p_olength);
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index 3175b22..cad4442 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -38,7 +38,8 @@ void           tracker_parser_reset           (TrackerParser   *parser,
                                                gboolean         delimit_words,
                                                gboolean         enable_stemmer,
                                                gboolean         enable_stop_words,
-                                               gboolean         parse_reserved_words);
+                                               gboolean         skip_reserved_words,
+                                               gboolean         skip_numbers);
 
 const gchar *  tracker_parser_next            (TrackerParser   *parser,
                                                gint            *position,
diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c
index 1c38215..970c8cc 100644
--- a/tests/libtracker-fts/tracker-parser-test.c
+++ b/tests/libtracker-fts/tracker-parser-test.c
@@ -149,6 +149,7 @@ run_parsing (void)
 	                      TRUE,
 	                      TRUE,
 	                      TRUE,
+	                      TRUE,
 	                      TRUE);
 
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]