tracker r1954 - in branches/indexer-split: . src/libtracker-common tests/libtracker-common



Author: ifrade
Date: Mon Jul 28 13:34:44 2008
New Revision: 1954
URL: http://svn.gnome.org/viewvc/tracker?rev=1954&view=rev

Log:
Added stop word filter

Modified:
   branches/indexer-split/ChangeLog
   branches/indexer-split/src/libtracker-common/tracker-language.c
   branches/indexer-split/src/libtracker-common/tracker-parser.c
   branches/indexer-split/tests/libtracker-common/tracker-parser-test.c

Modified: branches/indexer-split/src/libtracker-common/tracker-language.c
==============================================================================
--- branches/indexer-split/src/libtracker-common/tracker-language.c	(original)
+++ branches/indexer-split/src/libtracker-common/tracker-language.c	Mon Jul 28 13:34:44 2008
@@ -425,7 +425,7 @@
 	priv = GET_PRIV (language);
 
 	if (!tracker_config_get_enable_stemmer (priv->config)) {
-		return NULL;
+		return g_strdup (word);
 	}
 
 	g_mutex_lock (priv->stemmer_mutex);

Modified: branches/indexer-split/src/libtracker-common/tracker-parser.c
==============================================================================
--- branches/indexer-split/src/libtracker-common/tracker-parser.c	(original)
+++ branches/indexer-split/src/libtracker-common/tracker-parser.c	Mon Jul 28 13:34:44 2008
@@ -144,6 +144,18 @@
 	return FALSE;
 }
 
+static gboolean
+is_stop_word (TrackerLanguage *language,
+              const gchar     *word)
+{
+        GHashTable *stop_words;
+        
+        stop_words = tracker_language_get_stop_words (language);
+
+        return (g_hash_table_lookup (stop_words, word) != NULL);
+}
+
+
 static const gchar *
 analyze_text (const gchar      *text, 
               TrackerLanguage  *language,
@@ -306,11 +318,13 @@
                                 *index_word = tracker_language_stem_word (language, 
                                                                           tmp, 
                                                                           strlen (tmp));
-                                if (*index_word) {
-                                        g_free (tmp);
-                                } else {
-                                        *index_word = tmp;			
+                                g_free (tmp);
+                                
+                                if (filter_words && is_stop_word (language, *index_word)) {
+                                        g_free (*index_word);
+                                        *index_word = NULL;
                                 }
+
                         }
                 }
         } 

Modified: branches/indexer-split/tests/libtracker-common/tracker-parser-test.c
==============================================================================
--- branches/indexer-split/tests/libtracker-common/tracker-parser-test.c	(original)
+++ branches/indexer-split/tests/libtracker-common/tracker-parser-test.c	Mon Jul 28 13:34:44 2008
@@ -9,8 +9,9 @@
 /* 
  * len(word) > 3 : 6 words  
  * longest word: 10 chars
+ * stop words ("here", "a", "of", "various", "to", "after")
  */
-#define SAMPLE_TEXT "Here a good collection of various words to parse 12345678"
+#define SAMPLE_TEXT "Here a good collection of various words to parse 12345678 after"
 
 TrackerConfig *config;
 TrackerLanguage *language;
@@ -72,7 +73,7 @@
                                       3, /* min length of the word */
                                       FALSE, FALSE); /* Filter / Delimit */
         g_hash_table_foreach (result, assert_key_length, GINT_TO_POINTER (max_length));
-        g_assert_cmpint (g_hash_table_size (result), ==, 7);
+        g_assert_cmpint (g_hash_table_size (result), ==, 8);
 
         tracker_parser_text_free (result);        
 }
@@ -81,7 +82,7 @@
  * Filter numbers 
  */
 static void
-test_parser_text_filter_numbers (void)
+test_parser_text_filter_numbers_stop_words (void)
 {
         GHashTable *result = NULL;
 
@@ -92,12 +93,12 @@
                                       language,
                                       100, /* max words to index */
                                       100, /* max length of the word */
-                                      1, /* min length of the word */
+                                      0, /* min length of the word */
                                       TRUE, FALSE); /* Filter / Delimit */
 
         g_assert (!g_hash_table_lookup (result, "12345678"));
 
-        g_assert_cmpint (g_hash_table_size (result), ==, 9);
+        g_assert_cmpint (g_hash_table_size (result), ==, 4);
 
         tracker_parser_text_free (result);        
         result = NULL;
@@ -109,10 +110,10 @@
                                       language,
                                       100, /* max words to index */
                                       100, /* max length of the word */
-                                      1, /* min length of the word */
+                                      0, /* min length of the word */
                                       FALSE, FALSE); /* Filter / Delimit */
 
-        g_assert_cmpint (g_hash_table_size (result), ==, 10);
+        g_assert_cmpint (g_hash_table_size (result), ==, 11);
 
         g_assert (g_hash_table_lookup (result, "12345678"));
 
@@ -123,7 +124,7 @@
 static void
 test_parser_stop_words (void)
 {
-        GHashTable *stop_words;
+        GHashTable *stop_words, *result = NULL;
         
         /* Check we have the default stop words */
         stop_words = tracker_language_get_stop_words (language);
@@ -134,6 +135,14 @@
         tracker_config_set_language (config, "en");
         g_assert (g_hash_table_lookup (stop_words, "after"));
 
+        result = tracker_parser_text (result,
+                                      SAMPLE_TEXT,
+                                      1,
+                                      language,
+                                      100, /* max words to index */
+                                      100, /* max length of the word */
+                                      1, /* min length of the word */
+                                      TRUE, FALSE); /* Filter / Delimit */        
 }
 
 int
@@ -156,7 +165,7 @@
                          test_parser_text_max_length);
 
         g_test_add_func ("/libtracker-common/tracker-parser/parser_text/filter_numbers",
-                         test_parser_text_filter_numbers);
+                         test_parser_text_filter_numbers_stop_words);
 
         g_test_add_func ("/libtracker-common/tracker-parser/stop_words",
                          test_parser_stop_words);



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]