[tracker] Fixes GB#526346: New FTS config option to enable/disable stemmer



commit 4ad36d4b0f524114bd2489246fc06a27bc0e0833
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Mon May 24 16:41:09 2010 +0200

    Fixes GB#526346: New FTS config option to enable/disable stemmer
    
    	* Note: By default stemming will be disabled. See bugreport for
    	more details.

 docs/manpages/tracker-fts.cfg.5            |    5 ++
 src/libtracker-fts/tracker-fts-config.c    |   58 ++++++++++++++--
 src/libtracker-fts/tracker-fts-config.h    |    3 +
 src/libtracker-fts/tracker-fts.c           |    8 ++-
 tests/libtracker-fts/tracker-parser-test.c |  102 +++++++++++++++++-----------
 5 files changed, 127 insertions(+), 49 deletions(-)
---
diff --git a/docs/manpages/tracker-fts.cfg.5 b/docs/manpages/tracker-fts.cfg.5
index 176fe54..93587cf 100644
--- a/docs/manpages/tracker-fts.cfg.5
+++ b/docs/manpages/tracker-fts.cfg.5
@@ -23,6 +23,11 @@ Set the minimum length of words to index (0->30).
 Set the maximum length of words to index (0->200).
 
 .TP
+.B EnableStemmer=false
+Set to true if stemming should be applied to each word. Stemming is the process
+for reducing inflected and derived words to their stem, base or root form.
+
+.TP
 .B IgnoreNumbers=true
 Set to true if words starting with numbers should be ignored.
 
diff --git a/src/libtracker-fts/tracker-fts-config.c b/src/libtracker-fts/tracker-fts-config.c
index 736d044..6d53bc4 100644
--- a/src/libtracker-fts/tracker-fts-config.c
+++ b/src/libtracker-fts/tracker-fts-config.c
@@ -35,16 +35,18 @@
 #define GROUP_INDEXING             "Indexing"
 
 /* Default values */
-#define DEFAULT_MIN_WORD_LENGTH            3      /* 0->30 */
-#define DEFAULT_MAX_WORD_LENGTH            30     /* 0->200 */
-#define DEFAULT_MAX_WORDS_TO_INDEX      10000
-#define DEFAULT_IGNORE_NUMBERS           TRUE
-#define DEFAULT_IGNORE_STOP_WORDS        TRUE
+#define DEFAULT_MIN_WORD_LENGTH      3      /* 0->30 */
+#define DEFAULT_MAX_WORD_LENGTH      30     /* 0->200 */
+#define DEFAULT_MAX_WORDS_TO_INDEX   10000
+#define DEFAULT_IGNORE_NUMBERS       TRUE
+#define DEFAULT_IGNORE_STOP_WORDS    TRUE
+#define DEFAULT_ENABLE_STEMMER       FALSE  /* As per GB#526346, disabled */
 
 typedef struct {
 	/* Indexing */
 	gint min_word_length;
 	gint max_word_length;
+	gboolean enable_stemmer;
 	gboolean ignore_numbers;
 	gboolean ignore_stop_words;
 	gint max_words_to_index;
@@ -78,6 +80,7 @@ enum {
 	/* Indexing */
 	PROP_MIN_WORD_LENGTH,
 	PROP_MAX_WORD_LENGTH,
+	PROP_ENABLE_STEMMER,
 	PROP_IGNORE_NUMBERS,
 	PROP_IGNORE_STOP_WORDS,
 
@@ -88,6 +91,7 @@ enum {
 static ObjectToKeyFile conversions[] = {
 	{ G_TYPE_INT,     "min-word-length",    GROUP_INDEXING, "MinWordLength"   },
 	{ G_TYPE_INT,     "max-word-length",    GROUP_INDEXING, "MaxWordLength"   },
+	{ G_TYPE_BOOLEAN, "enable-stemmer",     GROUP_INDEXING, "EnableStemmer"   },
 	{ G_TYPE_BOOLEAN, "ignore-numbers",     GROUP_INDEXING, "IgnoreNumbers"   },
 	{ G_TYPE_BOOLEAN, "ignore-stop-words",  GROUP_INDEXING, "IgnoreStopWords" },
 	{ G_TYPE_INT,     "max-words-to-index", GROUP_INDEXING, "MaxWordsToIndex" },
@@ -125,17 +129,24 @@ tracker_fts_config_class_init (TrackerFTSConfigClass *klass)
 	                                                   DEFAULT_MAX_WORD_LENGTH,
 	                                                   G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
 	g_object_class_install_property (object_class,
+	                                 PROP_ENABLE_STEMMER,
+	                                 g_param_spec_boolean ("enable-stemmer",
+	                                                       "Enable Stemmer",
+	                                                       " Flag to enable word stemming utility (default=FALSE)",
+	                                                       DEFAULT_ENABLE_STEMMER,
+	                                                       G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
+	g_object_class_install_property (object_class,
 	                                 PROP_IGNORE_NUMBERS,
 	                                 g_param_spec_boolean ("ignore-numbers",
 	                                                       "Ignore numbers",
-	                                                       " Flag to ignore numbers in FTS (default: TRUE)",
+	                                                       " Flag to ignore numbers in FTS (default=TRUE)",
 	                                                       DEFAULT_IGNORE_NUMBERS,
 	                                                       G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
 	g_object_class_install_property (object_class,
 	                                 PROP_IGNORE_STOP_WORDS,
 	                                 g_param_spec_boolean ("ignore-stop-words",
 	                                                       "Ignore stop words",
-	                                                       " Flag to ignore stop words in FTS (default: TRUE)",
+	                                                       " Flag to ignore stop words in FTS (default=TRUE)",
 	                                                       DEFAULT_IGNORE_STOP_WORDS,
 	                                                       G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
 	g_object_class_install_property (object_class,
@@ -172,6 +183,10 @@ config_set_property (GObject      *object,
 		tracker_fts_config_set_max_word_length (TRACKER_FTS_CONFIG (object),
 		                                        g_value_get_int (value));
 		break;
+	case PROP_ENABLE_STEMMER:
+		tracker_fts_config_set_enable_stemmer (TRACKER_FTS_CONFIG (object),
+		                                       g_value_get_boolean (value));
+		break;
 	case PROP_IGNORE_NUMBERS:
 		tracker_fts_config_set_ignore_numbers (TRACKER_FTS_CONFIG (object),
 		                                       g_value_get_boolean (value));
@@ -209,6 +224,9 @@ config_get_property (GObject    *object,
 	case PROP_MAX_WORD_LENGTH:
 		g_value_set_int (value, priv->max_word_length);
 		break;
+	case PROP_ENABLE_STEMMER:
+		g_value_set_boolean (value, priv->enable_stemmer);
+		break;
 	case PROP_IGNORE_NUMBERS:
 		g_value_set_boolean (value, priv->ignore_numbers);
 		break;
@@ -419,6 +437,18 @@ tracker_fts_config_get_max_word_length (TrackerFTSConfig *config)
 }
 
 gboolean
+tracker_fts_config_get_enable_stemmer (TrackerFTSConfig *config)
+{
+	TrackerFTSConfigPrivate *priv;
+
+	g_return_val_if_fail (TRACKER_IS_FTS_CONFIG (config), DEFAULT_ENABLE_STEMMER);
+
+	priv = TRACKER_FTS_CONFIG_GET_PRIVATE (config);
+
+	return priv->enable_stemmer;
+}
+
+gboolean
 tracker_fts_config_get_ignore_numbers (TrackerFTSConfig *config)
 {
 	TrackerFTSConfigPrivate *priv;
@@ -491,6 +521,20 @@ tracker_fts_config_set_max_word_length (TrackerFTSConfig *config,
 }
 
 void
+tracker_fts_config_set_enable_stemmer (TrackerFTSConfig *config,
+                                       gboolean          value)
+{
+	TrackerFTSConfigPrivate *priv;
+
+	g_return_if_fail (TRACKER_IS_FTS_CONFIG (config));
+
+	priv = TRACKER_FTS_CONFIG_GET_PRIVATE (config);
+
+	priv->enable_stemmer = value;
+	g_object_notify (G_OBJECT (config), "enable-stemmer");
+}
+
+void
 tracker_fts_config_set_ignore_numbers (TrackerFTSConfig *config,
                                        gboolean          value)
 {
diff --git a/src/libtracker-fts/tracker-fts-config.h b/src/libtracker-fts/tracker-fts-config.h
index 9c83e35..aabb71a 100644
--- a/src/libtracker-fts/tracker-fts-config.h
+++ b/src/libtracker-fts/tracker-fts-config.h
@@ -50,6 +50,7 @@ TrackerFTSConfig *tracker_fts_config_new                    (void);
 gboolean          tracker_fts_config_save                   (TrackerFTSConfig *config);
 gint              tracker_fts_config_get_min_word_length    (TrackerFTSConfig *config);
 gint              tracker_fts_config_get_max_word_length    (TrackerFTSConfig *config);
+gboolean          tracker_fts_config_get_enable_stemmer     (TrackerFTSConfig *config);
 gboolean          tracker_fts_config_get_ignore_numbers     (TrackerFTSConfig *config);
 gboolean          tracker_fts_config_get_ignore_stop_words  (TrackerFTSConfig *config);
 gint              tracker_fts_config_get_max_words_to_index (TrackerFTSConfig *config);
@@ -57,6 +58,8 @@ void              tracker_fts_config_set_min_word_length    (TrackerFTSConfig *c
                                                              gint              value);
 void              tracker_fts_config_set_max_word_length    (TrackerFTSConfig *config,
                                                              gint              value);
+void              tracker_fts_config_set_enable_stemmer     (TrackerFTSConfig *config,
+                                                             gboolean          value);
 void              tracker_fts_config_set_ignore_numbers     (TrackerFTSConfig *config,
                                                              gboolean          value);
 void              tracker_fts_config_set_ignore_stop_words  (TrackerFTSConfig *config,
diff --git a/src/libtracker-fts/tracker-fts.c b/src/libtracker-fts/tracker-fts.c
index bc0b49c..e6a8326 100644
--- a/src/libtracker-fts/tracker-fts.c
+++ b/src/libtracker-fts/tracker-fts.c
@@ -2330,6 +2330,7 @@ struct fulltext_vtab {
   const char *zName;		   /* virtual table name */
   int nColumn;			   /* number of columns in virtual table */
   TrackerParser *parser;	   /* tokenizer for inserts and queries */
+  gboolean enable_stemmer;
   gboolean ignore_numbers;
   gboolean ignore_stop_words;
   int max_words;
@@ -3370,6 +3371,7 @@ static int constructVtab(
 
   min_len = tracker_fts_config_get_min_word_length (config);
   max_len = tracker_fts_config_get_max_word_length (config);
+  v->enable_stemmer = tracker_fts_config_get_enable_stemmer (config);
   v->ignore_numbers = tracker_fts_config_get_ignore_numbers (config);
 
   /* disable stop words if TRACKER_FTS_STOP_WORDS is set to 0 - used by tests
@@ -3673,7 +3675,7 @@ static void snippetOffsetsOfColumn(
   tracker_parser_reset (pVtab->parser,
                         zDoc,
                         nDoc,
-                        TRUE,
+                        pVtab->enable_stemmer,
                         pVtab->ignore_stop_words,
                         TRUE,
                         pVtab->ignore_numbers);
@@ -4376,7 +4378,7 @@ static int tokenizeSegment(
   tracker_parser_reset (parser,
                         pSegment,
                         nSegment,
-                        TRUE,
+                        v->enable_stemmer,
                         v->ignore_stop_words,
                         FALSE,
                         v->ignore_numbers);
@@ -4835,7 +4837,7 @@ int Catid,
   tracker_parser_reset (parser,
                         zText,
                         strlen (zText),
-                        TRUE,
+                        v->enable_stemmer,
                         v->ignore_stop_words,
                         TRUE,
                         v->ignore_numbers);
diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c
index 5a7d5a0..47edcbf 100644
--- a/tests/libtracker-fts/tracker-parser-test.c
+++ b/tests/libtracker-fts/tracker-parser-test.c
@@ -53,9 +53,9 @@ typedef struct {
 	/* Default parser configuration to use */
 	gint              max_word_length;
 	gboolean          enable_stemmer;
-	gboolean          enable_stop_words;
-	gboolean          skip_reserved_words;
-	gboolean          skip_numbers;
+	gboolean          ignore_stop_words;
+	gboolean          ignore_reserved_words;
+	gboolean          ignore_numbers;
 } TrackerParserTestFixture;
 
 /* Common setup for all tests */
@@ -65,8 +65,10 @@ test_common_setup (TrackerParserTestFixture *fixture,
 {
 	TrackerLanguage  *language;
 
-	/* Setup language for parser */
-	language = tracker_language_new (NULL);
+	/* Setup language for parser. We make sure that always English is used
+	 *  in the unit tests, because we want the English stemming method to
+	 *  be used. */
+	language = tracker_language_new ("en");
 	if (!language) {
 		g_critical ("Language setup failed!");
 		return;
@@ -75,9 +77,9 @@ test_common_setup (TrackerParserTestFixture *fixture,
 	/* Default conf parameters */
 	fixture->max_word_length = 50;
 	fixture->enable_stemmer = TRUE;
-	fixture->enable_stop_words = TRUE;
-	fixture->skip_reserved_words = TRUE;
-	fixture->skip_numbers = TRUE;
+	fixture->ignore_stop_words = TRUE;
+	fixture->ignore_reserved_words = TRUE;
+	fixture->ignore_numbers = TRUE;
 
 	/* Create the parser */
 	fixture->parser = tracker_parser_new (language,
@@ -106,7 +108,7 @@ test_common_teardown (TrackerParserTestFixture *fixture,
 typedef struct TestDataExpectedNWords TestDataExpectedNWords;
 struct TestDataExpectedNWords {
 	const gchar *str;
-	gboolean     skip_numbers;
+	gboolean     ignore_numbers;
 	guint        expected_nwords;
 };
 
@@ -129,9 +131,9 @@ expected_nwords_check (TrackerParserTestFixture *fixture,
 	                      testdata->str,
 	                      strlen (testdata->str),
 	                      fixture->enable_stemmer,
-	                      fixture->enable_stop_words,
-	                      fixture->skip_reserved_words,
-	                      testdata->skip_numbers);
+	                      fixture->ignore_stop_words,
+	                      fixture->ignore_reserved_words,
+	                      testdata->ignore_numbers);
 
 	/* Count number of output words */
 	while ((word = tracker_parser_next (fixture->parser,
@@ -154,6 +156,7 @@ typedef struct TestDataExpectedWord TestDataExpectedWord;
 struct TestDataExpectedWord {
 	const gchar  *str;
 	const gchar  *expected;
+	gboolean      enable_stemmer;
 };
 
 /* Common expected_word test method */
@@ -173,10 +176,10 @@ expected_word_check (TrackerParserTestFixture *fixture,
 	tracker_parser_reset (fixture->parser,
 	                      testdata->str,
 	                      strlen (testdata->str),
-	                      FALSE, /* no stemming for this test */
-	                      fixture->enable_stop_words,
-	                      fixture->skip_reserved_words,
-	                      fixture->skip_numbers);
+	                      testdata->enable_stemmer,
+	                      fixture->ignore_stop_words,
+	                      fixture->ignore_reserved_words,
+	                      fixture->ignore_numbers);
 
 	/* Process next word */
 	word = tracker_parser_next (fixture->parser,
@@ -195,48 +198,55 @@ expected_word_check (TrackerParserTestFixture *fixture,
 #ifdef HAVE_UNAC
 /* Normalization-related tests (unaccenting) */
 static const TestDataExpectedWord test_data_normalization[] = {
-	{ "école",                "ecole" },
-	{ "Ã?COLE",                "ecole" },
-	{ "Ã?cole",                "ecole" },
+	{ "école",                "ecole", FALSE },
+	{ "Ã?COLE",                "ecole", FALSE },
+	{ "Ã?cole",                "ecole", FALSE },
 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
-	{ "e" "\xCC\x81" "cole",  "ecole" },
-	{ "E" "\xCC\x81" "COLE",  "ecole" },
-	{ "E" "\xCC\x81" "cole",  "ecole" },
+	{ "e" "\xCC\x81" "cole",  "ecole", FALSE },
+	{ "E" "\xCC\x81" "COLE",  "ecole", FALSE },
+	{ "E" "\xCC\x81" "cole",  "ecole", FALSE },
 #endif
-	{ NULL,                   NULL    }
+	{ NULL,                   NULL,    FALSE }
 };
 
 /* Unaccenting-related tests */
 static const TestDataExpectedWord test_data_unaccent[] = {
-	{ "Murciélago", "murcielago" },
-	{ "camión",     "camion"     },
-	{ "desagüe",    "desague"    },
-	{ NULL,         NULL         }
+	{ "Murciélago", "murcielago", FALSE },
+	{ "camión",     "camion",     FALSE },
+	{ "desagüe",    "desague",    FALSE },
+	{ NULL,         NULL,         FALSE }
 };
 #else
 /* Normalization-related tests (not unaccenting) */
 static const TestDataExpectedWord test_data_normalization[] = {
-	{ "école",                "école" },
-	{ "�COLE",                "école" },
-	{ "�cole",                "école" },
+	{ "école",                "école", FALSE },
+	{ "�COLE",                "école", FALSE },
+	{ "�cole",                "école", FALSE },
 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
-	{ "e" "\xCC\x81" "cole",  "école" },
-	{ "E" "\xCC\x81" "COLE",  "école" },
-	{ "E" "\xCC\x81" "cole",  "école" },
+	{ "e" "\xCC\x81" "cole",  "école", FALSE },
+	{ "E" "\xCC\x81" "COLE",  "école", FALSE },
+	{ "E" "\xCC\x81" "cole",  "école", FALSE },
 #endif
-	{ NULL,                   NULL    }
+	{ NULL,                   NULL,    FALSE }
 };
 #endif
 
+/* Stemming-related tests */
+static const TestDataExpectedWord test_data_stemming[] = {
+	{ "ecole", "ecol",  TRUE  },
+	{ "ecole", "ecole", FALSE },
+	{ NULL,    NULL,    FALSE }
+};
+
 /* Casefolding-related tests */
 static const TestDataExpectedWord test_data_casefolding[] = {
-	{ "gross", "gross" },
-	{ "GROSS", "gross" },
-	{ "GrOsS", "gross" },
+	{ "gross", "gross", FALSE },
+	{ "GROSS", "gross", FALSE },
+	{ "GrOsS", "gross", FALSE },
 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't do full-word casefolding */
-	{ "groÃ?",  "gross" },
+	{ "groÃ?",  "gross", FALSE },
 #endif
-	{ NULL,    NULL    }
+	{ NULL,    NULL,    FALSE }
 };
 
 /* Number of expected words tests */
@@ -312,6 +322,20 @@ main (int argc, char **argv)
 		g_free (testpath);
 	}
 
+	/* Add stemming checks */
+	for (i = 0; test_data_stemming[i].str != NULL; i++) {
+		gchar *testpath;
+
+		testpath = g_strdup_printf ("/libtracker-fts/parser/stemming_%d", i);
+		g_test_add (testpath,
+		            TrackerParserTestFixture,
+		            &test_data_stemming[i],
+		            test_common_setup,
+		            expected_word_check,
+		            test_common_teardown);
+		g_free (testpath);
+	}
+
 	/* Add expected number of words checks */
 	for (i = 0; test_data_nwords[i].str != NULL; i++) {
 		gchar *testpath;



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]