[tracker] Fixes GB#526346: New FTS config option to enable/disable stemmer
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker] Fixes GB#526346: New FTS config option to enable/disable stemmer
- Date: Tue, 25 May 2010 09:03:14 +0000 (UTC)
commit 4ad36d4b0f524114bd2489246fc06a27bc0e0833
Author: Aleksander Morgado <aleksander lanedo com>
Date: Mon May 24 16:41:09 2010 +0200
Fixes GB#526346: New FTS config option to enable/disable stemmer
* Note: By default stemming will be disabled. See bugreport for
more details.
docs/manpages/tracker-fts.cfg.5 | 5 ++
src/libtracker-fts/tracker-fts-config.c | 58 ++++++++++++++--
src/libtracker-fts/tracker-fts-config.h | 3 +
src/libtracker-fts/tracker-fts.c | 8 ++-
tests/libtracker-fts/tracker-parser-test.c | 102 +++++++++++++++++-----------
5 files changed, 127 insertions(+), 49 deletions(-)
---
diff --git a/docs/manpages/tracker-fts.cfg.5 b/docs/manpages/tracker-fts.cfg.5
index 176fe54..93587cf 100644
--- a/docs/manpages/tracker-fts.cfg.5
+++ b/docs/manpages/tracker-fts.cfg.5
@@ -23,6 +23,11 @@ Set the minimum length of words to index (0->30).
Set the maximum length of words to index (0->200).
.TP
+.B EnableStemmer=false
+Set to true if stemming should be applied to each word. Stemming is the process
+for reducing inflected and derived words to their stem, base or root form.
+
+.TP
.B IgnoreNumbers=true
Set to true if words starting with numbers should be ignored.
diff --git a/src/libtracker-fts/tracker-fts-config.c b/src/libtracker-fts/tracker-fts-config.c
index 736d044..6d53bc4 100644
--- a/src/libtracker-fts/tracker-fts-config.c
+++ b/src/libtracker-fts/tracker-fts-config.c
@@ -35,16 +35,18 @@
#define GROUP_INDEXING "Indexing"
/* Default values */
-#define DEFAULT_MIN_WORD_LENGTH 3 /* 0->30 */
-#define DEFAULT_MAX_WORD_LENGTH 30 /* 0->200 */
-#define DEFAULT_MAX_WORDS_TO_INDEX 10000
-#define DEFAULT_IGNORE_NUMBERS TRUE
-#define DEFAULT_IGNORE_STOP_WORDS TRUE
+#define DEFAULT_MIN_WORD_LENGTH 3 /* 0->30 */
+#define DEFAULT_MAX_WORD_LENGTH 30 /* 0->200 */
+#define DEFAULT_MAX_WORDS_TO_INDEX 10000
+#define DEFAULT_IGNORE_NUMBERS TRUE
+#define DEFAULT_IGNORE_STOP_WORDS TRUE
+#define DEFAULT_ENABLE_STEMMER FALSE /* As per GB#526346, disabled */
typedef struct {
/* Indexing */
gint min_word_length;
gint max_word_length;
+ gboolean enable_stemmer;
gboolean ignore_numbers;
gboolean ignore_stop_words;
gint max_words_to_index;
@@ -78,6 +80,7 @@ enum {
/* Indexing */
PROP_MIN_WORD_LENGTH,
PROP_MAX_WORD_LENGTH,
+ PROP_ENABLE_STEMMER,
PROP_IGNORE_NUMBERS,
PROP_IGNORE_STOP_WORDS,
@@ -88,6 +91,7 @@ enum {
static ObjectToKeyFile conversions[] = {
{ G_TYPE_INT, "min-word-length", GROUP_INDEXING, "MinWordLength" },
{ G_TYPE_INT, "max-word-length", GROUP_INDEXING, "MaxWordLength" },
+ { G_TYPE_BOOLEAN, "enable-stemmer", GROUP_INDEXING, "EnableStemmer" },
{ G_TYPE_BOOLEAN, "ignore-numbers", GROUP_INDEXING, "IgnoreNumbers" },
{ G_TYPE_BOOLEAN, "ignore-stop-words", GROUP_INDEXING, "IgnoreStopWords" },
{ G_TYPE_INT, "max-words-to-index", GROUP_INDEXING, "MaxWordsToIndex" },
@@ -125,17 +129,24 @@ tracker_fts_config_class_init (TrackerFTSConfigClass *klass)
DEFAULT_MAX_WORD_LENGTH,
G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
g_object_class_install_property (object_class,
+ PROP_ENABLE_STEMMER,
+ g_param_spec_boolean ("enable-stemmer",
+ "Enable Stemmer",
+ " Flag to enable word stemming utility (default=FALSE)",
+ DEFAULT_ENABLE_STEMMER,
+ G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
+ g_object_class_install_property (object_class,
PROP_IGNORE_NUMBERS,
g_param_spec_boolean ("ignore-numbers",
"Ignore numbers",
- " Flag to ignore numbers in FTS (default: TRUE)",
+ " Flag to ignore numbers in FTS (default=TRUE)",
DEFAULT_IGNORE_NUMBERS,
G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
g_object_class_install_property (object_class,
PROP_IGNORE_STOP_WORDS,
g_param_spec_boolean ("ignore-stop-words",
"Ignore stop words",
- " Flag to ignore stop words in FTS (default: TRUE)",
+ " Flag to ignore stop words in FTS (default=TRUE)",
DEFAULT_IGNORE_STOP_WORDS,
G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
g_object_class_install_property (object_class,
@@ -172,6 +183,10 @@ config_set_property (GObject *object,
tracker_fts_config_set_max_word_length (TRACKER_FTS_CONFIG (object),
g_value_get_int (value));
break;
+ case PROP_ENABLE_STEMMER:
+ tracker_fts_config_set_enable_stemmer (TRACKER_FTS_CONFIG (object),
+ g_value_get_boolean (value));
+ break;
case PROP_IGNORE_NUMBERS:
tracker_fts_config_set_ignore_numbers (TRACKER_FTS_CONFIG (object),
g_value_get_boolean (value));
@@ -209,6 +224,9 @@ config_get_property (GObject *object,
case PROP_MAX_WORD_LENGTH:
g_value_set_int (value, priv->max_word_length);
break;
+ case PROP_ENABLE_STEMMER:
+ g_value_set_boolean (value, priv->enable_stemmer);
+ break;
case PROP_IGNORE_NUMBERS:
g_value_set_boolean (value, priv->ignore_numbers);
break;
@@ -419,6 +437,18 @@ tracker_fts_config_get_max_word_length (TrackerFTSConfig *config)
}
gboolean
+tracker_fts_config_get_enable_stemmer (TrackerFTSConfig *config)
+{
+ TrackerFTSConfigPrivate *priv;
+
+ g_return_val_if_fail (TRACKER_IS_FTS_CONFIG (config), DEFAULT_ENABLE_STEMMER);
+
+ priv = TRACKER_FTS_CONFIG_GET_PRIVATE (config);
+
+ return priv->enable_stemmer;
+}
+
+gboolean
tracker_fts_config_get_ignore_numbers (TrackerFTSConfig *config)
{
TrackerFTSConfigPrivate *priv;
@@ -491,6 +521,20 @@ tracker_fts_config_set_max_word_length (TrackerFTSConfig *config,
}
void
+tracker_fts_config_set_enable_stemmer (TrackerFTSConfig *config,
+ gboolean value)
+{
+ TrackerFTSConfigPrivate *priv;
+
+ g_return_if_fail (TRACKER_IS_FTS_CONFIG (config));
+
+ priv = TRACKER_FTS_CONFIG_GET_PRIVATE (config);
+
+ priv->enable_stemmer = value;
+ g_object_notify (G_OBJECT (config), "enable-stemmer");
+}
+
+void
tracker_fts_config_set_ignore_numbers (TrackerFTSConfig *config,
gboolean value)
{
diff --git a/src/libtracker-fts/tracker-fts-config.h b/src/libtracker-fts/tracker-fts-config.h
index 9c83e35..aabb71a 100644
--- a/src/libtracker-fts/tracker-fts-config.h
+++ b/src/libtracker-fts/tracker-fts-config.h
@@ -50,6 +50,7 @@ TrackerFTSConfig *tracker_fts_config_new (void);
gboolean tracker_fts_config_save (TrackerFTSConfig *config);
gint tracker_fts_config_get_min_word_length (TrackerFTSConfig *config);
gint tracker_fts_config_get_max_word_length (TrackerFTSConfig *config);
+gboolean tracker_fts_config_get_enable_stemmer (TrackerFTSConfig *config);
gboolean tracker_fts_config_get_ignore_numbers (TrackerFTSConfig *config);
gboolean tracker_fts_config_get_ignore_stop_words (TrackerFTSConfig *config);
gint tracker_fts_config_get_max_words_to_index (TrackerFTSConfig *config);
@@ -57,6 +58,8 @@ void tracker_fts_config_set_min_word_length (TrackerFTSConfig *c
gint value);
void tracker_fts_config_set_max_word_length (TrackerFTSConfig *config,
gint value);
+void tracker_fts_config_set_enable_stemmer (TrackerFTSConfig *config,
+ gboolean value);
void tracker_fts_config_set_ignore_numbers (TrackerFTSConfig *config,
gboolean value);
void tracker_fts_config_set_ignore_stop_words (TrackerFTSConfig *config,
diff --git a/src/libtracker-fts/tracker-fts.c b/src/libtracker-fts/tracker-fts.c
index bc0b49c..e6a8326 100644
--- a/src/libtracker-fts/tracker-fts.c
+++ b/src/libtracker-fts/tracker-fts.c
@@ -2330,6 +2330,7 @@ struct fulltext_vtab {
const char *zName; /* virtual table name */
int nColumn; /* number of columns in virtual table */
TrackerParser *parser; /* tokenizer for inserts and queries */
+ gboolean enable_stemmer;
gboolean ignore_numbers;
gboolean ignore_stop_words;
int max_words;
@@ -3370,6 +3371,7 @@ static int constructVtab(
min_len = tracker_fts_config_get_min_word_length (config);
max_len = tracker_fts_config_get_max_word_length (config);
+ v->enable_stemmer = tracker_fts_config_get_enable_stemmer (config);
v->ignore_numbers = tracker_fts_config_get_ignore_numbers (config);
/* disable stop words if TRACKER_FTS_STOP_WORDS is set to 0 - used by tests
@@ -3673,7 +3675,7 @@ static void snippetOffsetsOfColumn(
tracker_parser_reset (pVtab->parser,
zDoc,
nDoc,
- TRUE,
+ pVtab->enable_stemmer,
pVtab->ignore_stop_words,
TRUE,
pVtab->ignore_numbers);
@@ -4376,7 +4378,7 @@ static int tokenizeSegment(
tracker_parser_reset (parser,
pSegment,
nSegment,
- TRUE,
+ v->enable_stemmer,
v->ignore_stop_words,
FALSE,
v->ignore_numbers);
@@ -4835,7 +4837,7 @@ int Catid,
tracker_parser_reset (parser,
zText,
strlen (zText),
- TRUE,
+ v->enable_stemmer,
v->ignore_stop_words,
TRUE,
v->ignore_numbers);
diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c
index 5a7d5a0..47edcbf 100644
--- a/tests/libtracker-fts/tracker-parser-test.c
+++ b/tests/libtracker-fts/tracker-parser-test.c
@@ -53,9 +53,9 @@ typedef struct {
/* Default parser configuration to use */
gint max_word_length;
gboolean enable_stemmer;
- gboolean enable_stop_words;
- gboolean skip_reserved_words;
- gboolean skip_numbers;
+ gboolean ignore_stop_words;
+ gboolean ignore_reserved_words;
+ gboolean ignore_numbers;
} TrackerParserTestFixture;
/* Common setup for all tests */
@@ -65,8 +65,10 @@ test_common_setup (TrackerParserTestFixture *fixture,
{
TrackerLanguage *language;
- /* Setup language for parser */
- language = tracker_language_new (NULL);
+ /* Setup language for parser. We make sure that always English is used
+ * in the unit tests, because we want the English stemming method to
+ * be used. */
+ language = tracker_language_new ("en");
if (!language) {
g_critical ("Language setup failed!");
return;
@@ -75,9 +77,9 @@ test_common_setup (TrackerParserTestFixture *fixture,
/* Default conf parameters */
fixture->max_word_length = 50;
fixture->enable_stemmer = TRUE;
- fixture->enable_stop_words = TRUE;
- fixture->skip_reserved_words = TRUE;
- fixture->skip_numbers = TRUE;
+ fixture->ignore_stop_words = TRUE;
+ fixture->ignore_reserved_words = TRUE;
+ fixture->ignore_numbers = TRUE;
/* Create the parser */
fixture->parser = tracker_parser_new (language,
@@ -106,7 +108,7 @@ test_common_teardown (TrackerParserTestFixture *fixture,
typedef struct TestDataExpectedNWords TestDataExpectedNWords;
struct TestDataExpectedNWords {
const gchar *str;
- gboolean skip_numbers;
+ gboolean ignore_numbers;
guint expected_nwords;
};
@@ -129,9 +131,9 @@ expected_nwords_check (TrackerParserTestFixture *fixture,
testdata->str,
strlen (testdata->str),
fixture->enable_stemmer,
- fixture->enable_stop_words,
- fixture->skip_reserved_words,
- testdata->skip_numbers);
+ fixture->ignore_stop_words,
+ fixture->ignore_reserved_words,
+ testdata->ignore_numbers);
/* Count number of output words */
while ((word = tracker_parser_next (fixture->parser,
@@ -154,6 +156,7 @@ typedef struct TestDataExpectedWord TestDataExpectedWord;
struct TestDataExpectedWord {
const gchar *str;
const gchar *expected;
+ gboolean enable_stemmer;
};
/* Common expected_word test method */
@@ -173,10 +176,10 @@ expected_word_check (TrackerParserTestFixture *fixture,
tracker_parser_reset (fixture->parser,
testdata->str,
strlen (testdata->str),
- FALSE, /* no stemming for this test */
- fixture->enable_stop_words,
- fixture->skip_reserved_words,
- fixture->skip_numbers);
+ testdata->enable_stemmer,
+ fixture->ignore_stop_words,
+ fixture->ignore_reserved_words,
+ fixture->ignore_numbers);
/* Process next word */
word = tracker_parser_next (fixture->parser,
@@ -195,48 +198,55 @@ expected_word_check (TrackerParserTestFixture *fixture,
#ifdef HAVE_UNAC
/* Normalization-related tests (unaccenting) */
static const TestDataExpectedWord test_data_normalization[] = {
- { "école", "ecole" },
- { "Ã?COLE", "ecole" },
- { "Ã?cole", "ecole" },
+ { "école", "ecole", FALSE },
+ { "Ã?COLE", "ecole", FALSE },
+ { "Ã?cole", "ecole", FALSE },
#ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
- { "e" "\xCC\x81" "cole", "ecole" },
- { "E" "\xCC\x81" "COLE", "ecole" },
- { "E" "\xCC\x81" "cole", "ecole" },
+ { "e" "\xCC\x81" "cole", "ecole", FALSE },
+ { "E" "\xCC\x81" "COLE", "ecole", FALSE },
+ { "E" "\xCC\x81" "cole", "ecole", FALSE },
#endif
- { NULL, NULL }
+ { NULL, NULL, FALSE }
};
/* Unaccenting-related tests */
static const TestDataExpectedWord test_data_unaccent[] = {
- { "Murciélago", "murcielago" },
- { "camión", "camion" },
- { "desagüe", "desague" },
- { NULL, NULL }
+ { "Murciélago", "murcielago", FALSE },
+ { "camión", "camion", FALSE },
+ { "desagüe", "desague", FALSE },
+ { NULL, NULL, FALSE }
};
#else
/* Normalization-related tests (not unaccenting) */
static const TestDataExpectedWord test_data_normalization[] = {
- { "école", "école" },
- { "�COLE", "école" },
- { "�cole", "école" },
+ { "école", "école", FALSE },
+ { "�COLE", "école", FALSE },
+ { "�cole", "école", FALSE },
#ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
- { "e" "\xCC\x81" "cole", "école" },
- { "E" "\xCC\x81" "COLE", "école" },
- { "E" "\xCC\x81" "cole", "école" },
+ { "e" "\xCC\x81" "cole", "école", FALSE },
+ { "E" "\xCC\x81" "COLE", "école", FALSE },
+ { "E" "\xCC\x81" "cole", "école", FALSE },
#endif
- { NULL, NULL }
+ { NULL, NULL, FALSE }
};
#endif
+/* Stemming-related tests */
+static const TestDataExpectedWord test_data_stemming[] = {
+ { "ecole", "ecol", TRUE },
+ { "ecole", "ecole", FALSE },
+ { NULL, NULL, FALSE }
+};
+
/* Casefolding-related tests */
static const TestDataExpectedWord test_data_casefolding[] = {
- { "gross", "gross" },
- { "GROSS", "gross" },
- { "GrOsS", "gross" },
+ { "gross", "gross", FALSE },
+ { "GROSS", "gross", FALSE },
+ { "GrOsS", "gross", FALSE },
#ifdef FULL_UNICODE_TESTS /* glib/pango doesn't do full-word casefolding */
- { "groÃ?", "gross" },
+ { "groÃ?", "gross", FALSE },
#endif
- { NULL, NULL }
+ { NULL, NULL, FALSE }
};
/* Number of expected words tests */
@@ -312,6 +322,20 @@ main (int argc, char **argv)
g_free (testpath);
}
+ /* Add stemming checks */
+ for (i = 0; test_data_stemming[i].str != NULL; i++) {
+ gchar *testpath;
+
+ testpath = g_strdup_printf ("/libtracker-fts/parser/stemming_%d", i);
+ g_test_add (testpath,
+ TrackerParserTestFixture,
+ &test_data_stemming[i],
+ test_common_setup,
+ expected_word_check,
+ test_common_teardown);
+ g_free (testpath);
+ }
+
/* Add expected number of words checks */
for (i = 0; test_data_nwords[i].str != NULL; i++) {
gchar *testpath;
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]