[tracker] Fixes GB#560220: New FTS config option to enable/disable unaccenting
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker] Fixes GB#560220: New FTS config option to enable/disable unaccenting
- Date: Tue, 25 May 2010 11:41:05 +0000 (UTC)
commit 2145eeb70fbe860371b8f7a020b7733b951f8027
Author: Aleksander Morgado <aleksander lanedo com>
Date: Mon May 24 17:50:59 2010 +0200
Fixes GB#560220: New FTS config option to enable/disable unaccenting
* Interactive tracker-parser tester also modified to read the
proper configuration values from tracker-fts.cfg
docs/manpages/tracker-fts.cfg.5 | 4 ++
src/libtracker-fts/tracker-fts-config.c | 44 ++++++++++++++++++
src/libtracker-fts/tracker-fts-config.h | 3 +
src/libtracker-fts/tracker-fts.c | 8 +++
src/libtracker-fts/tracker-parser-glib.c | 5 ++-
src/libtracker-fts/tracker-parser-libicu.c | 5 ++-
src/libtracker-fts/tracker-parser-libunistring.c | 5 ++-
src/libtracker-fts/tracker-parser.h | 1 +
tests/libtracker-fts/tracker-parser-test.c | 54 ++++++++++++++--------
tests/libtracker-fts/tracker-parser.c | 30 +++++-------
10 files changed, 119 insertions(+), 40 deletions(-)
---
diff --git a/docs/manpages/tracker-fts.cfg.5 b/docs/manpages/tracker-fts.cfg.5
index 93587cf..efc8987 100644
--- a/docs/manpages/tracker-fts.cfg.5
+++ b/docs/manpages/tracker-fts.cfg.5
@@ -28,6 +28,10 @@ Set to true if stemming should be applied to each word. Stemming is the process
for reducing inflected and derived words to their stem, base or root form.
.TP
+.B EnableUnaccent=true
+Set to true if combining diacritical marks should be removed from each word.
+
+.TP
.B IgnoreNumbers=true
Set to true if words starting with numbers should be ignored.
diff --git a/src/libtracker-fts/tracker-fts-config.c b/src/libtracker-fts/tracker-fts-config.c
index 6d53bc4..f3b1faa 100644
--- a/src/libtracker-fts/tracker-fts-config.c
+++ b/src/libtracker-fts/tracker-fts-config.c
@@ -41,12 +41,14 @@
#define DEFAULT_IGNORE_NUMBERS TRUE
#define DEFAULT_IGNORE_STOP_WORDS TRUE
#define DEFAULT_ENABLE_STEMMER FALSE /* As per GB#526346, disabled */
+#define DEFAULT_ENABLE_UNACCENT TRUE
typedef struct {
/* Indexing */
gint min_word_length;
gint max_word_length;
gboolean enable_stemmer;
+ gboolean enable_unaccent;
gboolean ignore_numbers;
gboolean ignore_stop_words;
gint max_words_to_index;
@@ -81,6 +83,7 @@ enum {
PROP_MIN_WORD_LENGTH,
PROP_MAX_WORD_LENGTH,
PROP_ENABLE_STEMMER,
+ PROP_ENABLE_UNACCENT,
PROP_IGNORE_NUMBERS,
PROP_IGNORE_STOP_WORDS,
@@ -92,6 +95,7 @@ static ObjectToKeyFile conversions[] = {
{ G_TYPE_INT, "min-word-length", GROUP_INDEXING, "MinWordLength" },
{ G_TYPE_INT, "max-word-length", GROUP_INDEXING, "MaxWordLength" },
{ G_TYPE_BOOLEAN, "enable-stemmer", GROUP_INDEXING, "EnableStemmer" },
+ { G_TYPE_BOOLEAN, "enable-unaccent", GROUP_INDEXING, "EnableUnaccent" },
{ G_TYPE_BOOLEAN, "ignore-numbers", GROUP_INDEXING, "IgnoreNumbers" },
{ G_TYPE_BOOLEAN, "ignore-stop-words", GROUP_INDEXING, "IgnoreStopWords" },
{ G_TYPE_INT, "max-words-to-index", GROUP_INDEXING, "MaxWordsToIndex" },
@@ -136,6 +140,13 @@ tracker_fts_config_class_init (TrackerFTSConfigClass *klass)
DEFAULT_ENABLE_STEMMER,
G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
g_object_class_install_property (object_class,
+ PROP_ENABLE_UNACCENT,
+ g_param_spec_boolean ("enable-unaccent",
+ "Enable Unaccent",
+ " Flag to enable word unaccenting (default=TRUE)",
+ DEFAULT_ENABLE_UNACCENT,
+ G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
+ g_object_class_install_property (object_class,
PROP_IGNORE_NUMBERS,
g_param_spec_boolean ("ignore-numbers",
"Ignore numbers",
@@ -187,6 +198,10 @@ config_set_property (GObject *object,
tracker_fts_config_set_enable_stemmer (TRACKER_FTS_CONFIG (object),
g_value_get_boolean (value));
break;
+ case PROP_ENABLE_UNACCENT:
+ tracker_fts_config_set_enable_unaccent (TRACKER_FTS_CONFIG (object),
+ g_value_get_boolean (value));
+ break;
case PROP_IGNORE_NUMBERS:
tracker_fts_config_set_ignore_numbers (TRACKER_FTS_CONFIG (object),
g_value_get_boolean (value));
@@ -227,6 +242,9 @@ config_get_property (GObject *object,
case PROP_ENABLE_STEMMER:
g_value_set_boolean (value, priv->enable_stemmer);
break;
+ case PROP_ENABLE_UNACCENT:
+ g_value_set_boolean (value, priv->enable_unaccent);
+ break;
case PROP_IGNORE_NUMBERS:
g_value_set_boolean (value, priv->ignore_numbers);
break;
@@ -449,6 +467,18 @@ tracker_fts_config_get_enable_stemmer (TrackerFTSConfig *config)
}
gboolean
+tracker_fts_config_get_enable_unaccent (TrackerFTSConfig *config)
+{
+ TrackerFTSConfigPrivate *priv;
+
+ g_return_val_if_fail (TRACKER_IS_FTS_CONFIG (config), DEFAULT_ENABLE_UNACCENT);
+
+ priv = TRACKER_FTS_CONFIG_GET_PRIVATE (config);
+
+ return priv->enable_unaccent;
+}
+
+gboolean
tracker_fts_config_get_ignore_numbers (TrackerFTSConfig *config)
{
TrackerFTSConfigPrivate *priv;
@@ -535,6 +565,20 @@ tracker_fts_config_set_enable_stemmer (TrackerFTSConfig *config,
}
void
+tracker_fts_config_set_enable_unaccent (TrackerFTSConfig *config,
+ gboolean value)
+{
+ TrackerFTSConfigPrivate *priv;
+
+ g_return_if_fail (TRACKER_IS_FTS_CONFIG (config));
+
+ priv = TRACKER_FTS_CONFIG_GET_PRIVATE (config);
+
+ priv->enable_unaccent = value;
+ g_object_notify (G_OBJECT (config), "enable-unaccent");
+}
+
+void
tracker_fts_config_set_ignore_numbers (TrackerFTSConfig *config,
gboolean value)
{
diff --git a/src/libtracker-fts/tracker-fts-config.h b/src/libtracker-fts/tracker-fts-config.h
index aabb71a..de75fb8 100644
--- a/src/libtracker-fts/tracker-fts-config.h
+++ b/src/libtracker-fts/tracker-fts-config.h
@@ -51,6 +51,7 @@ gboolean tracker_fts_config_save (TrackerFTSConfig *c
gint tracker_fts_config_get_min_word_length (TrackerFTSConfig *config);
gint tracker_fts_config_get_max_word_length (TrackerFTSConfig *config);
gboolean tracker_fts_config_get_enable_stemmer (TrackerFTSConfig *config);
+gboolean tracker_fts_config_get_enable_unaccent (TrackerFTSConfig *config);
gboolean tracker_fts_config_get_ignore_numbers (TrackerFTSConfig *config);
gboolean tracker_fts_config_get_ignore_stop_words (TrackerFTSConfig *config);
gint tracker_fts_config_get_max_words_to_index (TrackerFTSConfig *config);
@@ -60,6 +61,8 @@ void tracker_fts_config_set_max_word_length (TrackerFTSConfig *c
gint value);
void tracker_fts_config_set_enable_stemmer (TrackerFTSConfig *config,
gboolean value);
+void tracker_fts_config_set_enable_unaccent (TrackerFTSConfig *config,
+ gboolean value);
void tracker_fts_config_set_ignore_numbers (TrackerFTSConfig *config,
gboolean value);
void tracker_fts_config_set_ignore_stop_words (TrackerFTSConfig *config,
diff --git a/src/libtracker-fts/tracker-fts.c b/src/libtracker-fts/tracker-fts.c
index e6a8326..3f42bcd 100644
--- a/src/libtracker-fts/tracker-fts.c
+++ b/src/libtracker-fts/tracker-fts.c
@@ -2331,6 +2331,7 @@ struct fulltext_vtab {
int nColumn; /* number of columns in virtual table */
TrackerParser *parser; /* tokenizer for inserts and queries */
gboolean enable_stemmer;
+ gboolean enable_unaccent;
gboolean ignore_numbers;
gboolean ignore_stop_words;
int max_words;
@@ -3372,6 +3373,7 @@ static int constructVtab(
min_len = tracker_fts_config_get_min_word_length (config);
max_len = tracker_fts_config_get_max_word_length (config);
v->enable_stemmer = tracker_fts_config_get_enable_stemmer (config);
+ v->enable_unaccent = tracker_fts_config_get_enable_unaccent (config);
v->ignore_numbers = tracker_fts_config_get_ignore_numbers (config);
/* disable stop words if TRACKER_FTS_STOP_WORDS is set to 0 - used by tests
@@ -3397,6 +3399,9 @@ static int constructVtab(
g_object_set_qdata_full (object, quark_fulltext_vtab, v,
(GDestroyNotify) fulltext_vtab_destroy);
+ /* Config no longer needed */
+ g_object_unref (config);
+
return SQLITE_OK;
}
@@ -3676,6 +3681,7 @@ static void snippetOffsetsOfColumn(
zDoc,
nDoc,
pVtab->enable_stemmer,
+ pVtab->enable_unaccent,
pVtab->ignore_stop_words,
TRUE,
pVtab->ignore_numbers);
@@ -4379,6 +4385,7 @@ static int tokenizeSegment(
pSegment,
nSegment,
v->enable_stemmer,
+ v->enable_unaccent,
v->ignore_stop_words,
FALSE,
v->ignore_numbers);
@@ -4838,6 +4845,7 @@ int Catid,
zText,
strlen (zText),
v->enable_stemmer,
+ v->enable_unaccent,
v->ignore_stop_words,
TRUE,
v->ignore_numbers);
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index fd7d1bd..2c324bb 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -72,6 +72,7 @@ struct TrackerParser {
TrackerLanguage *language;
gboolean enable_stemmer;
+ gboolean enable_unaccent;
gboolean ignore_stop_words;
guint max_word_length;
gboolean ignore_reserved_words;
@@ -456,6 +457,7 @@ tracker_parser_reset (TrackerParser *parser,
const gchar *txt,
gint txt_size,
gboolean enable_stemmer,
+ gboolean enable_unaccent,
gboolean ignore_stop_words,
gboolean ignore_reserved_words,
gboolean ignore_numbers)
@@ -470,6 +472,7 @@ tracker_parser_reset (TrackerParser *parser,
parser->encoding = get_encoding (txt);
parser->enable_stemmer = enable_stemmer;
+ parser->enable_unaccent = enable_unaccent;
parser->ignore_stop_words = ignore_stop_words;
parser->txt_size = txt_size;
@@ -533,7 +536,7 @@ tracker_parser_process_word (TrackerParser *parser,
tracker_parser_message_hex ("ORIGINAL word",
word, bytes);
- if (do_strip) {
+ if (parser->enable_unaccent && do_strip) {
stripped_word = tracker_parser_unaccent_utf8_word (word,
bytes,
&len);
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index d3fdda4..4814281 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -55,6 +55,7 @@ struct TrackerParser {
TrackerLanguage *language;
gboolean enable_stemmer;
+ gboolean enable_unaccent;
guint max_word_length;
gboolean ignore_stop_words;
gboolean ignore_reserved_words;
@@ -318,6 +319,7 @@ tracker_parser_reset (TrackerParser *parser,
const gchar *txt,
gint txt_size,
gboolean enable_stemmer,
+ gboolean enable_unaccent,
gboolean ignore_stop_words,
gboolean ignore_reserved_words,
gboolean ignore_numbers)
@@ -331,6 +333,7 @@ tracker_parser_reset (TrackerParser *parser,
g_return_if_fail (txt != NULL);
parser->enable_stemmer = enable_stemmer;
+ parser->enable_unaccent = enable_unaccent;
parser->ignore_stop_words = ignore_stop_words;
parser->txt_size = txt_size;
@@ -486,7 +489,7 @@ process_word_uchar (TrackerParser *parser,
}
/* UNAC stripping needed? (for non-CJK and non-ASCII) */
- if (type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
+ if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
gsize stripped_word_length;
/* Get unaccented string in UTF-8 */
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index a5fe3ab..02b89a9 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -54,6 +54,7 @@ struct TrackerParser {
TrackerLanguage *language;
gboolean enable_stemmer;
+ gboolean enable_unaccent;
guint max_word_length;
gboolean ignore_stop_words;
gboolean ignore_reserved_words;
@@ -277,6 +278,7 @@ tracker_parser_reset (TrackerParser *parser,
const gchar *txt,
gint txt_size,
gboolean enable_stemmer,
+ gboolean enable_unaccent,
gboolean ignore_stop_words,
gboolean ignore_reserved_words,
gboolean ignore_numbers)
@@ -285,6 +287,7 @@ tracker_parser_reset (TrackerParser *parser,
g_return_if_fail (txt != NULL);
parser->enable_stemmer = enable_stemmer;
+ parser->enable_unaccent = enable_unaccent;
parser->ignore_stop_words = ignore_stop_words;
parser->txt_size = txt_size;
@@ -407,7 +410,7 @@ process_word_utf8 (TrackerParser *parser,
normalized[new_word_length] = '\0';
/* UNAC stripping needed? (for non-CJK and non-ASCII) */
- if (type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
+ if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC) {
gsize stripped_word_length;
stripped = tracker_parser_unaccent_utf8_word (normalized,
diff --git a/src/libtracker-fts/tracker-parser.h b/src/libtracker-fts/tracker-parser.h
index 21ab427..b84d534 100644
--- a/src/libtracker-fts/tracker-parser.h
+++ b/src/libtracker-fts/tracker-parser.h
@@ -36,6 +36,7 @@ void tracker_parser_reset (TrackerParser *parser,
const gchar *txt,
gint txt_size,
gboolean enable_stemmer,
+ gboolean enable_unaccent,
gboolean ignore_stop_words,
gboolean ignore_reserved_words,
gboolean ignore_numbers);
diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c
index 47edcbf..8975f41 100644
--- a/tests/libtracker-fts/tracker-parser-test.c
+++ b/tests/libtracker-fts/tracker-parser-test.c
@@ -53,6 +53,7 @@ typedef struct {
/* Default parser configuration to use */
gint max_word_length;
gboolean enable_stemmer;
+ gboolean enable_unaccent;
gboolean ignore_stop_words;
gboolean ignore_reserved_words;
gboolean ignore_numbers;
@@ -77,6 +78,7 @@ test_common_setup (TrackerParserTestFixture *fixture,
/* Default conf parameters */
fixture->max_word_length = 50;
fixture->enable_stemmer = TRUE;
+ fixture->enable_unaccent = TRUE;
fixture->ignore_stop_words = TRUE;
fixture->ignore_reserved_words = TRUE;
fixture->ignore_numbers = TRUE;
@@ -131,6 +133,7 @@ expected_nwords_check (TrackerParserTestFixture *fixture,
testdata->str,
strlen (testdata->str),
fixture->enable_stemmer,
+ fixture->enable_unaccent,
fixture->ignore_stop_words,
fixture->ignore_reserved_words,
testdata->ignore_numbers);
@@ -157,6 +160,7 @@ struct TestDataExpectedWord {
const gchar *str;
const gchar *expected;
gboolean enable_stemmer;
+ gboolean enable_unaccent;
};
/* Common expected_word test method */
@@ -177,6 +181,7 @@ expected_word_check (TrackerParserTestFixture *fixture,
testdata->str,
strlen (testdata->str),
testdata->enable_stemmer,
+ testdata->enable_unaccent,
fixture->ignore_stop_words,
fixture->ignore_reserved_words,
fixture->ignore_numbers);
@@ -198,38 +203,49 @@ expected_word_check (TrackerParserTestFixture *fixture,
#ifdef HAVE_UNAC
/* Normalization-related tests (unaccenting) */
static const TestDataExpectedWord test_data_normalization[] = {
- { "école", "ecole", FALSE },
- { "Ã?COLE", "ecole", FALSE },
- { "Ã?cole", "ecole", FALSE },
+ { "école", "ecole", FALSE, TRUE },
+ { "Ã?COLE", "ecole", FALSE, TRUE },
+ { "Ã?cole", "ecole", FALSE, TRUE },
#ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
- { "e" "\xCC\x81" "cole", "ecole", FALSE },
- { "E" "\xCC\x81" "COLE", "ecole", FALSE },
- { "E" "\xCC\x81" "cole", "ecole", FALSE },
+ { "e" "\xCC\x81" "cole", "ecole", FALSE, TRUE },
+ { "E" "\xCC\x81" "COLE", "ecole", FALSE, TRUE },
+ { "E" "\xCC\x81" "cole", "ecole", FALSE, TRUE },
#endif
- { NULL, NULL, FALSE }
+ { NULL, NULL, FALSE, FALSE }
};
/* Unaccenting-related tests */
static const TestDataExpectedWord test_data_unaccent[] = {
- { "Murciélago", "murcielago", FALSE },
- { "camión", "camion", FALSE },
- { "desagüe", "desague", FALSE },
- { NULL, NULL, FALSE }
+ { "Murciélago", "murcielago", FALSE, TRUE },
+ { "camión", "camion", FALSE, TRUE },
+ { "desagüe", "desague", FALSE, TRUE },
+ { "Murciélago", "murciélago", FALSE, FALSE },
+ { "camión", "camión", FALSE, FALSE },
+ { "desagüe", "desagüe", FALSE, FALSE },
+ { NULL, NULL, FALSE, FALSE }
};
#else
/* Normalization-related tests (not unaccenting) */
static const TestDataExpectedWord test_data_normalization[] = {
- { "école", "école", FALSE },
- { "�COLE", "école", FALSE },
- { "�cole", "école", FALSE },
+ { "école", "école", FALSE, FALSE },
+ { "�COLE", "école", FALSE, FALSE },
+ { "�cole", "école", FALSE, FALSE },
#ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
- { "e" "\xCC\x81" "cole", "école", FALSE },
- { "E" "\xCC\x81" "COLE", "école", FALSE },
- { "E" "\xCC\x81" "cole", "école", FALSE },
+ { "e" "\xCC\x81" "cole", "école", FALSE, FALSE },
+ { "E" "\xCC\x81" "COLE", "école", FALSE, FALSE },
+ { "E" "\xCC\x81" "cole", "école", FALSE, FALSE },
#endif
- { NULL, NULL, FALSE }
-};
+ { "école", "école", FALSE, TRUE },
+ { "�COLE", "école", FALSE, TRUE },
+ { "�cole", "école", FALSE, TRUE },
+#ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
+ { "e" "\xCC\x81" "cole", "école", FALSE, TRUE },
+ { "E" "\xCC\x81" "COLE", "école", FALSE, TRUE },
+ { "E" "\xCC\x81" "cole", "école", FALSE, TRUE },
#endif
+ { NULL, NULL, FALSE, FALSE }
+};
+#endif /* !HAVE_UNAC */
/* Stemming-related tests */
static const TestDataExpectedWord test_data_stemming[] = {
diff --git a/tests/libtracker-fts/tracker-parser.c b/tests/libtracker-fts/tracker-parser.c
index 1c38215..0aaf6c4 100644
--- a/tests/libtracker-fts/tracker-parser.c
+++ b/tests/libtracker-fts/tracker-parser.c
@@ -29,10 +29,6 @@
#include <libtracker-fts/tracker-fts-config.h>
#include <libtracker-common/tracker-common.h>
-
-#define DEFAULT_MAX_WORD_LENGTH 30
-
-static gint max_word_length = DEFAULT_MAX_WORD_LENGTH;
static gchar *text;
static gchar *filename;
static gboolean verbose;
@@ -46,12 +42,6 @@ static const GOptionEntry options [] = {
NULL
},
{
- "max-word-length", 'm', 0,
- G_OPTION_ARG_INT, &max_word_length,
- "Maximum word length to consider",
- NULL
- },
- {
"text", 't', 0,
G_OPTION_ARG_STRING, &text,
"Specific text to parse",
@@ -119,13 +109,17 @@ load_file_contents (void)
static gboolean
run_parsing (void)
{
+ TrackerFTSConfig *config;
TrackerLanguage *language;
- TrackerParser *parser;
- GTimer *timer;
+ TrackerParser *parser;
+ GTimer *timer;
/* Initialize timing */
timer = g_timer_new ();
+ /* Read config file */
+ config = tracker_fts_config_new ();
+
/* Setup language for parser */
language = tracker_language_new (NULL);
if (!language) {
@@ -135,22 +129,22 @@ run_parsing (void)
/* Create the parser */
parser = tracker_parser_new (language,
- max_word_length);
+ tracker_fts_config_get_max_word_length (config));
if (!parser) {
g_printerr ("Parser creation failed!\n");
g_object_unref (language);
return FALSE;
}
- /* Reset the parser with our string */
+ /* Reset the parser with our string, reading the current FTS config */
tracker_parser_reset (parser,
text,
strlen (text),
+ tracker_fts_config_get_enable_stemmer (config),
+ tracker_fts_config_get_enable_unaccent (config),
+ tracker_fts_config_get_ignore_stop_words (config),
TRUE,
- TRUE,
- TRUE,
- TRUE);
-
+ tracker_fts_config_get_ignore_numbers (config));
/* Loop through all words! */
while (1) {
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]