[tracker] Fixes GB#619498: Check if stop word before stemming, not after
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker] Fixes GB#619498: Check if stop word before stemming, not after
- Date: Tue, 1 Jun 2010 11:48:19 +0000 (UTC)
commit 9f2c559895aa1cff437b020b65582107f22e8d15
Author: Aleksander Morgado <aleksander lanedo com>
Date: Fri May 28 11:55:17 2010 +0200
Fixes GB#619498: Check if stop word before stemming, not after
* tests/libtracker-fts/tracker-parser-test.c: Added new unit tests
for stop-words.
* src/libtracker-common/tracker-language.c
* docs/manpages/tracker-store.1
New TRACKER_LANGUAGE_STOP_WORDS_DIR environment variable to specify
the path where stop words dictionaries are available, used for
testing.
* src/libtracker-fts/tracker-parser-glib.c
* src/libtracker-fts/tracker-parser-libicu.c
* src/libtracker-fts/tracker-parser-libunistring.c
Check for stopword is done before the word stemming, if applicable.
docs/manpages/tracker-store.1 | 6 ++
src/libtracker-common/tracker-language.c | 23 ++++--
src/libtracker-fts/tracker-parser-glib.c | 34 ++++----
src/libtracker-fts/tracker-parser-libicu.c | 28 ++++---
src/libtracker-fts/tracker-parser-libunistring.c | 30 ++++---
tests/libtracker-fts/tracker-parser-test.c | 102 +++++++++++++++++++---
6 files changed, 165 insertions(+), 58 deletions(-)
---
diff --git a/docs/manpages/tracker-store.1 b/docs/manpages/tracker-store.1
index adb9091..50dbdb4 100644
--- a/docs/manpages/tracker-store.1
+++ b/docs/manpages/tracker-store.1
@@ -68,6 +68,12 @@ This is the directory which tracker uses to load the .ontology files
from. If unset it will default to the correct place. This is used
mainly for testing purposes.
+.TP
+.B TRACKER_LANGUAGE_STOP_WORDS_DIR
+This is the directory which tracker uses to load the stop words
+dictionaries from. If unset it will default to the correct place. This
+is used mainly for testing purposes.
+
.SH FILES
.I $HOME/.config/tracker/tracker-store.cfg
.I $HOME/.config/tracker/tracker-fts.cfg
diff --git a/src/libtracker-common/tracker-language.c b/src/libtracker-common/tracker-language.c
index d67d13d..d06d3ab 100644
--- a/src/libtracker-common/tracker-language.c
+++ b/src/libtracker-common/tracker-language.c
@@ -218,15 +218,26 @@ language_get_stopword_filename (const gchar *language_code)
{
gchar *str;
gchar *filename;
+ const gchar *testpath;
str = g_strconcat ("stopwords.", language_code, NULL);
- filename = g_build_filename (SHAREDIR,
- "tracker",
- "languages",
- str,
- NULL);
- g_free (str);
+ /* Look if the testpath for stopwords dictionary was set
+ * (used during unit tests) */
+ testpath = g_getenv ("TRACKER_LANGUAGE_STOP_WORDS_DIR");
+ if (!testpath) {
+ filename = g_build_filename (SHAREDIR,
+ "tracker",
+ "languages",
+ str,
+ NULL);
+ } else {
+ filename = g_build_filename (testpath,
+ str,
+ NULL);
+ }
+
+ g_free (str);
return filename;
}
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index 670a46f..d521c9c 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -166,7 +166,8 @@ static gchar *
process_word_utf8 (TrackerParser *parser,
const gchar *word,
gint length,
- gboolean do_strip)
+ gboolean do_strip,
+ gboolean *stop_word)
{
gchar *stem_word;
gchar *str;
@@ -221,6 +222,12 @@ process_word_utf8 (TrackerParser *parser,
return NULL;
}
+ /* Check if stop word */
+ if (parser->ignore_stop_words) {
+ *stop_word = tracker_language_is_stop_word (parser->language,
+ str);
+ }
+
if (!parser->enable_stemmer) {
return str;
}
@@ -306,7 +313,8 @@ pango_next (TrackerParser *parser,
static gboolean
parser_next (TrackerParser *parser,
gint *byte_offset_start,
- gint *byte_offset_end)
+ gint *byte_offset_end,
+ gboolean *stop_word)
{
TrackerParserWordType word_type;
gunichar word[64];
@@ -477,7 +485,7 @@ parser_next (TrackerParser *parser,
parser->cursor = parser->txt + *byte_offset_end;
- processed_word = process_word_utf8 (parser, utf8, bytes, do_strip);
+ processed_word = process_word_utf8 (parser, utf8, bytes, do_strip, stop_word);
g_free (utf8);
if (processed_word) {
@@ -589,33 +597,27 @@ tracker_parser_next (TrackerParser *parser,
gint *word_length)
{
const gchar *str;
- gint byte_start = 0, byte_end = 0;
+ gint byte_start = 0, byte_end = 0;
str = NULL;
g_free (parser->word);
parser->word = NULL;
+ *stop_word = FALSE;
+
if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
if (pango_next (parser, &byte_start, &byte_end)) {
str = parser->word;
}
- parser->word_position++;
-
- *stop_word = FALSE;
} else {
- if (parser_next (parser, &byte_start, &byte_end)) {
+ if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
str = parser->word;
}
+ }
- if (str &&
- parser->ignore_stop_words &&
- tracker_language_is_stop_word (parser->language, str)) {
- *stop_word = TRUE;
- } else {
- parser->word_position++;
- *stop_word = FALSE;
- }
+ if (!*stop_word) {
+ parser->word_position++;
}
*word_length = parser->word_length;
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 3e1ad98..55151ec 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -142,7 +142,8 @@ static gchar *
process_word_uchar (TrackerParser *parser,
const UChar *word,
gint length,
- TrackerParserWordType type)
+ TrackerParserWordType type,
+ gboolean *stop_word)
{
UErrorCode error = U_ZERO_ERROR;
UChar normalized_buffer [WORD_BUFFER_LENGTH];
@@ -279,6 +280,12 @@ process_word_uchar (TrackerParser *parser,
new_word_length);
}
+ /* Check if stop word */
+ if (parser->ignore_stop_words) {
+ *stop_word = tracker_language_is_stop_word (parser->language,
+ utf8_str);
+ }
+
/* Stemming needed? */
if (parser->enable_stemmer) {
/* Input for stemmer ALWAYS in UTF-8, as well as output */
@@ -303,7 +310,8 @@ process_word_uchar (TrackerParser *parser,
static gboolean
parser_next (TrackerParser *parser,
gint *byte_offset_start,
- gint *byte_offset_end)
+ gint *byte_offset_end,
+ gboolean *stop_word)
{
gsize word_length_uchar = 0;
gsize word_length_utf8 = 0;
@@ -398,7 +406,8 @@ parser_next (TrackerParser *parser,
processed_word = process_word_uchar (parser,
&(parser->utxt[parser->cursor]),
truncated_length,
- type);
+ type,
+ stop_word);
if (!processed_word) {
/* Ignore this word and keep on looping */
parser->cursor = next_word_offset_uchar;
@@ -563,24 +572,21 @@ tracker_parser_next (TrackerParser *parser,
gint *word_length)
{
const gchar *str;
- gint byte_start = 0, byte_end = 0;
+ gint byte_start = 0, byte_end = 0;
str = NULL;
g_free (parser->word);
parser->word = NULL;
- if (parser_next (parser, &byte_start, &byte_end)) {
+ *stop_word = FALSE;
+
+ if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
str = parser->word;
}
- if (str &&
- parser->ignore_stop_words &&
- tracker_language_is_stop_word (parser->language, str)) {
- *stop_word = TRUE;
- } else {
+ if (!*stop_word) {
parser->word_position++;
- *stop_word = FALSE;
}
*word_length = parser->word_length;
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index 67dda5f..07f638d 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -142,8 +142,9 @@ get_word_info (TrackerParser *parser,
static gchar *
process_word_utf8 (TrackerParser *parser,
const gchar *word,
- gint length,
- TrackerParserWordType type)
+ gint length,
+ TrackerParserWordType type,
+ gboolean *stop_word)
{
gchar word_buffer [WORD_BUFFER_LENGTH];
gchar *normalized = NULL;
@@ -229,6 +230,12 @@ process_word_utf8 (TrackerParser *parser,
}
}
+ /* Check if stop word */
+ if (parser->ignore_stop_words) {
+ *stop_word = tracker_language_is_stop_word (parser->language,
+ stripped ? stripped : normalized);
+ }
+
/* Stemming needed? */
if (parser->enable_stemmer) {
stemmed = tracker_language_stem_word (parser->language,
@@ -266,7 +273,8 @@ process_word_utf8 (TrackerParser *parser,
static gboolean
parser_next (TrackerParser *parser,
gint *byte_offset_start,
- gint *byte_offset_end)
+ gint *byte_offset_end,
+ gboolean *stop_word)
{
gsize word_length = 0;
gchar *processed_word = NULL;
@@ -328,7 +336,8 @@ parser_next (TrackerParser *parser,
processed_word = process_word_utf8 (parser,
&(parser->txt[parser->cursor]),
truncated_length,
- type);
+ type,
+ stop_word);
if (!processed_word) {
/* Ignore this word and keep on looping */
parser->cursor += word_length;
@@ -443,24 +452,21 @@ tracker_parser_next (TrackerParser *parser,
gint *word_length)
{
const gchar *str;
- gint byte_start = 0, byte_end = 0;
+ gint byte_start = 0, byte_end = 0;
str = NULL;
g_free (parser->word);
parser->word = NULL;
- if (parser_next (parser, &byte_start, &byte_end)) {
+ *stop_word = FALSE;
+
+ if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
str = parser->word;
}
- if (str &&
- parser->ignore_stop_words &&
- tracker_language_is_stop_word (parser->language, str)) {
- *stop_word = TRUE;
- } else {
+ if (!*stop_word) {
parser->word_position++;
- *stop_word = FALSE;
}
*word_length = parser->word_length;
diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c
index c34fc08..813ce38 100644
--- a/tests/libtracker-fts/tracker-parser-test.c
+++ b/tests/libtracker-fts/tracker-parser-test.c
@@ -48,15 +48,15 @@
/* Fixture object type */
typedef struct {
/* The parser object */
- TrackerParser *parser;
+ TrackerParser *parser;
/* Default parser configuration to use */
- gint max_word_length;
- gboolean enable_stemmer;
- gboolean enable_unaccent;
- gboolean ignore_stop_words;
- gboolean ignore_reserved_words;
- gboolean ignore_numbers;
+ gint max_word_length;
+ gboolean enable_stemmer;
+ gboolean enable_unaccent;
+ gboolean ignore_stop_words;
+ gboolean ignore_reserved_words;
+ gboolean ignore_numbers;
} TrackerParserTestFixture;
/* Common setup for all tests */
@@ -109,8 +109,8 @@ test_common_teardown (TrackerParserTestFixture *fixture,
typedef struct TestDataExpectedNWords TestDataExpectedNWords;
struct TestDataExpectedNWords {
const gchar *str;
- gboolean ignore_numbers;
- guint expected_nwords;
+ gboolean ignore_numbers;
+ guint expected_nwords;
};
/* Common expected_word test method */
@@ -157,10 +157,10 @@ expected_nwords_check (TrackerParserTestFixture *fixture,
/* Test struct for the expected-word tests */
typedef struct TestDataExpectedWord TestDataExpectedWord;
struct TestDataExpectedWord {
- const gchar *str;
- const gchar *expected;
- gboolean enable_stemmer;
- gboolean enable_unaccent;
+ const gchar *str;
+ const gchar *expected;
+ gboolean enable_stemmer;
+ gboolean enable_unaccent;
};
/* Common expected_word test method */
@@ -199,6 +199,52 @@ expected_word_check (TrackerParserTestFixture *fixture,
g_assert_cmpstr (word, == , testdata->expected);
}
+/* -------------- STOP WORD TESTS ----------------- */
+
+/* Test struct for the stop-word tests */
+typedef struct TestDataStopWord TestDataStopWord;
+struct TestDataStopWord {
+ const gchar *str;
+ gboolean ignore_stop_words;
+ gboolean is_expected_stop_word;
+};
+
+/* Common stop__word test method */
+static void
+stop_word_check (TrackerParserTestFixture *fixture,
+ gconstpointer data)
+{
+ const TestDataStopWord *testdata = data;
+ const gchar *word;
+ gint position;
+ gint byte_offset_start;
+ gint byte_offset_end;
+ gboolean stop_word;
+ gint word_length;
+
+ /* Reset the parser with our string */
+ tracker_parser_reset (fixture->parser,
+ testdata->str,
+ strlen (testdata->str),
+ fixture->max_word_length,
+ fixture->enable_stemmer,
+ fixture->enable_unaccent,
+ testdata->ignore_stop_words,
+ fixture->ignore_reserved_words,
+ fixture->ignore_numbers);
+
+ /* Process next word */
+ word = tracker_parser_next (fixture->parser,
+ &position,
+ &byte_offset_start,
+ &byte_offset_end,
+ &stop_word,
+ &word_length);
+
+ /* Check if input is same as stop_word */
+ g_assert_cmpuint (stop_word, == , testdata->is_expected_stop_word);
+}
+
/* -------------- LIST OF TESTS ----------------- */
#ifdef HAVE_UNAC
@@ -284,6 +330,15 @@ static const TestDataExpectedNWords test_data_nwords[] = {
{ NULL, FALSE, 0 }
};
+/* Stop-word tests (for english only) */
+static const TestDataStopWord test_data_stop_words[] = {
+ { "hello", TRUE, TRUE }, /* hello is stop word */
+ { "hello", FALSE, FALSE },
+ { "world", TRUE, FALSE }, /* world is not stop word */
+ { "world", FALSE, FALSE },
+ { NULL, FALSE, FALSE }
+};
+
int
main (int argc, char **argv)
{
@@ -295,6 +350,13 @@ main (int argc, char **argv)
}
g_test_init (&argc, &argv, NULL);
+ /* We want the tests to properly find the stopwords dictionaries, so we
+ * need to set the following envvar with the path where the
+ * dictionaries are. */
+ g_setenv ("TRACKER_LANGUAGE_STOP_WORDS_DIR",
+ TOP_SRCDIR "/data/languages",
+ TRUE);
+
/* Add normalization checks */
for (i = 0; test_data_normalization[i].str != NULL; i++) {
gchar *testpath;
@@ -367,5 +429,19 @@ main (int argc, char **argv)
g_free (testpath);
}
+ /* Add stop word checks */
+ for (i = 0; test_data_stop_words[i].str != NULL; i++) {
+ gchar *testpath;
+
+ testpath = g_strdup_printf ("/libtracker-fts/parser/stop_words_%d", i);
+ g_test_add (testpath,
+ TrackerParserTestFixture,
+ &test_data_stop_words[i],
+ test_common_setup,
+ stop_word_check,
+ test_common_teardown);
+ g_free (testpath);
+ }
+
return g_test_run ();
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]