[tracker] Fixes GB#619498: Check if stop word before stemming, not after

From: Aleksander Morgado <aleksm src gnome org>
To: commits-list gnome org
Cc:
Subject: [tracker] Fixes GB#619498: Check if stop word before stemming, not after
Date: Tue, 1 Jun 2010 11:48:19 +0000 (UTC)
commit 9f2c559895aa1cff437b020b65582107f22e8d15
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Fri May 28 11:55:17 2010 +0200

    Fixes GB#619498: Check if stop word before stemming, not after
    
    	* tests/libtracker-fts/tracker-parser-test.c: Added new unit tests
    	for stop-words.
    
    	* src/libtracker-common/tracker-language.c
    	* docs/manpages/tracker-store.1
    	New TRACKER_LANGUAGE_STOP_WORDS_DIR environment variable to specify
    	the path where stop words dictionaries are available, used for
    	testing.
    
    	* src/libtracker-fts/tracker-parser-glib.c
    	* src/libtracker-fts/tracker-parser-libicu.c
    	* src/libtracker-fts/tracker-parser-libunistring.c
    	Check for stopword is done before the word stemming, if applicable.

 docs/manpages/tracker-store.1                    |    6 ++
 src/libtracker-common/tracker-language.c         |   23 ++++--
 src/libtracker-fts/tracker-parser-glib.c         |   34 ++++----
 src/libtracker-fts/tracker-parser-libicu.c       |   28 ++++---
 src/libtracker-fts/tracker-parser-libunistring.c |   30 ++++---
 tests/libtracker-fts/tracker-parser-test.c       |  102 +++++++++++++++++++---
 6 files changed, 165 insertions(+), 58 deletions(-)
---
diff --git a/docs/manpages/tracker-store.1 b/docs/manpages/tracker-store.1
index adb9091..50dbdb4 100644
--- a/docs/manpages/tracker-store.1
+++ b/docs/manpages/tracker-store.1
@@ -68,6 +68,12 @@ This is the directory which tracker uses to load the .ontology files
 from. If unset it will default to the correct place. This is used
 mainly for testing purposes.
 
+.TP
+.B TRACKER_LANGUAGE_STOP_WORDS_DIR
+This is the directory which tracker uses to load the stop words
+dictionaries from. If unset it will default to the correct place. This
+is used mainly for testing purposes.
+
 .SH FILES
 .I $HOME/.config/tracker/tracker-store.cfg
 .I $HOME/.config/tracker/tracker-fts.cfg
diff --git a/src/libtracker-common/tracker-language.c b/src/libtracker-common/tracker-language.c
index d67d13d..d06d3ab 100644
--- a/src/libtracker-common/tracker-language.c
+++ b/src/libtracker-common/tracker-language.c
@@ -218,15 +218,26 @@ language_get_stopword_filename (const gchar *language_code)
 {
 	gchar *str;
 	gchar *filename;
+	const gchar *testpath;
 
 	str = g_strconcat ("stopwords.", language_code, NULL);
-	filename = g_build_filename (SHAREDIR,
-	                             "tracker",
-	                             "languages",
-	                             str,
-	                             NULL);
-	g_free (str);
 
+	/* Look if the testpath for stopwords dictionary was set
+	 *  (used during unit tests) */
+	testpath = g_getenv ("TRACKER_LANGUAGE_STOP_WORDS_DIR");
+	if (!testpath) {
+		filename = g_build_filename (SHAREDIR,
+		                             "tracker",
+		                             "languages",
+		                             str,
+		                             NULL);
+	} else {
+		filename = g_build_filename (testpath,
+		                             str,
+		                             NULL);
+	}
+
+	g_free (str);
 	return filename;
 }
 
diff --git a/src/libtracker-fts/tracker-parser-glib.c b/src/libtracker-fts/tracker-parser-glib.c
index 670a46f..d521c9c 100644
--- a/src/libtracker-fts/tracker-parser-glib.c
+++ b/src/libtracker-fts/tracker-parser-glib.c
@@ -166,7 +166,8 @@ static gchar *
 process_word_utf8 (TrackerParser *parser,
 		   const gchar   *word,
 		   gint           length,
-		   gboolean       do_strip)
+                   gboolean       do_strip,
+                   gboolean      *stop_word)
 {
 	gchar *stem_word;
 	gchar *str;
@@ -221,6 +222,12 @@ process_word_utf8 (TrackerParser *parser,
 			return NULL;
 		}
 
+		/* Check if stop word */
+		if (parser->ignore_stop_words) {
+			*stop_word = tracker_language_is_stop_word (parser->language,
+			                                            str);
+		}
+
 		if (!parser->enable_stemmer) {
 			return str;
 		}
@@ -306,7 +313,8 @@ pango_next (TrackerParser *parser,
 static gboolean
 parser_next (TrackerParser *parser,
              gint          *byte_offset_start,
-             gint          *byte_offset_end)
+             gint          *byte_offset_end,
+             gboolean      *stop_word)
 {
 	TrackerParserWordType word_type;
 	gunichar              word[64];
@@ -477,7 +485,7 @@ parser_next (TrackerParser *parser,
 
 		parser->cursor = parser->txt + *byte_offset_end;
 
-		processed_word = process_word_utf8 (parser, utf8, bytes, do_strip);
+		processed_word = process_word_utf8 (parser, utf8, bytes, do_strip, stop_word);
 		g_free (utf8);
 
 		if (processed_word) {
@@ -589,33 +597,27 @@ tracker_parser_next (TrackerParser *parser,
                      gint          *word_length)
 {
 	const gchar  *str;
-	gint     byte_start = 0, byte_end = 0;
+	gint byte_start = 0, byte_end = 0;
 
 	str = NULL;
 
 	g_free (parser->word);
 	parser->word = NULL;
 
+	*stop_word = FALSE;
+
 	if (parser->encoding == TRACKER_PARSER_ENCODING_CJK) {
 		if (pango_next (parser, &byte_start, &byte_end)) {
 			str = parser->word;
 		}
-		parser->word_position++;
-
-		*stop_word = FALSE;
 	} else {
-		if (parser_next (parser, &byte_start, &byte_end)) {
+		if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
 			str = parser->word;
 		}
+	}
 
-		if (str &&
-		    parser->ignore_stop_words &&
-		    tracker_language_is_stop_word (parser->language, str)) {
-			*stop_word = TRUE;
-		} else {
-			parser->word_position++;
-			*stop_word = FALSE;
-		}
+	if (!*stop_word) {
+		parser->word_position++;
 	}
 
 	*word_length = parser->word_length;
diff --git a/src/libtracker-fts/tracker-parser-libicu.c b/src/libtracker-fts/tracker-parser-libicu.c
index 3e1ad98..55151ec 100644
--- a/src/libtracker-fts/tracker-parser-libicu.c
+++ b/src/libtracker-fts/tracker-parser-libicu.c
@@ -142,7 +142,8 @@ static gchar *
 process_word_uchar (TrackerParser         *parser,
                     const UChar           *word,
                     gint                   length,
-                    TrackerParserWordType  type)
+                    TrackerParserWordType  type,
+                    gboolean              *stop_word)
 {
 	UErrorCode error = U_ZERO_ERROR;
 	UChar normalized_buffer [WORD_BUFFER_LENGTH];
@@ -279,6 +280,12 @@ process_word_uchar (TrackerParser         *parser,
 					    new_word_length);
 	}
 
+	/* Check if stop word */
+	if (parser->ignore_stop_words) {
+		*stop_word = tracker_language_is_stop_word (parser->language,
+		                                            utf8_str);
+	}
+
 	/* Stemming needed? */
 	if (parser->enable_stemmer) {
 		/* Input for stemmer ALWAYS in UTF-8, as well as output */
@@ -303,7 +310,8 @@ process_word_uchar (TrackerParser         *parser,
 static gboolean
 parser_next (TrackerParser *parser,
              gint          *byte_offset_start,
-             gint          *byte_offset_end)
+             gint          *byte_offset_end,
+             gboolean      *stop_word)
 {
 	gsize word_length_uchar = 0;
 	gsize word_length_utf8 = 0;
@@ -398,7 +406,8 @@ parser_next (TrackerParser *parser,
 		processed_word = process_word_uchar (parser,
 		                                     &(parser->utxt[parser->cursor]),
 		                                     truncated_length,
-		                                     type);
+		                                     type,
+		                                     stop_word);
 		if (!processed_word) {
 			/* Ignore this word and keep on looping */
 			parser->cursor = next_word_offset_uchar;
@@ -563,24 +572,21 @@ tracker_parser_next (TrackerParser *parser,
                      gint          *word_length)
 {
 	const gchar  *str;
-	gint     byte_start = 0, byte_end = 0;
+	gint byte_start = 0, byte_end = 0;
 
 	str = NULL;
 
 	g_free (parser->word);
 	parser->word = NULL;
 
-	if (parser_next (parser, &byte_start, &byte_end)) {
+	*stop_word = FALSE;
+
+	if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
 		str = parser->word;
 	}
 
-	if (str &&
-	    parser->ignore_stop_words &&
-	    tracker_language_is_stop_word (parser->language, str)) {
-		*stop_word = TRUE;
-	} else {
+	if (!*stop_word) {
 		parser->word_position++;
-		*stop_word = FALSE;
 	}
 
 	*word_length = parser->word_length;
diff --git a/src/libtracker-fts/tracker-parser-libunistring.c b/src/libtracker-fts/tracker-parser-libunistring.c
index 67dda5f..07f638d 100644
--- a/src/libtracker-fts/tracker-parser-libunistring.c
+++ b/src/libtracker-fts/tracker-parser-libunistring.c
@@ -142,8 +142,9 @@ get_word_info (TrackerParser         *parser,
 static gchar *
 process_word_utf8 (TrackerParser         *parser,
                    const gchar           *word,
-                   gint                  length,
-                   TrackerParserWordType type)
+                   gint                   length,
+                   TrackerParserWordType  type,
+                   gboolean              *stop_word)
 {
 	gchar word_buffer [WORD_BUFFER_LENGTH];
 	gchar *normalized = NULL;
@@ -229,6 +230,12 @@ process_word_utf8 (TrackerParser         *parser,
 		}
 	}
 
+	/* Check if stop word */
+	if (parser->ignore_stop_words) {
+		*stop_word = tracker_language_is_stop_word (parser->language,
+		                                            stripped ? stripped : normalized);
+	}
+
 	/* Stemming needed? */
 	if (parser->enable_stemmer) {
 		stemmed = tracker_language_stem_word (parser->language,
@@ -266,7 +273,8 @@ process_word_utf8 (TrackerParser         *parser,
 static gboolean
 parser_next (TrackerParser *parser,
              gint          *byte_offset_start,
-             gint          *byte_offset_end)
+             gint          *byte_offset_end,
+             gboolean      *stop_word)
 {
 	gsize word_length = 0;
 	gchar *processed_word = NULL;
@@ -328,7 +336,8 @@ parser_next (TrackerParser *parser,
 		processed_word = process_word_utf8 (parser,
 		                                    &(parser->txt[parser->cursor]),
 		                                    truncated_length,
-		                                    type);
+		                                    type,
+		                                    stop_word);
 		if (!processed_word) {
 			/* Ignore this word and keep on looping */
 			parser->cursor += word_length;
@@ -443,24 +452,21 @@ tracker_parser_next (TrackerParser *parser,
                      gint          *word_length)
 {
 	const gchar  *str;
-	gint     byte_start = 0, byte_end = 0;
+	gint byte_start = 0, byte_end = 0;
 
 	str = NULL;
 
 	g_free (parser->word);
 	parser->word = NULL;
 
-	if (parser_next (parser, &byte_start, &byte_end)) {
+	*stop_word = FALSE;
+
+	if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
 		str = parser->word;
 	}
 
-	if (str &&
-	    parser->ignore_stop_words &&
-	    tracker_language_is_stop_word (parser->language, str)) {
-		*stop_word = TRUE;
-	} else {
+	if (!*stop_word) {
 		parser->word_position++;
-		*stop_word = FALSE;
 	}
 
 	*word_length = parser->word_length;
diff --git a/tests/libtracker-fts/tracker-parser-test.c b/tests/libtracker-fts/tracker-parser-test.c
index c34fc08..813ce38 100644
--- a/tests/libtracker-fts/tracker-parser-test.c
+++ b/tests/libtracker-fts/tracker-parser-test.c
@@ -48,15 +48,15 @@
 /* Fixture object type */
 typedef struct {
 	/* The parser object */
-	TrackerParser    *parser;
+	TrackerParser *parser;
 
 	/* Default parser configuration to use */
-	gint              max_word_length;
-	gboolean          enable_stemmer;
-	gboolean          enable_unaccent;
-	gboolean          ignore_stop_words;
-	gboolean          ignore_reserved_words;
-	gboolean          ignore_numbers;
+	gint max_word_length;
+	gboolean enable_stemmer;
+	gboolean enable_unaccent;
+	gboolean ignore_stop_words;
+	gboolean ignore_reserved_words;
+	gboolean ignore_numbers;
 } TrackerParserTestFixture;
 
 /* Common setup for all tests */
@@ -109,8 +109,8 @@ test_common_teardown (TrackerParserTestFixture *fixture,
 typedef struct TestDataExpectedNWords TestDataExpectedNWords;
 struct TestDataExpectedNWords {
 	const gchar *str;
-	gboolean     ignore_numbers;
-	guint        expected_nwords;
+	gboolean ignore_numbers;
+	guint expected_nwords;
 };
 
 /* Common expected_word test method */
@@ -157,10 +157,10 @@ expected_nwords_check (TrackerParserTestFixture *fixture,
 /* Test struct for the expected-word tests */
 typedef struct TestDataExpectedWord TestDataExpectedWord;
 struct TestDataExpectedWord {
-	const gchar  *str;
-	const gchar  *expected;
-	gboolean      enable_stemmer;
-	gboolean      enable_unaccent;
+	const gchar *str;
+	const gchar *expected;
+	gboolean enable_stemmer;
+	gboolean enable_unaccent;
 };
 
 /* Common expected_word test method */
@@ -199,6 +199,52 @@ expected_word_check (TrackerParserTestFixture *fixture,
 	g_assert_cmpstr (word, == , testdata->expected);
 }
 
+/* -------------- STOP WORD TESTS ----------------- */
+
+/* Test struct for the stop-word tests */
+typedef struct TestDataStopWord TestDataStopWord;
+struct TestDataStopWord {
+	const gchar *str;
+	gboolean ignore_stop_words;
+	gboolean is_expected_stop_word;
+};
+
+/* Common stop__word test method */
+static void
+stop_word_check (TrackerParserTestFixture *fixture,
+                 gconstpointer data)
+{
+	const TestDataStopWord *testdata = data;
+	const gchar *word;
+	gint position;
+	gint byte_offset_start;
+	gint byte_offset_end;
+	gboolean stop_word;
+	gint word_length;
+
+	/* Reset the parser with our string */
+	tracker_parser_reset (fixture->parser,
+	                      testdata->str,
+	                      strlen (testdata->str),
+	                      fixture->max_word_length,
+	                      fixture->enable_stemmer,
+	                      fixture->enable_unaccent,
+	                      testdata->ignore_stop_words,
+	                      fixture->ignore_reserved_words,
+	                      fixture->ignore_numbers);
+
+	/* Process next word */
+	word = tracker_parser_next (fixture->parser,
+	                            &position,
+	                            &byte_offset_start,
+	                            &byte_offset_end,
+	                            &stop_word,
+	                            &word_length);
+
+	/* Check if input is same as stop_word */
+	g_assert_cmpuint (stop_word, == , testdata->is_expected_stop_word);
+}
+
 /* -------------- LIST OF TESTS ----------------- */
 
 #ifdef HAVE_UNAC
@@ -284,6 +330,15 @@ static const TestDataExpectedNWords test_data_nwords[] = {
 	{ NULL,                                                     FALSE,  0 }
 };
 
+/* Stop-word tests (for english only) */
+static const TestDataStopWord test_data_stop_words[] = {
+	{ "hello", TRUE,  TRUE  }, /* hello is stop word */
+	{ "hello", FALSE, FALSE },
+	{ "world", TRUE,  FALSE }, /* world is not stop word */
+	{ "world", FALSE, FALSE },
+	{ NULL,    FALSE, FALSE }
+};
+
 int
 main (int argc, char **argv)
 {
@@ -295,6 +350,13 @@ main (int argc, char **argv)
 	}
 	g_test_init (&argc, &argv, NULL);
 
+	/* We want the tests to properly find the stopwords dictionaries, so we
+	 *  need to set the following envvar with the path where the
+	 *  dictionaries are. */
+	g_setenv ("TRACKER_LANGUAGE_STOP_WORDS_DIR",
+	          TOP_SRCDIR "/data/languages",
+	          TRUE);
+
 	/* Add normalization checks */
 	for (i = 0; test_data_normalization[i].str != NULL; i++) {
 		gchar *testpath;
@@ -367,5 +429,19 @@ main (int argc, char **argv)
 		g_free (testpath);
 	}
 
+	/* Add stop word checks */
+	for (i = 0; test_data_stop_words[i].str != NULL; i++) {
+		gchar *testpath;
+
+		testpath = g_strdup_printf ("/libtracker-fts/parser/stop_words_%d", i);
+		g_test_add (testpath,
+		            TrackerParserTestFixture,
+		            &test_data_stop_words[i],
+		            test_common_setup,
+		            stop_word_check,
+		            test_common_teardown);
+		g_free (testpath);
+	}
+
 	return g_test_run ();
 }
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]