tracker r1954 - in branches/indexer-split: . src/libtracker-common tests/libtracker-common
- From: ifrade svn gnome org
- To: svn-commits-list gnome org
- Subject: tracker r1954 - in branches/indexer-split: . src/libtracker-common tests/libtracker-common
- Date: Mon, 28 Jul 2008 13:34:44 +0000 (UTC)
Author: ifrade
Date: Mon Jul 28 13:34:44 2008
New Revision: 1954
URL: http://svn.gnome.org/viewvc/tracker?rev=1954&view=rev
Log:
Added stop word filter
Modified:
branches/indexer-split/ChangeLog
branches/indexer-split/src/libtracker-common/tracker-language.c
branches/indexer-split/src/libtracker-common/tracker-parser.c
branches/indexer-split/tests/libtracker-common/tracker-parser-test.c
Modified: branches/indexer-split/src/libtracker-common/tracker-language.c
==============================================================================
--- branches/indexer-split/src/libtracker-common/tracker-language.c (original)
+++ branches/indexer-split/src/libtracker-common/tracker-language.c Mon Jul 28 13:34:44 2008
@@ -425,7 +425,7 @@
priv = GET_PRIV (language);
if (!tracker_config_get_enable_stemmer (priv->config)) {
- return NULL;
+ return g_strdup (word);
}
g_mutex_lock (priv->stemmer_mutex);
Modified: branches/indexer-split/src/libtracker-common/tracker-parser.c
==============================================================================
--- branches/indexer-split/src/libtracker-common/tracker-parser.c (original)
+++ branches/indexer-split/src/libtracker-common/tracker-parser.c Mon Jul 28 13:34:44 2008
@@ -144,6 +144,18 @@
return FALSE;
}
+static gboolean
+is_stop_word (TrackerLanguage *language,
+ const gchar *word)
+{
+ GHashTable *stop_words;
+
+ stop_words = tracker_language_get_stop_words (language);
+
+ return (g_hash_table_lookup (stop_words, word) != NULL);
+}
+
+
static const gchar *
analyze_text (const gchar *text,
TrackerLanguage *language,
@@ -306,11 +318,13 @@
*index_word = tracker_language_stem_word (language,
tmp,
strlen (tmp));
- if (*index_word) {
- g_free (tmp);
- } else {
- *index_word = tmp;
+ g_free (tmp);
+
+ if (filter_words && is_stop_word (language, *index_word)) {
+ g_free (*index_word);
+ *index_word = NULL;
}
+
}
}
}
Modified: branches/indexer-split/tests/libtracker-common/tracker-parser-test.c
==============================================================================
--- branches/indexer-split/tests/libtracker-common/tracker-parser-test.c (original)
+++ branches/indexer-split/tests/libtracker-common/tracker-parser-test.c Mon Jul 28 13:34:44 2008
@@ -9,8 +9,9 @@
/*
* len(word) > 3 : 6 words
* longest word: 10 chars
+ * stop words ("here", "a", "of", "various", "to", "after")
*/
-#define SAMPLE_TEXT "Here a good collection of various words to parse 12345678"
+#define SAMPLE_TEXT "Here a good collection of various words to parse 12345678 after"
TrackerConfig *config;
TrackerLanguage *language;
@@ -72,7 +73,7 @@
3, /* min length of the word */
FALSE, FALSE); /* Filter / Delimit */
g_hash_table_foreach (result, assert_key_length, GINT_TO_POINTER (max_length));
- g_assert_cmpint (g_hash_table_size (result), ==, 7);
+ g_assert_cmpint (g_hash_table_size (result), ==, 8);
tracker_parser_text_free (result);
}
@@ -81,7 +82,7 @@
* Filter numbers
*/
static void
-test_parser_text_filter_numbers (void)
+test_parser_text_filter_numbers_stop_words (void)
{
GHashTable *result = NULL;
@@ -92,12 +93,12 @@
language,
100, /* max words to index */
100, /* max length of the word */
- 1, /* min length of the word */
+ 0, /* min length of the word */
TRUE, FALSE); /* Filter / Delimit */
g_assert (!g_hash_table_lookup (result, "12345678"));
- g_assert_cmpint (g_hash_table_size (result), ==, 9);
+ g_assert_cmpint (g_hash_table_size (result), ==, 4);
tracker_parser_text_free (result);
result = NULL;
@@ -109,10 +110,10 @@
language,
100, /* max words to index */
100, /* max length of the word */
- 1, /* min length of the word */
+ 0, /* min length of the word */
FALSE, FALSE); /* Filter / Delimit */
- g_assert_cmpint (g_hash_table_size (result), ==, 10);
+ g_assert_cmpint (g_hash_table_size (result), ==, 11);
g_assert (g_hash_table_lookup (result, "12345678"));
@@ -123,7 +124,7 @@
static void
test_parser_stop_words (void)
{
- GHashTable *stop_words;
+ GHashTable *stop_words, *result = NULL;
/* Check we have the default stop words */
stop_words = tracker_language_get_stop_words (language);
@@ -134,6 +135,14 @@
tracker_config_set_language (config, "en");
g_assert (g_hash_table_lookup (stop_words, "after"));
+ result = tracker_parser_text (result,
+ SAMPLE_TEXT,
+ 1,
+ language,
+ 100, /* max words to index */
+ 100, /* max length of the word */
+ 1, /* min length of the word */
+ TRUE, FALSE); /* Filter / Delimit */
}
int
@@ -156,7 +165,7 @@
test_parser_text_max_length);
g_test_add_func ("/libtracker-common/tracker-parser/parser_text/filter_numbers",
- test_parser_text_filter_numbers);
+ test_parser_text_filter_numbers_stop_words);
g_test_add_func ("/libtracker-common/tracker-parser/stop_words",
test_parser_stop_words);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]