tracker r1840 - in branches/indexer-split: . src/tracker-indexer tests/tracker-indexer
- From: ifrade svn gnome org
- To: svn-commits-list gnome org
- Subject: tracker r1840 - in branches/indexer-split: . src/tracker-indexer tests/tracker-indexer
- Date: Wed, 2 Jul 2008 17:25:47 +0000 (UTC)
Author: ifrade
Date: Wed Jul 2 17:25:47 2008
New Revision: 1840
URL: http://svn.gnome.org/viewvc/tracker?rev=1840&view=rev
Log:
Implemented word additions in the index. Included unit tests
Modified:
branches/indexer-split/ChangeLog
branches/indexer-split/src/tracker-indexer/tracker-index.c
branches/indexer-split/tests/tracker-indexer/tracker-index-test.c
Modified: branches/indexer-split/src/tracker-indexer/tracker-index.c
==============================================================================
--- branches/indexer-split/src/tracker-indexer/tracker-index.c (original)
+++ branches/indexer-split/src/tracker-indexer/tracker-index.c Wed Jul 2 17:25:47 2008
@@ -42,8 +42,28 @@
* metadata */
};
+/* This functions will be used also in the search code! */
+static inline gint16
+index_get_score (TrackerIndexElement *element)
+{
+ unsigned char a[2];
+
+ a[0] = (element->amalgamated >> 16) & 0xFF;
+ a[1] = (element->amalgamated >> 8) & 0xFF;
+
+ return (gint16) (a[0] << 8) | (a[1]);
+}
+
+
+static inline guint8
+index_get_service_type (TrackerIndexElement *element)
+{
+ return (element->amalgamated >> 24) & 0xFF;
+}
+
+
static guint32
-tracker_index_calc_amalgamated (gint service,
+index_calc_amalgamated (gint service,
gint weight)
{
unsigned char a[4];
@@ -64,12 +84,14 @@
return (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3];
}
+
static void
free_cache_values (GArray *array)
{
g_array_free (array, TRUE);
}
+
TrackerIndex *
tracker_index_new (const gchar *file,
gint bucket_count)
@@ -87,6 +109,7 @@
return index;
}
+
void
tracker_index_free (TrackerIndex *index)
{
@@ -99,6 +122,7 @@
g_free (index);
}
+
void
tracker_index_add_word (TrackerIndex *index,
const gchar *word,
@@ -108,21 +132,156 @@
{
TrackerIndexElement elem;
GArray *array;
+ guint i, new_score;
+ TrackerIndexElement *current;
elem.id = service_id;
- elem.amalgamated = tracker_index_calc_amalgamated (service_type, weight);
+ elem.amalgamated = index_calc_amalgamated (service_type, weight);
array = g_hash_table_lookup (index->cache, word);
if (!array) {
- /* create the array if it didn't exist */
+ /* create the array if it didn't exist (first time we find the word) */
array = g_array_new (FALSE, TRUE, sizeof (TrackerIndexElement));
g_hash_table_insert (index->cache, g_strdup (word), array);
+ g_array_append_val (array, elem);
+ return;
+ }
+
+ /* It is not the first time we find the word */
+ for (i = 0; i < array->len; i++) {
+
+ current = &g_array_index (array, TrackerIndexElement, i);
+
+ if (current->id == service_id) {
+ /* The word was already found in the same service_id (file), increase score */
+ new_score = index_get_score (current) + weight;
+ current->amalgamated = index_calc_amalgamated (index_get_service_type (current),
+ new_score);
+ return;
+ }
}
+ /* First time in the file */
g_array_append_val (array, elem);
}
+
+/* use for deletes or updates of multiple entities when they are not new */
+static gboolean
+indexer_update_word (DEPOT *index,
+ const gchar *word,
+ GArray *new_hits)
+{
+ gint tsiz, i, score;
+ guint j;
+ gint k;
+
+ TrackerIndexElement *new_hit, *previous_hits;
+ gboolean write_back = FALSE, edited = FALSE;
+ gint old_hit_count = 0;
+ GArray *pending_hits = NULL;
+ gboolean result;
+
+ g_return_val_if_fail (index, FALSE);
+ g_return_val_if_fail (word, FALSE);
+ g_return_val_if_fail (new_hits, FALSE);
+
+ previous_hits = (TrackerIndexElement *)dpget (index, word, -1, 0, MAX_HIT_BUFFER, &tsiz);
+
+ /* New word in the index */
+ if (previous_hits == NULL) {
+
+ result = dpput (index,
+ word, -1,
+ (char *) new_hits->data, (new_hits->len * sizeof (TrackerIndexElement)),
+ DP_DCAT);
+
+ if (!result) {
+ g_warning ("Could not store word: %s", word);
+ return FALSE;
+ }
+
+ return TRUE;
+ }
+
+ /* Word already exists */
+ old_hit_count = tsiz / sizeof (TrackerIndexElement);
+
+ for (j = 0; j < new_hits->len; j++) {
+
+ new_hit = &g_array_index (new_hits, TrackerIndexElement, j);
+
+ edited = FALSE;
+
+ for (i = 0; i < old_hit_count; i++) {
+
+ if (previous_hits[i].id == new_hit->id) {
+
+ write_back = TRUE;
+
+ /* NB the paramter score can be negative */
+ score = index_get_score (&previous_hits[i]) + index_get_score (new_hit);
+ /* g_print ("current score for %s is %d and new is %d and final is %d\n",
+ word, index_get_score (&previous_hits[i]), index_get_score (new_hit), score); */
+
+
+ /* check for deletion */
+ if (score < 1) {
+
+ /* g_print ("Deleting word hit %s\n", word); */
+
+ /* shift all subsequent records in array down one place */
+ for (k = i + 1; k < old_hit_count; k++) {
+ previous_hits[k - 1] = previous_hits[k];
+ }
+
+ old_hit_count--;
+
+ } else {
+ previous_hits[i].amalgamated = index_calc_amalgamated (index_get_service_type (&previous_hits[i]), score);
+ }
+
+ edited = TRUE;
+ break;
+ }
+ }
+
+ /* add hits that could not be updated directly here so they can be appended later */
+ if (!edited) {
+
+ if (!pending_hits) {
+ pending_hits = g_array_new (FALSE, TRUE, sizeof (TrackerIndexElement));
+ }
+
+ g_array_append_val (pending_hits, *new_hit);
+ g_debug ("could not update word hit %s - appending", word);
+ }
+ }
+
+ /* write back if we have modded anything */
+ if (write_back) {
+ dpput (index,
+ word, -1,
+ (char *) previous_hits, (old_hit_count * sizeof (TrackerIndexElement)),
+ DP_DOVER);
+ }
+
+ /* Append new occurences */
+ if (pending_hits) {
+ dpput (index,
+ word, -1,
+ (char *) pending_hits->data, (pending_hits->len * sizeof (TrackerIndexElement)),
+ DP_DCAT);
+ g_array_free (pending_hits, TRUE);
+ }
+
+ g_free (previous_hits);
+
+ return TRUE;
+}
+
+
static gboolean
cache_flush_foreach (gpointer key,
gpointer value,
@@ -131,28 +290,13 @@
GArray *array;
DEPOT *index;
gchar *word;
-#if 0
- gchar *tmp;
- gint table_size;
-#endif
word = (gchar *) key;
array = (GArray *) value;
index = (DEPOT *) user_data;
-#if 0
- if ((tmp = dpget (index, word, -1, 0, MAX_HIT_BUFFER, &table_size)) != NULL) {
- /* FIXME: missing merge with previous values */
- }
-#endif
-
- if (!dpput (index, word, -1, (char *) array->data, (array->len * sizeof (TrackerIndexElement)), DP_DCAT)) {
- g_warning ("Could not store word: %s", word);
- return FALSE;
- }
-
- /* Mark element for removal */
- return TRUE;
+ /* Mark element for removal if succesfull insertion */
+ return indexer_update_word (index, word, array);
}
guint
Modified: branches/indexer-split/tests/tracker-indexer/tracker-index-test.c
==============================================================================
--- branches/indexer-split/tests/tracker-indexer/tracker-index-test.c (original)
+++ branches/indexer-split/tests/tracker-indexer/tracker-index-test.c Wed Jul 2 17:25:47 2008
@@ -6,14 +6,26 @@
#include <qdbm/depot.h>
+#define BUCKET_COUNT 100
+
+
-// Private struct. Used here to test results
+/* Private code from the module. Used here to check results */
typedef struct {
guint32 id;
guint32 amalgamated;
} TrackerIndexElement;
-#define BUCKET_COUNT 100
+gint16
+helper_get_score (TrackerIndexElement *element)
+{
+ unsigned char a[2];
+
+ a[0] = (element->amalgamated >> 16) & 0xFF;
+ a[1] = (element->amalgamated >> 8) & 0xFF;
+
+ return (gint16) (a[0] << 8) | (a[1]);
+}
/* Helper functions to read the index */
@@ -47,6 +59,29 @@
return result / sizeof (TrackerIndexElement);
}
+gint
+get_score_for_word (const gchar *index_file, const gchar *word)
+{
+ DEPOT *index;
+ gint tsiz;
+ TrackerIndexElement *results;
+ gint score;
+
+ index = dpopen (index_file, DP_OREADER, BUCKET_COUNT);
+
+ results = (TrackerIndexElement *)dpget (index, word, -1, 0, -1, &tsiz);
+
+ dpclose (index);
+
+ g_return_val_if_fail ((tsiz / sizeof (TrackerIndexElement)) == 1, -1);
+ g_return_val_if_fail (results, -1);
+
+ score = helper_get_score (&results[0]);
+
+ g_free (results);
+ return score;
+}
+
void
debug_print_index (const gchar *index_file) {
@@ -75,7 +110,11 @@
}
g_print ("\n");
}
+ g_free (results);
+ g_free (iter);
}
+
+ dpclose (index);
}
/* Actual tests */
@@ -138,19 +177,89 @@
index = tracker_index_new (indexname, BUCKET_COUNT);
for ( i = 0; i < 20; i++) {
- tracker_index_add_word (index, "word", i, 1, 1);
+ tracker_index_add_word (index, "test-word", i, 1, 1);
+ }
+
+ tracker_index_flush (index);
+ tracker_index_free (index);
+
+ g_assert_cmpint (get_number_words_in_index (indexname), ==, 1);
+ g_assert_cmpint (get_results_for_word (indexname, "test-word"), ==, 20);
+
+ g_remove (indexname);
+}
+
+static void
+test_add_word_multiple_occurrences ()
+{
+ TrackerIndex *index;
+ gint i;
+ const gchar *indexname = "test-word-multiple-ocurrences.index";
+
+ g_remove (indexname);
+ index = tracker_index_new (indexname, BUCKET_COUNT);
+
+ for ( i = 0; i < 20; i++) {
+ tracker_index_add_word (index, "test-word", 1, 1, 1);
}
tracker_index_flush (index);
tracker_index_free (index);
g_assert_cmpint (get_number_words_in_index (indexname), ==, 1);
- g_assert_cmpint (get_results_for_word (indexname, "word"), ==, 20);
+
+ // There must be only ONE result with a high score
+ g_assert_cmpint (get_results_for_word (indexname, "test-word"), ==, 1);
+ g_assert_cmpint (get_score_for_word (indexname, "test-word"), ==, 20);
g_remove (indexname);
+
}
+static void
+test_add_with_flushs ()
+{
+
+ TrackerIndex *index;
+ const gchar *indexname = "test-add-with-flush.index";
+ gchar **pieces;
+ gint i;
+
+ const gchar *text1 = "this is a text to try a kind of real use case of the indexer";
+ const gchar *text2 = "this is another text with some common words";
+
+ g_remove (indexname);
+ index = tracker_index_new (indexname, BUCKET_COUNT);
+
+ /* Text 1 */
+ pieces = g_strsplit (text1, " ", -1);
+ for (i = 0; pieces[i] != NULL; i++) {
+ tracker_index_add_word (index, pieces[i], 1, 1, 1);
+ }
+ g_strfreev (pieces);
+ tracker_index_flush (index);
+
+ /* Text 2 */
+ pieces = g_strsplit (text2, " ", -1);
+ for (i = 0; pieces[i] != NULL; i++) {
+ tracker_index_add_word (index, pieces[i], 2, 1, 1);
+ }
+ g_strfreev (pieces);
+ tracker_index_flush (index);
+
+ tracker_index_free (index);
+
+ g_assert_cmpint (get_number_words_in_index (indexname), ==, 18);
+ g_assert_cmpint (get_results_for_word (indexname, "this"), ==, 2);
+ g_assert_cmpint (get_results_for_word (indexname, "common"), ==, 1);
+ g_assert_cmpint (get_score_for_word (indexname, "a"), ==, 2);
+ g_remove (indexname);
+
+}
+
+
+
int
main (int argc, char **argv) {
@@ -168,6 +277,11 @@
g_test_add_func ("/tracker/tracker-indexer/tracker-index/add_word_n_times",
test_add_word_n_times);
+ g_test_add_func ("/tracker/tracker-indexer/tracker-index/add_word_multiple_occurrences",
+ test_add_word_multiple_occurrences);
+
+ g_test_add_func ("/tracker/tracker-indexer/tracker-index/add_with_flush",
+ test_add_with_flushs);
result = g_test_run ();
return result;
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]