tracker r1840 - in branches/indexer-split: . src/tracker-indexer tests/tracker-indexer



Author: ifrade
Date: Wed Jul  2 17:25:47 2008
New Revision: 1840
URL: http://svn.gnome.org/viewvc/tracker?rev=1840&view=rev

Log:
Implemented word additions in the index. Included unit tests

Modified:
   branches/indexer-split/ChangeLog
   branches/indexer-split/src/tracker-indexer/tracker-index.c
   branches/indexer-split/tests/tracker-indexer/tracker-index-test.c

Modified: branches/indexer-split/src/tracker-indexer/tracker-index.c
==============================================================================
--- branches/indexer-split/src/tracker-indexer/tracker-index.c	(original)
+++ branches/indexer-split/src/tracker-indexer/tracker-index.c	Wed Jul  2 17:25:47 2008
@@ -42,8 +42,28 @@
 			      * metadata */
 };
 
+/* This functions will be used also in the search code! */
+static inline gint16
+index_get_score (TrackerIndexElement *element)
+{
+	unsigned char a[2];
+
+	a[0] = (element->amalgamated >> 16) & 0xFF;
+	a[1] = (element->amalgamated >> 8) & 0xFF;
+
+	return (gint16) (a[0] << 8) | (a[1]);	
+}
+
+
+static inline guint8
+index_get_service_type (TrackerIndexElement *element)
+{
+	return (element->amalgamated >> 24) & 0xFF;
+}
+
+
 static guint32
-tracker_index_calc_amalgamated (gint service,
+index_calc_amalgamated (gint service,
 				gint weight)
 {
 	unsigned char a[4];
@@ -64,12 +84,14 @@
 	return (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3];
 }
 
+
 static void
 free_cache_values (GArray *array)
 {
 	g_array_free (array, TRUE);
 }
 
+
 TrackerIndex *
 tracker_index_new (const gchar *file,
 		   gint         bucket_count)
@@ -87,6 +109,7 @@
 	return index;
 }
 
+
 void
 tracker_index_free (TrackerIndex *index)
 {
@@ -99,6 +122,7 @@
 	g_free (index);
 }
 
+
 void
 tracker_index_add_word (TrackerIndex *index,
 			const gchar  *word,
@@ -108,21 +132,156 @@
 {
 	TrackerIndexElement elem;
 	GArray *array;
+	guint    i, new_score;
+	TrackerIndexElement *current;
 
 	elem.id = service_id;
-	elem.amalgamated = tracker_index_calc_amalgamated (service_type, weight);
+	elem.amalgamated = index_calc_amalgamated (service_type, weight);
 
 	array = g_hash_table_lookup (index->cache, word);
 
 	if (!array) {
-		/* create the array if it didn't exist */
+		/* create the array if it didn't exist (first time we find the word) */
 		array = g_array_new (FALSE, TRUE, sizeof (TrackerIndexElement));
 		g_hash_table_insert (index->cache, g_strdup (word), array);
+		g_array_append_val (array, elem);
+		return;
+	} 
+
+	/* It is not the first time we find the word */
+	for (i = 0; i < array->len; i++) {
+
+		current = &g_array_index (array, TrackerIndexElement, i);
+
+		if (current->id == service_id) {
+			/* The word was already found in the same service_id (file), increase score */
+			new_score = index_get_score (current) + weight;
+			current->amalgamated = index_calc_amalgamated (index_get_service_type (current), 
+								       new_score);
+			return;
+		}
 	}
 
+	/* First time in the file */
 	g_array_append_val (array, elem);
 }
 
+
+/* use for deletes or updates of multiple entities when they are not new */
+static gboolean
+indexer_update_word (DEPOT        *index, 
+		     const gchar  *word, 
+		     GArray       *new_hits)
+{	
+	gint  tsiz, i, score;
+	guint j;
+	gint k;
+					
+	TrackerIndexElement *new_hit, *previous_hits;
+	gboolean write_back = FALSE, edited = FALSE;
+	gint old_hit_count = 0;
+	GArray *pending_hits = NULL;
+	gboolean result;
+
+	g_return_val_if_fail (index, FALSE);
+	g_return_val_if_fail (word, FALSE);
+	g_return_val_if_fail (new_hits, FALSE);
+
+	previous_hits = (TrackerIndexElement *)dpget (index, word, -1, 0, MAX_HIT_BUFFER, &tsiz);
+	
+	/* New word in the index */
+	if (previous_hits == NULL) {
+
+		result = dpput (index, 
+				word, -1, 
+				(char *) new_hits->data, (new_hits->len * sizeof (TrackerIndexElement)), 
+				DP_DCAT);
+
+		if (!result) {
+			g_warning ("Could not store word: %s", word);
+			return FALSE;
+		}
+
+		return TRUE;
+	}
+
+	/* Word already exists */
+	old_hit_count = tsiz / sizeof (TrackerIndexElement);
+
+	for (j = 0; j < new_hits->len; j++) {
+
+		new_hit = &g_array_index (new_hits, TrackerIndexElement, j); 
+
+		edited = FALSE;
+
+		for (i = 0; i < old_hit_count; i++) {
+
+			if (previous_hits[i].id == new_hit->id) {
+
+				write_back = TRUE;
+				
+				/* NB the paramter score can be negative */
+				score = index_get_score (&previous_hits[i]) + index_get_score (new_hit);
+				/* g_print ("current score for %s is %d and new is %d and final is %d\n", 
+				   word, index_get_score (&previous_hits[i]), index_get_score (new_hit), score);  */
+				
+				
+				/* check for deletion */		
+				if (score < 1) {
+					
+					/* g_print ("Deleting word hit %s\n", word); */
+					
+					/* shift all subsequent records in array down one place */
+					for (k = i + 1; k < old_hit_count; k++) {
+						previous_hits[k - 1] = previous_hits[k];
+					}
+					
+					old_hit_count--;
+					
+				} else {
+					previous_hits[i].amalgamated = index_calc_amalgamated (index_get_service_type (&previous_hits[i]), score);
+				}
+				
+				edited = TRUE;
+				break;
+			}
+		}
+		
+		/* add hits that could not be updated directly here so they can be appended later */
+		if (!edited) {
+
+			if (!pending_hits) {
+				pending_hits = g_array_new (FALSE, TRUE, sizeof (TrackerIndexElement));
+			}
+
+			g_array_append_val (pending_hits, *new_hit);
+			g_debug ("could not update word hit %s - appending", word);
+		}
+	}
+	
+	/* write back if we have modded anything */
+	if (write_back) {
+		dpput (index, 
+		       word, -1, 
+		       (char *) previous_hits, (old_hit_count * sizeof (TrackerIndexElement)), 
+		       DP_DOVER);
+	}
+	
+	/*  Append new occurences */
+	if (pending_hits) {
+		dpput (index, 
+		       word, -1, 
+		       (char *) pending_hits->data, (pending_hits->len * sizeof (TrackerIndexElement)), 
+		       DP_DCAT);
+		g_array_free (pending_hits, TRUE);
+	}
+
+	g_free (previous_hits);
+	
+	return TRUE;
+}
+
+
 static gboolean
 cache_flush_foreach (gpointer key,
 		     gpointer value,
@@ -131,28 +290,13 @@
 	GArray *array;
 	DEPOT  *index;
 	gchar  *word;
-#if 0
-	gchar *tmp;
-	gint   table_size;
-#endif
 
 	word = (gchar *) key;
 	array = (GArray *) value;
 	index = (DEPOT *) user_data;
 
-#if 0
-	if ((tmp = dpget (index, word, -1, 0, MAX_HIT_BUFFER, &table_size)) != NULL) {
-		/* FIXME: missing merge with previous values */
-	}
-#endif
-
-	if (!dpput (index, word, -1, (char *) array->data, (array->len * sizeof (TrackerIndexElement)), DP_DCAT)) {
-		g_warning ("Could not store word: %s", word);
-		return FALSE;
-	}
-
-	/* Mark element for removal */
-	return TRUE;
+	/* Mark element for removal if succesfull insertion */
+	return indexer_update_word (index, word, array);
 }
 
 guint

Modified: branches/indexer-split/tests/tracker-indexer/tracker-index-test.c
==============================================================================
--- branches/indexer-split/tests/tracker-indexer/tracker-index-test.c	(original)
+++ branches/indexer-split/tests/tracker-indexer/tracker-index-test.c	Wed Jul  2 17:25:47 2008
@@ -6,14 +6,26 @@
 
 #include <qdbm/depot.h>
 
+#define BUCKET_COUNT 100
+
+
 
-// Private struct. Used here to test results
+/* Private code from the module. Used here to check results */
 typedef struct {
 	guint32 id;
 	guint32 amalgamated; 
 } TrackerIndexElement;
 
-#define BUCKET_COUNT 100
+gint16
+helper_get_score (TrackerIndexElement *element) 
+{
+	unsigned char a[2];
+
+	a[0] = (element->amalgamated >> 16) & 0xFF;
+	a[1] = (element->amalgamated >> 8) & 0xFF;
+
+	return (gint16) (a[0] << 8) | (a[1]);	
+}
 
 
 /* Helper functions to read the index */
@@ -47,6 +59,29 @@
         return result / sizeof (TrackerIndexElement);
 }
 
+gint
+get_score_for_word (const gchar *index_file, const gchar *word)
+{
+        DEPOT *index;
+        gint tsiz;
+        TrackerIndexElement *results;
+        gint score;
+
+        index = dpopen (index_file, DP_OREADER, BUCKET_COUNT);
+
+        results = (TrackerIndexElement *)dpget (index, word, -1, 0, -1, &tsiz);
+
+        dpclose (index);
+
+        g_return_val_if_fail ((tsiz / sizeof (TrackerIndexElement)) == 1, -1);
+        g_return_val_if_fail (results, -1);
+
+        score = helper_get_score (&results[0]);
+
+        g_free (results);
+        return score;
+}
+
 void
 debug_print_index (const gchar *index_file) {
 
@@ -75,7 +110,11 @@
                         }
                         g_print ("\n");
                 }
+                g_free (results);
+                g_free (iter);
         }
+
+        dpclose (index);
 }
 
 /* Actual tests */
@@ -138,19 +177,89 @@
         index = tracker_index_new (indexname, BUCKET_COUNT);
         
         for ( i = 0; i < 20; i++) {
-                tracker_index_add_word (index, "word", i, 1, 1);
+                tracker_index_add_word (index, "test-word", i, 1, 1);
+        }
+
+        tracker_index_flush (index);
+        tracker_index_free (index);
+
+        g_assert_cmpint (get_number_words_in_index (indexname), ==, 1);
+        g_assert_cmpint (get_results_for_word (indexname, "test-word"), ==, 20);
+
+        g_remove (indexname);
+}
+
+static void
+test_add_word_multiple_occurrences ()
+{
+        TrackerIndex *index;
+        gint i;
+        const gchar *indexname = "test-word-multiple-ocurrences.index";
+
+        g_remove (indexname);
+        index = tracker_index_new (indexname, BUCKET_COUNT);
+        
+        for ( i = 0; i < 20; i++) {
+                tracker_index_add_word (index, "test-word", 1, 1, 1);
         }
 
         tracker_index_flush (index);
         tracker_index_free (index);
 
         g_assert_cmpint (get_number_words_in_index (indexname), ==, 1);
-        g_assert_cmpint (get_results_for_word (indexname, "word"), ==, 20);
+
+        // There must be only ONE result with a high score
+        g_assert_cmpint (get_results_for_word (indexname, "test-word"), ==, 1);
+        g_assert_cmpint (get_score_for_word (indexname, "test-word"), ==, 20);
 
         g_remove (indexname);
+        
 }
 
 
+static void
+test_add_with_flushs () 
+{
+
+        TrackerIndex *index;
+        const gchar *indexname = "test-add-with-flush.index";
+        gchar **pieces;
+        gint i;
+
+        const gchar *text1 = "this is a text to try a kind of real use case of the indexer";
+        const gchar *text2 = "this is another text with some common words";
+        
+        g_remove (indexname);
+        index = tracker_index_new (indexname, BUCKET_COUNT);
+
+        /* Text 1 */
+        pieces = g_strsplit (text1, " ", -1);
+        for (i = 0; pieces[i] != NULL; i++) {
+                tracker_index_add_word (index, pieces[i], 1, 1, 1);
+        }
+        g_strfreev (pieces);
+        tracker_index_flush (index);
+
+        /* Text 2 */
+        pieces = g_strsplit (text2, " ", -1);
+        for (i = 0; pieces[i] != NULL; i++) {
+                tracker_index_add_word (index, pieces[i], 2, 1, 1);
+        }
+        g_strfreev (pieces);
+        tracker_index_flush (index);
+
+        tracker_index_free (index);
+
+        g_assert_cmpint (get_number_words_in_index (indexname), ==, 18);
+        g_assert_cmpint (get_results_for_word (indexname, "this"), ==, 2);
+        g_assert_cmpint (get_results_for_word (indexname, "common"), ==, 1);
+        g_assert_cmpint (get_score_for_word (indexname, "a"), ==, 2);
+        g_remove (indexname);
+
+}
+
+
+
 int
 main (int argc, char **argv) {
 
@@ -168,6 +277,11 @@
         g_test_add_func ("/tracker/tracker-indexer/tracker-index/add_word_n_times",
                          test_add_word_n_times);
 
+        g_test_add_func ("/tracker/tracker-indexer/tracker-index/add_word_multiple_occurrences",
+                         test_add_word_multiple_occurrences);
+
+        g_test_add_func ("/tracker/tracker-indexer/tracker-index/add_with_flush",
+                         test_add_with_flushs);
         result = g_test_run ();
         
         return result;



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]