tracker r1257 - in trunk: . src/trackerd



Author: ifrade
Date: Tue Apr  8 20:45:03 2008
New Revision: 1257
URL: http://svn.gnome.org/viewvc/tracker?rev=1257&view=rev

Log:
Patch Bug #525403. Isolating indexer

Modified:
   trunk/ChangeLog
   trunk/src/trackerd/tracker-cache.c
   trunk/src/trackerd/tracker-dbus-search.c
   trunk/src/trackerd/tracker-indexer.c
   trunk/src/trackerd/tracker-indexer.h
   trunk/src/trackerd/tracker-utils.h
   trunk/src/trackerd/trackerd.c

Modified: trunk/src/trackerd/tracker-cache.c
==============================================================================
--- trunk/src/trackerd/tracker-cache.c	(original)
+++ trunk/src/trackerd/tracker-cache.c	Tue Apr  8 20:45:03 2008
@@ -65,7 +65,7 @@
 		break;
 	}
 
-	indexer = tracker_indexer_open (temp_file_name);
+	indexer = tracker_indexer_open (temp_file_name, FALSE);
 
 	g_free (temp_file_name);
 
@@ -162,7 +162,7 @@
 		/* determine is index has been written to significantly before and create new ones if so */
 		if (tracker_indexer_size (tracker->file_index) > 4000000) {
 			index_con.file_index = create_merge_index ("file-index.tmp.");
-			tracker_log ("flushing to %s", dpname (index_con.file_index->word_index));
+			tracker_log ("flushing to %s", tracker_indexer_get_name (index_con.file_index));
 			using_file_tmp = TRUE;
 		} else {
 			index_con.file_index = tracker->file_index;
@@ -170,7 +170,7 @@
 		
 		if (tracker_indexer_size (tracker->email_index) > 4000000) {
 			index_con.email_index = create_merge_index ("email-index.tmp.");
-			tracker_log ("flushing to %s", dpname (index_con.email_index->word_index));
+			tracker_log ("flushing to %s", tracker_indexer_get_name (index_con.email_index));
 			using_email_tmp = TRUE;
 		} else {
 			index_con.email_index = tracker->email_index;

Modified: trunk/src/trackerd/tracker-dbus-search.c
==============================================================================
--- trunk/src/trackerd/tracker-dbus-search.c	(original)
+++ trunk/src/trackerd/tracker-dbus-search.c	Tue Apr  8 20:45:03 2008
@@ -858,97 +858,6 @@
 }
 
 
-/* int levenshtein ()
- * Original license: GNU Lesser Public License
- * from the Dixit project, (http://dixit.sourceforge.net/)
- * Author: Octavian Procopiuc <oprocopiuc gmail com>
- * Created: July 25, 2004
- * Copied into tracker, by Edward Duffy
- */
-
-static int
-levenshtein(char *source, char *target, int maxdist)
-{
-	char n, m;
-	int l;
-	l = strlen (source);
-	if (l > 50)
-		return -1;
-	n = l;
-
-	l = strlen (target);
-	if (l > 50)
-		return -1;
-	m = l;
-
-	if (maxdist == 0)
-		maxdist = MAX(m, n);
-	if (n == 0)
-		return MIN(m, maxdist);
-	if (m == 0)
-		return MIN(n, maxdist);
-
-	// Store the min. value on each column, so that, if it reaches
-	// maxdist, we break early.
-	char mincolval;
-
-	char matrix[51][51];
-
-	char j;
-	char i;
-	char cell;
-
-	for (j = 0; j <= m; j++)
-		matrix[0][(int)j] = j;
-
-	for (i = 1; i <= n; i++) {
-
-		mincolval = MAX(m, i);
-		matrix[(int)i][0] = i;
-
-		char s_i = source[i-1];
-
-		for (j = 1; j <= m; j++) {
-
-			char t_j = target[j-1];
-
-			char cost = (s_i == t_j ? 0 : 1);
-
-			char above = matrix[i-1][(int)j];
-			char left = matrix[(int)i][j-1];
-			char diag = matrix[i-1][j-1];
-			cell = MIN(above + 1, MIN(left + 1, diag + cost));
-
-			// Cover transposition, in addition to deletion,
-			// insertion and substitution. This step is taken from:
-			// Berghel, Hal ; Roach, David : "An Extension of Ukkonen's 
-			// Enhanced Dynamic Programming ASM Algorithm"
-			// (http://www.acm.org/~hlb/publications/asm/asm.html)
-
-			if (i > 2 && j > 2) {
-				char trans = matrix[i-2][j-2] + 1;
-				if (source[i-2] != t_j)
-					trans++;
-				if (s_i != target[j-2])
-					trans++;
-				if (cell > trans)
-					cell = trans;
-			}
-
-			mincolval = MIN(mincolval, cell);
-			matrix[(int)i][(int)j] = cell;
-		}
-
-		if (mincolval >= maxdist)
-			break;
-
-	}
-
-	if (i == n + 1)
-		return (int) matrix[(int)n][(int)m];
-	else
-		return maxdist;
-}
 
 
 void
@@ -956,14 +865,9 @@
 {
 	DBusError	dbus_error;
 	DBusMessage 	*reply;
-	gchar		*term, *str;
+	gchar		*term;
 	gint		maxdist;
-	gint		dist, tsiz;
 	gchar		*winner_str;
-	gint		winner_dist;
-	char		*tmp;
-	int		hits;
-	GTimeVal	start, current;
 
 	/*
 		<method name="Suggest">
@@ -983,60 +887,14 @@
 		return;
 	}
 
-	winner_str = NULL;
-        winner_dist = -1;  /* to initialize winner_dist with something */
-
-
 	Indexer *index = tracker->file_index;
 
-	dpiterinit (index->word_index);
-
-	g_get_current_time (&start);
+        winner_str = tracker_indexer_get_suggestion (index, term, maxdist);
 
-	str = dpiternext (index->word_index, NULL);
-	while (str != NULL) {
-		dist = levenshtein (term, str, 0);
-		if (dist != -1 && dist < maxdist) {
-			hits = 0;
-			if ((tmp = dpget (index->word_index, str, -1, 0, -1, &tsiz)) != NULL) {
-				hits = tsiz / sizeof (WordDetails);
-				free (tmp);
-				if (tsiz % sizeof (WordDetails) != 0) {
-					tracker_set_error (rec, "Possible data error from dpget  Aborting tracker_dbus_method_search_suggest.");
-					g_free (str);
-					if (winner_str) {
-						g_free (winner_str);
-					}
-					return;
-				}
-			}
-			if (hits > 0) {
-				if (winner_str == NULL) {
-					winner_str = strdup (str);
-					winner_dist = dist;
-				}
-				else if (dist < winner_dist) {
-					free (winner_str);
-					winner_str = strdup (str);
-					winner_dist = dist;
-				}
-			}
-			else {
-				tracker_log ("No hits for %s!", str);
-			}
-		}
-		free (str);
-		g_get_current_time (&current);
-		if (current.tv_sec - start.tv_sec >= 2) { /* 2 second time out */
-			tracker_log ("Timeout in tracker_dbus_method_search_suggest");
-			break;
-		}
-		str = dpiternext (index->word_index, NULL);
-	}
-
-	if (winner_str == NULL) {
-		winner_str = strdup (term);
-	}
+        if (!winner_str) {
+                tracker_set_error (rec, "Possible data error in index. Aborting tracker_dbus_method_search_suggest.");
+                return;
+        }
 
 	tracker_log ("Suggested spelling for %s is %s.", term, winner_str);
 

Modified: trunk/src/trackerd/tracker-indexer.c
==============================================================================
--- trunk/src/trackerd/tracker-indexer.c	(original)
+++ trunk/src/trackerd/tracker-indexer.c	Tue Apr  8 20:45:03 2008
@@ -41,7 +41,8 @@
 #include <unistd.h>
 #include <string.h>
 #include <math.h>
-#include <sqlite3.h>
+
+#include <depot.h>
 
 #include <glib.h>
 #include <glib/gstdio.h>
@@ -56,6 +57,16 @@
 
 extern Tracker *tracker;
 
+struct Indexer_ {
+	DEPOT  		*word_index;	/* file hashtable handle for the word -> {serviceID, ServiceTypeID, Score}  */
+	GMutex 		*word_mutex;
+	char   		*name;
+	gpointer  	emails; /* pointer to email indexer */
+	gpointer  	data; /* pointer to file indexer */
+	gboolean	main_index;
+	gboolean	needs_merge; /* should new stuff be added directly or merged later on from a new index */
+};
+
 static inline gint16
 get_score (WordDetails *details)
 {
@@ -257,7 +268,7 @@
 }
 
 Indexer *
-tracker_indexer_open (const gchar *name)
+tracker_indexer_open (const gchar *name, gboolean main_index)
 {
 	char *word_dir;
 	DEPOT *word_index;
@@ -273,7 +284,7 @@
 
 	result = g_new0 (Indexer, 1);
 
-	result->main_index = FALSE;
+	result->main_index = main_index;
 	
 	result->needs_merge = FALSE;
 
@@ -351,6 +362,14 @@
 	
 }
 
+const gchar *   
+tracker_indexer_get_name (Indexer *indexer) 
+{
+        g_return_val_if_fail (indexer != NULL, NULL);
+        
+        return dpname (indexer->word_index);
+}
+
 
 guint32
 tracker_indexer_size (Indexer *indexer)
@@ -499,7 +518,7 @@
 	tracker_indexer_free (src, TRUE);
 
 	if (update) {
-		tracker->file_update_index = tracker_indexer_open ("file-update-index.db");
+		tracker->file_update_index = tracker_indexer_open ("file-update-index.db", FALSE);
 	}
 	
 	tracker->in_merge = FALSE;
@@ -663,7 +682,7 @@
 
 					if (g_file_test (file->data, G_FILE_TEST_EXISTS)) {
 
-                                                Indexer *tmp_index = tracker_indexer_open (name);
+                                                Indexer *tmp_index = tracker_indexer_open (name, FALSE);
 						if (tmp_index) {
 							index_list = g_slist_prepend (index_list, tmp_index);
 						}
@@ -712,9 +731,9 @@
 	tracker_dbus_send_index_status_change_signal ();
 
 	if (type == INDEX_TYPE_FILES) {
-		final_index = tracker_indexer_open ("file-index-final");
+		final_index = tracker_indexer_open ("file-index-final", TRUE);
 	} else {
-		final_index = tracker_indexer_open ("email-index-final");
+		final_index = tracker_indexer_open ("email-index-final", TRUE);
 	}
 
 	if (!final_index) {
@@ -1770,3 +1789,173 @@
 		return query->hit_count;
 	}
 }
+
+/* int levenshtein ()
+ * Original license: GNU Lesser Public License
+ * from the Dixit project, (http://dixit.sourceforge.net/)
+ * Author: Octavian Procopiuc <oprocopiuc gmail com>
+ * Created: July 25, 2004
+ * Copied into tracker, by Edward Duffy
+ */
+
+static int
+levenshtein(const char *source, char *target, int maxdist)
+{
+	char n, m;
+	int l;
+	l = strlen (source);
+	if (l > 50)
+		return -1;
+	n = l;
+
+	l = strlen (target);
+	if (l > 50)
+		return -1;
+	m = l;
+
+	if (maxdist == 0)
+		maxdist = MAX(m, n);
+	if (n == 0)
+		return MIN(m, maxdist);
+	if (m == 0)
+		return MIN(n, maxdist);
+
+	// Store the min. value on each column, so that, if it reaches
+	// maxdist, we break early.
+	char mincolval;
+
+	char matrix[51][51];
+
+	char j;
+	char i;
+	char cell;
+
+	for (j = 0; j <= m; j++)
+		matrix[0][(int)j] = j;
+
+	for (i = 1; i <= n; i++) {
+
+		mincolval = MAX(m, i);
+		matrix[(int)i][0] = i;
+
+		char s_i = source[i-1];
+
+		for (j = 1; j <= m; j++) {
+
+			char t_j = target[j-1];
+
+			char cost = (s_i == t_j ? 0 : 1);
+
+			char above = matrix[i-1][(int)j];
+			char left = matrix[(int)i][j-1];
+			char diag = matrix[i-1][j-1];
+			cell = MIN(above + 1, MIN(left + 1, diag + cost));
+
+			// Cover transposition, in addition to deletion,
+			// insertion and substitution. This step is taken from:
+			// Berghel, Hal ; Roach, David : "An Extension of Ukkonen's 
+			// Enhanced Dynamic Programming ASM Algorithm"
+			// (http://www.acm.org/~hlb/publications/asm/asm.html)
+
+			if (i > 2 && j > 2) {
+				char trans = matrix[i-2][j-2] + 1;
+				if (source[i-2] != t_j)
+					trans++;
+				if (s_i != target[j-2])
+					trans++;
+				if (cell > trans)
+					cell = trans;
+			}
+
+			mincolval = MIN(mincolval, cell);
+			matrix[(int)i][(int)j] = cell;
+		}
+
+		if (mincolval >= maxdist)
+			break;
+
+	}
+
+	if (i == n + 1)
+		return (int) matrix[(int)n][(int)m];
+	else
+		return maxdist;
+}
+
+static int
+count_hits_for_word (Indexer *indexer, const gchar *str) {
+        
+        gint tsiz, hits = 0;
+
+        tsiz = count_hit_size_for_word (indexer, str);
+
+        if (tsiz == -1 || tsiz % sizeof (WordDetails) != 0) {
+                return -1;
+        }
+
+        hits = tsiz / sizeof (WordDetails);
+
+        return hits;
+}
+
+
+char *
+tracker_indexer_get_suggestion (Indexer *indexer, const gchar *term, gint maxdist)
+{
+
+	gchar		*str;
+	gint		dist; 
+	gchar		*winner_str;
+	gint		winner_dist;
+	gint		hits;
+	GTimeVal	start, current;
+
+	winner_str = g_strdup (term);
+        winner_dist = G_MAXINT;  /* Initialize to the worst case */
+
+        dpiterinit (indexer->word_index);
+
+	g_get_current_time (&start);
+
+	str = dpiternext (indexer->word_index, NULL);
+
+	while (str != NULL) {
+
+		dist = levenshtein (term, str, 0);
+
+		if (dist != -1 && dist < maxdist && dist < winner_dist) {
+
+                        hits = count_hits_for_word (indexer, str);
+
+                        if (hits < 0) {
+
+                                g_free (winner_str);
+                                g_free (str);
+                                return NULL;
+
+			} else if (hits > 0) {
+
+                                g_free (winner_str);
+                                winner_str = g_strdup (str);
+                                winner_dist = dist;
+
+                        } else {
+				tracker_log ("No hits for %s!", str);
+			}
+		}
+
+		g_free (str);
+
+		g_get_current_time (&current);
+
+		if (current.tv_sec - start.tv_sec >= 2) { /* 2 second time out */
+			tracker_log ("Timeout in tracker_dbus_method_search_suggest");
+                        break;
+		}
+
+		str = dpiternext (indexer->word_index, NULL);
+	}
+
+        return winner_str;
+}
+

Modified: trunk/src/trackerd/tracker-indexer.h
==============================================================================
--- trunk/src/trackerd/tracker-indexer.h	(original)
+++ trunk/src/trackerd/tracker-indexer.h	Tue Apr  8 20:45:03 2008
@@ -25,9 +25,12 @@
 
 #include <stdlib.h>
 #include <glib.h>
-#include <depot.h>
 
-#include "tracker-utils.h"
+typedef struct {                         /* type of structure for an element of search result */
+	guint32 	id;              /* Service ID number of the document */
+	int 		amalgamated;     /* amalgamation of service_type and score of the word in the document's metadata */
+} WordDetails;
+
 
 typedef struct {                         	 
 	guint32 	service_id;              /* Service ID of the document */
@@ -51,17 +54,7 @@
 } SearchWord;
 
 
-typedef struct {
-	DEPOT  		*word_index;	/* file hashtable handle for the word -> {serviceID, ServiceTypeID, Score}  */
-	GMutex 		*word_mutex;
-	char   		*name;
-	gpointer  	emails; /* pointer to email indexer */
-	gpointer  	data; /* pointer to file indexer */
-	gboolean	main_index;
-	gboolean	needs_merge; /* should new stuff be added directly or merged later on from a new index */
-} Indexer;
-
-
+typedef struct Indexer_ Indexer;
 
 typedef struct {                        
 	Indexer 	*indexer;
@@ -99,13 +92,13 @@
 guint32		tracker_indexer_calc_amalgamated 	(gint service, gint score);
 void		tracker_index_free_hit_list		(GSList *hit_list);
 
-Indexer * 	tracker_indexer_open 			(const gchar *name);
+Indexer * 	tracker_indexer_open 			(const gchar *name, gboolean main_index);
 void		tracker_indexer_close 			(Indexer *indexer);
 gboolean	tracker_indexer_repair 			(const char *name);
 void		tracker_indexer_free 			(Indexer *indexer, gboolean remove_file);
 gboolean	tracker_indexer_has_merge_index 	(Indexer *indexer, gboolean update);
 
-
+const gchar *   tracker_indexer_get_name                (Indexer *indexer);
 guint32		tracker_indexer_size 			(Indexer *indexer);
 gboolean	tracker_indexer_optimize		(Indexer *indexer);
 void		tracker_indexer_sync 			(Indexer *indexer);
@@ -131,5 +124,6 @@
 
 gboolean	tracker_remove_dud_hits 		(Indexer *indexer, const gchar *word, GSList *dud_list);
 
+char *          tracker_indexer_get_suggestion          (Indexer *indexer, const gchar *term, gint maxdist);
 
 #endif

Modified: trunk/src/trackerd/tracker-utils.h
==============================================================================
--- trunk/src/trackerd/tracker-utils.h	(original)
+++ trunk/src/trackerd/tracker-utils.h	Tue Apr  8 20:45:03 2008
@@ -56,26 +56,11 @@
 #define OPTIMIZATION_COUNT		10000
 #define MAX_WORDS_TO_INDEX		10000
 
-/* default indexer options */
-#define MIN_INDEX_BUCKET_COUNT		131072    /* minimum bucket number of word index per division (total buckets = INDEXBNUM * INDEXDIV) */
-#define INDEX_DIVISIONS	        	4        /* no. of divisions of file */
-#define MAX_INDEX_BUCKET_COUNT 		262144	 /* max no of buckets to use  */
-#define INDEX_BUCKET_RATIO		1	 /* desired ratio of unused buckets to have (range 0 to 4)*/
-#define INDEX_PADDING	 		2
-
-
-typedef struct {                         /* type of structure for an element of search result */
-	guint32 	id;              /* Service ID number of the document */
-	int 		amalgamated;     /* amalgamation of service_type and score of the word in the document's metadata */
-} WordDetails;
-
-
 typedef struct {                         
 	int	 	id;              /* word ID of the cached word */
 	int 		count;     	 /* cummulative count of the cached word */
 } CacheWord;
 
-
 typedef enum {
 	DATA_KEYWORD,	
 	DATA_INDEX,

Modified: trunk/src/trackerd/trackerd.c
==============================================================================
--- trunk/src/trackerd/trackerd.c	(original)
+++ trunk/src/trackerd/trackerd.c	Tue Apr  8 20:45:03 2008
@@ -1170,16 +1170,13 @@
 	
 	
 
-	Indexer *index = tracker_indexer_open ("file-index.db");
-	index->main_index = TRUE;
+	Indexer *index = tracker_indexer_open ("file-index.db", TRUE);
 	tracker->file_index = index;
 
-	index = tracker_indexer_open ("file-update-index.db");
-	index->main_index = FALSE;
+	index = tracker_indexer_open ("file-update-index.db", FALSE);
 	tracker->file_update_index = index;
 
-	index = tracker_indexer_open ("email-index.db");
-	index->main_index = TRUE;
+	index = tracker_indexer_open ("email-index.db", TRUE);
 	tracker->email_index = index;
 
 	db_con->word_index = tracker->file_index;



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]