tracker r2200 - in branches/indexer-split: . src/tracker-indexer



Author: mr
Date: Mon Sep  8 15:46:57 2008
New Revision: 2200
URL: http://svn.gnome.org/viewvc/tracker?rev=2200&view=rev

Log:
	* src/tracker-indexer/tracker-indexer.c: Implemented a cache that
	we use to know if a directory mtime has changed to know if we
	should go into it to be reindexed. We still mtime check each file
	in that directory, but we don't even try to index files for a
	directory where the mtime is the same.

	* src/tracker-indexer/tracker-module.h: Cleaned up header.


Modified:
   branches/indexer-split/ChangeLog
   branches/indexer-split/src/tracker-indexer/tracker-indexer.c
   branches/indexer-split/src/tracker-indexer/tracker-module.h

Modified: branches/indexer-split/src/tracker-indexer/tracker-indexer.c
==============================================================================
--- branches/indexer-split/src/tracker-indexer/tracker-indexer.c	(original)
+++ branches/indexer-split/src/tracker-indexer/tracker-indexer.c	Mon Sep  8 15:46:57 2008
@@ -51,6 +51,7 @@
 #include <sys/statvfs.h>
 
 #include <glib/gstdio.h>
+#include <gio/gio.h>
 #include <gmodule.h>
 
 #include <libtracker-common/tracker-config.h>
@@ -102,6 +103,8 @@
 	GQueue *file_queue;
 	GQueue *modules_queue;
 
+	GHashTable *mtime_cache;
+
 	GList *module_names;
 	gchar *current_module_name;
 	GHashTable *indexer_modules;
@@ -458,6 +461,8 @@
 	g_queue_foreach (priv->modules_queue, (GFunc) g_free, NULL);
 	g_queue_free (priv->modules_queue);
 
+	g_hash_table_unref (priv->mtime_cache);
+
 	g_queue_foreach (priv->dir_queue, (GFunc) path_info_free, NULL);
 	g_queue_free (priv->dir_queue);
 
@@ -763,6 +768,10 @@
 	priv->in_transaction = FALSE;
 	priv->dir_queue = g_queue_new ();
 	priv->file_queue = g_queue_new ();
+	priv->mtime_cache = g_hash_table_new_full (g_str_hash,
+						   g_str_equal,
+						   g_free,
+						   NULL);
 	priv->modules_queue = g_queue_new ();
 	priv->config = tracker_config_new ();
 
@@ -877,7 +886,7 @@
 	}
 
 	if (data->add) {
-		score = -1*tracker_field_get_weight (field);
+		score = -1 * tracker_field_get_weight (field);
 	} else {
 		score = tracker_field_get_weight (field);
 	}
@@ -1220,8 +1229,10 @@
 			g_free (text);
 		}
 	} else {
-		gchar           *old_text = NULL,  *new_text = NULL;
-		TrackerMetadata *old_metadata = NULL;
+		TrackerMetadata *old_metadata;
+		gchar           *old_text;
+		gchar           *new_text;
+
 		/* Update case */
 		g_debug ("Updating file '%s'", info->file->path);
 
@@ -1239,8 +1250,8 @@
 		new_text = tracker_indexer_module_file_get_text (info->module, info->file);
 		
 		if (old_text || new_text) {
-			GHashTable *old_words = NULL;
-			GHashTable *new_words = NULL;
+			GHashTable *old_words;
+			GHashTable *new_words;
 
 			/* Service has/had full text */
 			old_words = tracker_parser_text (NULL, 
@@ -1622,14 +1633,17 @@
 }
 
 static gboolean
-should_index_file (PathInfo    *info,
-		   const gchar *dirname,
-		   const gchar *basename)
+should_index_file (TrackerIndexer *indexer,
+		   PathInfo       *info,
+		   const gchar    *dirname,
+		   const gchar    *basename)
 {
 	TrackerService *service;
 	gchar *service_type;
+	const gchar *str;
+	gboolean is_dir;
+	gboolean should_be_cached;
 	struct stat st;
-	guint id;
 	time_t mtime;
 
 	service_type = tracker_indexer_module_file_get_service_type (info->module, info->file);
@@ -1645,15 +1659,152 @@
 		return TRUE;
 	}
 
-	if (!tracker_db_check_service (service, dirname, basename, &id, &mtime)) {
+	/* Check the file/directory exists. If it doesn't we
+	 * definitely want to index it.
+	 */
+	if (!tracker_db_check_service (service, 
+				       dirname, 
+				       basename, 
+				       NULL, 
+				       &mtime)) {
 		return TRUE;
 	}
 
-	if (g_lstat ((const gchar *) info->file->path, &st) == -1) {
+	/* So, if we are here, then the file or directory DID exist
+	 * in the database already. Now we need to check if the
+	 * parent directory mtime matches the mtime we have for it in
+	 * the database. If it does, then we can ignore any files
+	 * immediately in this parent directory.
+	 */
+	if (g_lstat (info->file->path, &st) == -1) {
 		return TRUE;
 	}
 
-	return (st.st_mtime > mtime);
+	/* It is most efficient to keep a hash table of mtime values
+	 * which are out of sync with the database. Since there are
+	 * more likely to be less of those than actual directories.
+	 * So we keep a list of them and if the dirname is in the
+	 * hash table we then say it needs reindexing. If not, we
+	 * assume that it must be up to date. 
+	 *
+	 * This entry in the hash table is removed after each index is
+	 * complete. 
+	 *
+	 * Note: info->file->path = '/tmp/foo/bar'
+	 *       dirname          = '/tmp/foo'
+	 *       basename         = 'bar'
+	 *
+	 * Example A. PathInfo is file.
+	 *   1) Lookup 'dirname', if exists then
+	 *     --> return TRUE
+	 *   2) Check 'dirname' mtime is newer, if not then
+	 *     --> return FALSE
+	 *   3) Add to hash table
+	 *     --> return TRUE
+	 *
+	 * Example B. PathInfo is directory.
+	 *   1) Lookup 'info->file->path', if exists then
+	 *     --> return TRUE
+	 *   2) Check 'info->file->path' mtime is newer, if not then
+	 *     --> return FALSE
+	 *   3) Add to hash table
+	 *     --> return TRUE
+	 */ 
+	is_dir = S_ISDIR (st.st_mode);
+	should_be_cached = TRUE;
+
+	/* Choose the path we evaluate based on if we have a directory
+	 * or not. All operations are done using the same string.
+	 */
+	if (is_dir) {
+		str = info->file->path;
+	} else {
+		str = dirname;
+	}
+
+	/* Step 1. */
+	if (g_hash_table_lookup (indexer->private->mtime_cache, str)) {
+		gboolean should_index;
+
+		if (!is_dir) {
+			/* Only index files in this directory which
+			 * have an old mtime.
+			 */
+			should_index = st.st_mtime > mtime;
+		} else {
+			/* We always index directories */
+			should_index = TRUE;
+		}
+
+		g_debug ("%s:'%s' exists in cache, %s",
+			 is_dir ? "Path" : "Parent path",
+ 			 str,
+			 should_index ? "should index" : "should not index");
+
+		return should_index;
+	}
+	
+	/* Step 2. */
+	if (!is_dir) {
+		gchar *parent_dirname;
+		gchar *parent_basename;
+		gboolean exists;
+
+		/* FIXME: What if there is no parent? */
+		parent_dirname = g_path_get_dirname (dirname);
+		parent_basename = g_path_get_basename (dirname);
+
+		/* We don't have the mtime for the dirname yet, we do
+		 * if this is a info->file->path of course.
+		 */
+		exists = tracker_db_check_service (service, 
+						   parent_dirname, 
+						   parent_basename, 
+						   NULL, 
+						   &mtime);
+		if (!exists) {
+			g_critical ("Expected path '%s/%s' to exist, not in database?",
+				    parent_dirname,
+				    parent_basename);
+
+			g_free (parent_basename);
+			g_free (parent_dirname);
+
+			return TRUE;
+		}
+
+		if (g_lstat (dirname, &st) == -1) {
+			g_critical ("Expected path '%s' to exist, could not stat()",
+				    parent_dirname);
+
+			g_free (parent_basename);
+			g_free (parent_dirname);
+
+			return TRUE;
+		}
+
+		g_free (parent_basename);
+		g_free (parent_dirname);
+	}
+
+	if (st.st_mtime <= mtime) {
+		g_debug ("%s:'%s' has indifferent mtime and should not be indexed",
+			 is_dir ? "Path" : "Parent path",
+			 str);
+
+		return FALSE;
+	}
+
+	/* Step 3. */
+	g_debug ("%s:'%s' being added to cache and should be indexed",
+		 is_dir ? "Path" : "Parent path",
+		 str);
+
+	g_hash_table_replace (indexer->private->mtime_cache, 
+			      g_strdup (str),
+			      GINT_TO_POINTER (1));
+	
+	return TRUE;
 }
 
 static gboolean
@@ -1674,7 +1825,7 @@
 		return TRUE;
 	}
 
-	if (!should_index_file (info, dirname, basename)) {
+	if (!should_index_file (indexer, info, dirname, basename)) {
 		g_debug ("File is already up to date: '%s'", info->file->path);
 		g_free (dirname);
 		g_free (basename);

Modified: branches/indexer-split/src/tracker-indexer/tracker-module.h
==============================================================================
--- branches/indexer-split/src/tracker-indexer/tracker-module.h	(original)
+++ branches/indexer-split/src/tracker-indexer/tracker-module.h	Mon Sep  8 15:46:57 2008
@@ -1,4 +1,6 @@
-/* Copyright (C) 2006, Mr Jamie McCracken (jamiemcc gnome org)
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2006, Mr Jamie McCracken (jamiemcc gnome org)
  * Copyright (C) 2008, Nokia
 
  * This library is free software; you can redistribute it and/or
@@ -32,42 +34,37 @@
 	gpointer  data;
 };
 
-typedef void              (* TrackerModuleInit)               (void);
-typedef void              (* TrackerModuleShutdown)           (void);
+typedef void              (* TrackerModuleInit)                   (void);
+typedef void              (* TrackerModuleShutdown)               (void);
 
-typedef const gchar *     (* TrackerModuleGetNameFunc)        (void);
-typedef gchar **          (* TrackerModuleGetDirectoriesFunc) (void);
+typedef const gchar *     (* TrackerModuleGetNameFunc)            (void);
+typedef gchar **          (* TrackerModuleGetDirectoriesFunc)     (void);
 
-typedef gpointer          (* TrackerModuleFileGetDataFunc)  (const gchar *path);
-typedef void              (* TrackerModuleFileFreeDataFunc) (gpointer     data);
+typedef gpointer          (* TrackerModuleFileGetDataFunc)        (const gchar  *path);
+typedef void              (* TrackerModuleFileFreeDataFunc)       (gpointer      data);
 
 typedef gchar *           (* TrackerModuleFileGetServiceTypeFunc) (TrackerFile  *file);
 typedef void              (* TrackerModuleFileGetUriFunc)         (TrackerFile  *file,
                                                                    gchar       **dirname,
                                                                    gchar       **basename);
 
-typedef TrackerMetadata * (* TrackerModuleFileGetMetadataFunc) (TrackerFile *file);
-typedef gchar *           (* TrackerModuleFileGetText)         (TrackerFile *path);
-typedef gboolean          (* TrackerModuleFileIterContents)    (TrackerFile *path);
-
-
-void                   tracker_module_init                   (void);
-void                   tracker_module_shutdown               (void);
-
-G_CONST_RETURN gchar * tracker_module_get_name               (void);
-
-gpointer               tracker_module_file_get_data  (const gchar *path);
-void                   tracker_module_file_free_data (gpointer     file_data);
-
-gchar *                tracker_module_file_get_service_type  (TrackerFile *file);
-void                   tracker_module_file_get_uri   (TrackerFile  *file,
-                                                      gchar       **dirname,
-                                                      gchar       **basename);
-
-TrackerMetadata *      tracker_module_file_get_metadata  (TrackerFile *file);
-gchar *                tracker_module_file_get_text      (TrackerFile *file);
-gboolean               tracker_module_file_iter_contents (TrackerFile *file);
-
+typedef TrackerMetadata * (* TrackerModuleFileGetMetadataFunc)    (TrackerFile  *file);
+typedef gchar *           (* TrackerModuleFileGetText)            (TrackerFile  *path);
+typedef gboolean          (* TrackerModuleFileIterContents)       (TrackerFile  *path);
+
+
+void                  tracker_module_init                  (void);
+void                  tracker_module_shutdown              (void);
+G_CONST_RETURN gchar *tracker_module_get_name              (void);
+gpointer              tracker_module_file_get_data         (const gchar  *path);
+void                  tracker_module_file_free_data        (gpointer      file_data);
+gchar *               tracker_module_file_get_service_type (TrackerFile  *file);
+void                  tracker_module_file_get_uri          (TrackerFile  *file,
+                                                            gchar       **dirname,
+                                                            gchar       **basename);
+TrackerMetadata *     tracker_module_file_get_metadata     (TrackerFile  *file);
+gchar *               tracker_module_file_get_text         (TrackerFile  *file);
+gboolean              tracker_module_file_iter_contents    (TrackerFile  *file);
 
 G_END_DECLS
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]