[tracker] tracker-extract: store as many words as the FTS config says.



commit ee7c66892b3968400f622e2c31769b02e38d0b5f
Author: Carlos Garnacho <carlos lanedo com>
Date:   Thu Oct 22 17:31:32 2009 +0200

    tracker-extract: store as many words as the FTS config says.

 src/tracker-extract/Makefile.am                |    2 +
 src/tracker-extract/tracker-extract-msoffice.c |    6 +-
 src/tracker-extract/tracker-extract-oasis.c    |    6 +-
 src/tracker-extract/tracker-extract-pdf.c      |    7 +-
 src/tracker-extract/tracker-fts-config.c       |  430 ++++++++++++++++++++++++
 src/tracker-extract/tracker-fts-config.h       |   65 ++++
 src/tracker-extract/tracker-main.c             |   11 +
 src/tracker-extract/tracker-main.h             |    3 +
 8 files changed, 526 insertions(+), 4 deletions(-)
---
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 0c41af7..c8fed93 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -253,6 +253,8 @@ tracker_extract_SOURCES = 						\
 	tracker-dbus.h							\
 	tracker-extract.c						\
 	tracker-extract.h						\
+	tracker-fts-config.c						\
+	tracker-fts.config.h						\
 	tracker-main.c							\
 	tracker-main.h
 
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 1b1b2c8..ebcb799 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -250,6 +250,8 @@ extract_msoffice (const gchar *uri,
 	GsfInput  *stream;
 	gchar     *filename, *content;
 	gboolean   rdf_type_added = FALSE;
+	TrackerFTSConfig *fts_config;
+	guint n_words;
 
 	gsf_init ();
 
@@ -327,7 +329,9 @@ extract_msoffice (const gchar *uri,
 		g_object_unref (stream);
 	}
 
-	content = extract_content (uri, 1000);
+	fts_config = tracker_main_get_fts_config ();
+	n_words = tracker_fts_config_get_max_words_to_index (fts_config);
+	content = extract_content (uri, n_words);
 
 	if (content) {
 		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index 51111a2..49fadcb 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -111,6 +111,8 @@ extract_oasis (const gchar *uri,
 	gchar	      *xml;
 	gchar *filename = g_filename_from_uri (uri, NULL, NULL);
 	gchar *content;
+	TrackerFTSConfig *fts_config;
+	guint n_words;
 	ODTParseInfo   info = {
 		metadata,
 		-1,
@@ -144,7 +146,9 @@ extract_oasis (const gchar *uri,
 		g_free (xml);
 	}
 
-	content = extract_content (filename, 1000);
+	fts_config = tracker_main_get_fts_config ();
+	n_words = tracker_fts_config_get_max_words_to_index (fts_config);
+	content = extract_content (filename, n_words);
 
 	if (content) {
 		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.c
index 8517b78..b6edfd6 100644
--- a/src/tracker-extract/tracker-extract-pdf.c
+++ b/src/tracker-extract/tracker-extract-pdf.c
@@ -134,6 +134,8 @@ extract_pdf (const gchar *uri,
 	gchar		*metadata_xml	= NULL;
 	GTime		 creation_date;
 	GError		*error		= NULL;
+	TrackerFTSConfig *fts_config;
+	guint             n_words;
 
 	g_type_init ();
 
@@ -347,8 +349,9 @@ extract_pdf (const gchar *uri,
 		}
 	}
 
-	/* FIXME: Fixed word limit at the moment */
-	content = extract_content (document, 1000);
+	fts_config = tracker_main_get_fts_config ();
+	n_words = tracker_fts_config_get_max_words_to_index (fts_config);
+	content = extract_content (document, n_words);
 
 	if (content) {
 		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
diff --git a/src/tracker-extract/tracker-fts-config.c b/src/tracker-extract/tracker-fts-config.c
new file mode 100644
index 0000000..b452fbf
--- /dev/null
+++ b/src/tracker-extract/tracker-fts-config.c
@@ -0,0 +1,430 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2009, Nokia (urho konttori nokia com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include "config.h"
+
+#include <string.h>
+#include <stdlib.h>
+
+#include <glib.h>
+#include <gio/gio.h>
+
+#include <libtracker-common/tracker-keyfile-object.h>
+
+#include "tracker-fts-config.h"
+
+#define TRACKER_FTS_CONFIG_GET_PRIVATE(obj) (G_TYPE_INSTANCE_GET_PRIVATE ((obj), TRACKER_TYPE_FTS_CONFIG, TrackerFTSConfigPrivate))
+
+/* GKeyFile defines */
+#define GROUP_INDEXING		   "Indexing"
+
+/* Default values */
+#define DEFAULT_MIN_WORD_LENGTH	   3	  /* 0->30 */
+#define DEFAULT_MAX_WORD_LENGTH	   30	  /* 0->200 */
+#define DEFAULT_MAX_WORDS_TO_INDEX 10000
+
+typedef struct {
+	/* Indexing */
+	gint min_word_length;
+	gint max_word_length;
+	gint max_words_to_index;
+}  TrackerFTSConfigPrivate;
+
+typedef struct {
+	GType  type;
+	const gchar *property;
+	const gchar *group;
+	const gchar *key;
+} ObjectToKeyFile;
+
+static void config_set_property         (GObject       *object,
+					 guint          param_id,
+					 const GValue  *value,
+					 GParamSpec    *pspec);
+static void config_get_property         (GObject       *object,
+					 guint          param_id,
+					 GValue        *value,
+					 GParamSpec    *pspec);
+static void config_finalize             (GObject       *object);
+static void config_constructed          (GObject       *object);
+static void config_create_with_defaults (TrackerFTSConfig *config,
+					 GKeyFile      *key_file, 
+					 gboolean       overwrite);
+static void config_load                 (TrackerFTSConfig *config);
+
+enum {
+	PROP_0,
+
+	/* Indexing */
+	PROP_MIN_WORD_LENGTH,
+	PROP_MAX_WORD_LENGTH,
+
+	/* Performance */
+	PROP_MAX_WORDS_TO_INDEX,
+};
+
+static ObjectToKeyFile conversions[] = {
+	{ G_TYPE_INT,     "min-word-length",    GROUP_INDEXING, "MinWordLength"   },
+	{ G_TYPE_INT,     "max-word-length",    GROUP_INDEXING, "MaxWordLength"   },
+	{ G_TYPE_INT,     "max-words-to-index", GROUP_INDEXING, "MaxWordsToIndex" },
+};
+
+G_DEFINE_TYPE (TrackerFTSConfig, tracker_fts_config, TRACKER_TYPE_CONFIG_FILE);
+
+static void
+tracker_fts_config_class_init (TrackerFTSConfigClass *klass)
+{
+	GObjectClass *object_class = G_OBJECT_CLASS (klass);
+
+	object_class->set_property = config_set_property;
+	object_class->get_property = config_get_property;
+	object_class->finalize	   = config_finalize;
+	object_class->constructed  = config_constructed;
+
+	/* Indexing */
+	g_object_class_install_property (object_class,
+					 PROP_MIN_WORD_LENGTH,
+					 g_param_spec_int ("min-word-length",
+							   "Minimum word length",
+							   " Set the minimum length of words to index (0->30, default=3)",
+							   0,
+							   30,
+							   DEFAULT_MIN_WORD_LENGTH,
+							   G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
+	g_object_class_install_property (object_class,
+					 PROP_MAX_WORD_LENGTH,
+					 g_param_spec_int ("max-word-length",
+							   "Maximum word length",
+							   " Set the maximum length of words to index (0->200, default=30)",
+							   0,
+							   200, /* Is this a reasonable limit? */
+							   DEFAULT_MAX_WORD_LENGTH,
+							   G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
+	g_object_class_install_property (object_class,
+					 PROP_MAX_WORDS_TO_INDEX,
+					 g_param_spec_int ("max-words-to-index",
+							   "Maximum words to index",
+							   " Maximum unique words to index from a file's content (default=10000)",
+							   0,
+							   G_MAXINT,
+							   DEFAULT_MAX_WORDS_TO_INDEX,
+							   G_PARAM_READWRITE | G_PARAM_CONSTRUCT));
+
+	g_type_class_add_private (object_class, sizeof (TrackerFTSConfigPrivate));
+}
+
+static void
+tracker_fts_config_init (TrackerFTSConfig *object)
+{
+}
+
+static void
+config_set_property (GObject	  *object,
+		     guint	   param_id,
+		     const GValue *value,
+		     GParamSpec	  *pspec)
+{
+	switch (param_id) {
+		/* Indexing */
+	case PROP_MIN_WORD_LENGTH:
+		tracker_fts_config_set_min_word_length (TRACKER_FTS_CONFIG (object),
+							g_value_get_int (value));
+		break;
+	case PROP_MAX_WORD_LENGTH:
+		tracker_fts_config_set_max_word_length (TRACKER_FTS_CONFIG (object),
+							g_value_get_int (value));
+		break;
+	case PROP_MAX_WORDS_TO_INDEX:
+		tracker_fts_config_set_max_words_to_index (TRACKER_FTS_CONFIG (object),
+							   g_value_get_int (value));
+		break;
+
+	default:
+		G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
+		break;
+	};
+}
+
+static void
+config_get_property (GObject	*object,
+		     guint	 param_id,
+		     GValue	*value,
+		     GParamSpec *pspec)
+{
+	TrackerFTSConfigPrivate *priv;
+
+	priv = TRACKER_FTS_CONFIG_GET_PRIVATE (object);
+
+	switch (param_id) {
+		/* Indexing */
+	case PROP_MIN_WORD_LENGTH:
+		g_value_set_int (value, priv->min_word_length);
+		break;
+	case PROP_MAX_WORD_LENGTH:
+		g_value_set_int (value, priv->max_word_length);
+		break;
+	case PROP_MAX_WORDS_TO_INDEX:
+		g_value_set_int (value, priv->max_words_to_index);
+		break;
+
+	default:
+		G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
+		break;
+	};
+}
+
+static void
+config_finalize (GObject *object)
+{
+	/* For now we do nothing here, we left this override in for
+	 * future expansion.
+	 */
+
+	(G_OBJECT_CLASS (tracker_fts_config_parent_class)->finalize) (object);
+}
+
+static void
+config_constructed (GObject *object)
+{
+	(G_OBJECT_CLASS (tracker_fts_config_parent_class)->constructed) (object);
+
+	config_load (TRACKER_FTS_CONFIG (object));
+}
+
+static void
+config_create_with_defaults (TrackerFTSConfig *config,
+			     GKeyFile      *key_file, 
+			     gboolean       overwrite)
+{
+	gint i;
+
+	g_message ("Loading defaults into GKeyFile...");
+	
+	for (i = 0; i < G_N_ELEMENTS (conversions); i++) {
+		gboolean has_key;
+		
+		has_key = g_key_file_has_key (key_file, 
+					      conversions[i].group, 
+					      conversions[i].key, 
+					      NULL);
+		if (!overwrite && has_key) {
+			continue;
+		}
+		
+		switch (conversions[i].type) {
+		case G_TYPE_INT:
+			g_key_file_set_integer (key_file, 
+						conversions[i].group, 
+						conversions[i].key, 
+						tracker_keyfile_object_default_int (config, 
+										    conversions[i].property));
+			break;
+
+		default:
+			g_assert_not_reached ();
+			break;
+		}
+
+		g_key_file_set_comment (key_file, 
+					conversions[i].group, 
+					conversions[i].key, 
+					tracker_keyfile_object_blurb (config,
+								      conversions[i].property), 
+					NULL);
+	}
+}
+
+static void
+config_load (TrackerFTSConfig *config)
+{
+	TrackerConfigFile *file;
+	gint i;
+
+	file = TRACKER_CONFIG_FILE (config);
+	config_create_with_defaults (config, file->key_file, FALSE);
+
+	if (!file->file_exists) {
+		tracker_config_file_save (file);
+	}
+
+	for (i = 0; i < G_N_ELEMENTS (conversions); i++) {
+		gboolean has_key;
+		
+		has_key = g_key_file_has_key (file->key_file, 
+					      conversions[i].group, 
+					      conversions[i].key, 
+					      NULL);
+	
+		switch (conversions[i].type) {
+		case G_TYPE_INT:
+			tracker_keyfile_object_load_int (G_OBJECT (file), 
+							 conversions[i].property,
+							 file->key_file,
+							 conversions[i].group, 
+							 conversions[i].key);
+			break;
+
+		default:
+			g_assert_not_reached ();
+			break;
+		}
+	}
+}
+
+static gboolean
+config_save (TrackerFTSConfig *config)
+{
+	TrackerConfigFile *file;
+	gint i;
+
+	file = TRACKER_CONFIG_FILE (config);
+
+	if (!file->key_file) {
+		g_critical ("Could not save config, GKeyFile was NULL, has the config been loaded?");
+
+		return FALSE;
+	}
+
+	g_message ("Setting details to GKeyFile object...");
+
+	for (i = 0; i < G_N_ELEMENTS (conversions); i++) {
+		switch (conversions[i].type) {
+		case G_TYPE_INT:
+			tracker_keyfile_object_save_int (file,
+							 conversions[i].property, 
+							 file->key_file,
+							 conversions[i].group, 
+							 conversions[i].key);
+			break;
+
+		default:
+			g_assert_not_reached ();
+			break;
+		}
+	}
+
+	return tracker_config_file_save (TRACKER_CONFIG_FILE (config));
+}
+
+TrackerFTSConfig *
+tracker_fts_config_new (void)
+{
+	return g_object_new (TRACKER_TYPE_FTS_CONFIG, 
+			     "domain", "tracker-fts",
+			     NULL);
+}
+
+gboolean
+tracker_fts_config_save (TrackerFTSConfig *config)
+{
+	g_return_val_if_fail (TRACKER_IS_FTS_CONFIG (config), FALSE);
+
+	return config_save (config);
+}
+
+gint
+tracker_fts_config_get_min_word_length (TrackerFTSConfig *config)
+{
+	TrackerFTSConfigPrivate *priv;
+
+	g_return_val_if_fail (TRACKER_IS_FTS_CONFIG (config), DEFAULT_MIN_WORD_LENGTH);
+
+	priv = TRACKER_FTS_CONFIG_GET_PRIVATE (config);
+
+	return priv->min_word_length;
+}
+
+gint
+tracker_fts_config_get_max_word_length (TrackerFTSConfig *config)
+{
+	TrackerFTSConfigPrivate *priv;
+
+	g_return_val_if_fail (TRACKER_IS_FTS_CONFIG (config), DEFAULT_MAX_WORD_LENGTH);
+
+	priv = TRACKER_FTS_CONFIG_GET_PRIVATE (config);
+
+	return priv->max_word_length;
+}
+
+gint
+tracker_fts_config_get_max_words_to_index (TrackerFTSConfig *config)
+{
+	TrackerFTSConfigPrivate *priv;
+
+	g_return_val_if_fail (TRACKER_IS_FTS_CONFIG (config), DEFAULT_MAX_WORDS_TO_INDEX);
+
+	priv = TRACKER_FTS_CONFIG_GET_PRIVATE (config);
+
+	return priv->max_words_to_index;
+}
+
+void
+tracker_fts_config_set_min_word_length (TrackerFTSConfig *config,
+					gint	          value)
+{
+	TrackerFTSConfigPrivate *priv;
+
+	g_return_if_fail (TRACKER_IS_FTS_CONFIG (config));
+
+	if (!tracker_keyfile_object_validate_int (config, "min-word-length", value)) {
+		return;
+	}
+
+	priv = TRACKER_FTS_CONFIG_GET_PRIVATE (config);
+
+	priv->min_word_length = value;
+	g_object_notify (G_OBJECT (config), "min-word-length");
+}
+
+void
+tracker_fts_config_set_max_word_length (TrackerFTSConfig *config,
+					gint	          value)
+{
+	TrackerFTSConfigPrivate *priv;
+
+	g_return_if_fail (TRACKER_IS_FTS_CONFIG (config));
+
+	if (!tracker_keyfile_object_validate_int (config, "max-word-length", value)) {
+		return;
+	}
+
+	priv = TRACKER_FTS_CONFIG_GET_PRIVATE (config);
+
+	priv->max_word_length = value;
+	g_object_notify (G_OBJECT (config), "max-word-length");
+}
+
+void
+tracker_fts_config_set_max_words_to_index (TrackerFTSConfig *config,
+					   gint  	     value)
+{
+	TrackerFTSConfigPrivate *priv;
+
+	g_return_if_fail (TRACKER_IS_FTS_CONFIG (config));
+
+	if (!tracker_keyfile_object_validate_int (config, "max-words-to-index", value)) {
+		return;
+	}
+
+	priv = TRACKER_FTS_CONFIG_GET_PRIVATE (config);
+
+	priv->max_words_to_index = value;
+	g_object_notify (G_OBJECT (config), "max-words-to-index");
+}
diff --git a/src/tracker-extract/tracker-fts-config.h b/src/tracker-extract/tracker-fts-config.h
new file mode 100644
index 0000000..1919472
--- /dev/null
+++ b/src/tracker-extract/tracker-fts-config.h
@@ -0,0 +1,65 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2009, Nokia (urho konttori nokia com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __TRACKER_FTS_CONFIG_H__
+#define __TRACKER_FTS_CONFIG_H__
+
+#include <glib-object.h>
+
+#include <libtracker-common/tracker-config-file.h>
+
+G_BEGIN_DECLS
+
+#define TRACKER_TYPE_FTS_CONFIG	        (tracker_fts_config_get_type ())
+#define TRACKER_FTS_CONFIG(o)	        (G_TYPE_CHECK_INSTANCE_CAST ((o), TRACKER_TYPE_FTS_CONFIG, TrackerFTSConfig))
+#define TRACKER_FTS_CONFIG_CLASS(k)     (G_TYPE_CHECK_CLASS_CAST ((k), TRACKER_TYPE_FTS_CONFIG, TrackerFTSConfigClass))
+#define TRACKER_IS_FTS_CONFIG(o)        (G_TYPE_CHECK_INSTANCE_TYPE ((o), TRACKER_TYPE_FTS_CONFIG))
+#define TRACKER_IS_FTS_CONFIG_CLASS(k)  (G_TYPE_CHECK_CLASS_TYPE ((k), TRACKER_TYPE_FTS_CONFIG))
+#define TRACKER_FTS_CONFIG_GET_CLASS(o) (G_TYPE_INSTANCE_GET_CLASS ((o), TRACKER_TYPE_FTS_CONFIG, TrackerFTSConfigClass))
+
+typedef struct TrackerFTSConfig	     TrackerFTSConfig;
+typedef struct TrackerFTSConfigClass TrackerFTSConfigClass;
+
+struct TrackerFTSConfig {
+	TrackerConfigFile parent;
+};
+
+struct TrackerFTSConfigClass {
+	TrackerConfigFileClass parent_class;
+};
+
+GType             tracker_fts_config_get_type               (void) G_GNUC_CONST;
+
+TrackerFTSConfig *tracker_fts_config_new                    (void);
+gboolean          tracker_fts_config_save                   (TrackerFTSConfig *config);
+gint              tracker_fts_config_get_min_word_length    (TrackerFTSConfig *config);
+gint              tracker_fts_config_get_max_word_length    (TrackerFTSConfig *config);
+gint              tracker_fts_config_get_max_words_to_index (TrackerFTSConfig *config);
+void              tracker_fts_config_set_min_word_length    (TrackerFTSConfig *config,
+							     gint              value);
+void              tracker_fts_config_set_max_word_length    (TrackerFTSConfig *config,
+							     gint              value);
+void              tracker_fts_config_set_max_words_to_index (TrackerFTSConfig *config,
+							     gint              value);
+
+G_END_DECLS
+
+#endif /* __TRACKER_FTS_CONFIG_H__ */
+
diff --git a/src/tracker-extract/tracker-main.c b/src/tracker-extract/tracker-main.c
index fa9dbda..90946c9 100644
--- a/src/tracker-extract/tracker-main.c
+++ b/src/tracker-extract/tracker-main.c
@@ -74,6 +74,7 @@ static gboolean    disable_shutdown;
 static gint        verbosity = -1;
 static gchar      *filename;
 static gchar      *mime_type;
+static TrackerFTSConfig *fts_config;
 
 static GOptionEntry  entries[] = {
 	{ "version", 'V', 0,
@@ -254,6 +255,16 @@ log_handler (const gchar    *domain,
 	}	
 }
 
+TrackerFTSConfig *
+tracker_main_get_fts_config (void)
+{
+	if (G_UNLIKELY (!fts_config)) {
+		fts_config = tracker_fts_config_new ();
+	}
+
+	return fts_config;
+}
+
 int
 main (int argc, char *argv[])
 {
diff --git a/src/tracker-extract/tracker-main.h b/src/tracker-extract/tracker-main.h
index 45c5405..a562fef 100644
--- a/src/tracker-extract/tracker-main.h
+++ b/src/tracker-extract/tracker-main.h
@@ -26,6 +26,7 @@
 
 #include <libtracker-common/tracker-storage.h>
 #include <libtracker-common/tracker-statement-list.h>
+#include "tracker-fts-config.h"
 
 G_BEGIN_DECLS
 
@@ -48,6 +49,8 @@ TrackerExtractData *tracker_get_extract_data        (void);
  */
 void                tracker_main_quit_timeout_reset (void);
 
+TrackerFTSConfig   *tracker_main_get_fts_config (void);
+
 G_END_DECLS
 
 #endif /* __TRACKER_MAIN_H__ */



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]