[tracker] Add FTS support for MS and ODF document formats.



commit 0d83a247e921680d7972b3648fd94c277ee054d1
Author: Carlos Garnacho <carlos lanedo com>
Date:   Thu Oct 8 18:15:37 2009 +0200

    Add FTS support for MS and ODF document formats.

 src/tracker-extract/tracker-extract-msoffice.c |   44 +++++++++++++++++++++++-
 src/tracker-extract/tracker-extract-oasis.c    |   35 +++++++++++++++++++
 2 files changed, 78 insertions(+), 1 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index b1eebd8..1b1b2c8 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -208,6 +208,39 @@ doc_metadata_cb (gpointer key,
 	}
 }
 
+static gchar *
+extract_content (const gchar *uri,
+		 guint        n_words)
+{
+	gchar *path, *command, *output, *text;
+	GError *error = NULL;
+
+	path = g_filename_from_uri (uri, NULL, NULL);
+
+	if (!path) {
+		return NULL;
+	}
+
+	command = g_strdup_printf ("wvWare --charset utf-8 -1 -x wvText.xml %s", path);
+
+	g_free (path);
+
+	if (!g_spawn_command_line_sync (command, &output, NULL, NULL, &error)) {
+		g_warning ("Could not extract text from '%s': %s", uri, error->message);
+		g_error_free (error);
+		g_free (command);
+
+		return NULL;
+	}
+
+	text = tracker_text_normalize (output, n_words, NULL);
+
+	g_free (command);
+	g_free (output);
+
+	return text;
+}
+
 static void
 extract_msoffice (const gchar *uri,
 		  TrackerSparqlBuilder   *metadata)
@@ -215,7 +248,7 @@ extract_msoffice (const gchar *uri,
 	GsfInput  *input;
 	GsfInfile *infile;
 	GsfInput  *stream;
-	gchar     *filename;
+	gchar     *filename, *content;
 	gboolean   rdf_type_added = FALSE;
 
 	gsf_init ();
@@ -294,7 +327,16 @@ extract_msoffice (const gchar *uri,
 		g_object_unref (stream);
 	}
 
+	content = extract_content (uri, 1000);
+
+	if (content) {
+		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+		tracker_sparql_builder_object_unvalidated (metadata, content);
+		g_free (content);
+	}
+
 	g_object_unref (infile);
+	g_free (filename);
 
 	gsf_shutdown ();
 }
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index ed5b200..51111a2 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -27,6 +27,7 @@
 #include <libtracker-common/tracker-os-dependant.h>
 #include <libtracker-common/tracker-statement-list.h>
 #include <libtracker-common/tracker-ontology.h>
+#include <libtracker-common/tracker-utils.h>
 
 #include "tracker-main.h"
 
@@ -77,6 +78,31 @@ static TrackerExtractData extract_data[] = {
 	{ NULL, NULL }
 };
 
+static gchar *
+extract_content (const gchar *path,
+		 guint        n_words)
+{
+	gchar *command, *output, *text;
+	GError *error = NULL;
+
+	command = g_strdup_printf ("odt2txt --encoding=utf-8 %s", path);
+
+	if (!g_spawn_command_line_sync (command, &output, NULL, NULL, &error)) {
+		g_warning ("Could not extract text from '%s': %s", path, error->message);
+		g_error_free (error);
+		g_free (command);
+
+		return NULL;
+	}
+
+	text = tracker_text_normalize (output, n_words, NULL);
+
+	g_free (command);
+	g_free (output);
+
+	return text;
+}
+
 static void
 extract_oasis (const gchar *uri,
 	       TrackerSparqlBuilder   *metadata)
@@ -84,6 +110,7 @@ extract_oasis (const gchar *uri,
 	gchar	      *argv[5];
 	gchar	      *xml;
 	gchar *filename = g_filename_from_uri (uri, NULL, NULL);
+	gchar *content;
 	ODTParseInfo   info = {
 		metadata,
 		-1,
@@ -117,6 +144,14 @@ extract_oasis (const gchar *uri,
 		g_free (xml);
 	}
 
+	content = extract_content (filename, 1000);
+
+	if (content) {
+		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+		tracker_sparql_builder_object_unvalidated (metadata, content);
+		g_free (content);
+	}
+
 	g_free (argv[3]);
 	g_free (argv[1]);
 	g_free (argv[0]);



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]