[tracker] Fixes GB#616493 - Remove dependency of unzip from the OASIS extractor



commit 7506a09f8859875dceba5fa4edbb651d29004e63
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Wed Apr 21 18:58:09 2010 +0200

    Fixes GB#616493 - Remove dependency of unzip from the OASIS extractor

 src/tracker-extract/Makefile.am                |   14 ++-
 src/tracker-extract/tracker-extract-msoffice.c |  120 ++-----------------
 src/tracker-extract/tracker-extract-oasis.c    |   85 ++++++-------
 src/tracker-extract/tracker-gsf.c              |  153 ++++++++++++++++++++++++
 src/tracker-extract/tracker-gsf.h              |   35 ++++++
 5 files changed, 246 insertions(+), 161 deletions(-)
---
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 1346b0b..eede0c1 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -33,13 +33,12 @@ if HAVE_ENCA
 INCLUDES += $(ENCA_CFLAGS)
 endif
 
-# NOTE: 
+# NOTE:
 # We don't always link with libtracker-common, we only link
 # against it if we directly use functions in the .so
 modules_LTLIBRARIES = 							\
 	libextract-abw.la 						\
 	libextract-mp3.la				 		\
-	libextract-oasis.la 						\
 	libextract-png.la 						\
 	libextract-ps.la 						\
 	libextract-text.la
@@ -69,7 +68,9 @@ modules_LTLIBRARIES += libextract-html.la
 endif
 
 if HAVE_LIBGSF
-modules_LTLIBRARIES += libextract-msoffice.la
+modules_LTLIBRARIES += 							\
+	libextract-msoffice.la						\
+	libextract-oasis.la
 endif
 
 if HAVE_POPPLER_GLIB
@@ -82,7 +83,7 @@ endif
 
 if HAVE_GSTREAMER_HELIX
 modules_LTLIBRARIES += libextract-gstreamer-helix.la
-endif 
+endif
 
 if HAVE_LIBXINE
 modules_LTLIBRARIES += libextract-xine.la
@@ -320,6 +321,11 @@ tracker_extract_LDADD = 						\
 	$(GCOV_LIBS)							\
 	$(GLIB2_LIBS)
 
+if HAVE_LIBGSF
+tracker_extract_SOURCES += tracker-gsf.c tracker-gsf.h
+tracker_extract_LDADD += $(LIBGSF_LIBS)
+endif
+
 if HAVE_LIBSTREAMANALYZER
 tracker_extract_SOURCES += tracker-topanalyzer.cpp tracker-topanalyzer.h
 tracker_extract_LDADD += $(LIBSTREAMANALYZER_LIBS)
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index d7e6aa1..708a9dd 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -40,6 +40,7 @@
 #include <libtracker-extract/tracker-extract.h>
 
 #include "tracker-main.h"
+#include "tracker-gsf.h"
 
 /* Powerpoint files comprise of structures. Each structure contains a
  * header. Within that header is a record type that specifies what
@@ -2086,116 +2087,6 @@ xml_text_handler_document_data (GMarkupParseContext  *context,
 	}
 }
 
-/**
- * based on find_member() from vsd_utils.c:
- * http://vsdump.sourcearchive.com/documentation/0.0.44/vsd__utils_8c-source.html
- */
-static GsfInput *
-find_member (GsfInfile *arch,
-             char const *name)
-{
-	gchar const *slash = strchr (name, '/');
-
-	if (slash) {
-		gchar *dirname = g_strndup (name, slash - name);
-		GsfInput *member;
-
-		if ((member = gsf_infile_child_by_name (arch, dirname)) != NULL) {
-			GsfInfile *dir = GSF_INFILE (member);
-			member = find_member (dir, slash + 1);
-			g_object_unref (dir);
-		}
-
-		g_free (dirname);
-		return member;
-	} else {
-		return gsf_infile_child_by_name (arch, name);
-	}
-}
-
-
-#define XML_BUFFER_SIZE            8192         /* bytes */
-/* Note: 20 MBytes of max size is really assumed to be a safe limit. */
-#define XML_MAX_BYTES_READ         (20u << 20)  /* bytes */
-
-static void
-parse_xml_contents (const gchar *file_uri,
-                    const gchar *xml_filename,
-                    GMarkupParseContext *context)
-{
-	gchar *filename;
-	GError *error = NULL;
-	GsfInfile *infile = NULL;
-	GsfInput *src = NULL;
-	GsfInput *member = NULL;
-
-	g_debug ("Parsing '%s' XML file from '%s' zip archive...",
-	         xml_filename, file_uri);
-
-	/* Get filename from the given URI */
-	if ((filename = g_filename_from_uri (file_uri,
-	                                     NULL, &error)) == NULL) {
-		g_warning ("Can't get filename from uri '%s': %s",
-		           file_uri, error ? error->message : "no error given");
-	}
-	/* Create a new Input GSF object for the given file */
-	else if ((src = gsf_input_stdio_new (filename, &error)) == NULL) {
-		g_warning ("Failed creating a GSF Input object for '%s': %s",
-		           filename, error ? error->message : "no error given");
-	}
-	/* Input object is a Zip file */
-	else if ((infile = gsf_infile_zip_new (src, &error)) == NULL) {
-		g_warning ("'%s' Not a zip file: %s",
-		           filename, error ? error->message : "no error given");
-	}
-	/* Look for requested filename inside the ZIP file */
-	else if ((member = find_member (infile, xml_filename)) == NULL) {
-		g_warning ("No member '%s' in zip file '%s'",
-		           xml_filename, filename);
-	}
-	/* Load whole contents of the internal file in the xml buffer */
-	else {
-		guint8 buf[XML_BUFFER_SIZE];
-		size_t remaining_size, chunk_size, accum;
-
-		/* Get whole size of the contents to read */
-		remaining_size = (size_t) gsf_input_size (GSF_INPUT (member));
-
-		/* Note that gsf_input_read() needs to be able to read ALL specified
-		 *  number of bytes, or it will fail */
-		chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
-
-		accum = 0;
-		while (accum  <= XML_MAX_BYTES_READ &&
-		       chunk_size > 0 &&
-		       gsf_input_read (GSF_INPUT (member), chunk_size, buf) != NULL) {
-
-			/* update accumulated count */
-			accum += chunk_size;
-
-			/* Pass the read stream to the context parser... */
-			g_markup_parse_context_parse (context, buf, chunk_size, NULL);
-
-			/* update bytes to be read */
-			remaining_size -= chunk_size;
-			chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
-		}
-	}
-
-	/* it's safe to call g_free on NULL pointers */
-	g_free (filename);
-	/* but better don't do it in g_object_unref or g_error_free */
-	if (error)
-		g_error_free (error);
-	if (infile)
-		g_object_unref (infile);
-	if (src)
-		g_object_unref (src);
-	if (member)
-		g_object_unref (member);
-}
-
-
 static gboolean
 xml_read (MsOfficeXMLParserInfo *parser_info,
           const gchar           *xml_filename,
@@ -2254,7 +2145,9 @@ xml_read (MsOfficeXMLParserInfo *parser_info,
 	if (context) {
 		/* Load the internal XML file from the Zip archive, and parse it
 		 * using the given context */
-		parse_xml_contents (parser_info->uri, xml_filename, context);
+		tracker_gsf_parse_xml_in_zip (parser_info->uri,
+		                              xml_filename,
+		                              context);
 		g_markup_parse_context_free (context);
 	}
 
@@ -2395,9 +2288,12 @@ extract_msoffice_xml (const gchar          *uri,
 	info.content = g_string_new ("");
 
 	context = g_markup_parse_context_new (&parser, 0, &info, NULL);
+
 	/* Load the internal XML file from the Zip archive, and parse it
 	 * using the given context */
-	parse_xml_contents (uri, "[Content_Types].xml", context);
+	tracker_gsf_parse_xml_in_zip (uri,
+	                              "[Content_Types].xml",
+	                              context);
 
 	if (info.content) {
 		gchar *content;
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index 3fea090..e2f482c 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -23,6 +23,7 @@
 #include <libtracker-extract/tracker-extract.h>
 
 #include "tracker-main.h"
+#include "tracker-gsf.h"
 
 #include <unistd.h>
 
@@ -72,16 +73,19 @@ static TrackerExtractData extract_data[] = {
 #define ODT_BUFFER_SIZE            8193  /* bytes */
 
 static gchar *
-extract_content (const gchar *path,
-                 guint        n_words,
-                 gsize        n_bytes)
+extract_oasis_content (const gchar *uri,
+                       guint        n_words,
+                       gsize        n_bytes)
 {
 	const gchar *argv[4];
 	gint fdz;
 	FILE *fz;
 	GError *error = NULL;
 	gchar *text = NULL;
+	gchar *path;
 
+	/* Newly allocated string with the file path */
+	path = g_filename_from_uri (uri, NULL, NULL);
 
 	/* Setup command to be executed */
 	argv[0] = "odt2txt";
@@ -164,6 +168,8 @@ extract_content (const gchar *path,
 		text = g_string_free (normalized, FALSE);
 	}
 
+	g_free (path);
+
 	return text;
 }
 
@@ -172,71 +178,60 @@ extract_oasis (const gchar          *uri,
                TrackerSparqlBuilder *preupdate,
                TrackerSparqlBuilder *metadata)
 {
-	gchar *argv[5];
-	gchar *xml;
-	gchar *filename;
 	gchar *content;
 	TrackerFTSConfig *fts_config;
 	guint n_words;
 	gsize n_bytes;
+	ODTParseInfo info;
+	GMarkupParseContext *context;
+	GMarkupParser parser = {
+		xml_start_element_handler,
+		xml_end_element_handler,
+		xml_text_handler,
+		NULL,
+		NULL
+	};
+
+	/* Setup conf */
+	fts_config = tracker_main_get_fts_config ();
 
-	filename = g_filename_from_uri (uri, NULL, NULL);
-
-	argv[0] = g_strdup ("unzip");
-	argv[1] = g_strdup ("-p");
-	argv[2] = filename;
-	argv[3] = g_strdup ("meta.xml");
-	argv[4] = NULL;
+	g_debug ("Extracting OASIS metadata and contents from '%s'", uri);
 
-	/* No need to unlink meta.xml, as it goes to stdout of the
-	 *  spawned child (-p option in unzip) */
+	/* First, parse metadata */
 
 	tracker_sparql_builder_predicate (metadata, "a");
 	tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
 
-	if (tracker_spawn (argv, 10, &xml, NULL)) {
-		ODTParseInfo info;
-		GMarkupParseContext *context;
-		GMarkupParser parser = {
-			xml_start_element_handler,
-			xml_end_element_handler,
-			xml_text_handler,
-			NULL,
-			NULL
-		};
-
-		info.metadata = metadata;
-		info.current = ODT_TAG_TYPE_UNKNOWN;
-		info.uri = uri;
-
-		context = g_markup_parse_context_new (&parser, 0, &info, NULL);
-		g_markup_parse_context_parse (context, xml, -1, NULL);
-
-		g_markup_parse_context_free (context);
-		g_free (xml);
-	}
+	/* Create parse info */
+	info.metadata = metadata;
+	info.current = ODT_TAG_TYPE_UNKNOWN;
+	info.uri = uri;
+
+	/* Create parsing context */
+	context = g_markup_parse_context_new (&parser, 0, &info, NULL);
+
+	/* Load the internal XML file from the Zip archive, and parse it
+	 * using the given context */
+	tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context);
+	g_markup_parse_context_free (context);
+
+	/* Next, parse contents */
 
-	fts_config = tracker_main_get_fts_config ();
 	/* Set max words to read from content */
 	n_words = tracker_fts_config_get_max_words_to_index (fts_config);
+
 	/* Set max bytes to read from content.
 	 * Assuming 3 bytes per unicode point in UTF-8, as 4-byte UTF-8 unicode
 	 *  points are really pretty rare */
 	n_bytes = 3 * n_words * tracker_fts_config_get_max_word_length(fts_config);
 
-	content = extract_content (filename, n_words, n_bytes);
-
+	/* Extract content with the given limitations */
+	content = extract_oasis_content (uri, n_words, n_bytes);
 	if (content) {
 		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
 		tracker_sparql_builder_object_unvalidated (metadata, content);
 		g_free (content);
 	}
-
-	g_free (argv[3]);
-	g_free (argv[1]);
-	g_free (argv[0]);
-
-	g_free (filename);
 }
 
 static void
diff --git a/src/tracker-extract/tracker-gsf.c b/src/tracker-extract/tracker-gsf.c
new file mode 100644
index 0000000..9bf1608
--- /dev/null
+++ b/src/tracker-extract/tracker-gsf.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include <string.h>
+
+#include <glib.h>
+
+#include <gsf/gsf.h>
+#include <gsf/gsf-infile.h>
+#include <gsf/gsf-input-stdio.h>
+#include <gsf/gsf-infile-zip.h>
+
+#include "tracker-gsf.h"
+
+/* Size of the buffer to use */
+#define XML_BUFFER_SIZE            8192         /* bytes */
+/* Note: 20 MBytes of max size is really assumed to be a safe limit. */
+#define XML_MAX_BYTES_READ         (20u << 20)  /* bytes */
+
+/**
+ * based on find_member() from vsd_utils.c:
+ * http://vsdump.sourcearchive.com/documentation/0.0.44/vsd__utils_8c-source.html
+ */
+static GsfInput *
+find_member (GsfInfile *arch,
+             gchar const *name)
+{
+	gchar const *slash;
+
+	slash = strchr (name, '/');
+
+	if (slash) {
+		gchar *dirname;
+		GsfInput *member;
+
+		dirname = g_strndup (name, slash - name);
+
+		if ((member = gsf_infile_child_by_name (arch, dirname)) != NULL) {
+			GsfInfile *dir;
+
+			dir = GSF_INFILE (member);
+			member = find_member (dir, slash + 1);
+			g_object_unref (dir);
+		}
+
+		g_free (dirname);
+		return member;
+	} else {
+		return gsf_infile_child_by_name (arch, name);
+	}
+}
+
+/**
+ * tracker_gsf_parse_xml_in_zip:
+ * @zip_file_uri: URI of the ZIP archive
+ * @xml_filename: Name of the XML file stored inside the ZIP archive
+ * @context: Markup context to be used when parsing the XML
+ *
+ * This function reads and parses the contents of an XML file stored
+ *  inside a ZIP compressed archive. Reading and parsing is done buffered, and
+ *  maximum size of the uncompressed XML file is limited to be to 20MBytes.
+ */
+void
+tracker_gsf_parse_xml_in_zip (const gchar         *zip_file_uri,
+                              const gchar         *xml_filename,
+                              GMarkupParseContext *context)
+{
+	gchar *filename;
+	GError *error = NULL;
+	GsfInfile *infile = NULL;;
+	GsfInput *src = NULL;
+	GsfInput *member = NULL;
+
+	g_debug ("Parsing '%s' XML file from '%s' zip archive...",
+	         xml_filename, zip_file_uri);
+
+	/* Get filename from the given URI */
+	if ((filename = g_filename_from_uri (zip_file_uri,
+	                                     NULL, &error)) == NULL) {
+		g_warning ("Can't get filename from uri '%s': %s",
+		           zip_file_uri, error ? error->message : "no error given");
+	}
+	/* Create a new Input GSF object for the given file */
+	else if ((src = gsf_input_stdio_new (filename, &error)) == NULL) {
+		g_warning ("Failed creating a GSF Input object for '%s': %s",
+		           zip_file_uri, error ? error->message : "no error given");
+	}
+	/* Input object is a Zip file */
+	else if ((infile = gsf_infile_zip_new (src, &error)) == NULL) {
+		g_warning ("'%s' Not a zip file: %s",
+		           zip_file_uri, error ? error->message : "no error given");
+	}
+	/* Look for requested filename inside the ZIP file */
+	else if ((member = find_member (infile, xml_filename)) == NULL) {
+		g_warning ("No member '%s' in zip file '%s'",
+		           xml_filename, zip_file_uri);
+	}
+	/* Load whole contents of the internal file in the xml buffer */
+	else {
+		guint8 buf[XML_BUFFER_SIZE];
+		size_t remaining_size, chunk_size, accum;
+
+		/* Get whole size of the contents to read */
+		remaining_size = (size_t) gsf_input_size (GSF_INPUT (member));
+
+		/* Note that gsf_input_read() needs to be able to read ALL specified
+		 *  number of bytes, or it will fail */
+		chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
+
+		accum = 0;
+		while (accum  <= XML_MAX_BYTES_READ &&
+		       chunk_size > 0 &&
+		       gsf_input_read (GSF_INPUT (member), chunk_size, buf) != NULL) {
+
+			/* update accumulated count */
+			accum += chunk_size;
+
+			/* Pass the read stream to the context parser... */
+			g_markup_parse_context_parse (context, buf, chunk_size, NULL);
+
+			/* update bytes to be read */
+			remaining_size -= chunk_size;
+			chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
+		}
+	}
+
+	g_free (filename);
+	if (error)
+		g_error_free (error);
+	if (infile)
+		g_object_unref (infile);
+	if (src)
+		g_object_unref (src);
+	if (member)
+		g_object_unref (member);
+}
+
diff --git a/src/tracker-extract/tracker-gsf.h b/src/tracker-extract/tracker-gsf.h
new file mode 100644
index 0000000..26c34b3
--- /dev/null
+++ b/src/tracker-extract/tracker-gsf.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __TRACKER_GSF_H__
+#define __TRACKER_GSF_H__
+
+#include <glib.h>
+#include <gsf/gsf.h>
+
+G_BEGIN_DECLS
+
+void tracker_gsf_parse_xml_in_zip (const gchar         *zip_file_uri,
+                                   const gchar         *xml_filename,
+                                   GMarkupParseContext *context);
+
+G_END_DECLS
+
+#endif /* __TRACKER_GSF_H__ */
+



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]