[tracker] Fixes GB#623005: Tracker plugin for odt, odp, ods files



commit 9a3c8cfc2cd0b6b4bb8420c244551dc139bf80f6
Author: Murugappan Nataraj <murugappan nataraj nokia com>
Date:   Tue Aug 17 18:59:45 2010 +0530

    Fixes GB#623005: Tracker plugin for odt, odp, ods files

 src/tracker-extract/tracker-extract-msoffice.c |    4 +-
 src/tracker-extract/tracker-extract-oasis.c    |  407 ++++++++++++++++++------
 src/tracker-extract/tracker-gsf.c              |   15 +-
 src/tracker-extract/tracker-gsf.h              |    7 +-
 4 files changed, 330 insertions(+), 103 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 7edd0f3..099c7aa 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -2082,7 +2082,7 @@ xml_read (MsOfficeXMLParserInfo *parser_info,
 		 * using the given context */
 		tracker_gsf_parse_xml_in_zip (parser_info->uri,
 		                              xml_filename,
-		                              context);
+		                              context, NULL);
 		g_markup_parse_context_free (context);
 	}
 
@@ -2237,7 +2237,7 @@ extract_msoffice_xml (const gchar          *uri,
 	 * using the given context */
 	tracker_gsf_parse_xml_in_zip (uri,
 	                              "[Content_Types].xml",
-	                              context);
+	                              context, NULL);
 
 	if (info.content) {
 		gchar *content;
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index b1534f2..596ecfb 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -37,86 +37,126 @@ typedef enum {
 	ODT_TAG_TYPE_COMMENTS,
 	ODT_TAG_TYPE_STATS,
 	ODT_TAG_TYPE_CREATED,
-	ODT_TAG_TYPE_GENERATOR
+	ODT_TAG_TYPE_GENERATOR,
+	ODT_TAG_TYPE_WORD_TEXT,
+	ODT_TAG_TYPE_SLIDE_TEXT,
+	ODT_TAG_TYPE_SPREADSHEET_TEXT
 } ODTTagType;
 
+typedef enum {
+	FILE_TYPE_INVALID,
+	FILE_TYPE_ODP,
+	FILE_TYPE_ODT,
+	FILE_TYPE_ODS
+} ODTFileType;
+
 typedef struct {
 	TrackerSparqlBuilder *metadata;
 	ODTTagType current;
 	const gchar *uri;
 	gboolean title_already_set;
-} ODTParseInfo;
-
-static void xml_start_element_handler (GMarkupParseContext   *context,
-                                       const gchar           *element_name,
-                                       const gchar          **attribute_names,
-                                       const gchar          **attribute_values,
-                                       gpointer               user_data,
-                                       GError               **error);
-static void xml_end_element_handler   (GMarkupParseContext   *context,
-                                       const gchar           *element_name,
-                                       gpointer               user_data,
-                                       GError               **error);
-static void xml_text_handler          (GMarkupParseContext   *context,
-                                       const gchar           *text,
-                                       gsize                  text_len,
-                                       gpointer               user_data,
-                                       GError               **error);
-static void extract_oasis             (const gchar           *filename,
-                                       TrackerSparqlBuilder  *preupdate,
-                                       TrackerSparqlBuilder  *metadata);
+} ODTMetadataParseInfo;
+
+typedef struct {
+	ODTTagType current;
+	gboolean styles_present;
+	ODTFileType file_type;
+	GString *content;
+	gulong bytes_pending;
+} ODTContentParseInfo;
+
+GQuark maximum_size_error_quark = 0;
+
+static void xml_start_element_handler_metadata (GMarkupParseContext   *context,
+                                                const gchar           *element_name,
+                                                const gchar          **attribute_names,
+                                                const gchar          **attribute_values,
+                                                gpointer               user_data,
+                                                GError               **error);
+static void xml_end_element_handler_metadata   (GMarkupParseContext   *context,
+                                                const gchar           *element_name,
+                                                gpointer               user_data,
+                                                GError               **error);
+static void xml_text_handler_metadata          (GMarkupParseContext   *context,
+                                                const gchar           *text,
+                                                gsize                  text_len,
+                                                gpointer               user_data,
+                                                GError               **error);
+static void xml_start_element_handler_content  (GMarkupParseContext   *context,
+                                                const gchar           *element_name,
+                                                const gchar          **attribute_names,
+                                                const gchar          **attribute_values,
+                                                gpointer               user_data,
+                                                GError               **error);
+static void xml_end_element_handler_content    (GMarkupParseContext   *context,
+                                                const gchar           *element_name,
+                                                gpointer               user_data,
+                                                GError               **error);
+static void xml_text_handler_content           (GMarkupParseContext   *context,
+                                                const gchar           *text,
+                                                gsize                  text_len,
+                                                gpointer               user_data,
+                                                GError               **error);
+static void extract_oasis                      (const gchar           *filename,
+                                                TrackerSparqlBuilder  *preupdate,
+                                                TrackerSparqlBuilder  *metadata);
+static void extract_oasis_content              (const gchar           *uri,
+                                                gulong                 total_bytes,
+                                                ODTFileType            file_type,
+                                                TrackerSparqlBuilder  *metadata);
 
 static TrackerExtractData extract_data[] = {
 	{ "application/vnd.oasis.opendocument.*", extract_oasis },
 	{ NULL, NULL }
 };
 
-static gchar *
-extract_oasis_content (const gchar *uri,
-                       gsize        n_bytes)
+static void
+extract_oasis_content (const gchar          *uri,
+                       gulong                total_bytes,
+                       ODTFileType           file_type,
+                       TrackerSparqlBuilder *metadata)
 {
+	gchar *content;
+	ODTContentParseInfo info;
+	GMarkupParseContext *context;
 	GError *error = NULL;
-	const gchar *argv[4];
-	gchar *text = NULL;
-	gchar *path;
-	gint fd;
-
-	/* Newly allocated string with the file path */
-	path = g_filename_from_uri (uri, NULL, NULL);
-
-	/* Setup command to be executed */
-	argv[0] = "odt2txt";
-	argv[1] = "--encoding=utf-8";
-	argv[2] = path;
-	argv[3] = NULL;
-
-	g_debug ("Executing command:'%s %s %s' "
-	         "(max_bytes: %" G_GSIZE_FORMAT ")",
-	         argv[0], argv[1], argv[2], n_bytes);
-
-	/* Fork & spawn */
-	if (!g_spawn_async_with_pipes (g_get_tmp_dir (),
-	                               (gchar **)argv,
-	                               NULL,
-	                               G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL,
-	                               tracker_spawn_child_func,
-	                               GINT_TO_POINTER (10),
-	                               NULL,
-	                               NULL,
-	                               &fd,
-	                               NULL,
-	                               &error)) {
-		g_warning ("Spawning failed, could not extract text from '%s': %s",
-		           path, error ? error->message : NULL);
-		g_clear_error (&error);
+	GMarkupParser parser = {
+		xml_start_element_handler_content,
+		xml_end_element_handler_content,
+		xml_text_handler_content,
+		NULL,
+		NULL
+	};
+
+	/* Create parse info */
+	info.current = ODT_TAG_TYPE_UNKNOWN;
+	info.file_type = file_type;
+	info.styles_present = FALSE;
+	info.content = g_string_new ("");
+	info.bytes_pending = total_bytes;
+
+	/* Create parsing context */
+	context = g_markup_parse_context_new (&parser, 0, &info, NULL);
+
+	/* Load the internal XML file from the Zip archive, and parse it
+	 * using the given context */
+	tracker_gsf_parse_xml_in_zip (uri, "content.xml", context, &error);
+
+	if (!error || g_error_matches (error, maximum_size_error_quark, 0)) {
+		content = g_string_free (info.content, FALSE);
+		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+		tracker_sparql_builder_object_unvalidated (metadata, content);
 	} else {
-		/* Read up to n_bytes from FD (also closes FD) */
-		text = tracker_read_text_from_fd (fd, n_bytes, FALSE);
+		g_warning ("Got error parsing XML file: %s\n", error->message);
+		g_string_free (info.content, TRUE);
 	}
 
-	g_free (path);
+	if (error) {
+		g_error_free (error);
+	}
 
-	return text;
+	g_free (content);
+	g_markup_parse_context_free (context);
 }
 
 static void
@@ -124,18 +164,25 @@ extract_oasis (const gchar          *uri,
                TrackerSparqlBuilder *preupdate,
                TrackerSparqlBuilder *metadata)
 {
-	gchar *content;
 	TrackerConfig *config;
-	ODTParseInfo info;
+	ODTMetadataParseInfo info;
+	ODTFileType file_type;
+	GFile *file = NULL;
+	GFileInfo *file_info = NULL;
+	const gchar *mime_used;
 	GMarkupParseContext *context;
 	GMarkupParser parser = {
-		xml_start_element_handler,
-		xml_end_element_handler,
-		xml_text_handler,
+		xml_start_element_handler_metadata,
+		xml_end_element_handler_metadata,
+		xml_text_handler_metadata,
 		NULL,
 		NULL
 	};
 
+	if (G_UNLIKELY (maximum_size_error_quark == 0)) {
+		maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
+	}
+
 	/* Setup conf */
 	config = tracker_main_get_config ();
 
@@ -157,30 +204,62 @@ extract_oasis (const gchar          *uri,
 
 	/* Load the internal XML file from the Zip archive, and parse it
 	 * using the given context */
-	tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context);
+	tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context, NULL);
 	g_markup_parse_context_free (context);
 
 	/* Next, parse contents */
+	file = g_file_new_for_uri (uri);
 
-	/* Extract content with the given limitations */
-	content = extract_oasis_content (uri,
-	                                 tracker_config_get_max_bytes (config));
-	if (content) {
-		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
-		tracker_sparql_builder_object_unvalidated (metadata, content);
-		g_free (content);
+	if (!file) {
+		g_warning ("Could not create GFile for URI:'%s'",
+		           uri);
+		return;
+	}
+
+	file_info = g_file_query_info (file,
+	                               G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE,
+	                               G_FILE_QUERY_INFO_NONE,
+	                               NULL,
+	                               NULL);
+	g_object_unref (file);
+
+	if (!file_info) {
+		g_warning ("Could not get GFileInfo for URI:'%s'",
+		           uri);
+		return;
+	}
+
+	mime_used = g_file_info_get_content_type (file_info);
+
+	if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.text") == 0) {
+		file_type = FILE_TYPE_ODT;
+	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.presentation") == 0) {
+		file_type = FILE_TYPE_ODP;
+	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.spreadsheet") == 0) {
+		file_type = FILE_TYPE_ODS;
+	} else {
+		g_message ("Mime type was not recognised:'%s'", mime_used);
+		file_type = FILE_TYPE_INVALID;
 	}
+
+	g_object_unref (file_info);
+
+	/* Extract content with the given limitations */
+	extract_oasis_content (uri, 
+	                       tracker_config_get_max_bytes (config), 
+	                       file_type, 
+	                       metadata);
 }
 
 static void
-xml_start_element_handler (GMarkupParseContext  *context,
-                           const gchar          *element_name,
-                           const gchar         **attribute_names,
-                           const gchar         **attribute_values,
-                           gpointer              user_data,
-                           GError              **error)
+xml_start_element_handler_metadata (GMarkupParseContext  *context,
+                                    const gchar          *element_name,
+                                    const gchar         **attribute_names,
+                                    const gchar         **attribute_values,
+                                    gpointer              user_data,
+                                    GError              **error)
 {
-	ODTParseInfo *data = user_data;
+	ODTMetadataParseInfo *data = user_data;
 
 	if (g_ascii_strcasecmp (element_name, "dc:title") == 0) {
 		data->current = ODT_TAG_TYPE_TITLE;
@@ -221,22 +300,22 @@ xml_start_element_handler (GMarkupParseContext  *context,
 }
 
 static void
-xml_end_element_handler (GMarkupParseContext  *context,
-                         const gchar          *element_name,
-                         gpointer              user_data,
-                         GError              **error)
+xml_end_element_handler_metadata (GMarkupParseContext  *context,
+                                  const gchar          *element_name,
+                                  gpointer              user_data,
+                                  GError              **error)
 {
-	((ODTParseInfo*) user_data)->current = -1;
+	((ODTMetadataParseInfo*) user_data)->current = -1;
 }
 
 static void
-xml_text_handler (GMarkupParseContext  *context,
-                  const gchar          *text,
-                  gsize                 text_len,
-                  gpointer              user_data,
-                  GError              **error)
+xml_text_handler_metadata (GMarkupParseContext  *context,
+                           const gchar          *text,
+                           gsize                 text_len,
+                           gpointer              user_data,
+                           GError              **error)
 {
-	ODTParseInfo *data;
+	ODTMetadataParseInfo *data;
 	TrackerSparqlBuilder *metadata;
 	const gchar *uri;
 	gchar *date;
@@ -315,6 +394,150 @@ xml_text_handler (GMarkupParseContext  *context,
 	}
 }
 
+static void
+xml_start_element_handler_content (GMarkupParseContext  *context,
+                                   const gchar          *element_name,
+                                   const gchar         **attribute_names,
+                                   const gchar         **attribute_values,
+                                   gpointer              user_data,
+                                   GError              **error)
+{
+	ODTContentParseInfo *data = user_data;
+	const gchar **a;
+	const gchar **v;
+
+	switch (data->file_type) {
+	case FILE_TYPE_ODT:
+		if ((g_ascii_strcasecmp (element_name, "text:table-of-content") == 0) ||
+		    (g_ascii_strcasecmp (element_name, "text:table-index") == 0) ||
+		    (g_ascii_strcasecmp (element_name, "text:illustration-index") == 0) ||
+		    (g_ascii_strcasecmp (element_name, "text:section") == 0)) {
+		    data->styles_present = TRUE;
+		} else if (g_ascii_strcasecmp (element_name, "table:table-cell") == 0) {
+			data->current = ODT_TAG_TYPE_WORD_TEXT;
+		} else if (g_ascii_strcasecmp (element_name, "text:p") == 0) {
+			if (data->styles_present) {
+				data->current = ODT_TAG_TYPE_WORD_TEXT;
+				break;
+			}
+
+			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+				if (g_ascii_strcasecmp (*a, "text:style-name") != 0) {
+					continue;
+				}
+
+				if ((g_ascii_strcasecmp (*v, "title-article") == 0) ||
+				    (g_ascii_strcasecmp (*v, "para-padding") == 0) ||
+				    (g_ascii_strcasecmp (*v, "para-screen") == 0)) {
+					data->current = ODT_TAG_TYPE_WORD_TEXT;
+				}
+			}
+		} else if (g_ascii_strcasecmp (element_name, "text:h") == 0) {
+			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+				if (g_ascii_strcasecmp (*a, "text:style-name") != 0) {
+					continue;
+				}
+
+				if (g_ascii_strncasecmp (*v, "Heading", 7) == 0) {
+					data->current = ODT_TAG_TYPE_WORD_TEXT;
+				}
+			}
+		} else if (g_ascii_strcasecmp (element_name, "text:span") == 0) {
+			data->current = ODT_TAG_TYPE_WORD_TEXT;
+		} else if ((g_ascii_strcasecmp (element_name, "text:a") == 0) ||
+			   (g_ascii_strcasecmp (element_name, "text:s") == 0)) {
+			data->current = ODT_TAG_TYPE_WORD_TEXT;
+		} else {
+			data->current = -1;
+		}
+		break;
+
+	case FILE_TYPE_ODP:
+		data->current = ODT_TAG_TYPE_SLIDE_TEXT;
+		break;
+
+	case FILE_TYPE_ODS:
+		if (g_ascii_strncasecmp (element_name, "text", 4) == 0) {
+			data->current = ODT_TAG_TYPE_SPREADSHEET_TEXT;
+		} else {
+			data->current = -1;
+		}
+		break;
+
+	case FILE_TYPE_INVALID:
+		g_message ("Open Office Document type: %d invalid", data->file_type);
+		break;
+	}
+}
+
+static void
+xml_end_element_handler_content (GMarkupParseContext  *context,
+                                 const gchar          *element_name,
+                                 gpointer              user_data,
+                                 GError              **error)
+{
+	ODTContentParseInfo *data = user_data;
+
+	switch (data->file_type) {
+	case FILE_TYPE_ODT:
+		if ((g_ascii_strcasecmp (element_name, "text:table-of-content") == 0) ||
+		    (g_ascii_strcasecmp (element_name, "text:table-index") == 0) ||
+		    (g_ascii_strcasecmp (element_name, "text:illustration-index") == 0) ||
+		    (g_ascii_strcasecmp (element_name, "text:section") == 0)) {
+		    data->styles_present = FALSE;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if ((g_ascii_strcasecmp (element_name, "text:a") != 0) &&
+	    (g_ascii_strcasecmp (element_name, "text:s") != 0)) {
+		data->current = -1;
+	}
+}
+
+static void
+xml_text_handler_content (GMarkupParseContext  *context,
+                          const gchar          *text,
+                          gsize                 text_len,
+                          gpointer              user_data,
+                          GError              **error)
+{
+	ODTContentParseInfo *data = user_data;
+	gsize written_bytes = 0;
+
+	switch (data->current) {
+	case ODT_TAG_TYPE_WORD_TEXT:
+	case ODT_TAG_TYPE_SLIDE_TEXT:
+	case ODT_TAG_TYPE_SPREADSHEET_TEXT:
+                if (data->bytes_pending == 0) {
+                        g_set_error_literal (error,
+                                             maximum_size_error_quark, 0,
+                                             "Maximum text limit reached");
+                        break;
+                }
+
+		/* Look for valid UTF-8 text */
+		if (tracker_text_validate_utf8 (text,
+		                                MIN (text_len, data->bytes_pending),
+		                                &data->content,
+		                                &written_bytes)) {
+			if (data->content->str[data->content->len - 1] != ' ') {
+				/* If some bytes found to be valid, append an extra whitespace
+				 * as separator */
+				g_string_append_c (data->content, ' ');
+			}
+		}
+
+		data->bytes_pending -= written_bytes;
+		break;
+
+	default:
+		break;
+	}
+}
+
 TrackerExtractData *
 tracker_extract_get_data (void)
 {
diff --git a/src/tracker-extract/tracker-gsf.c b/src/tracker-extract/tracker-gsf.c
index 7c607d6..69a143e 100644
--- a/src/tracker-extract/tracker-gsf.c
+++ b/src/tracker-extract/tracker-gsf.c
@@ -77,9 +77,10 @@ find_member (GsfInfile *arch,
  *  maximum size of the uncompressed XML file is limited to be to 20MBytes.
  */
 void
-tracker_gsf_parse_xml_in_zip (const gchar         *zip_file_uri,
-                              const gchar         *xml_filename,
-                              GMarkupParseContext *context)
+tracker_gsf_parse_xml_in_zip (const gchar          *zip_file_uri,
+                              const gchar          *xml_filename,
+                              GMarkupParseContext  *context,
+                              GError              **err)
 {
 	gchar *filename;
 	GError *error = NULL;
@@ -124,7 +125,8 @@ tracker_gsf_parse_xml_in_zip (const gchar         *zip_file_uri,
 		chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
 
 		accum = 0;
-		while (accum  <= XML_MAX_BYTES_READ &&
+		while (!error &&
+		       accum  <= XML_MAX_BYTES_READ &&
 		       chunk_size > 0 &&
 		       gsf_input_read (GSF_INPUT (member), chunk_size, buf) != NULL) {
 
@@ -132,7 +134,7 @@ tracker_gsf_parse_xml_in_zip (const gchar         *zip_file_uri,
 			accum += chunk_size;
 
 			/* Pass the read stream to the context parser... */
-			g_markup_parse_context_parse (context, buf, chunk_size, NULL);
+			g_markup_parse_context_parse (context, buf, chunk_size, &error);
 
 			/* update bytes to be read */
 			remaining_size -= chunk_size;
@@ -141,8 +143,9 @@ tracker_gsf_parse_xml_in_zip (const gchar         *zip_file_uri,
 	}
 
 	g_free (filename);
+
 	if (error)
-		g_error_free (error);
+		g_propagate_error (err, error);
 	if (infile)
 		g_object_unref (infile);
 	if (src)
diff --git a/src/tracker-extract/tracker-gsf.h b/src/tracker-extract/tracker-gsf.h
index 26c34b3..c29230b 100644
--- a/src/tracker-extract/tracker-gsf.h
+++ b/src/tracker-extract/tracker-gsf.h
@@ -25,9 +25,10 @@
 
 G_BEGIN_DECLS
 
-void tracker_gsf_parse_xml_in_zip (const gchar         *zip_file_uri,
-                                   const gchar         *xml_filename,
-                                   GMarkupParseContext *context);
+void tracker_gsf_parse_xml_in_zip (const gchar          *zip_file_uri,
+                                   const gchar          *xml_filename,
+                                   GMarkupParseContext  *context,
+                                   GError              **error);
 
 G_END_DECLS
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]