[tracker] Fixes GB#623005: Tracker plugin for odt, odp, ods files
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker] Fixes GB#623005: Tracker plugin for odt, odp, ods files
- Date: Tue, 17 Aug 2010 14:47:23 +0000 (UTC)
commit 9a3c8cfc2cd0b6b4bb8420c244551dc139bf80f6
Author: Murugappan Nataraj <murugappan nataraj nokia com>
Date: Tue Aug 17 18:59:45 2010 +0530
Fixes GB#623005: Tracker plugin for odt, odp, ods files
src/tracker-extract/tracker-extract-msoffice.c | 4 +-
src/tracker-extract/tracker-extract-oasis.c | 407 ++++++++++++++++++------
src/tracker-extract/tracker-gsf.c | 15 +-
src/tracker-extract/tracker-gsf.h | 7 +-
4 files changed, 330 insertions(+), 103 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 7edd0f3..099c7aa 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -2082,7 +2082,7 @@ xml_read (MsOfficeXMLParserInfo *parser_info,
* using the given context */
tracker_gsf_parse_xml_in_zip (parser_info->uri,
xml_filename,
- context);
+ context, NULL);
g_markup_parse_context_free (context);
}
@@ -2237,7 +2237,7 @@ extract_msoffice_xml (const gchar *uri,
* using the given context */
tracker_gsf_parse_xml_in_zip (uri,
"[Content_Types].xml",
- context);
+ context, NULL);
if (info.content) {
gchar *content;
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index b1534f2..596ecfb 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -37,86 +37,126 @@ typedef enum {
ODT_TAG_TYPE_COMMENTS,
ODT_TAG_TYPE_STATS,
ODT_TAG_TYPE_CREATED,
- ODT_TAG_TYPE_GENERATOR
+ ODT_TAG_TYPE_GENERATOR,
+ ODT_TAG_TYPE_WORD_TEXT,
+ ODT_TAG_TYPE_SLIDE_TEXT,
+ ODT_TAG_TYPE_SPREADSHEET_TEXT
} ODTTagType;
+typedef enum {
+ FILE_TYPE_INVALID,
+ FILE_TYPE_ODP,
+ FILE_TYPE_ODT,
+ FILE_TYPE_ODS
+} ODTFileType;
+
typedef struct {
TrackerSparqlBuilder *metadata;
ODTTagType current;
const gchar *uri;
gboolean title_already_set;
-} ODTParseInfo;
-
-static void xml_start_element_handler (GMarkupParseContext *context,
- const gchar *element_name,
- const gchar **attribute_names,
- const gchar **attribute_values,
- gpointer user_data,
- GError **error);
-static void xml_end_element_handler (GMarkupParseContext *context,
- const gchar *element_name,
- gpointer user_data,
- GError **error);
-static void xml_text_handler (GMarkupParseContext *context,
- const gchar *text,
- gsize text_len,
- gpointer user_data,
- GError **error);
-static void extract_oasis (const gchar *filename,
- TrackerSparqlBuilder *preupdate,
- TrackerSparqlBuilder *metadata);
+} ODTMetadataParseInfo;
+
+typedef struct {
+ ODTTagType current;
+ gboolean styles_present;
+ ODTFileType file_type;
+ GString *content;
+ gulong bytes_pending;
+} ODTContentParseInfo;
+
+GQuark maximum_size_error_quark = 0;
+
+static void xml_start_element_handler_metadata (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error);
+static void xml_end_element_handler_metadata (GMarkupParseContext *context,
+ const gchar *element_name,
+ gpointer user_data,
+ GError **error);
+static void xml_text_handler_metadata (GMarkupParseContext *context,
+ const gchar *text,
+ gsize text_len,
+ gpointer user_data,
+ GError **error);
+static void xml_start_element_handler_content (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error);
+static void xml_end_element_handler_content (GMarkupParseContext *context,
+ const gchar *element_name,
+ gpointer user_data,
+ GError **error);
+static void xml_text_handler_content (GMarkupParseContext *context,
+ const gchar *text,
+ gsize text_len,
+ gpointer user_data,
+ GError **error);
+static void extract_oasis (const gchar *filename,
+ TrackerSparqlBuilder *preupdate,
+ TrackerSparqlBuilder *metadata);
+static void extract_oasis_content (const gchar *uri,
+ gulong total_bytes,
+ ODTFileType file_type,
+ TrackerSparqlBuilder *metadata);
static TrackerExtractData extract_data[] = {
{ "application/vnd.oasis.opendocument.*", extract_oasis },
{ NULL, NULL }
};
-static gchar *
-extract_oasis_content (const gchar *uri,
- gsize n_bytes)
+static void
+extract_oasis_content (const gchar *uri,
+ gulong total_bytes,
+ ODTFileType file_type,
+ TrackerSparqlBuilder *metadata)
{
+ gchar *content;
+ ODTContentParseInfo info;
+ GMarkupParseContext *context;
GError *error = NULL;
- const gchar *argv[4];
- gchar *text = NULL;
- gchar *path;
- gint fd;
-
- /* Newly allocated string with the file path */
- path = g_filename_from_uri (uri, NULL, NULL);
-
- /* Setup command to be executed */
- argv[0] = "odt2txt";
- argv[1] = "--encoding=utf-8";
- argv[2] = path;
- argv[3] = NULL;
-
- g_debug ("Executing command:'%s %s %s' "
- "(max_bytes: %" G_GSIZE_FORMAT ")",
- argv[0], argv[1], argv[2], n_bytes);
-
- /* Fork & spawn */
- if (!g_spawn_async_with_pipes (g_get_tmp_dir (),
- (gchar **)argv,
- NULL,
- G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL,
- tracker_spawn_child_func,
- GINT_TO_POINTER (10),
- NULL,
- NULL,
- &fd,
- NULL,
- &error)) {
- g_warning ("Spawning failed, could not extract text from '%s': %s",
- path, error ? error->message : NULL);
- g_clear_error (&error);
+ GMarkupParser parser = {
+ xml_start_element_handler_content,
+ xml_end_element_handler_content,
+ xml_text_handler_content,
+ NULL,
+ NULL
+ };
+
+ /* Create parse info */
+ info.current = ODT_TAG_TYPE_UNKNOWN;
+ info.file_type = file_type;
+ info.styles_present = FALSE;
+ info.content = g_string_new ("");
+ info.bytes_pending = total_bytes;
+
+ /* Create parsing context */
+ context = g_markup_parse_context_new (&parser, 0, &info, NULL);
+
+ /* Load the internal XML file from the Zip archive, and parse it
+ * using the given context */
+ tracker_gsf_parse_xml_in_zip (uri, "content.xml", context, &error);
+
+ if (!error || g_error_matches (error, maximum_size_error_quark, 0)) {
+ content = g_string_free (info.content, FALSE);
+ tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+ tracker_sparql_builder_object_unvalidated (metadata, content);
} else {
- /* Read up to n_bytes from FD (also closes FD) */
- text = tracker_read_text_from_fd (fd, n_bytes, FALSE);
+ g_warning ("Got error parsing XML file: %s\n", error->message);
+ g_string_free (info.content, TRUE);
}
- g_free (path);
+ if (error) {
+ g_error_free (error);
+ }
- return text;
+ g_free (content);
+ g_markup_parse_context_free (context);
}
static void
@@ -124,18 +164,25 @@ extract_oasis (const gchar *uri,
TrackerSparqlBuilder *preupdate,
TrackerSparqlBuilder *metadata)
{
- gchar *content;
TrackerConfig *config;
- ODTParseInfo info;
+ ODTMetadataParseInfo info;
+ ODTFileType file_type;
+ GFile *file = NULL;
+ GFileInfo *file_info = NULL;
+ const gchar *mime_used;
GMarkupParseContext *context;
GMarkupParser parser = {
- xml_start_element_handler,
- xml_end_element_handler,
- xml_text_handler,
+ xml_start_element_handler_metadata,
+ xml_end_element_handler_metadata,
+ xml_text_handler_metadata,
NULL,
NULL
};
+ if (G_UNLIKELY (maximum_size_error_quark == 0)) {
+ maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
+ }
+
/* Setup conf */
config = tracker_main_get_config ();
@@ -157,30 +204,62 @@ extract_oasis (const gchar *uri,
/* Load the internal XML file from the Zip archive, and parse it
* using the given context */
- tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context);
+ tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context, NULL);
g_markup_parse_context_free (context);
/* Next, parse contents */
+ file = g_file_new_for_uri (uri);
- /* Extract content with the given limitations */
- content = extract_oasis_content (uri,
- tracker_config_get_max_bytes (config));
- if (content) {
- tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
- tracker_sparql_builder_object_unvalidated (metadata, content);
- g_free (content);
+ if (!file) {
+ g_warning ("Could not create GFile for URI:'%s'",
+ uri);
+ return;
+ }
+
+ file_info = g_file_query_info (file,
+ G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE,
+ G_FILE_QUERY_INFO_NONE,
+ NULL,
+ NULL);
+ g_object_unref (file);
+
+ if (!file_info) {
+ g_warning ("Could not get GFileInfo for URI:'%s'",
+ uri);
+ return;
+ }
+
+ mime_used = g_file_info_get_content_type (file_info);
+
+ if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.text") == 0) {
+ file_type = FILE_TYPE_ODT;
+ } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.presentation") == 0) {
+ file_type = FILE_TYPE_ODP;
+ } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.spreadsheet") == 0) {
+ file_type = FILE_TYPE_ODS;
+ } else {
+ g_message ("Mime type was not recognised:'%s'", mime_used);
+ file_type = FILE_TYPE_INVALID;
}
+
+ g_object_unref (file_info);
+
+ /* Extract content with the given limitations */
+ extract_oasis_content (uri,
+ tracker_config_get_max_bytes (config),
+ file_type,
+ metadata);
}
static void
-xml_start_element_handler (GMarkupParseContext *context,
- const gchar *element_name,
- const gchar **attribute_names,
- const gchar **attribute_values,
- gpointer user_data,
- GError **error)
+xml_start_element_handler_metadata (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error)
{
- ODTParseInfo *data = user_data;
+ ODTMetadataParseInfo *data = user_data;
if (g_ascii_strcasecmp (element_name, "dc:title") == 0) {
data->current = ODT_TAG_TYPE_TITLE;
@@ -221,22 +300,22 @@ xml_start_element_handler (GMarkupParseContext *context,
}
static void
-xml_end_element_handler (GMarkupParseContext *context,
- const gchar *element_name,
- gpointer user_data,
- GError **error)
+xml_end_element_handler_metadata (GMarkupParseContext *context,
+ const gchar *element_name,
+ gpointer user_data,
+ GError **error)
{
- ((ODTParseInfo*) user_data)->current = -1;
+ ((ODTMetadataParseInfo*) user_data)->current = -1;
}
static void
-xml_text_handler (GMarkupParseContext *context,
- const gchar *text,
- gsize text_len,
- gpointer user_data,
- GError **error)
+xml_text_handler_metadata (GMarkupParseContext *context,
+ const gchar *text,
+ gsize text_len,
+ gpointer user_data,
+ GError **error)
{
- ODTParseInfo *data;
+ ODTMetadataParseInfo *data;
TrackerSparqlBuilder *metadata;
const gchar *uri;
gchar *date;
@@ -315,6 +394,150 @@ xml_text_handler (GMarkupParseContext *context,
}
}
+static void
+xml_start_element_handler_content (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error)
+{
+ ODTContentParseInfo *data = user_data;
+ const gchar **a;
+ const gchar **v;
+
+ switch (data->file_type) {
+ case FILE_TYPE_ODT:
+ if ((g_ascii_strcasecmp (element_name, "text:table-of-content") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:table-index") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:illustration-index") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:section") == 0)) {
+ data->styles_present = TRUE;
+ } else if (g_ascii_strcasecmp (element_name, "table:table-cell") == 0) {
+ data->current = ODT_TAG_TYPE_WORD_TEXT;
+ } else if (g_ascii_strcasecmp (element_name, "text:p") == 0) {
+ if (data->styles_present) {
+ data->current = ODT_TAG_TYPE_WORD_TEXT;
+ break;
+ }
+
+ for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+ if (g_ascii_strcasecmp (*a, "text:style-name") != 0) {
+ continue;
+ }
+
+ if ((g_ascii_strcasecmp (*v, "title-article") == 0) ||
+ (g_ascii_strcasecmp (*v, "para-padding") == 0) ||
+ (g_ascii_strcasecmp (*v, "para-screen") == 0)) {
+ data->current = ODT_TAG_TYPE_WORD_TEXT;
+ }
+ }
+ } else if (g_ascii_strcasecmp (element_name, "text:h") == 0) {
+ for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+ if (g_ascii_strcasecmp (*a, "text:style-name") != 0) {
+ continue;
+ }
+
+ if (g_ascii_strncasecmp (*v, "Heading", 7) == 0) {
+ data->current = ODT_TAG_TYPE_WORD_TEXT;
+ }
+ }
+ } else if (g_ascii_strcasecmp (element_name, "text:span") == 0) {
+ data->current = ODT_TAG_TYPE_WORD_TEXT;
+ } else if ((g_ascii_strcasecmp (element_name, "text:a") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:s") == 0)) {
+ data->current = ODT_TAG_TYPE_WORD_TEXT;
+ } else {
+ data->current = -1;
+ }
+ break;
+
+ case FILE_TYPE_ODP:
+ data->current = ODT_TAG_TYPE_SLIDE_TEXT;
+ break;
+
+ case FILE_TYPE_ODS:
+ if (g_ascii_strncasecmp (element_name, "text", 4) == 0) {
+ data->current = ODT_TAG_TYPE_SPREADSHEET_TEXT;
+ } else {
+ data->current = -1;
+ }
+ break;
+
+ case FILE_TYPE_INVALID:
+ g_message ("Open Office Document type: %d invalid", data->file_type);
+ break;
+ }
+}
+
+static void
+xml_end_element_handler_content (GMarkupParseContext *context,
+ const gchar *element_name,
+ gpointer user_data,
+ GError **error)
+{
+ ODTContentParseInfo *data = user_data;
+
+ switch (data->file_type) {
+ case FILE_TYPE_ODT:
+ if ((g_ascii_strcasecmp (element_name, "text:table-of-content") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:table-index") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:illustration-index") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:section") == 0)) {
+ data->styles_present = FALSE;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if ((g_ascii_strcasecmp (element_name, "text:a") != 0) &&
+ (g_ascii_strcasecmp (element_name, "text:s") != 0)) {
+ data->current = -1;
+ }
+}
+
+static void
+xml_text_handler_content (GMarkupParseContext *context,
+ const gchar *text,
+ gsize text_len,
+ gpointer user_data,
+ GError **error)
+{
+ ODTContentParseInfo *data = user_data;
+ gsize written_bytes = 0;
+
+ switch (data->current) {
+ case ODT_TAG_TYPE_WORD_TEXT:
+ case ODT_TAG_TYPE_SLIDE_TEXT:
+ case ODT_TAG_TYPE_SPREADSHEET_TEXT:
+ if (data->bytes_pending == 0) {
+ g_set_error_literal (error,
+ maximum_size_error_quark, 0,
+ "Maximum text limit reached");
+ break;
+ }
+
+ /* Look for valid UTF-8 text */
+ if (tracker_text_validate_utf8 (text,
+ MIN (text_len, data->bytes_pending),
+ &data->content,
+ &written_bytes)) {
+ if (data->content->str[data->content->len - 1] != ' ') {
+ /* If some bytes found to be valid, append an extra whitespace
+ * as separator */
+ g_string_append_c (data->content, ' ');
+ }
+ }
+
+ data->bytes_pending -= written_bytes;
+ break;
+
+ default:
+ break;
+ }
+}
+
TrackerExtractData *
tracker_extract_get_data (void)
{
diff --git a/src/tracker-extract/tracker-gsf.c b/src/tracker-extract/tracker-gsf.c
index 7c607d6..69a143e 100644
--- a/src/tracker-extract/tracker-gsf.c
+++ b/src/tracker-extract/tracker-gsf.c
@@ -77,9 +77,10 @@ find_member (GsfInfile *arch,
* maximum size of the uncompressed XML file is limited to be to 20MBytes.
*/
void
-tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri,
- const gchar *xml_filename,
- GMarkupParseContext *context)
+tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri,
+ const gchar *xml_filename,
+ GMarkupParseContext *context,
+ GError **err)
{
gchar *filename;
GError *error = NULL;
@@ -124,7 +125,8 @@ tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri,
chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
accum = 0;
- while (accum <= XML_MAX_BYTES_READ &&
+ while (!error &&
+ accum <= XML_MAX_BYTES_READ &&
chunk_size > 0 &&
gsf_input_read (GSF_INPUT (member), chunk_size, buf) != NULL) {
@@ -132,7 +134,7 @@ tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri,
accum += chunk_size;
/* Pass the read stream to the context parser... */
- g_markup_parse_context_parse (context, buf, chunk_size, NULL);
+ g_markup_parse_context_parse (context, buf, chunk_size, &error);
/* update bytes to be read */
remaining_size -= chunk_size;
@@ -141,8 +143,9 @@ tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri,
}
g_free (filename);
+
if (error)
- g_error_free (error);
+ g_propagate_error (err, error);
if (infile)
g_object_unref (infile);
if (src)
diff --git a/src/tracker-extract/tracker-gsf.h b/src/tracker-extract/tracker-gsf.h
index 26c34b3..c29230b 100644
--- a/src/tracker-extract/tracker-gsf.h
+++ b/src/tracker-extract/tracker-gsf.h
@@ -25,9 +25,10 @@
G_BEGIN_DECLS
-void tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri,
- const gchar *xml_filename,
- GMarkupParseContext *context);
+void tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri,
+ const gchar *xml_filename,
+ GMarkupParseContext *context,
+ GError **error);
G_END_DECLS
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]