[tracker] tracker-extract: Split msoffice and msoffice-xml extractors
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker] tracker-extract: Split msoffice and msoffice-xml extractors
- Date: Wed, 24 Nov 2010 12:24:30 +0000 (UTC)
commit b98fa7891390ef13f4478319afcf4952b41d7578
Author: Aleksander Morgado <aleksander lanedo com>
Date: Wed Nov 24 11:01:34 2010 +0100
tracker-extract: Split msoffice and msoffice-xml extractors
src/tracker-extract/Makefile.am | 12 +
src/tracker-extract/tracker-extract-msoffice-xml.c | 714 ++++++++++++++++++++
src/tracker-extract/tracker-extract-msoffice.c | 663 ------------------
3 files changed, 726 insertions(+), 663 deletions(-)
---
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index af5adc3..7afe879 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -52,6 +52,7 @@ endif
if HAVE_LIBGSF
modules_LTLIBRARIES += \
libextract-msoffice.la \
+ libextract-msoffice-xml.la \
libextract-oasis.la
endif
@@ -205,6 +206,17 @@ libextract_msoffice_la_LIBADD = \
$(TRACKER_EXTRACT_LIBS) \
$(LIBGSF_LIBS)
+# MS Office XML
+libextract_msoffice_xml_la_SOURCES = tracker-extract-msoffice-xml.c
+libextract_msoffice_xml_la_CFLAGS = $(LIBGSF_CFLAGS)
+libextract_msoffice_xml_la_LDFLAGS = $(module_flags)
+libextract_msoffice_xml_la_LIBADD = \
+ $(top_builddir)/src/libtracker-extract/libtracker-extract- TRACKER_API_VERSION@.la \
+ $(top_builddir)/src/libtracker-common/libtracker-common.la \
+ $(BUILD_LIBS) \
+ $(TRACKER_EXTRACT_LIBS) \
+ $(LIBGSF_LIBS)
+
# PDF
libextract_pdf_la_SOURCES = tracker-extract-pdf.cpp
libextract_pdf_la_CXXFLAGS = $(POPPLER_CFLAGS)
diff --git a/src/tracker-extract/tracker-extract-msoffice-xml.c b/src/tracker-extract/tracker-extract-msoffice-xml.c
new file mode 100644
index 0000000..7e34dd3
--- /dev/null
+++ b/src/tracker-extract/tracker-extract-msoffice-xml.c
@@ -0,0 +1,714 @@
+/*
+ * Copyright (C) 2008-2010 Nokia <ivan frade nokia com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include <glib.h>
+
+#include <gsf/gsf.h>
+#include <gsf/gsf-doc-meta-data.h>
+#include <gsf/gsf-infile.h>
+#include <gsf/gsf-infile-msole.h>
+#include <gsf/gsf-input-stdio.h>
+#include <gsf/gsf-msole-utils.h>
+#include <gsf/gsf-utils.h>
+#include <gsf/gsf-infile-zip.h>
+
+#include <libtracker-common/tracker-utils.h>
+#include <libtracker-common/tracker-os-dependant.h>
+
+#include <libtracker-extract/tracker-extract.h>
+
+#include "tracker-main.h"
+#include "tracker-gsf.h"
+
+typedef enum {
+ MS_OFFICE_XML_TAG_INVALID,
+ MS_OFFICE_XML_TAG_TITLE,
+ MS_OFFICE_XML_TAG_SUBJECT,
+ MS_OFFICE_XML_TAG_AUTHOR,
+ MS_OFFICE_XML_TAG_MODIFIED,
+ MS_OFFICE_XML_TAG_COMMENTS,
+ MS_OFFICE_XML_TAG_CREATED,
+ MS_OFFICE_XML_TAG_GENERATOR,
+ MS_OFFICE_XML_TAG_NUM_OF_PAGES,
+ MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS,
+ MS_OFFICE_XML_TAG_NUM_OF_WORDS,
+ MS_OFFICE_XML_TAG_NUM_OF_LINES,
+ MS_OFFICE_XML_TAG_APPLICATION,
+ MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS,
+ MS_OFFICE_XML_TAG_SLIDE_TEXT,
+ MS_OFFICE_XML_TAG_WORD_TEXT,
+ MS_OFFICE_XML_TAG_XLS_SHARED_TEXT,
+ MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA,
+ MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA
+} MsOfficeXMLTagType;
+
+typedef enum {
+ FILE_TYPE_INVALID,
+ FILE_TYPE_PPTX,
+ FILE_TYPE_PPSX,
+ FILE_TYPE_DOCX,
+ FILE_TYPE_XLSX
+} MsOfficeXMLFileType;
+
+typedef struct {
+ TrackerSparqlBuilder *metadata;
+ MsOfficeXMLFileType file_type;
+ MsOfficeXMLTagType tag_type;
+ gboolean style_element_present;
+ gboolean preserve_attribute_present;
+ const gchar *uri;
+ GString *content;
+ gboolean title_already_set;
+ gboolean generator_already_set;
+ gulong bytes_pending;
+} MsOfficeXMLParserInfo;
+
+static GQuark maximum_size_error_quark = 0;
+
+static void extract_msoffice_xml (const gchar *uri,
+ TrackerSparqlBuilder *preupdate,
+ TrackerSparqlBuilder *metadata);
+
+static TrackerExtractData data[] = {
+ /* MSoffice2007*/
+ { "application/vnd.openxmlformats-officedocument.presentationml.presentation", extract_msoffice_xml },
+ { "application/vnd.openxmlformats-officedocument.presentationml.slideshow", extract_msoffice_xml },
+ { "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", extract_msoffice_xml },
+ { "application/vnd.openxmlformats-officedocument.wordprocessingml.document", extract_msoffice_xml },
+ { NULL, NULL }
+};
+
+static void
+xml_start_element_handler_text_data (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error)
+{
+ MsOfficeXMLParserInfo *info = user_data;
+ const gchar **a;
+ const gchar **v;
+
+ switch (info->file_type) {
+ case FILE_TYPE_DOCX:
+ if (g_ascii_strcasecmp (element_name, "w:pStyle") == 0) {
+ for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+ if (g_ascii_strcasecmp (*a, "w:val") != 0) {
+ continue;
+ }
+
+ if (g_ascii_strncasecmp (*v, "Heading", 7) == 0) {
+ info->style_element_present = TRUE;
+ } else if (g_ascii_strncasecmp (*v, "TOC", 3) == 0) {
+ info->style_element_present = TRUE;
+ } else if (g_ascii_strncasecmp (*v, "Section", 7) == 0) {
+ info->style_element_present = TRUE;
+ } else if (g_ascii_strncasecmp (*v, "Title", 5) == 0) {
+ info->style_element_present = TRUE;
+ } else if (g_ascii_strncasecmp (*v, "Subtitle", 8) == 0) {
+ info->style_element_present = TRUE;
+ }
+ }
+ } else if (g_ascii_strcasecmp (element_name, "w:rStyle") == 0) {
+ for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+ if (g_ascii_strcasecmp (*a, "w:val") != 0) {
+ continue;
+ }
+
+ if (g_ascii_strncasecmp (*v, "SubtleEmphasis", 14) == 0) {
+ info->style_element_present = TRUE;
+ } else if (g_ascii_strncasecmp (*v, "SubtleReference", 15) == 0) {
+ info->style_element_present = TRUE;
+ }
+ }
+ } else if (g_ascii_strcasecmp (element_name, "w:sz") == 0) {
+ for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+ if (g_ascii_strcasecmp (*a, "w:val") != 0) {
+ continue;
+ }
+
+ if (atoi (*v) >= 38) {
+ info->style_element_present = TRUE;
+ }
+ }
+ } else if (g_ascii_strcasecmp (element_name, "w:smartTag") == 0) {
+ info->style_element_present = TRUE;
+ } else if (g_ascii_strcasecmp (element_name, "w:sdtContent") == 0) {
+ info->style_element_present = TRUE;
+ } else if (g_ascii_strcasecmp (element_name, "w:hyperlink") == 0) {
+ info->style_element_present = TRUE;
+ } else if (g_ascii_strcasecmp (element_name, "w:t") == 0) {
+ for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+ if (g_ascii_strcasecmp (*a, "xml:space") != 0) {
+ continue;
+ }
+
+ if (g_ascii_strncasecmp (*v, "preserve", 8) == 0) {
+ info->preserve_attribute_present = TRUE;
+ }
+ }
+
+ info->tag_type = MS_OFFICE_XML_TAG_WORD_TEXT;
+ }
+ break;
+
+ case FILE_TYPE_XLSX:
+ if (g_ascii_strcasecmp (element_name, "sheet") == 0) {
+ for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+ if (g_ascii_strcasecmp (*a, "name") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_XLS_SHARED_TEXT;
+ }
+ }
+
+ } else if (g_ascii_strcasecmp (element_name, "t") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_XLS_SHARED_TEXT;
+ }
+ break;
+
+ case FILE_TYPE_PPTX:
+ case FILE_TYPE_PPSX:
+ info->tag_type = MS_OFFICE_XML_TAG_SLIDE_TEXT;
+ break;
+
+ case FILE_TYPE_INVALID:
+ g_message ("Microsoft document type:%d invalid", info->file_type);
+ break;
+ }
+}
+
+static void
+xml_end_element_handler_document_data (GMarkupParseContext *context,
+ const gchar *element_name,
+ gpointer user_data,
+ GError **error)
+{
+ MsOfficeXMLParserInfo *info = user_data;
+
+ if (g_ascii_strcasecmp (element_name, "w:p") == 0) {
+ info->style_element_present = FALSE;
+ info->preserve_attribute_present = FALSE;
+ }
+
+ ((MsOfficeXMLParserInfo*) user_data)->tag_type = MS_OFFICE_XML_TAG_INVALID;
+}
+
+static void
+xml_start_element_handler_core_data (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error)
+{
+ MsOfficeXMLParserInfo *info = user_data;
+
+ if (g_ascii_strcasecmp (element_name, "dc:title") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_TITLE;
+ } else if (g_ascii_strcasecmp (element_name, "dc:subject") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_SUBJECT;
+ } else if (g_ascii_strcasecmp (element_name, "dc:creator") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_AUTHOR;
+ } else if (g_ascii_strcasecmp (element_name, "dc:description") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_COMMENTS;
+ } else if (g_ascii_strcasecmp (element_name, "dcterms:created") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_CREATED;
+ } else if (g_ascii_strcasecmp (element_name, "meta:generator") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_GENERATOR;
+ } else if (g_ascii_strcasecmp (element_name, "dcterms:modified") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_MODIFIED;
+ } else if (g_ascii_strcasecmp (element_name, "cp:lastModifiedBy") == 0) {
+ /* Do nothing ? */
+ } else if (g_ascii_strcasecmp (element_name, "Pages") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PAGES;
+ } else if (g_ascii_strcasecmp (element_name, "Slides") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PAGES;
+ } else if (g_ascii_strcasecmp (element_name, "Paragraphs") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS;
+ } else if (g_ascii_strcasecmp (element_name, "Characters") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS;
+ } else if (g_ascii_strcasecmp (element_name, "Words") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_WORDS;
+ } else if (g_ascii_strcasecmp (element_name, "Lines") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_LINES;
+ } else if (g_ascii_strcasecmp (element_name, "Application") == 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_APPLICATION;
+ } else {
+ info->tag_type = MS_OFFICE_XML_TAG_INVALID;
+ }
+}
+
+static void
+xml_core_handler_document_data (GMarkupParseContext *context,
+ const gchar *text,
+ gsize text_len,
+ gpointer user_data,
+ GError **error)
+{
+ MsOfficeXMLParserInfo *info = user_data;
+
+ switch (info->tag_type) {
+ /* Ignore tags that may not happen inside the core subdocument */
+ case MS_OFFICE_XML_TAG_WORD_TEXT:
+ case MS_OFFICE_XML_TAG_SLIDE_TEXT:
+ case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
+ break;
+
+ case MS_OFFICE_XML_TAG_TITLE:
+ if (info->title_already_set) {
+ g_warning ("Avoiding additional title (%s) in MsOffice XML document '%s'",
+ text, info->uri);
+ } else {
+ info->title_already_set = TRUE;
+ tracker_sparql_builder_predicate (info->metadata, "nie:title");
+ tracker_sparql_builder_object_unvalidated (info->metadata, text);
+ }
+ break;
+
+ case MS_OFFICE_XML_TAG_SUBJECT:
+ tracker_sparql_builder_predicate (info->metadata, "nie:subject");
+ tracker_sparql_builder_object_unvalidated (info->metadata, text);
+ break;
+
+ case MS_OFFICE_XML_TAG_AUTHOR:
+ tracker_sparql_builder_predicate (info->metadata, "nco:publisher");
+
+ tracker_sparql_builder_object_blank_open (info->metadata);
+ tracker_sparql_builder_predicate (info->metadata, "a");
+ tracker_sparql_builder_object (info->metadata, "nco:Contact");
+
+ tracker_sparql_builder_predicate (info->metadata, "nco:fullname");
+ tracker_sparql_builder_object_unvalidated (info->metadata, text);
+ tracker_sparql_builder_object_blank_close (info->metadata);
+ break;
+
+ case MS_OFFICE_XML_TAG_COMMENTS:
+ tracker_sparql_builder_predicate (info->metadata, "nie:comment");
+ tracker_sparql_builder_object_unvalidated (info->metadata, text);
+ break;
+
+ case MS_OFFICE_XML_TAG_CREATED: {
+ gchar *date;
+
+ date = tracker_date_guess (text);
+ tracker_sparql_builder_predicate (info->metadata, "nie:contentCreated");
+ tracker_sparql_builder_object_unvalidated (info->metadata, date);
+ g_free (date);
+ break;
+ }
+
+ case MS_OFFICE_XML_TAG_GENERATOR:
+ if (info->generator_already_set) {
+ g_warning ("Avoiding additional generator (%s) in MsOffice XML document '%s'",
+ text, info->uri);
+ } else {
+ info->generator_already_set = TRUE;
+ tracker_sparql_builder_predicate (info->metadata, "nie:generator");
+ tracker_sparql_builder_object_unvalidated (info->metadata, text);
+ }
+ break;
+
+ case MS_OFFICE_XML_TAG_APPLICATION:
+ /* FIXME: Same code as MS_OFFICE_XML_TAG_GENERATOR should be
+ * used, but nie:generator has max cardinality of 1
+ * and this would cause errors.
+ */
+ break;
+
+ case MS_OFFICE_XML_TAG_MODIFIED: {
+ gchar *date;
+
+ date = tracker_date_guess (text);
+ tracker_sparql_builder_predicate (info->metadata, "nie:contentLastModified");
+ tracker_sparql_builder_object_unvalidated (info->metadata, date);
+ g_free (date);
+ break;
+ }
+
+ case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
+ tracker_sparql_builder_predicate (info->metadata, "nfo:pageCount");
+ tracker_sparql_builder_object_unvalidated (info->metadata, text);
+ break;
+
+ case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
+ tracker_sparql_builder_predicate (info->metadata, "nfo:characterCount");
+ tracker_sparql_builder_object_unvalidated (info->metadata, text);
+ break;
+
+ case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
+ tracker_sparql_builder_predicate (info->metadata, "nfo:wordCount");
+ tracker_sparql_builder_object_unvalidated (info->metadata, text);
+ break;
+
+ case MS_OFFICE_XML_TAG_NUM_OF_LINES:
+ tracker_sparql_builder_predicate (info->metadata, "nfo:lineCount");
+ tracker_sparql_builder_object_unvalidated (info->metadata, text);
+ break;
+
+ case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:
+ /* TODO: There is no ontology for this. */
+ break;
+
+ case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA:
+ case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA:
+ /* Nothing as we are using it in defining type of data */
+ break;
+
+ case MS_OFFICE_XML_TAG_INVALID:
+ /* Here we cant use log otheriwse it will print for other non useful files */
+ break;
+ }
+}
+
+static void
+xml_text_handler_document_data (GMarkupParseContext *context,
+ const gchar *text,
+ gsize text_len,
+ gpointer user_data,
+ GError **error)
+{
+ MsOfficeXMLParserInfo *info = user_data;
+ gsize written_bytes = 0;
+
+ /* If reached max bytes to extract, just return */
+ if (info->bytes_pending == 0) {
+ g_set_error_literal (error,
+ maximum_size_error_quark,
+ 0,
+ "Maximum text limit reached");
+ return;
+ }
+
+ /* Create content string if not already done before */
+ if (G_UNLIKELY (info->content == NULL)) {
+ info->content = g_string_new ("");
+ }
+
+ switch (info->tag_type) {
+ case MS_OFFICE_XML_TAG_WORD_TEXT:
+ tracker_text_validate_utf8 (text,
+ MIN (text_len, info->bytes_pending),
+ &info->content,
+ &written_bytes);
+ g_string_append_c (info->content, ' ');
+ info->bytes_pending -= written_bytes;
+ break;
+
+ case MS_OFFICE_XML_TAG_SLIDE_TEXT:
+ tracker_text_validate_utf8 (text,
+ MIN (text_len, info->bytes_pending),
+ &info->content,
+ &written_bytes);
+ g_string_append_c (info->content, ' ');
+ info->bytes_pending -= written_bytes;
+ break;
+
+ case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
+ if (atoi (text) == 0) {
+ tracker_text_validate_utf8 (text,
+ MIN (text_len, info->bytes_pending),
+ &info->content,
+ &written_bytes);
+ g_string_append_c (info->content, ' ');
+ info->bytes_pending -= written_bytes;
+ }
+ break;
+
+ /* Ignore tags that may not happen inside the text subdocument */
+ case MS_OFFICE_XML_TAG_TITLE:
+ case MS_OFFICE_XML_TAG_SUBJECT:
+ case MS_OFFICE_XML_TAG_AUTHOR:
+ case MS_OFFICE_XML_TAG_COMMENTS:
+ case MS_OFFICE_XML_TAG_CREATED:
+ case MS_OFFICE_XML_TAG_GENERATOR:
+ case MS_OFFICE_XML_TAG_APPLICATION:
+ case MS_OFFICE_XML_TAG_MODIFIED:
+ case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
+ case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
+ case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
+ case MS_OFFICE_XML_TAG_NUM_OF_LINES:
+ case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:
+ case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA:
+ case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA:
+ case MS_OFFICE_XML_TAG_INVALID:
+ break;
+ }
+}
+
+static gboolean
+xml_read (MsOfficeXMLParserInfo *parser_info,
+ const gchar *xml_filename,
+ MsOfficeXMLTagType type)
+{
+ GMarkupParseContext *context;
+ MsOfficeXMLParserInfo info;
+ TrackerConfig *config;
+
+ /* Setup conf */
+ config = tracker_main_get_config ();
+
+ /* FIXME: Can we use the original info here? */
+ info.metadata = parser_info->metadata;
+ info.file_type = parser_info->file_type;
+ info.tag_type = MS_OFFICE_XML_TAG_INVALID;
+ info.style_element_present = FALSE;
+ info.preserve_attribute_present = FALSE;
+ info.uri = parser_info->uri;
+ info.content = parser_info->content;
+ info.title_already_set = parser_info->title_already_set;
+ info.bytes_pending = tracker_config_get_max_bytes (config);
+ switch (type) {
+ case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA: {
+ GMarkupParser parser = {
+ xml_start_element_handler_core_data,
+ xml_end_element_handler_document_data,
+ xml_core_handler_document_data,
+ NULL,
+ NULL
+ };
+
+ context = g_markup_parse_context_new (&parser,
+ 0,
+ &info,
+ NULL);
+ break;
+ }
+
+ case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA: {
+ GMarkupParser parser = {
+ xml_start_element_handler_text_data,
+ xml_end_element_handler_document_data,
+ xml_text_handler_document_data,
+ NULL,
+ NULL
+ };
+
+ context = g_markup_parse_context_new (&parser,
+ 0,
+ &info,
+ NULL);
+ break;
+ }
+
+ default:
+ context = NULL;
+ break;
+ }
+
+ if (context) {
+ /* Load the internal XML file from the Zip archive, and parse it
+ * using the given context */
+ tracker_gsf_parse_xml_in_zip (parser_info->uri,
+ xml_filename,
+ context, NULL);
+ g_markup_parse_context_free (context);
+ }
+
+ return TRUE;
+}
+
+static void
+xml_start_element_handler_content_types (GMarkupParseContext *context,
+ const gchar *element_name,
+ const gchar **attribute_names,
+ const gchar **attribute_values,
+ gpointer user_data,
+ GError **error)
+{
+ MsOfficeXMLParserInfo *info;
+ const gchar *part_name;
+ const gchar *content_type;
+ gint i;
+
+ info = user_data;
+
+ if (g_ascii_strcasecmp (element_name, "Override") != 0) {
+ info->tag_type = MS_OFFICE_XML_TAG_INVALID;
+ return;
+ }
+
+ part_name = NULL;
+ content_type = NULL;
+
+ for (i = 0; attribute_names[i]; i++) {
+ if (g_ascii_strcasecmp (attribute_names[i], "PartName") == 0) {
+ part_name = attribute_values[i];
+ } else if (g_ascii_strcasecmp (attribute_names[i], "ContentType") == 0) {
+ content_type = attribute_values[i];
+ }
+ }
+
+ /* Both part_name and content_type MUST be NON-NULL */
+ if (!part_name || !content_type) {
+ g_message ("Invalid file (part_name:%s, content_type:%s)",
+ part_name ? part_name : "none",
+ content_type ? content_type : "none");
+ return;
+ }
+
+ if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-package.core-properties+xml") == 0) ||
+ (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.extended-properties+xml") == 0)) {
+ xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA);
+ return;
+ }
+
+ switch (info->file_type) {
+ case FILE_TYPE_DOCX:
+ if (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml") == 0) {
+ xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
+ }
+ break;
+
+ case FILE_TYPE_PPTX:
+ case FILE_TYPE_PPSX:
+ if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.presentationml.slide+xml") == 0) ||
+ (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml") == 0)) {
+ xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
+ }
+ break;
+
+ case FILE_TYPE_XLSX:
+ if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml") == 0) ||
+ (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml") == 0)) {
+ xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
+ }
+ break;
+
+ case FILE_TYPE_INVALID:
+ g_message ("Invalid file type:'%d'", info->file_type);
+ break;
+ }
+}
+
+static void
+extract_msoffice_xml (const gchar *uri,
+ TrackerSparqlBuilder *preupdate,
+ TrackerSparqlBuilder *metadata)
+{
+ MsOfficeXMLParserInfo info;
+ MsOfficeXMLFileType file_type;
+ TrackerConfig *config;
+ GFile *file;
+ GFileInfo *file_info;
+ GMarkupParseContext *context = NULL;
+ GError *error = NULL;
+ gulong total_bytes;
+ GMarkupParser parser = {
+ xml_start_element_handler_content_types,
+ xml_end_element_handler_document_data,
+ NULL,
+ NULL,
+ NULL
+ };
+ const gchar *mime_used;
+
+ if (G_UNLIKELY (maximum_size_error_quark == 0)) {
+ maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
+ }
+
+ file = g_file_new_for_uri (uri);
+
+ if (!file) {
+ g_warning ("Could not create GFile for URI:'%s'",
+ uri);
+ return;
+ }
+
+ file_info = g_file_query_info (file,
+ G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE,
+ G_FILE_QUERY_INFO_NONE,
+ NULL,
+ NULL);
+ g_object_unref (file);
+
+ if (!file_info) {
+ g_warning ("Could not get GFileInfo for URI:'%s'",
+ uri);
+ return;
+ }
+
+ mime_used = g_file_info_get_content_type (file_info);
+
+ if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.wordprocessingml.document") == 0) {
+ file_type = FILE_TYPE_DOCX;
+ } else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.presentationml.presentation") == 0) {
+ file_type = FILE_TYPE_PPTX;
+ } else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.presentationml.slideshow") == 0) {
+ file_type = FILE_TYPE_PPSX;
+ } else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") == 0) {
+ file_type = FILE_TYPE_XLSX;
+ } else {
+ g_message ("Mime type was not recognised:'%s'", mime_used);
+ file_type = FILE_TYPE_INVALID;
+ }
+
+ g_object_unref (file_info);
+
+ /* Setup conf */
+ config = tracker_main_get_config ();
+
+ g_debug ("Extracting MsOffice XML format...");
+
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+ total_bytes = tracker_config_get_max_bytes (config);
+ info.metadata = metadata;
+ info.file_type = file_type;
+ info.tag_type = MS_OFFICE_XML_TAG_INVALID;
+ info.style_element_present = FALSE;
+ info.preserve_attribute_present = FALSE;
+ info.uri = uri;
+ info.content = NULL;
+ info.title_already_set = FALSE;
+ info.bytes_pending = total_bytes;
+ context = g_markup_parse_context_new (&parser, 0, &info, NULL);
+
+ /* Load the internal XML file from the Zip archive, and parse it
+ * using the given context */
+ tracker_gsf_parse_xml_in_zip (uri,
+ "[Content_Types].xml",
+ context,
+ &error);
+
+ /* If we got any content, add it */
+ if (info.content) {
+ gchar *content;
+
+ content = g_string_free (info.content, FALSE);
+ info.content = NULL;
+
+ if (content) {
+ tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+ tracker_sparql_builder_object_unvalidated (metadata, content);
+ g_free (content);
+ }
+ }
+
+ g_markup_parse_context_free (context);
+}
+
+TrackerExtractData *
+tracker_extract_get_data (void)
+{
+ return data;
+}
diff --git a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
index 04caa24..72a89d2 100644
--- a/src/tracker-extract/tracker-extract-msoffice.c
+++ b/src/tracker-extract/tracker-extract-msoffice.c
@@ -126,74 +126,20 @@ typedef struct {
gsize length;
} ExcelExtendedStringRecord;
-typedef enum {
- MS_OFFICE_XML_TAG_INVALID,
- MS_OFFICE_XML_TAG_TITLE,
- MS_OFFICE_XML_TAG_SUBJECT,
- MS_OFFICE_XML_TAG_AUTHOR,
- MS_OFFICE_XML_TAG_MODIFIED,
- MS_OFFICE_XML_TAG_COMMENTS,
- MS_OFFICE_XML_TAG_CREATED,
- MS_OFFICE_XML_TAG_GENERATOR,
- MS_OFFICE_XML_TAG_NUM_OF_PAGES,
- MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS,
- MS_OFFICE_XML_TAG_NUM_OF_WORDS,
- MS_OFFICE_XML_TAG_NUM_OF_LINES,
- MS_OFFICE_XML_TAG_APPLICATION,
- MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS,
- MS_OFFICE_XML_TAG_SLIDE_TEXT,
- MS_OFFICE_XML_TAG_WORD_TEXT,
- MS_OFFICE_XML_TAG_XLS_SHARED_TEXT,
- MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA,
- MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA
-} MsOfficeXMLTagType;
-
-typedef enum {
- FILE_TYPE_INVALID,
- FILE_TYPE_PPTX,
- FILE_TYPE_PPSX,
- FILE_TYPE_DOCX,
- FILE_TYPE_XLSX
-} MsOfficeXMLFileType;
-
-typedef struct {
- TrackerSparqlBuilder *metadata;
- MsOfficeXMLFileType file_type;
- MsOfficeXMLTagType tag_type;
- gboolean style_element_present;
- gboolean preserve_attribute_present;
- const gchar *uri;
- GString *content;
- gboolean title_already_set;
- gboolean generator_already_set;
- gulong bytes_pending;
-} MsOfficeXMLParserInfo;
-
typedef struct {
TrackerSparqlBuilder *metadata;
const gchar *uri;
} MetadataInfo;
-static GQuark maximum_size_error_quark = 0;
-
static void extract_msoffice (const gchar *uri,
TrackerSparqlBuilder *preupdate,
TrackerSparqlBuilder *metadata);
-static void extract_msoffice_xml (const gchar *uri,
- TrackerSparqlBuilder *preupdate,
- TrackerSparqlBuilder *metadata);
static TrackerExtractData data[] = {
{ "application/msword", extract_msoffice },
- /* Powerpoint files */
{ "application/vnd.ms-powerpoint", extract_msoffice },
{ "application/vnd.ms-excel", extract_msoffice },
{ "application/vnd.ms-*", extract_msoffice },
- /* MSoffice2007*/
- { "application/vnd.openxmlformats-officedocument.presentationml.presentation", extract_msoffice_xml },
- { "application/vnd.openxmlformats-officedocument.presentationml.slideshow", extract_msoffice_xml },
- { "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", extract_msoffice_xml },
- { "application/vnd.openxmlformats-officedocument.wordprocessingml.document", extract_msoffice_xml },
{ NULL, NULL }
};
@@ -1736,615 +1682,6 @@ extract_msoffice (const gchar *uri,
gsf_shutdown ();
}
-static void
-xml_start_element_handler_text_data (GMarkupParseContext *context,
- const gchar *element_name,
- const gchar **attribute_names,
- const gchar **attribute_values,
- gpointer user_data,
- GError **error)
-{
- MsOfficeXMLParserInfo *info = user_data;
- const gchar **a;
- const gchar **v;
-
- switch (info->file_type) {
- case FILE_TYPE_DOCX:
- if (g_ascii_strcasecmp (element_name, "w:pStyle") == 0) {
- for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
- if (g_ascii_strcasecmp (*a, "w:val") != 0) {
- continue;
- }
-
- if (g_ascii_strncasecmp (*v, "Heading", 7) == 0) {
- info->style_element_present = TRUE;
- } else if (g_ascii_strncasecmp (*v, "TOC", 3) == 0) {
- info->style_element_present = TRUE;
- } else if (g_ascii_strncasecmp (*v, "Section", 7) == 0) {
- info->style_element_present = TRUE;
- } else if (g_ascii_strncasecmp (*v, "Title", 5) == 0) {
- info->style_element_present = TRUE;
- } else if (g_ascii_strncasecmp (*v, "Subtitle", 8) == 0) {
- info->style_element_present = TRUE;
- }
- }
- } else if (g_ascii_strcasecmp (element_name, "w:rStyle") == 0) {
- for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
- if (g_ascii_strcasecmp (*a, "w:val") != 0) {
- continue;
- }
-
- if (g_ascii_strncasecmp (*v, "SubtleEmphasis", 14) == 0) {
- info->style_element_present = TRUE;
- } else if (g_ascii_strncasecmp (*v, "SubtleReference", 15) == 0) {
- info->style_element_present = TRUE;
- }
- }
- } else if (g_ascii_strcasecmp (element_name, "w:sz") == 0) {
- for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
- if (g_ascii_strcasecmp (*a, "w:val") != 0) {
- continue;
- }
-
- if (atoi (*v) >= 38) {
- info->style_element_present = TRUE;
- }
- }
- } else if (g_ascii_strcasecmp (element_name, "w:smartTag") == 0) {
- info->style_element_present = TRUE;
- } else if (g_ascii_strcasecmp (element_name, "w:sdtContent") == 0) {
- info->style_element_present = TRUE;
- } else if (g_ascii_strcasecmp (element_name, "w:hyperlink") == 0) {
- info->style_element_present = TRUE;
- } else if (g_ascii_strcasecmp (element_name, "w:t") == 0) {
- for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
- if (g_ascii_strcasecmp (*a, "xml:space") != 0) {
- continue;
- }
-
- if (g_ascii_strncasecmp (*v, "preserve", 8) == 0) {
- info->preserve_attribute_present = TRUE;
- }
- }
-
- info->tag_type = MS_OFFICE_XML_TAG_WORD_TEXT;
- }
- break;
-
- case FILE_TYPE_XLSX:
- if (g_ascii_strcasecmp (element_name, "sheet") == 0) {
- for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
- if (g_ascii_strcasecmp (*a, "name") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_XLS_SHARED_TEXT;
- }
- }
-
- } else if (g_ascii_strcasecmp (element_name, "t") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_XLS_SHARED_TEXT;
- }
- break;
-
- case FILE_TYPE_PPTX:
- case FILE_TYPE_PPSX:
- info->tag_type = MS_OFFICE_XML_TAG_SLIDE_TEXT;
- break;
-
- case FILE_TYPE_INVALID:
- g_message ("Microsoft document type:%d invalid", info->file_type);
- break;
- }
-}
-
-static void
-xml_end_element_handler_document_data (GMarkupParseContext *context,
- const gchar *element_name,
- gpointer user_data,
- GError **error)
-{
- MsOfficeXMLParserInfo *info = user_data;
-
- if (g_ascii_strcasecmp (element_name, "w:p") == 0) {
- info->style_element_present = FALSE;
- info->preserve_attribute_present = FALSE;
- }
-
- ((MsOfficeXMLParserInfo*) user_data)->tag_type = MS_OFFICE_XML_TAG_INVALID;
-}
-
-static void
-xml_start_element_handler_core_data (GMarkupParseContext *context,
- const gchar *element_name,
- const gchar **attribute_names,
- const gchar **attribute_values,
- gpointer user_data,
- GError **error)
-{
- MsOfficeXMLParserInfo *info = user_data;
-
- if (g_ascii_strcasecmp (element_name, "dc:title") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_TITLE;
- } else if (g_ascii_strcasecmp (element_name, "dc:subject") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_SUBJECT;
- } else if (g_ascii_strcasecmp (element_name, "dc:creator") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_AUTHOR;
- } else if (g_ascii_strcasecmp (element_name, "dc:description") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_COMMENTS;
- } else if (g_ascii_strcasecmp (element_name, "dcterms:created") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_CREATED;
- } else if (g_ascii_strcasecmp (element_name, "meta:generator") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_GENERATOR;
- } else if (g_ascii_strcasecmp (element_name, "dcterms:modified") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_MODIFIED;
- } else if (g_ascii_strcasecmp (element_name, "cp:lastModifiedBy") == 0) {
- /* Do nothing ? */
- } else if (g_ascii_strcasecmp (element_name, "Pages") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PAGES;
- } else if (g_ascii_strcasecmp (element_name, "Slides") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PAGES;
- } else if (g_ascii_strcasecmp (element_name, "Paragraphs") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS;
- } else if (g_ascii_strcasecmp (element_name, "Characters") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS;
- } else if (g_ascii_strcasecmp (element_name, "Words") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_WORDS;
- } else if (g_ascii_strcasecmp (element_name, "Lines") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_LINES;
- } else if (g_ascii_strcasecmp (element_name, "Application") == 0) {
- info->tag_type = MS_OFFICE_XML_TAG_APPLICATION;
- } else {
- info->tag_type = MS_OFFICE_XML_TAG_INVALID;
- }
-}
-
-static void
-xml_core_handler_document_data (GMarkupParseContext *context,
- const gchar *text,
- gsize text_len,
- gpointer user_data,
- GError **error)
-{
- MsOfficeXMLParserInfo *info = user_data;
-
- switch (info->tag_type) {
- /* Ignore tags that may not happen inside the core subdocument */
- case MS_OFFICE_XML_TAG_WORD_TEXT:
- case MS_OFFICE_XML_TAG_SLIDE_TEXT:
- case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
- break;
-
- case MS_OFFICE_XML_TAG_TITLE:
- if (info->title_already_set) {
- g_warning ("Avoiding additional title (%s) in MsOffice XML document '%s'",
- text, info->uri);
- } else {
- info->title_already_set = TRUE;
- tracker_sparql_builder_predicate (info->metadata, "nie:title");
- tracker_sparql_builder_object_unvalidated (info->metadata, text);
- }
- break;
-
- case MS_OFFICE_XML_TAG_SUBJECT:
- tracker_sparql_builder_predicate (info->metadata, "nie:subject");
- tracker_sparql_builder_object_unvalidated (info->metadata, text);
- break;
-
- case MS_OFFICE_XML_TAG_AUTHOR:
- tracker_sparql_builder_predicate (info->metadata, "nco:publisher");
-
- tracker_sparql_builder_object_blank_open (info->metadata);
- tracker_sparql_builder_predicate (info->metadata, "a");
- tracker_sparql_builder_object (info->metadata, "nco:Contact");
-
- tracker_sparql_builder_predicate (info->metadata, "nco:fullname");
- tracker_sparql_builder_object_unvalidated (info->metadata, text);
- tracker_sparql_builder_object_blank_close (info->metadata);
- break;
-
- case MS_OFFICE_XML_TAG_COMMENTS:
- tracker_sparql_builder_predicate (info->metadata, "nie:comment");
- tracker_sparql_builder_object_unvalidated (info->metadata, text);
- break;
-
- case MS_OFFICE_XML_TAG_CREATED: {
- gchar *date;
-
- date = tracker_date_guess (text);
- tracker_sparql_builder_predicate (info->metadata, "nie:contentCreated");
- tracker_sparql_builder_object_unvalidated (info->metadata, date);
- g_free (date);
- break;
- }
-
- case MS_OFFICE_XML_TAG_GENERATOR:
- if (info->generator_already_set) {
- g_warning ("Avoiding additional generator (%s) in MsOffice XML document '%s'",
- text, info->uri);
- } else {
- info->generator_already_set = TRUE;
- tracker_sparql_builder_predicate (info->metadata, "nie:generator");
- tracker_sparql_builder_object_unvalidated (info->metadata, text);
- }
- break;
-
- case MS_OFFICE_XML_TAG_APPLICATION:
- /* FIXME: Same code as MS_OFFICE_XML_TAG_GENERATOR should be
- * used, but nie:generator has max cardinality of 1
- * and this would cause errors.
- */
- break;
-
- case MS_OFFICE_XML_TAG_MODIFIED: {
- gchar *date;
-
- date = tracker_date_guess (text);
- tracker_sparql_builder_predicate (info->metadata, "nie:contentLastModified");
- tracker_sparql_builder_object_unvalidated (info->metadata, date);
- g_free (date);
- break;
- }
-
- case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
- tracker_sparql_builder_predicate (info->metadata, "nfo:pageCount");
- tracker_sparql_builder_object_unvalidated (info->metadata, text);
- break;
-
- case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
- tracker_sparql_builder_predicate (info->metadata, "nfo:characterCount");
- tracker_sparql_builder_object_unvalidated (info->metadata, text);
- break;
-
- case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
- tracker_sparql_builder_predicate (info->metadata, "nfo:wordCount");
- tracker_sparql_builder_object_unvalidated (info->metadata, text);
- break;
-
- case MS_OFFICE_XML_TAG_NUM_OF_LINES:
- tracker_sparql_builder_predicate (info->metadata, "nfo:lineCount");
- tracker_sparql_builder_object_unvalidated (info->metadata, text);
- break;
-
- case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:
- /* TODO: There is no ontology for this. */
- break;
-
- case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA:
- case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA:
- /* Nothing as we are using it in defining type of data */
- break;
-
- case MS_OFFICE_XML_TAG_INVALID:
- /* Here we cant use log otheriwse it will print for other non useful files */
- break;
- }
-}
-
-static void
-xml_text_handler_document_data (GMarkupParseContext *context,
- const gchar *text,
- gsize text_len,
- gpointer user_data,
- GError **error)
-{
- MsOfficeXMLParserInfo *info = user_data;
- gsize written_bytes = 0;
-
- /* If reached max bytes to extract, just return */
- if (info->bytes_pending == 0) {
- g_set_error_literal (error,
- maximum_size_error_quark,
- 0,
- "Maximum text limit reached");
- return;
- }
-
- /* Create content string if not already done before */
- if (G_UNLIKELY (info->content == NULL)) {
- info->content = g_string_new ("");
- }
-
- switch (info->tag_type) {
- case MS_OFFICE_XML_TAG_WORD_TEXT:
- tracker_text_validate_utf8 (text,
- MIN (text_len, info->bytes_pending),
- &info->content,
- &written_bytes);
- g_string_append_c (info->content, ' ');
- info->bytes_pending -= written_bytes;
- break;
-
- case MS_OFFICE_XML_TAG_SLIDE_TEXT:
- tracker_text_validate_utf8 (text,
- MIN (text_len, info->bytes_pending),
- &info->content,
- &written_bytes);
- g_string_append_c (info->content, ' ');
- info->bytes_pending -= written_bytes;
- break;
-
- case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
- if (atoi (text) == 0) {
- tracker_text_validate_utf8 (text,
- MIN (text_len, info->bytes_pending),
- &info->content,
- &written_bytes);
- g_string_append_c (info->content, ' ');
- info->bytes_pending -= written_bytes;
- }
- break;
-
- /* Ignore tags that may not happen inside the text subdocument */
- case MS_OFFICE_XML_TAG_TITLE:
- case MS_OFFICE_XML_TAG_SUBJECT:
- case MS_OFFICE_XML_TAG_AUTHOR:
- case MS_OFFICE_XML_TAG_COMMENTS:
- case MS_OFFICE_XML_TAG_CREATED:
- case MS_OFFICE_XML_TAG_GENERATOR:
- case MS_OFFICE_XML_TAG_APPLICATION:
- case MS_OFFICE_XML_TAG_MODIFIED:
- case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
- case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
- case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
- case MS_OFFICE_XML_TAG_NUM_OF_LINES:
- case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:
- case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA:
- case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA:
- case MS_OFFICE_XML_TAG_INVALID:
- break;
- }
-}
-
-static gboolean
-xml_read (MsOfficeXMLParserInfo *parser_info,
- const gchar *xml_filename,
- MsOfficeXMLTagType type)
-{
- GMarkupParseContext *context;
- MsOfficeXMLParserInfo info;
- TrackerConfig *config;
-
- /* Setup conf */
- config = tracker_main_get_config ();
-
- /* FIXME: Can we use the original info here? */
- info.metadata = parser_info->metadata;
- info.file_type = parser_info->file_type;
- info.tag_type = MS_OFFICE_XML_TAG_INVALID;
- info.style_element_present = FALSE;
- info.preserve_attribute_present = FALSE;
- info.uri = parser_info->uri;
- info.content = parser_info->content;
- info.title_already_set = parser_info->title_already_set;
- info.bytes_pending = tracker_config_get_max_bytes (config);
- switch (type) {
- case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA: {
- GMarkupParser parser = {
- xml_start_element_handler_core_data,
- xml_end_element_handler_document_data,
- xml_core_handler_document_data,
- NULL,
- NULL
- };
-
- context = g_markup_parse_context_new (&parser,
- 0,
- &info,
- NULL);
- break;
- }
-
- case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA: {
- GMarkupParser parser = {
- xml_start_element_handler_text_data,
- xml_end_element_handler_document_data,
- xml_text_handler_document_data,
- NULL,
- NULL
- };
-
- context = g_markup_parse_context_new (&parser,
- 0,
- &info,
- NULL);
- break;
- }
-
- default:
- context = NULL;
- break;
- }
-
- if (context) {
- /* Load the internal XML file from the Zip archive, and parse it
- * using the given context */
- tracker_gsf_parse_xml_in_zip (parser_info->uri,
- xml_filename,
- context, NULL);
- g_markup_parse_context_free (context);
- }
-
- return TRUE;
-}
-
-static void
-xml_start_element_handler_content_types (GMarkupParseContext *context,
- const gchar *element_name,
- const gchar **attribute_names,
- const gchar **attribute_values,
- gpointer user_data,
- GError **error)
-{
- MsOfficeXMLParserInfo *info;
- const gchar *part_name;
- const gchar *content_type;
- gint i;
-
- info = user_data;
-
- if (g_ascii_strcasecmp (element_name, "Override") != 0) {
- info->tag_type = MS_OFFICE_XML_TAG_INVALID;
- return;
- }
-
- part_name = NULL;
- content_type = NULL;
-
- for (i = 0; attribute_names[i]; i++) {
- if (g_ascii_strcasecmp (attribute_names[i], "PartName") == 0) {
- part_name = attribute_values[i];
- } else if (g_ascii_strcasecmp (attribute_names[i], "ContentType") == 0) {
- content_type = attribute_values[i];
- }
- }
-
- /* Both part_name and content_type MUST be NON-NULL */
- if (!part_name || !content_type) {
- g_message ("Invalid file (part_name:%s, content_type:%s)",
- part_name ? part_name : "none",
- content_type ? content_type : "none");
- return;
- }
-
- if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-package.core-properties+xml") == 0) ||
- (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.extended-properties+xml") == 0)) {
- xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA);
- return;
- }
-
- switch (info->file_type) {
- case FILE_TYPE_DOCX:
- if (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml") == 0) {
- xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
- }
- break;
-
- case FILE_TYPE_PPTX:
- case FILE_TYPE_PPSX:
- if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.presentationml.slide+xml") == 0) ||
- (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml") == 0)) {
- xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
- }
- break;
-
- case FILE_TYPE_XLSX:
- if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml") == 0) ||
- (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml") == 0)) {
- xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
- }
- break;
-
- case FILE_TYPE_INVALID:
- g_message ("Invalid file type:'%d'", info->file_type);
- break;
- }
-}
-
-static void
-extract_msoffice_xml (const gchar *uri,
- TrackerSparqlBuilder *preupdate,
- TrackerSparqlBuilder *metadata)
-{
- MsOfficeXMLParserInfo info;
- MsOfficeXMLFileType file_type;
- TrackerConfig *config;
- GFile *file;
- GFileInfo *file_info;
- GMarkupParseContext *context = NULL;
- GError *error = NULL;
- gulong total_bytes;
- GMarkupParser parser = {
- xml_start_element_handler_content_types,
- xml_end_element_handler_document_data,
- NULL,
- NULL,
- NULL
- };
- const gchar *mime_used;
-
- if (G_UNLIKELY (maximum_size_error_quark == 0)) {
- maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
- }
-
- file = g_file_new_for_uri (uri);
-
- if (!file) {
- g_warning ("Could not create GFile for URI:'%s'",
- uri);
- return;
- }
-
- file_info = g_file_query_info (file,
- G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE,
- G_FILE_QUERY_INFO_NONE,
- NULL,
- NULL);
- g_object_unref (file);
-
- if (!file_info) {
- g_warning ("Could not get GFileInfo for URI:'%s'",
- uri);
- return;
- }
-
- mime_used = g_file_info_get_content_type (file_info);
-
- if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.wordprocessingml.document") == 0) {
- file_type = FILE_TYPE_DOCX;
- } else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.presentationml.presentation") == 0) {
- file_type = FILE_TYPE_PPTX;
- } else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.presentationml.slideshow") == 0) {
- file_type = FILE_TYPE_PPSX;
- } else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") == 0) {
- file_type = FILE_TYPE_XLSX;
- } else {
- g_message ("Mime type was not recognised:'%s'", mime_used);
- file_type = FILE_TYPE_INVALID;
- }
-
- g_object_unref (file_info);
-
- /* Setup conf */
- config = tracker_main_get_config ();
-
- g_debug ("Extracting MsOffice XML format...");
-
- tracker_sparql_builder_predicate (metadata, "a");
- tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
- total_bytes = tracker_config_get_max_bytes (config);
- info.metadata = metadata;
- info.file_type = file_type;
- info.tag_type = MS_OFFICE_XML_TAG_INVALID;
- info.style_element_present = FALSE;
- info.preserve_attribute_present = FALSE;
- info.uri = uri;
- info.content = NULL;
- info.title_already_set = FALSE;
- info.bytes_pending = total_bytes;
- context = g_markup_parse_context_new (&parser, 0, &info, NULL);
-
- /* Load the internal XML file from the Zip archive, and parse it
- * using the given context */
- tracker_gsf_parse_xml_in_zip (uri,
- "[Content_Types].xml",
- context,
- &error);
-
- /* If we got any content, add it */
- if (info.content) {
- gchar *content;
-
- content = g_string_free (info.content, FALSE);
- info.content = NULL;
-
- if (content) {
- tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
- tracker_sparql_builder_object_unvalidated (metadata, content);
- g_free (content);
- }
- }
-
- g_markup_parse_context_free (context);
-}
-
TrackerExtractData *
tracker_extract_get_data (void)
{
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]