[tracker] tracker-extract: Protect all single valued properties in ooxml extractor



commit ad762d208563c5c646cf663206617969a72d33b5
Author: Carlos Garnacho <carlosg gnome org>
Date:   Mon Mar 14 01:04:39 2016 +0100

    tracker-extract: Protect all single valued properties in ooxml extractor
    
    In case of malformed documents or unexpected input, avoid creating sparql
    that will break cardinality constraints, warn nicely instead.

 src/tracker-extract/tracker-extract-msoffice-xml.c |  151 +++++++++++++++-----
 1 files changed, 112 insertions(+), 39 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-msoffice-xml.c 
b/src/tracker-extract/tracker-extract-msoffice-xml.c
index 71155be..14a8345 100644
--- a/src/tracker-extract/tracker-extract-msoffice-xml.c
+++ b/src/tracker-extract/tracker-extract-msoffice-xml.c
@@ -77,6 +77,17 @@ typedef struct {
 
        /* Metadata-parsing specific things */
        TrackerSparqlBuilder *metadata;
+       guint has_title      : 1;
+       guint has_subject    : 1;
+       guint has_publisher  : 1;
+       guint has_comment    : 1;
+       guint has_generator  : 1;
+       guint has_page_count : 1;
+       guint has_char_count : 1;
+       guint has_word_count : 1;
+       guint has_line_count : 1;
+       guint has_content_created       : 1;
+       guint has_content_last_modified : 1;
        gboolean title_already_set;
        gboolean generator_already_set;
 
@@ -420,54 +431,82 @@ msoffice_xml_metadata_parse (GMarkupParseContext  *context,
                break;
 
        case MS_OFFICE_XML_TAG_TITLE:
-               if (info->title_already_set) {
+               if (info->has_title) {
                        g_warning ("Avoiding additional title (%s) in MsOffice XML document '%s'",
                                   text, info->uri);
                } else {
-                       info->title_already_set = TRUE;
+                       info->has_title = TRUE;
                        tracker_sparql_builder_predicate (info->metadata, "nie:title");
                        tracker_sparql_builder_object_unvalidated (info->metadata, text);
                }
                break;
 
        case MS_OFFICE_XML_TAG_SUBJECT:
-               tracker_sparql_builder_predicate (info->metadata, "nie:subject");
-               tracker_sparql_builder_object_unvalidated (info->metadata, text);
+               if (info->has_subject) {
+                       g_warning ("Avoiding additional subject (%s) in MsOffice XML document '%s'",
+                                  text, info->uri);
+               } else {
+                       info->has_subject = TRUE;
+                       tracker_sparql_builder_predicate (info->metadata, "nie:subject");
+                       tracker_sparql_builder_object_unvalidated (info->metadata, text);
+               }
                break;
 
        case MS_OFFICE_XML_TAG_AUTHOR:
-               tracker_sparql_builder_predicate (info->metadata, "nco:publisher");
+               if (info->has_publisher) {
+                       g_warning ("Avoiding additional publisher (%s) in MsOffice XML document '%s'",
+                                  text, info->uri);
+               } else {
+                       info->has_publisher = TRUE;
+                       tracker_sparql_builder_predicate (info->metadata, "nco:publisher");
 
-               tracker_sparql_builder_object_blank_open (info->metadata);
-               tracker_sparql_builder_predicate (info->metadata, "a");
-               tracker_sparql_builder_object (info->metadata, "nco:Contact");
+                       tracker_sparql_builder_object_blank_open (info->metadata);
+                       tracker_sparql_builder_predicate (info->metadata, "a");
+                       tracker_sparql_builder_object (info->metadata, "nco:Contact");
 
-               tracker_sparql_builder_predicate (info->metadata, "nco:fullname");
-               tracker_sparql_builder_object_unvalidated (info->metadata, text);
-               tracker_sparql_builder_object_blank_close (info->metadata);
+                       tracker_sparql_builder_predicate (info->metadata, "nco:fullname");
+                       tracker_sparql_builder_object_unvalidated (info->metadata, text);
+                       tracker_sparql_builder_object_blank_close (info->metadata);
+               }
                break;
 
        case MS_OFFICE_XML_TAG_COMMENTS:
-               tracker_sparql_builder_predicate (info->metadata, "nie:comment");
-               tracker_sparql_builder_object_unvalidated (info->metadata, text);
+               if (info->has_comment) {
+                       g_warning ("Avoiding additional comment (%s) in MsOffice XML document '%s'",
+                                  text, info->uri);
+               } else {
+                       info->has_comment = TRUE;
+                       tracker_sparql_builder_predicate (info->metadata, "nie:comment");
+                       tracker_sparql_builder_object_unvalidated (info->metadata, text);
+               }
                break;
 
-       case MS_OFFICE_XML_TAG_CREATED: {
-               gchar *date;
-
-               date = tracker_date_guess (text);
-               tracker_sparql_builder_predicate (info->metadata, "nie:contentCreated");
-               tracker_sparql_builder_object_unvalidated (info->metadata, date);
-               g_free (date);
+       case MS_OFFICE_XML_TAG_CREATED:
+               if (info->has_content_created) {
+                       g_warning ("Avoiding additional creation time (%s) in MsOffice XML document '%s'",
+                                  text, info->uri);
+               } else {
+                       gchar *date;
+
+                       date = tracker_date_guess (text);
+                       if (date) {
+                               info->has_content_created = TRUE;
+                               tracker_sparql_builder_predicate (info->metadata, "nie:contentCreated");
+                               tracker_sparql_builder_object_unvalidated (info->metadata, date);
+                               g_free (date);
+                       } else {
+                               g_warning ("Could not parse creation time (%s) from MsOffice XML document 
'%s'",
+                                          text, info->uri);
+                       }
+               }
                break;
-       }
 
        case MS_OFFICE_XML_TAG_GENERATOR:
-               if (info->generator_already_set) {
+               if (info->has_generator) {
                        g_warning ("Avoiding additional generator (%s) in MsOffice XML document '%s'",
                                   text, info->uri);
                } else {
-                       info->generator_already_set = TRUE;
+                       info->has_generator = TRUE;
                        tracker_sparql_builder_predicate (info->metadata, "nie:generator");
                        tracker_sparql_builder_object_unvalidated (info->metadata, text);
                }
@@ -480,34 +519,68 @@ msoffice_xml_metadata_parse (GMarkupParseContext  *context,
                 */
                break;
 
-       case MS_OFFICE_XML_TAG_MODIFIED: {
-               gchar *date;
-
-                date = tracker_date_guess (text);
-               tracker_sparql_builder_predicate (info->metadata, "nie:contentLastModified");
-               tracker_sparql_builder_object_unvalidated (info->metadata, date);
-                g_free (date);
+       case MS_OFFICE_XML_TAG_MODIFIED:
+               if (info->has_content_last_modified) {
+                       g_warning ("Avoiding additional last modification time (%s) in MsOffice XML document 
'%s'",
+                                  text, info->uri);
+               } else {
+                       gchar *date;
+
+                       date = tracker_date_guess (text);
+                       if (date) {
+                               info->has_content_last_modified = TRUE;
+                               tracker_sparql_builder_predicate (info->metadata, "nie:contentLastModified");
+                               tracker_sparql_builder_object_unvalidated (info->metadata, date);
+                               g_free (date);
+                       } else {
+                               g_warning ("Could not parse last modification time (%s) from MsOffice XML 
document '%s'",
+                                          text, info->uri);
+                       }
+               }
                break;
-       }
 
        case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
-               tracker_sparql_builder_predicate (info->metadata, "nfo:pageCount");
-               tracker_sparql_builder_object_unvalidated (info->metadata, text);
+               if (info->has_page_count) {
+                       g_warning ("Avoiding additional page count (%s) in MsOffice XML document '%s'",
+                                  text, info->uri);
+               } else {
+                       info->has_page_count = TRUE;
+                       tracker_sparql_builder_predicate (info->metadata, "nfo:pageCount");
+                       tracker_sparql_builder_object_unvalidated (info->metadata, text);
+               }
                break;
 
        case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
-               tracker_sparql_builder_predicate (info->metadata, "nfo:characterCount");
-               tracker_sparql_builder_object_unvalidated (info->metadata, text);
+               if (info->has_char_count) {
+                       g_warning ("Avoiding additional character count (%s) in MsOffice XML document '%s'",
+                                  text, info->uri);
+               } else {
+                       info->has_char_count = TRUE;
+                       tracker_sparql_builder_predicate (info->metadata, "nfo:characterCount");
+                       tracker_sparql_builder_object_unvalidated (info->metadata, text);
+               }
                break;
 
        case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
-               tracker_sparql_builder_predicate (info->metadata, "nfo:wordCount");
-               tracker_sparql_builder_object_unvalidated (info->metadata, text);
+               if (info->has_word_count) {
+                       g_warning ("Avoiding additional word count (%s) in MsOffice XML document '%s'",
+                                  text, info->uri);
+               } else {
+                       info->has_word_count = TRUE;
+                       tracker_sparql_builder_predicate (info->metadata, "nfo:wordCount");
+                       tracker_sparql_builder_object_unvalidated (info->metadata, text);
+               }
                break;
 
        case MS_OFFICE_XML_TAG_NUM_OF_LINES:
-               tracker_sparql_builder_predicate (info->metadata, "nfo:lineCount");
-               tracker_sparql_builder_object_unvalidated (info->metadata, text);
+               if (info->has_line_count) {
+                       g_warning ("Avoiding additional line count (%s) in MsOffice XML document '%s'",
+                                  text, info->uri);
+               } else {
+                       info->has_line_count = TRUE;
+                       tracker_sparql_builder_predicate (info->metadata, "nfo:lineCount");
+                       tracker_sparql_builder_object_unvalidated (info->metadata, text);
+               }
                break;
 
        case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]