[tracker] tracker-extract: Protect all single valued properties in oasis extractor



commit cd1b468d2b4eda10bdc5bfa53017e0d5302eb6d6
Author: Carlos Garnacho <carlosg gnome org>
Date:   Mon Mar 14 01:06:20 2016 +0100

    tracker-extract: Protect all single valued properties in oasis extractor
    
    In case of malformed documents or unexpected input, avoid creating sparql
    that will break cardinality constraints, warn nicely instead.

 src/tracker-extract/tracker-extract-oasis.c |  105 ++++++++++++++++++++-------
 1 files changed, 78 insertions(+), 27 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index a580b72..5e4bea0 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -58,7 +58,14 @@ typedef struct {
        TrackerSparqlBuilder *metadata;
        ODTTagType current;
        const gchar *uri;
-       gboolean title_already_set;
+       guint has_title           : 1;
+       guint has_subject         : 1;
+       guint has_publisher       : 1;
+       guint has_comment         : 1;
+       guint has_generator       : 1;
+       guint has_word_count      : 1;
+       guint has_page_count      : 1;
+       guint has_content_created : 1;
 } ODTMetadataParseInfo;
 
 typedef struct {
@@ -163,7 +170,7 @@ tracker_extract_get_metadata (TrackerExtractInfo *extract_info)
 {
        TrackerSparqlBuilder *metadata;
        TrackerConfig *config;
-       ODTMetadataParseInfo info;
+       ODTMetadataParseInfo info = { 0 };
        ODTFileType file_type;
        GFile *file;
        gchar *uri;
@@ -201,7 +208,6 @@ tracker_extract_get_metadata (TrackerExtractInfo *extract_info)
        info.metadata = metadata;
        info.current = ODT_TAG_TYPE_UNKNOWN;
        info.uri = uri;
-       info.title_already_set = FALSE;
 
        /* Create parsing context */
        context = g_markup_parse_context_new (&parser, 0, &info, NULL);
@@ -263,11 +269,23 @@ xml_start_element_handler_metadata (GMarkupParseContext  *context,
 
                for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
                        if (g_ascii_strcasecmp (*a, "meta:word-count") == 0) {
-                               tracker_sparql_builder_predicate (metadata, "nfo:wordCount");
-                               tracker_sparql_builder_object_unvalidated (metadata, *v);
+                               if (data->has_word_count) {
+                                       g_warning ("Avoiding additional word count (%s) in OASIS document 
'%s'",
+                                                  *v, data->uri);
+                               } else {
+                                       data->has_word_count = TRUE;
+                                       tracker_sparql_builder_predicate (metadata, "nfo:wordCount");
+                                       tracker_sparql_builder_object_unvalidated (metadata, *v);
+                               }
                        } else if (g_ascii_strcasecmp (*a, "meta:page-count") == 0) {
-                               tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
-                               tracker_sparql_builder_object_unvalidated (metadata, *v);
+                               if (data->has_page_count) {
+                                       g_warning ("Avoiding additional page count (%s) in OASIS document 
'%s'",
+                                                  *v, data->uri);
+                               } else {
+                                       data->has_page_count = TRUE;
+                                       tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
+                                       tracker_sparql_builder_object_unvalidated (metadata, *v);
+                               }
                        }
                }
 
@@ -311,31 +329,43 @@ xml_text_handler_metadata (GMarkupParseContext  *context,
 
        switch (data->current) {
        case ODT_TAG_TYPE_TITLE:
-               if (data->title_already_set) {
+               if (data->has_title) {
                        g_warning ("Avoiding additional title (%s) in OASIS document '%s'",
                                   text, data->uri);
                } else {
-                       data->title_already_set = TRUE;
+                       data->has_title = TRUE;
                        tracker_sparql_builder_predicate (metadata, "nie:title");
                        tracker_sparql_builder_object_unvalidated (metadata, text);
                }
                break;
 
        case ODT_TAG_TYPE_SUBJECT:
-               tracker_sparql_builder_predicate (metadata, "nie:subject");
-               tracker_sparql_builder_object_unvalidated (metadata, text);
+               if (data->has_subject) {
+                       g_warning ("Avoiding additional subject (%s) in OASIS document '%s'",
+                                  text, data->uri);
+               } else {
+                       data->has_subject = TRUE;
+                       tracker_sparql_builder_predicate (metadata, "nie:subject");
+                       tracker_sparql_builder_object_unvalidated (metadata, text);
+               }
                break;
 
        case ODT_TAG_TYPE_AUTHOR:
-               tracker_sparql_builder_predicate (metadata, "nco:publisher");
+               if (data->has_publisher) {
+                       g_warning ("Avoiding additional publisher (%s) in OASIS document '%s'",
+                                  text, data->uri);
+               } else {
+                       data->has_publisher = TRUE;
+                       tracker_sparql_builder_predicate (metadata, "nco:publisher");
 
-               tracker_sparql_builder_object_blank_open (metadata);
-               tracker_sparql_builder_predicate (metadata, "a");
-               tracker_sparql_builder_object (metadata, "nco:Contact");
+                       tracker_sparql_builder_object_blank_open (metadata);
+                       tracker_sparql_builder_predicate (metadata, "a");
+                       tracker_sparql_builder_object (metadata, "nco:Contact");
 
-               tracker_sparql_builder_predicate (metadata, "nco:fullname");
-               tracker_sparql_builder_object_unvalidated (metadata, text);
-               tracker_sparql_builder_object_blank_close (metadata);
+                       tracker_sparql_builder_predicate (metadata, "nco:fullname");
+                       tracker_sparql_builder_object_unvalidated (metadata, text);
+                       tracker_sparql_builder_object_blank_close (metadata);
+               }
                break;
 
        case ODT_TAG_TYPE_KEYWORDS: {
@@ -357,22 +387,43 @@ xml_text_handler_metadata (GMarkupParseContext  *context,
        }
 
        case ODT_TAG_TYPE_COMMENTS:
-               tracker_sparql_builder_predicate (metadata, "nie:comment");
-               tracker_sparql_builder_object_unvalidated (metadata, text);
+               if (data->has_comment) {
+                       g_warning ("Avoiding additional comment (%s) in OASIS document '%s'",
+                                  text, data->uri);
+               } else {
+                       data->has_comment = TRUE;
+                       tracker_sparql_builder_predicate (metadata, "nie:comment");
+                       tracker_sparql_builder_object_unvalidated (metadata, text);
+               }
                break;
 
        case ODT_TAG_TYPE_CREATED:
-               date = tracker_date_guess (text);
-               if (date) {
-                       tracker_sparql_builder_predicate (metadata, "nie:contentCreated");
-                       tracker_sparql_builder_object_unvalidated (metadata, date);
-                       g_free (date);
+               if (data->has_content_created) {
+                       g_warning ("Avoiding additional creation time (%s) in OASIS document '%s'",
+                                  text, data->uri);
+               } else {
+                       date = tracker_date_guess (text);
+                       if (date) {
+                               data->has_content_created = TRUE;
+                               tracker_sparql_builder_predicate (metadata, "nie:contentCreated");
+                               tracker_sparql_builder_object_unvalidated (metadata, date);
+                               g_free (date);
+                       } else {
+                               g_warning ("Could not parse creation time (%s) in OASIS document '%s'",
+                                          text, data->uri);
+                       }
                }
                break;
 
        case ODT_TAG_TYPE_GENERATOR:
-               tracker_sparql_builder_predicate (metadata, "nie:generator");
-               tracker_sparql_builder_object_unvalidated (metadata, text);
+               if (data->has_generator) {
+                       g_warning ("Avoiding additional creation time (%s) in OASIS document '%s'",
+                                  text, data->uri);
+               } else {
+                       data->has_generator = TRUE;
+                       tracker_sparql_builder_predicate (metadata, "nie:generator");
+                       tracker_sparql_builder_object_unvalidated (metadata, text);
+               }
                break;
 
        default:


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]