[tracker/epub-metadata-improvements: 1/5] tracker-extract-epub: Extract more metadata, including author to nco:PersonContact



commit 00a6be90d67057833ec29636dd32a1b7936fa59e
Author: Martin Franco <mfranco gmx com>
Date:   Wed Aug 21 13:28:19 2013 -0500

    tracker-extract-epub: Extract more metadata, including author to nco:PersonContact
    
    Added metadata: language, description, author family/given/other name,
    description, subject, isbn and uuid.

 src/tracker-extract/tracker-extract-epub.c |  210 +++++++++++++++++++++++++++-
 1 files changed, 203 insertions(+), 7 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-epub.c b/src/tracker-extract/tracker-extract-epub.c
index 3c2f397..4ba3a02 100644
--- a/src/tracker-extract/tracker-extract-epub.c
+++ b/src/tracker-extract/tracker-extract-epub.c
@@ -29,8 +29,20 @@
 typedef enum {
        OPF_TAG_TYPE_UNKNOWN,
        OPF_TAG_TYPE_TITLE,
+       OPF_TAG_TYPE_CREATED,
+
        OPF_TAG_TYPE_AUTHOR,
-       OPF_TAG_TYPE_CREATED
+       OPF_TAG_TYPE_EDITOR,
+       OPF_TAG_TYPE_ILLUSTRATOR,
+       OPF_TAG_TYPE_CONTRIBUTOR,
+
+       OPF_TAG_TYPE_LANGUAGE,
+       OPF_TAG_TYPE_SUBJECT,
+       OPF_TAG_TYPE_DESCRIPTION,
+       OPF_TAG_TYPE_UUID,
+       OPF_TAG_TYPE_ISBN,
+       OPF_TAG_TYPE_PUBLISHER,
+       OPF_TAG_TYPE_RATING  // calibre addition, should it be indexed? how?
 } OPFTagType;
 
 typedef struct {
@@ -40,6 +52,7 @@ typedef struct {
        GList *pages;
        guint in_metadata : 1;
        guint in_manifest : 1;
+       gchar *savedstring;
 } OPFData;
 
 typedef struct {
@@ -86,6 +99,7 @@ opf_xml_start_element_handler (GMarkupParseContext  *context,
 {
        OPFData *data = user_data;
        gint i;
+       gboolean has_role_attr = FALSE;
 
        if (g_strcmp0 (element_name, "metadata") == 0) {
                data->in_metadata = TRUE;
@@ -97,12 +111,30 @@ opf_xml_start_element_handler (GMarkupParseContext  *context,
                        data->element = OPF_TAG_TYPE_TITLE;
                } else if (g_strcmp0 (element_name, "dc:creator") == 0) {
                        for (i = 0; attribute_names[i] != NULL; i++) {
-                               if (g_strcmp0 (attribute_names[i], "opf:role") == 0 &&
-                                   g_strcmp0 (attribute_values[i], "aut") == 0) {
-                                       data->element = OPF_TAG_TYPE_AUTHOR;
-                                       break;
+                               if (g_strcmp0 (attribute_names[i], "opf:file-as") == 0) {
+                                       g_debug ("Found creator file-as tag");
+                                       data->savedstring = g_strdup(attribute_values[i]);
+                               } else if (g_strcmp0 (attribute_names[i], "opf:role") == 0) {
+                                       has_role_attr = TRUE;
+                                       if(g_strcmp0 (attribute_values[i], "aut") == 0) {
+                                               data->element = OPF_TAG_TYPE_AUTHOR;
+                                       } else if(g_strcmp0 (attribute_values[i], "edt") == 0) {
+                                               data->element = OPF_TAG_TYPE_EDITOR;
+                                       } else if(g_strcmp0 (attribute_values[i], "ill") == 0) {
+                                               data->element = OPF_TAG_TYPE_ILLUSTRATOR;
+                                       } else {
+                                               data->element = OPF_TAG_TYPE_UNKNOWN;
+                                               g_debug ("Unknown role, skipping");
+                                               if(data->savedstring) {
+                                                       free(data->savedstring);
+                                                       data->savedstring = NULL;
+                                               }
+                                       }
                                }
                        }
+                       if (!has_role_attr) {
+                               data->element = OPF_TAG_TYPE_AUTHOR;
+                       }
                } else if (g_strcmp0 (element_name, "dc:date") == 0) {
                        for (i = 0; attribute_names[i] != NULL; i++) {
                                if (g_strcmp0 (attribute_names[i], "opf:event") == 0 &&
@@ -111,6 +143,36 @@ opf_xml_start_element_handler (GMarkupParseContext  *context,
                                        break;
                                }
                        }
+               } else if (g_strcmp0 (element_name, "dc:publisher") == 0) {
+                       data->element = OPF_TAG_TYPE_PUBLISHER;
+               } else if (g_strcmp0 (element_name, "dc:description") == 0) {
+                       data->element = OPF_TAG_TYPE_DESCRIPTION;
+               } else if (g_strcmp0 (element_name, "dc:language") == 0) {
+                       data->element = OPF_TAG_TYPE_LANGUAGE;
+               } else if (g_strcmp0 (element_name, "dc:identifier") == 0) {
+                       data->element = OPF_TAG_TYPE_UUID;
+                       for (i = 0; attribute_names[i] != NULL; i++) {
+                               if (g_strcmp0 (attribute_names[i], "opf:scheme") == 0) {
+                                       if (g_ascii_strncasecmp (attribute_values[i], "isbn", 4) == 0) {
+                                               data->element = OPF_TAG_TYPE_ISBN;
+                                       }
+                               }
+                       }
+                       /*
+               } else if (g_strcmp0 (element_name, "meta") == 0) {
+                       for (i = 0; attribute_names[i] != NULL; i++) {
+                               if (g_strcmp0 (attribute_names[i], "name") == 0) {
+                                       if (g_strcmp0 (attribute_values[i], "calibre:rating") == 0) {
+                                               anybool = TRUE;
+                                       }
+                               } else if(anybool && g_strcmp0 (attribute_names[i], "content")) {
+                                       data->element = OPF_TAG_TYPE_RATING;
+                                       data->savedstring = g_strdup(attribute_values[i]);
+                               }
+                       }
+               } else if (g_strcmp0 (element_name, "dc:subject") == 0) {
+                       data->element = OPF_TAG_TYPE_SUBJECT;
+               */
                }
        } else if (data->in_manifest &&
                   g_strcmp0 (element_name, "item") == 0) {
@@ -158,10 +220,11 @@ opf_xml_text_handler (GMarkupParseContext   *context,
                       GError               **error)
 {
        OPFData *data = user_data;
-       gchar *date;
+       gchar *date, *fname, *gname, *oname;
+       int i, j, len;
 
        switch (data->element) {
-       case OPF_TAG_TYPE_AUTHOR:
+       case OPF_TAG_TYPE_PUBLISHER:
                tracker_sparql_builder_predicate (data->metadata, "nco:publisher");
 
                tracker_sparql_builder_object_blank_open (data->metadata);
@@ -172,6 +235,114 @@ opf_xml_text_handler (GMarkupParseContext   *context,
                tracker_sparql_builder_object_unvalidated (data->metadata, text);
                tracker_sparql_builder_object_blank_close (data->metadata);
                break;
+       case OPF_TAG_TYPE_AUTHOR:
+       case OPF_TAG_TYPE_EDITOR:
+       case OPF_TAG_TYPE_ILLUSTRATOR:
+       case OPF_TAG_TYPE_CONTRIBUTOR:
+               fname = NULL;
+               gname = NULL;
+               oname = NULL;
+               // parse name.  may not work for dissimilar cultures.
+               if(data->savedstring != NULL) {
+                       // <family name>, <given name> <other name>
+                       g_debug ("EPUB Parsing opf:file-as attribute: %s", data->savedstring);
+                       len = strlen(data->savedstring);
+                       for (i=0; i < len; i++)
+                               if(data->savedstring[i] == ',') {
+                                       fname = strndup (data->savedstring, i);
+                                       g_debug ("Found family name: %s", fname);
+                                       for(; data->savedstring[i] == ',' || data->savedstring[i] == ' '; 
i++);
+                                       j = i;
+                                       break;
+                               }
+                       if(i == len) {
+                               g_debug ("Found only one name");
+                               fname = strdup(data->savedstring);
+                       } else {
+                               for(; i <= len; i++) {
+                                       if (i == len || data->savedstring[i] == ' ') {
+                                               gname = strndup (data->savedstring + j, i-j);
+                                               g_debug ("Found given name: %s", gname);
+                                               for(; data->savedstring[i] == ',' || data->savedstring[i] == 
' '; i++);
+                                               if (i != len) {
+                                                       oname = strdup (data->savedstring + i);
+                                                       g_debug ("Found other name: %s", oname);
+                                               }
+                                               break;
+                                       }
+                               }
+                       }
+               } else {
+                       // <given name> <other name> <family name>
+                       g_debug ("Parsing name, no opf:file-as found: %s", text);
+                       j = 0;
+                       len = strlen (text);
+                       for (i=0; i<len; i++) {
+                               if (text[i] == ' ') {
+                                       gname = strndup (text, i);
+                                       g_debug ("Found Given Name: %s", gname);
+                                       j = i+1;
+                                       break;
+                               }
+                       }
+                       if (j == 0) {
+                               fname = strdup (data->savedstring);
+                               g_debug ("Found Only One Name: %s", fname);
+                       } else {
+                               for (i=len-1; i>=j-1; i--) {
+                                       if (text[i] == ' ') {
+                                               fname = strdup (text + i+1);
+                                               g_debug ("Found Family Name: %s", fname);
+                                               if (i > j) {
+                                                       oname = strndup (text+j, i-j);
+                                                       g_debug ("Found Other Name: %s", oname);
+                                               }
+                                               break;
+                                       }
+                               }
+                       }
+               }
+
+               tracker_sparql_builder_predicate (data->metadata, "nco:creator");
+
+               tracker_sparql_builder_object_blank_open (data->metadata);
+               tracker_sparql_builder_predicate (data->metadata, "a");
+               tracker_sparql_builder_object (data->metadata, "nco:PersonContact");
+
+               if (fname) {
+                       tracker_sparql_builder_predicate (data->metadata, "nco:nameFamily");
+                       tracker_sparql_builder_object_unvalidated (data->metadata, fname);
+                       free(fname);
+               }
+               if (gname) {
+                       tracker_sparql_builder_predicate (data->metadata, "nco:nameGiven");
+                       tracker_sparql_builder_object_unvalidated (data->metadata, gname);
+                       free(gname);
+               }
+               if (oname) {
+                       tracker_sparql_builder_predicate (data->metadata, "nco:nameOther");
+                       tracker_sparql_builder_object_unvalidated (data->metadata, oname);
+                       free(oname);
+               }
+
+               tracker_sparql_builder_object_blank_open (data->metadata);
+               tracker_sparql_builder_predicate (data->metadata, "a");
+               tracker_sparql_builder_object (data->metadata, "nco:Role");
+               tracker_sparql_builder_predicate (data->metadata, "nco:role");
+
+               if (data->element == OPF_TAG_TYPE_AUTHOR)
+                       tracker_sparql_builder_object_unvalidated (data->metadata, "aut");
+               else if (data->element == OPF_TAG_TYPE_EDITOR)
+                       tracker_sparql_builder_object_unvalidated (data->metadata, "edt");
+               else if (data->element == OPF_TAG_TYPE_EDITOR)
+                       tracker_sparql_builder_object_unvalidated (data->metadata, "ill");
+               else
+                       g_assert("Unknown role");
+
+               tracker_sparql_builder_object_blank_close (data->metadata);
+
+               tracker_sparql_builder_object_blank_close (data->metadata);
+               break;
        case OPF_TAG_TYPE_TITLE:
                tracker_sparql_builder_predicate (data->metadata, "nie:title");
                tracker_sparql_builder_object_unvalidated (data->metadata, text);
@@ -182,10 +353,35 @@ opf_xml_text_handler (GMarkupParseContext   *context,
                tracker_sparql_builder_object_unvalidated (data->metadata, date);
                g_free (date);
                break;
+       case OPF_TAG_TYPE_LANGUAGE:
+               tracker_sparql_builder_predicate (data->metadata, "nie:language");
+               tracker_sparql_builder_object_unvalidated (data->metadata, text);
+               break;
+       case OPF_TAG_TYPE_SUBJECT:
+               tracker_sparql_builder_predicate (data->metadata, "nie:subject");
+               tracker_sparql_builder_object_unvalidated (data->metadata, text);
+               break;
+       case OPF_TAG_TYPE_DESCRIPTION:
+               tracker_sparql_builder_predicate (data->metadata, "nie:description");
+               tracker_sparql_builder_object_unvalidated (data->metadata, text);
+               break;
+       case OPF_TAG_TYPE_UUID:
+               tracker_sparql_builder_predicate (data->metadata, "nie:identifier");
+               tracker_sparql_builder_object_unvalidated (data->metadata, text);
+               break;
+       case OPF_TAG_TYPE_ISBN:
+               tracker_sparql_builder_predicate (data->metadata, "nie:identifier");
+               tracker_sparql_builder_object_unvalidated (data->metadata, text);
+               break;
+//     case OPF_TAG_TYPE_RATING:
        case OPF_TAG_TYPE_UNKNOWN:
        default:
                break;
        }
+       if(data->savedstring) {
+               free(data->savedstring);
+               data->savedstring = NULL;
+       }
 }
 
 /* Methods to extract XHTML text content */


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]