[tracker/epub-metadata-improvements: 1/5] tracker-extract-epub: Extract more metadata, including author to nco:PersonContact
- From: Martyn James Russell <mr src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/epub-metadata-improvements: 1/5] tracker-extract-epub: Extract more metadata, including author to nco:PersonContact
- Date: Wed, 11 Sep 2013 16:30:00 +0000 (UTC)
commit 00a6be90d67057833ec29636dd32a1b7936fa59e
Author: Martin Franco <mfranco gmx com>
Date: Wed Aug 21 13:28:19 2013 -0500
tracker-extract-epub: Extract more metadata, including author to nco:PersonContact
Added metadata: language, description, author family/given/other name,
description, subject, isbn and uuid.
src/tracker-extract/tracker-extract-epub.c | 210 +++++++++++++++++++++++++++-
1 files changed, 203 insertions(+), 7 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-epub.c b/src/tracker-extract/tracker-extract-epub.c
index 3c2f397..4ba3a02 100644
--- a/src/tracker-extract/tracker-extract-epub.c
+++ b/src/tracker-extract/tracker-extract-epub.c
@@ -29,8 +29,20 @@
typedef enum {
OPF_TAG_TYPE_UNKNOWN,
OPF_TAG_TYPE_TITLE,
+ OPF_TAG_TYPE_CREATED,
+
OPF_TAG_TYPE_AUTHOR,
- OPF_TAG_TYPE_CREATED
+ OPF_TAG_TYPE_EDITOR,
+ OPF_TAG_TYPE_ILLUSTRATOR,
+ OPF_TAG_TYPE_CONTRIBUTOR,
+
+ OPF_TAG_TYPE_LANGUAGE,
+ OPF_TAG_TYPE_SUBJECT,
+ OPF_TAG_TYPE_DESCRIPTION,
+ OPF_TAG_TYPE_UUID,
+ OPF_TAG_TYPE_ISBN,
+ OPF_TAG_TYPE_PUBLISHER,
+ OPF_TAG_TYPE_RATING // calibre addition, should it be indexed? how?
} OPFTagType;
typedef struct {
@@ -40,6 +52,7 @@ typedef struct {
GList *pages;
guint in_metadata : 1;
guint in_manifest : 1;
+ gchar *savedstring;
} OPFData;
typedef struct {
@@ -86,6 +99,7 @@ opf_xml_start_element_handler (GMarkupParseContext *context,
{
OPFData *data = user_data;
gint i;
+ gboolean has_role_attr = FALSE;
if (g_strcmp0 (element_name, "metadata") == 0) {
data->in_metadata = TRUE;
@@ -97,12 +111,30 @@ opf_xml_start_element_handler (GMarkupParseContext *context,
data->element = OPF_TAG_TYPE_TITLE;
} else if (g_strcmp0 (element_name, "dc:creator") == 0) {
for (i = 0; attribute_names[i] != NULL; i++) {
- if (g_strcmp0 (attribute_names[i], "opf:role") == 0 &&
- g_strcmp0 (attribute_values[i], "aut") == 0) {
- data->element = OPF_TAG_TYPE_AUTHOR;
- break;
+ if (g_strcmp0 (attribute_names[i], "opf:file-as") == 0) {
+ g_debug ("Found creator file-as tag");
+ data->savedstring = g_strdup(attribute_values[i]);
+ } else if (g_strcmp0 (attribute_names[i], "opf:role") == 0) {
+ has_role_attr = TRUE;
+ if(g_strcmp0 (attribute_values[i], "aut") == 0) {
+ data->element = OPF_TAG_TYPE_AUTHOR;
+ } else if(g_strcmp0 (attribute_values[i], "edt") == 0) {
+ data->element = OPF_TAG_TYPE_EDITOR;
+ } else if(g_strcmp0 (attribute_values[i], "ill") == 0) {
+ data->element = OPF_TAG_TYPE_ILLUSTRATOR;
+ } else {
+ data->element = OPF_TAG_TYPE_UNKNOWN;
+ g_debug ("Unknown role, skipping");
+ if(data->savedstring) {
+ free(data->savedstring);
+ data->savedstring = NULL;
+ }
+ }
}
}
+ if (!has_role_attr) {
+ data->element = OPF_TAG_TYPE_AUTHOR;
+ }
} else if (g_strcmp0 (element_name, "dc:date") == 0) {
for (i = 0; attribute_names[i] != NULL; i++) {
if (g_strcmp0 (attribute_names[i], "opf:event") == 0 &&
@@ -111,6 +143,36 @@ opf_xml_start_element_handler (GMarkupParseContext *context,
break;
}
}
+ } else if (g_strcmp0 (element_name, "dc:publisher") == 0) {
+ data->element = OPF_TAG_TYPE_PUBLISHER;
+ } else if (g_strcmp0 (element_name, "dc:description") == 0) {
+ data->element = OPF_TAG_TYPE_DESCRIPTION;
+ } else if (g_strcmp0 (element_name, "dc:language") == 0) {
+ data->element = OPF_TAG_TYPE_LANGUAGE;
+ } else if (g_strcmp0 (element_name, "dc:identifier") == 0) {
+ data->element = OPF_TAG_TYPE_UUID;
+ for (i = 0; attribute_names[i] != NULL; i++) {
+ if (g_strcmp0 (attribute_names[i], "opf:scheme") == 0) {
+ if (g_ascii_strncasecmp (attribute_values[i], "isbn", 4) == 0) {
+ data->element = OPF_TAG_TYPE_ISBN;
+ }
+ }
+ }
+ /*
+ } else if (g_strcmp0 (element_name, "meta") == 0) {
+ for (i = 0; attribute_names[i] != NULL; i++) {
+ if (g_strcmp0 (attribute_names[i], "name") == 0) {
+ if (g_strcmp0 (attribute_values[i], "calibre:rating") == 0) {
+ anybool = TRUE;
+ }
+ } else if(anybool && g_strcmp0 (attribute_names[i], "content")) {
+ data->element = OPF_TAG_TYPE_RATING;
+ data->savedstring = g_strdup(attribute_values[i]);
+ }
+ }
+ } else if (g_strcmp0 (element_name, "dc:subject") == 0) {
+ data->element = OPF_TAG_TYPE_SUBJECT;
+ */
}
} else if (data->in_manifest &&
g_strcmp0 (element_name, "item") == 0) {
@@ -158,10 +220,11 @@ opf_xml_text_handler (GMarkupParseContext *context,
GError **error)
{
OPFData *data = user_data;
- gchar *date;
+ gchar *date, *fname, *gname, *oname;
+ int i, j, len;
switch (data->element) {
- case OPF_TAG_TYPE_AUTHOR:
+ case OPF_TAG_TYPE_PUBLISHER:
tracker_sparql_builder_predicate (data->metadata, "nco:publisher");
tracker_sparql_builder_object_blank_open (data->metadata);
@@ -172,6 +235,114 @@ opf_xml_text_handler (GMarkupParseContext *context,
tracker_sparql_builder_object_unvalidated (data->metadata, text);
tracker_sparql_builder_object_blank_close (data->metadata);
break;
+ case OPF_TAG_TYPE_AUTHOR:
+ case OPF_TAG_TYPE_EDITOR:
+ case OPF_TAG_TYPE_ILLUSTRATOR:
+ case OPF_TAG_TYPE_CONTRIBUTOR:
+ fname = NULL;
+ gname = NULL;
+ oname = NULL;
+ // parse name. may not work for dissimilar cultures.
+ if(data->savedstring != NULL) {
+ // <family name>, <given name> <other name>
+ g_debug ("EPUB Parsing opf:file-as attribute: %s", data->savedstring);
+ len = strlen(data->savedstring);
+ for (i=0; i < len; i++)
+ if(data->savedstring[i] == ',') {
+ fname = strndup (data->savedstring, i);
+ g_debug ("Found family name: %s", fname);
+ for(; data->savedstring[i] == ',' || data->savedstring[i] == ' ';
i++);
+ j = i;
+ break;
+ }
+ if(i == len) {
+ g_debug ("Found only one name");
+ fname = strdup(data->savedstring);
+ } else {
+ for(; i <= len; i++) {
+ if (i == len || data->savedstring[i] == ' ') {
+ gname = strndup (data->savedstring + j, i-j);
+ g_debug ("Found given name: %s", gname);
+ for(; data->savedstring[i] == ',' || data->savedstring[i] ==
' '; i++);
+ if (i != len) {
+ oname = strdup (data->savedstring + i);
+ g_debug ("Found other name: %s", oname);
+ }
+ break;
+ }
+ }
+ }
+ } else {
+ // <given name> <other name> <family name>
+ g_debug ("Parsing name, no opf:file-as found: %s", text);
+ j = 0;
+ len = strlen (text);
+ for (i=0; i<len; i++) {
+ if (text[i] == ' ') {
+ gname = strndup (text, i);
+ g_debug ("Found Given Name: %s", gname);
+ j = i+1;
+ break;
+ }
+ }
+ if (j == 0) {
+ fname = strdup (data->savedstring);
+ g_debug ("Found Only One Name: %s", fname);
+ } else {
+ for (i=len-1; i>=j-1; i--) {
+ if (text[i] == ' ') {
+ fname = strdup (text + i+1);
+ g_debug ("Found Family Name: %s", fname);
+ if (i > j) {
+ oname = strndup (text+j, i-j);
+ g_debug ("Found Other Name: %s", oname);
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ tracker_sparql_builder_predicate (data->metadata, "nco:creator");
+
+ tracker_sparql_builder_object_blank_open (data->metadata);
+ tracker_sparql_builder_predicate (data->metadata, "a");
+ tracker_sparql_builder_object (data->metadata, "nco:PersonContact");
+
+ if (fname) {
+ tracker_sparql_builder_predicate (data->metadata, "nco:nameFamily");
+ tracker_sparql_builder_object_unvalidated (data->metadata, fname);
+ free(fname);
+ }
+ if (gname) {
+ tracker_sparql_builder_predicate (data->metadata, "nco:nameGiven");
+ tracker_sparql_builder_object_unvalidated (data->metadata, gname);
+ free(gname);
+ }
+ if (oname) {
+ tracker_sparql_builder_predicate (data->metadata, "nco:nameOther");
+ tracker_sparql_builder_object_unvalidated (data->metadata, oname);
+ free(oname);
+ }
+
+ tracker_sparql_builder_object_blank_open (data->metadata);
+ tracker_sparql_builder_predicate (data->metadata, "a");
+ tracker_sparql_builder_object (data->metadata, "nco:Role");
+ tracker_sparql_builder_predicate (data->metadata, "nco:role");
+
+ if (data->element == OPF_TAG_TYPE_AUTHOR)
+ tracker_sparql_builder_object_unvalidated (data->metadata, "aut");
+ else if (data->element == OPF_TAG_TYPE_EDITOR)
+ tracker_sparql_builder_object_unvalidated (data->metadata, "edt");
+ else if (data->element == OPF_TAG_TYPE_EDITOR)
+ tracker_sparql_builder_object_unvalidated (data->metadata, "ill");
+ else
+ g_assert("Unknown role");
+
+ tracker_sparql_builder_object_blank_close (data->metadata);
+
+ tracker_sparql_builder_object_blank_close (data->metadata);
+ break;
case OPF_TAG_TYPE_TITLE:
tracker_sparql_builder_predicate (data->metadata, "nie:title");
tracker_sparql_builder_object_unvalidated (data->metadata, text);
@@ -182,10 +353,35 @@ opf_xml_text_handler (GMarkupParseContext *context,
tracker_sparql_builder_object_unvalidated (data->metadata, date);
g_free (date);
break;
+ case OPF_TAG_TYPE_LANGUAGE:
+ tracker_sparql_builder_predicate (data->metadata, "nie:language");
+ tracker_sparql_builder_object_unvalidated (data->metadata, text);
+ break;
+ case OPF_TAG_TYPE_SUBJECT:
+ tracker_sparql_builder_predicate (data->metadata, "nie:subject");
+ tracker_sparql_builder_object_unvalidated (data->metadata, text);
+ break;
+ case OPF_TAG_TYPE_DESCRIPTION:
+ tracker_sparql_builder_predicate (data->metadata, "nie:description");
+ tracker_sparql_builder_object_unvalidated (data->metadata, text);
+ break;
+ case OPF_TAG_TYPE_UUID:
+ tracker_sparql_builder_predicate (data->metadata, "nie:identifier");
+ tracker_sparql_builder_object_unvalidated (data->metadata, text);
+ break;
+ case OPF_TAG_TYPE_ISBN:
+ tracker_sparql_builder_predicate (data->metadata, "nie:identifier");
+ tracker_sparql_builder_object_unvalidated (data->metadata, text);
+ break;
+// case OPF_TAG_TYPE_RATING:
case OPF_TAG_TYPE_UNKNOWN:
default:
break;
}
+ if(data->savedstring) {
+ free(data->savedstring);
+ data->savedstring = NULL;
+ }
}
/* Methods to extract XHTML text content */
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]