[evince] pdf:Add support for PDF/X and additional xmp metadata
- From: Germán Poo-Caamaño <gpoo src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [evince] pdf:Add support for PDF/X and additional xmp metadata
- Date: Sun, 8 Jul 2018 03:17:42 +0000 (UTC)
commit f109d4f5c2f18bd021f69cc27c378c73ae12821f
Author: Evangelos Rigas <erigas rnd2 org>
Date: Fri Jun 15 19:03:16 2018 +0100
pdf:Add support for PDF/X and additional xmp metadata
Recognise pdf as PDF/X and extract the following metadata
from the embedded XMP:
* Author,
* Creator,
* Producer,
* Keywords,
* Subject,
* Title, and
* dates
Fixes #93
backend/pdf/ev-poppler.cc | 403 +++++++++++++++++++++++++++++++++++++---------
1 file changed, 328 insertions(+), 75 deletions(-)
---
diff --git a/backend/pdf/ev-poppler.cc b/backend/pdf/ev-poppler.cc
index 4c6768ed..90300c5d 100644
--- a/backend/pdf/ev-poppler.cc
+++ b/backend/pdf/ev-poppler.cc
@@ -1,6 +1,7 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; c-indent-level: 8 -*- */
/* this file is part of evince, a gnome document viewer
*
+ * Copyright (C) 2018, Evangelos Rigas <erigas rnd2 org>
* Copyright (C) 2009, Juanjo Marín <juanj marin juntadeandalucia es>
* Copyright (C) 2004, Red Hat, Inc.
*
@@ -73,6 +74,20 @@
#define LICENSE_WEB_STATEMENT "/x:xmpmeta/rdf:RDF/rdf:Description/xmpRights:WebStatement"
/* license field from Creative Commons schema, http://creativecommons.org/ns */
#define LICENSE_URI "/x:xmpmeta/rdf:RDF/rdf:Description/cc:license/@rdf:resource"
+/* fields for authors and keywords */
+#define AUTHORS "/x:xmpmeta/rdf:RDF/rdf:Description/dc:creator/rdf:Seq/rdf:li"
+#define KEYWORDS "/x:xmpmeta/rdf:RDF/rdf:Description/dc:subject/rdf:Bag/rdf:li"
+/* fields for title and subject */
+#define TITLE "/x:xmpmeta/rdf:RDF/rdf:Description/dc:title/rdf:Alt/rdf:li[lang('%s')]"
+#define SUBJECT "/x:xmpmeta/rdf:RDF/rdf:Description/dc:description/rdf:Alt/rdf:li[lang('%s')]"
+/* fields for creation and modification dates */
+#define MOD_DATE "/x:xmpmeta/rdf:RDF/rdf:Description/xmp:ModifyDate"
+#define CREATE_DATE "/x:xmpmeta/rdf:RDF/rdf:Description/xmp:CreateDate"
+#define META_DATE "/x:xmpmeta/rdf:RDF/rdf:Description/xmp:MetadataDate"
+/* fields for pdf creator tool and producer */
+#define CREATOR "/x:xmpmeta/rdf:RDF/rdf:Description/xmp:CreatorTool"
+#define PRODUCER "/x:xmpmeta/rdf:RDF/rdf:Description/pdf:Producer"
+
typedef struct {
EvFileExporterFormat format;
@@ -555,61 +570,66 @@ pdf_document_get_thumbnail_surface (EvDocument *document,
return surface;
}
+static xmlChar *
+pdf_document_get_format_from_path (xmlXPathContextPtr xpathCtx,
+ const char* xpath)
+{
+ xmlXPathObjectPtr xpathObj;
+ xmlChar *result = NULL;
+
+ /* add pdf/a and pdf/x namespaces */
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "x", BAD_CAST "adobe:ns:meta/");
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "rdf", BAD_CAST "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "pdfaid", BAD_CAST "http://www.aiim.org/pdfa/ns/id/");
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "pdfxid", BAD_CAST "http://www.npes.org/pdfx/ns/id/");
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "pdfx", BAD_CAST "http://ns.adobe.com/pdfx/1.3/");
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "pdf", BAD_CAST "http://ns.adobe.com/pdf/1.3/");
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "xmp", BAD_CAST "http://ns.adobe.com/xap/1.0/");
+
+ xpathObj = xmlXPathEvalExpression (BAD_CAST xpath, xpathCtx);
+ if (xpathObj == NULL)
+ return NULL;
+
+ if (xpathObj->nodesetval != NULL && xpathObj->nodesetval->nodeNr != 0)
+ result = xmlNodeGetContent (xpathObj->nodesetval->nodeTab[0]);
+
+ xmlXPathFreeObject (xpathObj);
+ return result;
+}
+
/* reference:
http://www.pdfa.org/lib/exe/fetch.php?id=pdfa%3Aen%3Atechdoc&cache=cache&media=pdfa:techdoc:tn0001_pdfa-1_and_namespaces_2008-03-18.pdf
*/
static char *
-pdf_document_get_format_from_metadata (xmlDocPtr doc,
- xmlXPathContextPtr xpathCtx)
+pdf_document_get_format_from_metadata (xmlXPathContextPtr xpathCtx)
{
- xmlXPathObjectPtr xpathObj;
xmlChar *part = NULL;
xmlChar *conf = NULL;
+ xmlChar *pdfxid = NULL;
char *result = NULL;
int i;
- /* add pdf/a namespaces */
- xmlXPathRegisterNs (xpathCtx, BAD_CAST "x", BAD_CAST "adobe:ns:meta/");
- xmlXPathRegisterNs (xpathCtx, BAD_CAST "rdf", BAD_CAST "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
- xmlXPathRegisterNs (xpathCtx, BAD_CAST "pdfaid", BAD_CAST "http://www.aiim.org/pdfa/ns/id/");
-
/* reads pdf/a part */
/* first syntax: child node */
- xpathObj = xmlXPathEvalExpression (BAD_CAST "/x:xmpmeta/rdf:RDF/rdf:Description/pdfaid:part",
xpathCtx);
- if (xpathObj != NULL) {
- if (xpathObj->nodesetval != NULL && xpathObj->nodesetval->nodeNr != 0)
- part = xmlNodeGetContent (xpathObj->nodesetval->nodeTab[0]);
-
- xmlXPathFreeObject (xpathObj);
- }
+ part = pdf_document_get_format_from_path (xpathCtx, "/x:xmpmeta/rdf:RDF/rdf:Description/pdfaid:part");
if (part == NULL) {
/* second syntax: attribute */
- xpathObj = xmlXPathEvalExpression (BAD_CAST
"/x:xmpmeta/rdf:RDF/rdf:Description/@pdfaid:part", xpathCtx);
- if (xpathObj != NULL) {
- if (xpathObj->nodesetval != NULL && xpathObj->nodesetval->nodeNr != 0)
- part = xmlNodeGetContent (xpathObj->nodesetval->nodeTab[0]);
-
- xmlXPathFreeObject (xpathObj);
- }
+ part = pdf_document_get_format_from_path (xpathCtx,
"/x:xmpmeta/rdf:RDF/rdf:Description/@pdfaid:part");
}
/* reads pdf/a conformance */
/* first syntax: child node */
- xpathObj = xmlXPathEvalExpression (BAD_CAST "/x:xmpmeta/rdf:RDF/rdf:Description/pdfaid:conformance",
xpathCtx);
- if (xpathObj != NULL) {
- if (xpathObj->nodesetval != NULL && xpathObj->nodesetval->nodeNr != 0)
- conf = xmlNodeGetContent (xpathObj->nodesetval->nodeTab[0]);
-
- xmlXPathFreeObject (xpathObj);
- }
+ conf = pdf_document_get_format_from_path (xpathCtx,
"/x:xmpmeta/rdf:RDF/rdf:Description/pdfaid:conformance");
if (conf == NULL) {
/* second syntax: attribute */
- xpathObj = xmlXPathEvalExpression (BAD_CAST
"/x:xmpmeta/rdf:RDF/rdf:Description/@pdfaid:conformance", xpathCtx);
- if (xpathObj != NULL) {
- if (xpathObj->nodesetval != NULL && xpathObj->nodesetval->nodeNr != 0)
- conf = xmlNodeGetContent (xpathObj->nodesetval->nodeTab[0]);
+ conf = pdf_document_get_format_from_path (xpathCtx,
"/x:xmpmeta/rdf:RDF/rdf:Description/@pdfaid:conformance");
+ }
- xmlXPathFreeObject (xpathObj);
- }
+ /* reads pdf/x id */
+ /* first syntax: pdfxid */
+ pdfxid = pdf_document_get_format_from_path (xpathCtx,
"/x:xmpmeta/rdf:RDF/rdf:Description/pdfxid:GTS_PDFXVersion");
+ if (pdfxid == NULL) {
+ /* second syntax: pdfx */
+ pdfxid = pdf_document_get_format_from_path (xpathCtx,
"/x:xmpmeta/rdf:RDF/rdf:Description/pdfx:GTS_PDFXVersion");
}
if (part != NULL && conf != NULL) {
@@ -620,17 +640,232 @@ pdf_document_get_format_from_metadata (xmlDocPtr doc,
/* return buffer */
result = g_strdup_printf ("PDF/A - %s%s", part, conf);
}
+ else if (pdfxid != NULL) {
+ result = g_strdup_printf ("%s", pdfxid);
+ }
/* Cleanup */
xmlFree (part);
xmlFree (conf);
+ xmlFree (pdfxid);
+ return result;
+}
+
+static char *
+pdf_document_get_lists_from_dc_tags (xmlXPathContextPtr xpathCtx,
+ const char* xpath)
+{
+ xmlXPathObjectPtr xpathObj;
+ int i;
+ char* elements = NULL;
+ char* tmp_elements = NULL;
+ char* result = NULL;
+ xmlChar* content;
+
+ /* add pdf/a namespaces */
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "x", BAD_CAST "adobe:ns:meta/");
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "rdf", BAD_CAST "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "dc", BAD_CAST "http://purl.org/dc/elements/1.1/");
+
+ /* reads pdf/a sequence*/
+ xpathObj = xmlXPathEvalExpression (BAD_CAST xpath, xpathCtx);
+ if (xpathObj == NULL)
+ return NULL;
+
+ if (xpathObj->nodesetval != NULL && xpathObj->nodesetval->nodeNr != 0) {
+ for (i = 0; i < xpathObj->nodesetval->nodeNr; i++) {
+ content = xmlNodeGetContent (xpathObj->nodesetval->nodeTab[i]);
+ if (i) {
+ tmp_elements = g_strdup (elements);
+ g_free (elements);
+ elements = g_strdup_printf ("%s, %s", tmp_elements, content);
+ g_free (tmp_elements);
+ } else {
+ elements = g_strdup_printf ("%s", content);
+ }
+ xmlFree(content);
+ }
+ }
+ xmlXPathFreeObject (xpathObj);
+
+
+ if (elements != NULL) {
+ /* return buffer */
+ result = g_strdup (elements);
+ }
+
+ /* Cleanup */
+ g_free (elements);
return result;
}
+static char *
+pdf_document_get_author_from_metadata (xmlXPathContextPtr xpathCtx)
+{
+ return pdf_document_get_lists_from_dc_tags (xpathCtx, AUTHORS);
+}
+
+static char *
+pdf_document_get_keywords_from_metadata (xmlXPathContextPtr xpathCtx)
+{
+ return pdf_document_get_lists_from_dc_tags (xpathCtx, KEYWORDS);
+}
+
+static char *
+pdf_document_get_localized_object_from_metadata (xmlXPathContextPtr xpathCtx,
+ const char* xpath)
+{
+ xmlXPathObjectPtr xpathObj;
+ xmlChar *marked = NULL;
+ const char *language_string;
+ char *aux;
+ gchar **tags;
+ gchar *tag, *tag_aux;
+ int i, j;
+ char *loc_object= NULL;
+
+ /* register namespaces */
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "x", BAD_CAST "adobe:ns:meta/");
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "rdf", BAD_CAST "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "dc", BAD_CAST "http://purl.org/dc/elements/1.1/");
+ /* XMP Rights Management Schema */
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "xmpRights", BAD_CAST "http://ns.adobe.com/xap/1.0/rights/");
+ /* Creative Commons Schema */
+ xmlXPathRegisterNs (xpathCtx, BAD_CAST "cc", BAD_CAST "http://creativecommons.org/ns#");
+
+ /* 1) checking for a suitable localized string */
+ language_string = pango_language_to_string (gtk_get_default_language ());
+ tags = g_strsplit (language_string, "-", -1);
+ i = g_strv_length (tags);
+ while (i-- && !loc_object) {
+ tag = g_strdup (tags[0]);
+ for (j = 1; j <= i; j++) {
+ tag_aux = g_strdup_printf ("%s-%s", tag, tags[j]);
+ g_free (tag);
+ tag = tag_aux;
+ }
+ aux = g_strdup_printf (xpath, tag);
+ xpathObj = xmlXPathEvalExpression (BAD_CAST aux, xpathCtx);
+ if (xpathObj != NULL) {
+ if (xpathObj->nodesetval != NULL &&
+ xpathObj->nodesetval->nodeNr != 0)
+ loc_object = (gchar *)xmlNodeGetContent (xpathObj->nodesetval->nodeTab[0]);
+ xmlXPathFreeObject (xpathObj);
+ }
+ g_free (tag);
+ g_free (aux);
+ }
+ g_strfreev (tags);
+
+ /* 2) if not, use the default string */
+ if (!loc_object) {
+ aux = g_strdup_printf (xpath, "x-default");
+ xpathObj = xmlXPathEvalExpression (BAD_CAST aux, xpathCtx);
+ if (xpathObj != NULL) {
+ if (xpathObj->nodesetval != NULL &&
+ xpathObj->nodesetval->nodeNr != 0)
+ loc_object = (gchar *)xmlNodeGetContent (xpathObj->nodesetval->nodeTab[0]);
+ xmlXPathFreeObject (xpathObj);
+ }
+ g_free (aux);
+ }
+ return loc_object;
+}
+
+static char *
+pdf_document_get_title_from_metadata (xmlXPathContextPtr xpathCtx)
+{
+ return pdf_document_get_localized_object_from_metadata (xpathCtx, TITLE);
+}
+
+static char *
+pdf_document_get_subject_from_metadata (xmlXPathContextPtr xpathCtx)
+{
+ return pdf_document_get_localized_object_from_metadata (xpathCtx, SUBJECT);
+}
+
+static void
+pdf_document_get_dates_from_metadata (GTime *result, xmlXPathContextPtr xpathCtx)
+{
+ xmlChar *modifydate = NULL;
+ xmlChar *createdate = NULL;
+ xmlChar *metadate = NULL;
+ char *datestr;
+ GTimeVal tmp_time;
+ int i;
+
+ /* reads modify date */
+ modifydate = pdf_document_get_format_from_path (xpathCtx, MOD_DATE);
+ /* reads pdf create date */
+ createdate = pdf_document_get_format_from_path (xpathCtx, CREATE_DATE);
+ /* reads pdf metadata date */
+ metadate = pdf_document_get_format_from_path (xpathCtx, META_DATE);
+
+ if (modifydate != NULL) {
+ /* return buffer */
+ datestr = g_strdup_printf ("%s", modifydate);
+ g_time_val_from_iso8601 (datestr, &tmp_time);
+ result[0] = (GTime)tmp_time.tv_sec;
+ g_free (datestr);
+ }
+
+ if (createdate != NULL) {
+ datestr = g_strdup_printf ("%s", createdate);
+ g_time_val_from_iso8601 (datestr, &tmp_time);
+ result[1] = (GTime)tmp_time.tv_sec;
+ g_free (datestr);
+ }
+
+ if (createdate != NULL) {
+ datestr = g_strdup_printf ("%s", metadate);
+ g_time_val_from_iso8601 (datestr, &tmp_time);
+ result[1] = (GTime)tmp_time.tv_sec;
+ g_free (datestr);
+ }
+
+ /* Cleanup */
+ xmlFree (modifydate);
+ xmlFree (createdate);
+ xmlFree (metadate);
+}
+
+static char *
+pdf_document_get_creatortool_from_metadata (xmlXPathContextPtr xpathCtx)
+{
+ xmlChar *creatortool = NULL;
+ char *result = NULL;
+
+ /* reads CreatorTool */
+ creatortool = pdf_document_get_format_from_path (xpathCtx, CREATOR);
+ if (creatortool != NULL) {
+ result = g_strdup_printf ("%s", creatortool);
+ }
+
+ /* Cleanup */
+ xmlFree (creatortool);
+ return result;
+}
+
+static char *
+pdf_document_get_producer_from_metadata (xmlXPathContextPtr xpathCtx)
+{
+ xmlChar *producer = NULL;
+ char *result = NULL;
+
+ /* reads Producer */
+ producer = pdf_document_get_format_from_path (xpathCtx, PRODUCER);
+ if (producer != NULL) {
+ result = g_strdup_printf ("%s", producer);
+ }
+
+ /* Cleanup */
+ xmlFree (producer);
+ return result;
+}
+
static EvDocumentLicense *
-pdf_document_get_license_from_metadata (xmlDocPtr doc,
- xmlXPathContextPtr xpathCtx)
+pdf_document_get_license_from_metadata (xmlXPathContextPtr xpathCtx)
{
xmlXPathObjectPtr xpathObj;
xmlChar *marked = NULL;
@@ -678,41 +913,7 @@ pdf_document_get_license_from_metadata (xmlDocPtr doc,
* Schema. This field is recomended to be checked by Creative
* Commons */
/* 1) checking for a suitable localized string */
- language_string = pango_language_to_string (gtk_get_default_language ());
- tags = g_strsplit (language_string, "-", -1);
- i = g_strv_length (tags);
- while (i-- && !license->text) {
- tag = g_strdup (tags[0]);
- for (j = 1; j <= i; j++) {
- tag_aux = g_strdup_printf ("%s-%s", tag, tags[j]);
- g_free (tag);
- tag = tag_aux;
- }
- aux = g_strdup_printf (LICENSE_TEXT, tag);
- xpathObj = xmlXPathEvalExpression (BAD_CAST aux, xpathCtx);
- if (xpathObj != NULL) {
- if (xpathObj->nodesetval != NULL &&
- xpathObj->nodesetval->nodeNr != 0)
- license->text = (gchar *)xmlNodeGetContent
(xpathObj->nodesetval->nodeTab[0]);
- xmlXPathFreeObject (xpathObj);
- }
- g_free (tag);
- g_free (aux);
- }
- g_strfreev(tags);
-
- /* 2) if not, use the default string */
- if (!license->text) {
- aux = g_strdup_printf (LICENSE_TEXT, "x-default");
- xpathObj = xmlXPathEvalExpression (BAD_CAST aux, xpathCtx);
- if (xpathObj != NULL) {
- if (xpathObj->nodesetval != NULL &&
- xpathObj->nodesetval->nodeNr != 0)
- license->text = (gchar *)xmlNodeGetContent
(xpathObj->nodesetval->nodeTab[0]);
- xmlXPathFreeObject (xpathObj);
- }
- g_free (aux);
- }
+ license->text = pdf_document_get_localized_object_from_metadata(xpathCtx, LICENSE_TEXT);
/* Checking the license URI as defined by the Creative Commons
* Schema. This field is recomended to be checked by Creative
@@ -754,6 +955,13 @@ pdf_document_parse_metadata (const gchar *metadata,
xmlDocPtr doc;
xmlXPathContextPtr xpathCtx;
gchar *fmt;
+ gchar *author;
+ gchar *keywords;
+ gchar *title;
+ gchar *subject;
+ gchar *creatortool;
+ gchar *producer;
+ GTime dates[3] = {0};
doc = xmlParseMemory (metadata, strlen (metadata));
if (doc == NULL)
@@ -765,13 +973,58 @@ pdf_document_parse_metadata (const gchar *metadata,
return; /* invalid xpath context */
}
- fmt = pdf_document_get_format_from_metadata (doc, xpathCtx);
+ fmt = pdf_document_get_format_from_metadata (xpathCtx);
if (fmt != NULL) {
g_free (info->format);
info->format = fmt;
}
- info->license = pdf_document_get_license_from_metadata (doc, xpathCtx);
+ author = pdf_document_get_author_from_metadata (xpathCtx);
+ if (author != NULL) {
+ g_free (info->author);
+ info->author = author;
+ }
+
+ keywords = pdf_document_get_keywords_from_metadata (xpathCtx);
+ if (keywords != NULL) {
+ g_free (info->keywords);
+ info->keywords = keywords;
+ }
+
+ title = pdf_document_get_title_from_metadata (xpathCtx);
+ if (title != NULL) {
+ g_free (info->title);
+ info->title = title;
+ }
+
+ subject = pdf_document_get_subject_from_metadata (xpathCtx);
+ if (subject != NULL) {
+ g_free (info->subject);
+ info->subject = subject;
+ }
+
+ creatortool = pdf_document_get_creatortool_from_metadata (xpathCtx);
+ if (creatortool != NULL) {
+ g_free (info->creator);
+ info->creator = creatortool;
+ }
+
+ producer = pdf_document_get_producer_from_metadata (xpathCtx);
+ if (producer != NULL) {
+ g_free (info->producer);
+ info->producer = producer;
+ }
+
+ pdf_document_get_dates_from_metadata (dates, xpathCtx);
+ if (dates[0] != 0){
+ info->modified_date = dates[0];
+ }
+
+ if (dates[1] != 0) {
+ info->creation_date = dates[1];
+ }
+
+ info->license = pdf_document_get_license_from_metadata (xpathCtx);
xmlXPathFreeContext (xpathCtx);
xmlFreeDoc (doc);
@@ -806,7 +1059,7 @@ pdf_document_get_info (EvDocument *document)
EV_DOCUMENT_INFO_MOD_DATE |
EV_DOCUMENT_INFO_LINEARIZED |
EV_DOCUMENT_INFO_N_PAGES |
- EV_DOCUMENT_INFO_SECURITY |
+ EV_DOCUMENT_INFO_SECURITY |
EV_DOCUMENT_INFO_PAPER_SIZE |
EV_DOCUMENT_INFO_LICENSE;
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]