[tracker] tracker-extract: Fixed PDF extractor to use TrackerSparqlBuilder
- From: Martyn James Russell <mr src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [tracker] tracker-extract: Fixed PDF extractor to use TrackerSparqlBuilder
- Date: Wed, 4 Nov 2009 16:15:21 +0000 (UTC)
commit 6ae5b426ae51d5a46e1a3d6e0c555a1e62d5311d
Author: Martyn Russell <martyn lanedo com>
Date: Wed Nov 4 16:03:48 2009 +0000
tracker-extract: Fixed PDF extractor to use TrackerSparqlBuilder
src/tracker-extract/tracker-extract-pdf.c | 422 ++++++++++++++++-------------
1 files changed, 234 insertions(+), 188 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.c
index b6edfd6..17181c6 100644
--- a/src/tracker-extract/tracker-extract-pdf.c
+++ b/src/tracker-extract/tracker-extract-pdf.c
@@ -1,7 +1,7 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
* Copyright (C) 2006, Mr Jamie McCracken (jamiemcc gnome org)
- * Copyright (C) 2008, Nokia
+ * Copyright (C) 2008-2009, Nokia
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -36,52 +36,49 @@
#include <libtracker-common/tracker-utils.h>
#include <libtracker-common/tracker-type-utils.h>
-#define NMM_PREFIX TRACKER_NMM_PREFIX
-#define DC_PREFIX TRACKER_DC_PREFIX
-#define NIE_PREFIX TRACKER_NIE_PREFIX
-#define NFO_PREFIX TRACKER_NFO_PREFIX
-#define NCO_PREFIX TRACKER_NCO_PREFIX
-#define RDF_PREFIX TRACKER_RDF_PREFIX
+typedef struct {
+ gchar *title;
+ gchar *subject;
+ gchar *creation_date;
+ gchar *author;
+ gchar *date;
+ gchar *keywords;
+} PDFData;
-static void extract_pdf (const gchar *uri,
- TrackerSparqlBuilder *metadata);
+static void extract_pdf (const gchar *uri,
+ TrackerSparqlBuilder *metadata);
static TrackerExtractData data[] = {
{ "application/pdf", extract_pdf },
{ NULL, NULL }
};
-typedef struct {
- gchar *author, *title, *creation_date, *subject;
-} PdfData;
-
-typedef struct {
- gchar *creator, *title, *date;
-} PdfNeedsMergeData;
-
-
static void
-insert_keywords (TrackerSparqlBuilder *metadata, const gchar *uri, gchar *keywords)
+insert_keywords (TrackerSparqlBuilder *metadata,
+ gchar *keywords)
{
- char *lasts, *keyw;
+ char *saveptr, *p;
size_t len;
- keyw = keywords;
+ p = keywords;
keywords = strchr (keywords, '"');
- if (keywords)
+
+ if (keywords) {
keywords++;
- else
- keywords = keyw;
+ } else {
+ keywords = p;
+ }
len = strlen (keywords);
- if (keywords[len - 1] == '"')
+ if (keywords[len - 1] == '"') {
keywords[len - 1] = '\0';
+ }
- for (keyw = strtok_r (keywords, ",; ", &lasts); keyw;
- keyw = strtok_r (NULL, ",; ", &lasts)) {
- tracker_statement_list_insert (metadata, uri,
- NIE_PREFIX "keyword",
- (const gchar*) keyw);
+ for (p = strtok_r (keywords, ",; ", &saveptr);
+ p;
+ p = strtok_r (NULL, ",; ", &saveptr)) {
+ tracker_sparql_builder_predicate (metadata, "nie:keyword");
+ tracker_sparql_builder_object_unvalidated (metadata, p);
}
}
@@ -122,232 +119,287 @@ extract_content (PopplerDocument *document,
}
static void
-extract_pdf (const gchar *uri,
- TrackerSparqlBuilder *metadata)
+write_pdf_data (PDFData data,
+ TrackerSparqlBuilder *metadata)
+{
+ if (!tracker_is_empty_string (data.title)) {
+ tracker_sparql_builder_predicate (metadata, "nie:title");
+ tracker_sparql_builder_object_unvalidated (metadata, data.title);
+ g_free (data.title);
+ }
+
+ if (!tracker_is_empty_string (data.subject)) {
+ tracker_sparql_builder_predicate (metadata, "nie:subject");
+ tracker_sparql_builder_object_unvalidated (metadata, data.subject);
+ g_free (data.subject);
+ }
+
+ if (!tracker_is_empty_string (data.author)) {
+ tracker_sparql_builder_predicate (metadata, "nco:creator");
+ tracker_sparql_builder_object_blank_open (metadata);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nco:Contact");
+ tracker_sparql_builder_predicate (metadata, "nco:fullname");
+ tracker_sparql_builder_object_unvalidated (metadata, data.author);
+ tracker_sparql_builder_object_blank_close (metadata);
+ g_free (data.author);
+ }
+
+ if (!tracker_is_empty_string (data.date)) {
+ tracker_sparql_builder_predicate (metadata, "nie:contentCreated");
+ tracker_sparql_builder_object_unvalidated (metadata, data.date);
+ g_free (data.date);
+ }
+
+ if (!tracker_is_empty_string (data.keywords)) {
+ insert_keywords (metadata, data.keywords);
+ g_free (data.keywords);
+ }
+}
+
+static void
+extract_pdf (const gchar *uri,
+ TrackerSparqlBuilder *metadata)
{
- PdfData pdf_data = { 0 };
- PdfNeedsMergeData merge_data = { 0 };
- TrackerXmpData xmp_data = { 0 };
- PopplerDocument *document;
- gchar *author, *title, *subject, *content;
- gchar *keywords = NULL;
- gchar *metadata_xml = NULL;
- GTime creation_date;
- GError *error = NULL;
TrackerFTSConfig *fts_config;
- guint n_words;
+ GTime creation_date;
+ GError *error = NULL;
+ TrackerXmpData xd = { 0 };
+ PDFData pd = { 0 }; /* actual data */
+ PDFData md = { 0 }; /* for merging */
+ PopplerDocument *document;
+ gchar *xml = NULL;
+ gchar *content;
+ guint n_words;
g_type_init ();
document = poppler_document_new_from_file (uri, NULL, &error);
- if (document == NULL || error) {
+ if (error) {
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
+ uri,
+ error->message ? error->message : "no error given");
+ g_error_free (error);
+
+ return;
+ }
+
+ if (!document) {
+ g_warning ("Could not create PopplerDocument from uri:'%s', "
+ "NULL returned without an error",
+ uri);
return;
}
- tracker_statement_list_insert (metadata, uri,
- RDF_PREFIX "type",
- NFO_PREFIX "PaginatedTextDocument");
+ tracker_sparql_builder_subject_iri (metadata, uri);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
g_object_get (document,
- "title", &title,
- "author", &author,
- "subject", &subject,
- "keywords", &keywords,
+ "title", &pd.title,
+ "author", &pd.author,
+ "subject", &pd.subject,
+ "keywords", &pd.keywords,
"creation-date", &creation_date,
NULL);
/* metadata property not present in older poppler versions */
if (g_object_class_find_property (G_OBJECT_GET_CLASS (document), "metadata")) {
- g_object_get (document, "metadata", &metadata_xml, NULL);
- }
-
- if (!tracker_is_empty_string (title)) {
- pdf_data.title = title;
- }
-
- if (!tracker_is_empty_string (author)) {
- pdf_data.author = author;
- }
-
- if (!tracker_is_empty_string (subject)) {
- pdf_data.subject = subject;
+ g_object_get (document, "metadata", &xml, NULL);
}
if (creation_date > 0) {
- pdf_data.creation_date = tracker_date_to_string ((time_t) creation_date);
+ pd.creation_date = tracker_date_to_string ((time_t) creation_date);
}
- if (metadata_xml) {
-
- tracker_read_xmp (metadata_xml, strlen (metadata_xml), uri, &xmp_data);
-
- merge_data.creator = tracker_coalesce (2, pdf_data.author,
- xmp_data.creator);
+ if (xml) {
+ tracker_read_xmp (xml, strlen (xml), uri, &xd);
+ g_free (xml);
+ xml = NULL;
- merge_data.date = tracker_coalesce (3, pdf_data.creation_date,
- xmp_data.date,
- xmp_data.DateTimeOriginal);
+ md.title = tracker_coalesce (2, pd.title, xd.title);
+ md.subject = tracker_coalesce (2, pd.subject, xd.subject);
+ md.date = tracker_coalesce (3, pd.creation_date, xd.date, xd.DateTimeOriginal);
+ md.author = tracker_coalesce (2, pd.author, xd.creator);
- merge_data.title = tracker_coalesce (2, pdf_data.title,
- xmp_data.title);
+ write_pdf_data (md, metadata);
-
- if (pdf_data.subject) {
- insert_keywords (metadata, uri, pdf_data.subject);
- g_free (pdf_data.subject);
- }
-
- if (merge_data.creator) {
- tracker_statement_list_insert (metadata, ":", RDF_PREFIX "type", NCO_PREFIX "Contact");
- tracker_statement_list_insert (metadata, ":", NCO_PREFIX "fullname", merge_data.creator);
- tracker_statement_list_insert (metadata, uri, NCO_PREFIX "creator", ":");
- g_free (merge_data.creator);
+ if (xd.keywords) {
+ insert_keywords (metadata, xd.keywords);
+ g_free (xd.keywords);
}
- if (merge_data.date) {
- tracker_statement_list_insert (metadata, uri, NIE_PREFIX "contentCreated", merge_data.date);
- g_free (merge_data.date);
+ if (xd.publisher) {
+ tracker_sparql_builder_predicate (metadata, "nco:publisher");
+ tracker_sparql_builder_object_blank_open (metadata);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nco:Contact");
+ tracker_sparql_builder_predicate (metadata, "nco:fullname");
+ tracker_sparql_builder_object_unvalidated (metadata, xd.publisher);
+ tracker_sparql_builder_object_blank_close (metadata);
+ g_free (xd.publisher);
}
- if (merge_data.title) {
- tracker_statement_list_insert (metadata, uri, NIE_PREFIX "title", merge_data.title);
- g_free (merge_data.title);
+ if (xd.type) {
+ tracker_sparql_builder_predicate (metadata, "dc:type");
+ tracker_sparql_builder_object_unvalidated (metadata, xd.type);
+ g_free (xd.type);
}
- if (xmp_data.keywords) {
- insert_keywords (metadata, uri, xmp_data.keywords);
- g_free (xmp_data.keywords);
+ if (xd.format) {
+ tracker_sparql_builder_predicate (metadata, "dc:format");
+ tracker_sparql_builder_object_unvalidated (metadata, xd.format);
+ g_free (xd.format);
}
- if (xmp_data.subject) {
- insert_keywords (metadata, uri, xmp_data.subject);
- g_free (xmp_data.subject);
+ if (xd.identifier) {
+ tracker_sparql_builder_predicate (metadata, "dc:identifier");
+ tracker_sparql_builder_object_unvalidated (metadata, xd.identifier);
+ g_free (xd.identifier);
}
- if (xmp_data.publisher) {
- tracker_statement_list_insert (metadata, ":", RDF_PREFIX "type", NCO_PREFIX "Contact");
- tracker_statement_list_insert (metadata, ":", NCO_PREFIX "fullname", xmp_data.publisher);
- tracker_statement_list_insert (metadata, uri, NCO_PREFIX "publisher", ":");
- g_free (xmp_data.publisher);
+ if (xd.source) {
+ tracker_sparql_builder_predicate (metadata, "dc:source");
+ tracker_sparql_builder_object_unvalidated (metadata, xd.source);
+ g_free (xd.source);
}
- if (xmp_data.type) {
- tracker_statement_list_insert (metadata, uri, DC_PREFIX "type", xmp_data.type);
- g_free (xmp_data.type);
+ if (xd.language) {
+ tracker_sparql_builder_predicate (metadata, "dc:language");
+ tracker_sparql_builder_object_unvalidated (metadata, xd.language);
+ g_free (xd.language);
}
- if (xmp_data.format) {
- tracker_statement_list_insert (metadata, uri, DC_PREFIX "format", xmp_data.format);
- g_free (xmp_data.format);
+ if (xd.relation) {
+ tracker_sparql_builder_predicate (metadata, "dc:relation");
+ tracker_sparql_builder_object_unvalidated (metadata, xd.relation);
+ g_free (xd.relation);
}
- if (xmp_data.identifier) {
- tracker_statement_list_insert (metadata, uri, DC_PREFIX "identifier", xmp_data.identifier);
- g_free (xmp_data.identifier);
+ if (xd.coverage) {
+ tracker_sparql_builder_predicate (metadata, "dc:coverage");
+ tracker_sparql_builder_object_unvalidated (metadata, xd.coverage);
+ g_free (xd.coverage);
}
- if (xmp_data.source) {
- tracker_statement_list_insert (metadata, uri, DC_PREFIX "source", xmp_data.source);
- g_free (xmp_data.source);
+ if (xd.license) {
+ tracker_sparql_builder_predicate (metadata, "nie:license");
+ tracker_sparql_builder_object_unvalidated (metadata, xd.license);
+ g_free (xd.license);
}
- if (xmp_data.language) {
- tracker_statement_list_insert (metadata, uri, DC_PREFIX "language", xmp_data.language);
- g_free (xmp_data.language);
+ if (xd.Make || xd.Model) {
+ gchar *camera;
+
+ if ((xd.Make == NULL || xd.Model == NULL) ||
+ (xd.Make && xd.Model && strstr (xd.Model, xd.Make) == NULL)) {
+ camera = tracker_merge (" ", 2, xd.Make, xd.Model);
+ } else {
+ camera = g_strdup (xd.Model);
+ g_free (xd.Model);
+ g_free (xd.Make);
+ }
+
+ tracker_sparql_builder_predicate (metadata, "nmm:camera");
+ tracker_sparql_builder_object_unvalidated (metadata, camera);
+ g_free (camera);
}
- if (xmp_data.relation) {
- tracker_statement_list_insert (metadata, uri, DC_PREFIX "relation", xmp_data.relation);
- g_free (xmp_data.relation);
+ if (xd.Orientation) {
+ tracker_sparql_builder_predicate (metadata, "nfo:orientation");
+ tracker_sparql_builder_object (metadata, xd.Orientation);
+ g_free (xd.Orientation);
}
- if (xmp_data.coverage) {
- tracker_statement_list_insert (metadata, uri, DC_PREFIX "coverage", xmp_data.coverage);
- g_free (xmp_data.coverage);
+ if (xd.rights) {
+ tracker_sparql_builder_predicate (metadata, "nie:copyright");
+ tracker_sparql_builder_object_unvalidated (metadata, xd.rights);
+ g_free (xd.rights);
}
- if (xmp_data.license) {
- tracker_statement_list_insert (metadata, uri, NIE_PREFIX "license", xmp_data.license);
- g_free (xmp_data.license);
+ if (xd.WhiteBalance) {
+ tracker_sparql_builder_predicate (metadata, "nmm:whiteBalance");
+ tracker_sparql_builder_object (metadata, xd.WhiteBalance);
+ g_free (xd.WhiteBalance);
}
- if (xmp_data.Make || xmp_data.Model) {
- gchar *final_camera = tracker_merge (" ", 2, xmp_data.Make, xmp_data.Model);
- tracker_statement_list_insert (metadata, uri, NMM_PREFIX "camera", final_camera);
- g_free (final_camera);
- }
+ if (xd.FNumber) {
+ gdouble value;
- if (xmp_data.Orientation) {
- tracker_statement_list_insert (metadata, uri, NFO_PREFIX "orientation", xmp_data.Orientation);
- g_free (xmp_data.Orientation);
+ value = g_strtod (xd.FNumber, NULL);
+ tracker_sparql_builder_predicate (metadata, "nmm:fnumber");
+ tracker_sparql_builder_object_double (metadata, value);
+ g_free (xd.FNumber);
}
- if (xmp_data.rights) {
- tracker_statement_list_insert (metadata, uri, NIE_PREFIX "copyright", xmp_data.rights);
- g_free (xmp_data.rights);
+ if (xd.Flash) {
+ tracker_sparql_builder_predicate (metadata, "nmm:flash");
+ tracker_sparql_builder_object (metadata, xd.Flash);
+ g_free (xd.Flash);
}
- if (xmp_data.WhiteBalance) {
- tracker_statement_list_insert (metadata, uri, NMM_PREFIX "whiteBalance", xmp_data.WhiteBalance);
- g_free (xmp_data.WhiteBalance);
- }
+ if (xd.FocalLength) {
+ gdouble value;
- if (xmp_data.FNumber) {
- tracker_statement_list_insert (metadata, uri, NMM_PREFIX "fnumber", xmp_data.FNumber);
- g_free (xmp_data.FNumber);
+ value = g_strtod (xd.FocalLength, NULL);
+ tracker_sparql_builder_predicate (metadata, "nmm:focalLength");
+ tracker_sparql_builder_object_double (metadata, value);
+ g_free (xd.FocalLength);
}
- if (xmp_data.Flash) {
- tracker_statement_list_insert (metadata, uri, NMM_PREFIX "flash", xmp_data.Flash);
- g_free (xmp_data.Flash);
+ if (xd.Artist || xd.contributor) {
+ gchar *artist;
+
+ artist = tracker_coalesce (2, xd.Artist, xd.contributor);
+ tracker_sparql_builder_predicate (metadata, "nco:contributor");
+ tracker_sparql_builder_object_blank_open (metadata);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nco:Contact");
+ tracker_sparql_builder_predicate (metadata, "nco:fullname");
+ tracker_sparql_builder_object_unvalidated (metadata, artist);
+ tracker_sparql_builder_object_blank_close (metadata);
+ g_free (artist);
}
- if (xmp_data.FocalLength) {
- tracker_statement_list_insert (metadata, uri, NMM_PREFIX "focalLength", xmp_data.FocalLength);
- g_free (xmp_data.FocalLength);
- }
+ if (xd.ExposureTime) {
+ gdouble value;
- if (xmp_data.Artist || xmp_data.contributor) {
- gchar *final_artist = tracker_coalesce (2, xmp_data.Artist, xmp_data.contributor);
- tracker_statement_list_insert (metadata, ":", RDF_PREFIX "type", NCO_PREFIX "Contact");
- tracker_statement_list_insert (metadata, ":", NCO_PREFIX "fullname", final_artist);
- tracker_statement_list_insert (metadata, uri, NCO_PREFIX "contributor", ":");
- g_free (final_artist);
+ value = g_strtod (xd.ExposureTime, NULL);
+ tracker_sparql_builder_predicate (metadata, "nmm:exposureTime");
+ tracker_sparql_builder_object_double (metadata, value);
+ g_free (xd.ExposureTime);
}
- if (xmp_data.ExposureTime) {
- tracker_statement_list_insert (metadata, uri, NMM_PREFIX "exposureTime", xmp_data.ExposureTime);
- g_free (xmp_data.ExposureTime);
- }
+ if (xd.ISOSpeedRatings) {
+ gdouble value;
- if (xmp_data.ISOSpeedRatings) {
- tracker_statement_list_insert (metadata, uri, NMM_PREFIX "isoSpeed", xmp_data.ISOSpeedRatings);
- g_free (xmp_data.ISOSpeedRatings);
+ value = g_strtod (xd.ISOSpeedRatings, NULL);
+ tracker_sparql_builder_predicate (metadata, "nmm:isoSpeed");
+ tracker_sparql_builder_object_double (metadata, value);
+ g_free (xd.ISOSpeedRatings);
}
- if (xmp_data.description) {
- tracker_statement_list_insert (metadata, uri, NIE_PREFIX "description", xmp_data.description);
- g_free (xmp_data.description);
+ if (xd.description) {
+ tracker_sparql_builder_predicate (metadata, "nie:description");
+ tracker_sparql_builder_object_unvalidated (metadata, xd.description);
+ g_free (xd.description);
}
- if (xmp_data.MeteringMode) {
- tracker_statement_list_insert (metadata, uri, NMM_PREFIX "meteringMode", xmp_data.MeteringMode);
- g_free (xmp_data.MeteringMode);
+ if (xd.MeteringMode) {
+ tracker_sparql_builder_predicate (metadata, "nmm:meteringMode");
+ tracker_sparql_builder_object (metadata, xd.MeteringMode);
+ g_free (xd.MeteringMode);
}
-
+ } else {
+ /* So if we are here we have NO XMP data and we just
+ * write what we know from Poppler.
+ */
+ write_pdf_data (pd, metadata);
}
- if (!tracker_is_empty_string (keywords)) {
- char *lasts, *keyw;
-
- for (keyw = strtok_r (keywords, ",;", &lasts); keyw;
- keyw = strtok_r (NULL, ",;", &lasts)) {
- tracker_statement_list_insert (metadata,
- uri, NIE_PREFIX "keyword",
- (const gchar*) keyw);
- }
- }
+ tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
+ tracker_sparql_builder_object_int64 (metadata, poppler_document_get_n_pages (document));
fts_config = tracker_main_get_fts_config ();
n_words = tracker_fts_config_get_max_words_to_index (fts_config);
@@ -359,12 +411,6 @@ extract_pdf (const gchar *uri,
g_free (content);
}
- tracker_statement_list_insert_with_int (metadata, uri,
- NFO_PREFIX "pageCount",
- poppler_document_get_n_pages (document));
-
- g_free (keywords);
- g_free (metadata_xml);
g_object_unref (document);
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]