[tracker] tracker-extract: Fixed PDF extractor to use TrackerSparqlBuilder



commit 6ae5b426ae51d5a46e1a3d6e0c555a1e62d5311d
Author: Martyn Russell <martyn lanedo com>
Date:   Wed Nov 4 16:03:48 2009 +0000

    tracker-extract: Fixed PDF extractor to use TrackerSparqlBuilder

 src/tracker-extract/tracker-extract-pdf.c |  422 ++++++++++++++++-------------
 1 files changed, 234 insertions(+), 188 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.c
index b6edfd6..17181c6 100644
--- a/src/tracker-extract/tracker-extract-pdf.c
+++ b/src/tracker-extract/tracker-extract-pdf.c
@@ -1,7 +1,7 @@
 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
 /*
  * Copyright (C) 2006, Mr Jamie McCracken (jamiemcc gnome org)
- * Copyright (C) 2008, Nokia
+ * Copyright (C) 2008-2009, Nokia
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -36,52 +36,49 @@
 #include <libtracker-common/tracker-utils.h>
 #include <libtracker-common/tracker-type-utils.h>
 
-#define NMM_PREFIX TRACKER_NMM_PREFIX
-#define DC_PREFIX TRACKER_DC_PREFIX
-#define NIE_PREFIX TRACKER_NIE_PREFIX
-#define NFO_PREFIX TRACKER_NFO_PREFIX
-#define NCO_PREFIX TRACKER_NCO_PREFIX
-#define RDF_PREFIX TRACKER_RDF_PREFIX
+typedef struct {
+	gchar *title;
+	gchar *subject;
+	gchar *creation_date;
+	gchar *author;
+	gchar *date;
+	gchar *keywords;
+} PDFData;
 
-static void extract_pdf (const gchar *uri,
-			 TrackerSparqlBuilder   *metadata);
+static void extract_pdf (const gchar          *uri,
+			 TrackerSparqlBuilder *metadata);
 
 static TrackerExtractData data[] = {
 	{ "application/pdf", extract_pdf },
 	{ NULL, NULL }
 };
 
-typedef struct {
-	gchar *author, *title, *creation_date, *subject;
-} PdfData;
-
-typedef struct {
-	gchar *creator, *title, *date;
-} PdfNeedsMergeData;
-
-
 static void
-insert_keywords (TrackerSparqlBuilder *metadata, const gchar *uri, gchar *keywords)
+insert_keywords (TrackerSparqlBuilder *metadata,
+		 gchar                *keywords)
 {
-	char *lasts, *keyw;
+	char *saveptr, *p;
 	size_t len;
 
-	keyw = keywords;
+	p = keywords;
 	keywords = strchr (keywords, '"');
-	if (keywords)
+
+	if (keywords) {
 		keywords++;
-	else 
-		keywords = keyw;
+	} else {
+		keywords = p;
+	}
 
 	len = strlen (keywords);
-	if (keywords[len - 1] == '"')
+	if (keywords[len - 1] == '"') {
 		keywords[len - 1] = '\0';
+	}
 
-	for (keyw = strtok_r (keywords, ",; ", &lasts); keyw; 
-	     keyw = strtok_r (NULL, ",; ", &lasts)) {
-		tracker_statement_list_insert (metadata, uri, 
-		                               NIE_PREFIX "keyword", 
-		                               (const gchar*) keyw);
+	for (p = strtok_r (keywords, ",; ", &saveptr);
+	     p;
+	     p = strtok_r (NULL, ",; ", &saveptr)) {
+		tracker_sparql_builder_predicate (metadata, "nie:keyword");
+		tracker_sparql_builder_object_unvalidated (metadata, p);
 	}
 }
 
@@ -122,232 +119,287 @@ extract_content (PopplerDocument *document,
 }
 
 static void
-extract_pdf (const gchar *uri,
-	     TrackerSparqlBuilder  *metadata)
+write_pdf_data (PDFData               data,
+		TrackerSparqlBuilder *metadata)
+{
+	if (!tracker_is_empty_string (data.title)) {
+		tracker_sparql_builder_predicate (metadata, "nie:title");
+		tracker_sparql_builder_object_unvalidated (metadata, data.title);
+		g_free (data.title);
+	}
+
+	if (!tracker_is_empty_string (data.subject)) {
+		tracker_sparql_builder_predicate (metadata, "nie:subject");
+		tracker_sparql_builder_object_unvalidated (metadata, data.subject);
+		g_free (data.subject);
+	}
+
+	if (!tracker_is_empty_string (data.author)) {
+		tracker_sparql_builder_predicate (metadata, "nco:creator");
+		tracker_sparql_builder_object_blank_open (metadata);
+		tracker_sparql_builder_predicate (metadata, "a");
+		tracker_sparql_builder_object (metadata, "nco:Contact");
+		tracker_sparql_builder_predicate (metadata, "nco:fullname");
+		tracker_sparql_builder_object_unvalidated (metadata, data.author);
+		tracker_sparql_builder_object_blank_close (metadata);
+		g_free (data.author);
+	}
+
+	if (!tracker_is_empty_string (data.date)) {
+		tracker_sparql_builder_predicate (metadata, "nie:contentCreated");
+		tracker_sparql_builder_object_unvalidated (metadata, data.date);
+		g_free (data.date);
+	}
+
+	if (!tracker_is_empty_string (data.keywords)) {
+		insert_keywords (metadata, data.keywords);
+		g_free (data.keywords);
+	}
+}
+
+static void
+extract_pdf (const gchar          *uri,
+	     TrackerSparqlBuilder *metadata)
 {
-	PdfData pdf_data = { 0 };
-	PdfNeedsMergeData merge_data = { 0 };
-	TrackerXmpData xmp_data = { 0 };
-	PopplerDocument *document;
-	gchar		*author, *title, *subject, *content;
-	gchar		*keywords	= NULL;
-	gchar		*metadata_xml	= NULL;
-	GTime		 creation_date;
-	GError		*error		= NULL;
 	TrackerFTSConfig *fts_config;
-	guint             n_words;
+	GTime creation_date;
+	GError *error = NULL;
+	TrackerXmpData xd = { 0 };
+	PDFData pd = { 0 }; /* actual data */
+	PDFData md = { 0 }; /* for merging */
+	PopplerDocument *document;
+	gchar *xml = NULL;
+	gchar *content;
+	guint n_words;
 
 	g_type_init ();
 
 	document = poppler_document_new_from_file (uri, NULL, &error);
 
-	if (document == NULL || error) {
+	if (error) {
+		g_warning ("Couldn't create PopplerDocument from uri:'%s', %s", 
+			   uri,
+			   error->message ? error->message : "no error given");
+		g_error_free (error);
+
+		return;
+	}
+
+	if (!document) {
+		g_warning ("Could not create PopplerDocument from uri:'%s', "
+			   "NULL returned without an error",
+			   uri);
 		return;
 	}
 
-	tracker_statement_list_insert (metadata, uri, 
-	                               RDF_PREFIX "type", 
-	                               NFO_PREFIX "PaginatedTextDocument");
+	tracker_sparql_builder_subject_iri (metadata, uri);
+	tracker_sparql_builder_predicate (metadata, "a");
+	tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
 
 	g_object_get (document,
-	              "title", &title,
-	              "author", &author,
-	              "subject", &subject,
-	              "keywords", &keywords,
+	              "title", &pd.title,
+	              "author", &pd.author,
+	              "subject", &pd.subject,
+	              "keywords", &pd.keywords,
 	              "creation-date", &creation_date,
 	              NULL);
 
 	/* metadata property not present in older poppler versions */
 	if (g_object_class_find_property (G_OBJECT_GET_CLASS (document), "metadata")) {
-		g_object_get (document, "metadata", &metadata_xml, NULL);
-	}
-
-	if (!tracker_is_empty_string (title)) {
-		pdf_data.title = title;
-	}
-
-	if (!tracker_is_empty_string (author)) {
-		pdf_data.author = author;
-	}
-
-	if (!tracker_is_empty_string (subject)) {
-		pdf_data.subject = subject;
+		g_object_get (document, "metadata", &xml, NULL);
 	}
 
 	if (creation_date > 0) {
-		pdf_data.creation_date = tracker_date_to_string ((time_t) creation_date);
+		pd.creation_date = tracker_date_to_string ((time_t) creation_date);
 	}
 
-	if (metadata_xml) {
-
-		tracker_read_xmp (metadata_xml, strlen (metadata_xml), uri, &xmp_data);
-
-		merge_data.creator =  tracker_coalesce (2, pdf_data.author,
-		                                        xmp_data.creator);
+	if (xml) {
+		tracker_read_xmp (xml, strlen (xml), uri, &xd);
+		g_free (xml);
+		xml = NULL;
 
-		merge_data.date =  tracker_coalesce (3, pdf_data.creation_date,
-		                                     xmp_data.date,
-		                                     xmp_data.DateTimeOriginal);
+		md.title = tracker_coalesce (2, pd.title, xd.title);
+		md.subject = tracker_coalesce (2, pd.subject, xd.subject);
+		md.date = tracker_coalesce (3, pd.creation_date, xd.date, xd.DateTimeOriginal);
+		md.author = tracker_coalesce (2, pd.author, xd.creator);
 
-		merge_data.title =  tracker_coalesce (2, pdf_data.title,
-		                                        xmp_data.title);
+		write_pdf_data (md, metadata);
 
-
-		if (pdf_data.subject) {
-			insert_keywords (metadata, uri, pdf_data.subject);
-			g_free (pdf_data.subject);
-		}
-
-		if (merge_data.creator) {
-			tracker_statement_list_insert (metadata, ":", RDF_PREFIX "type", NCO_PREFIX "Contact");
-			tracker_statement_list_insert (metadata, ":", NCO_PREFIX "fullname", merge_data.creator);
-			tracker_statement_list_insert (metadata, uri, NCO_PREFIX "creator", ":");
-			g_free (merge_data.creator);
+		if (xd.keywords) {
+			insert_keywords (metadata, xd.keywords);
+			g_free (xd.keywords);
 		}
 
-		if (merge_data.date) {
-			tracker_statement_list_insert (metadata, uri, NIE_PREFIX "contentCreated", merge_data.date);
-			g_free (merge_data.date);
+		if (xd.publisher) {
+			tracker_sparql_builder_predicate (metadata, "nco:publisher");
+			tracker_sparql_builder_object_blank_open (metadata);
+			tracker_sparql_builder_predicate (metadata, "a");
+			tracker_sparql_builder_object (metadata, "nco:Contact");
+			tracker_sparql_builder_predicate (metadata, "nco:fullname");
+			tracker_sparql_builder_object_unvalidated (metadata, xd.publisher);
+			tracker_sparql_builder_object_blank_close (metadata);
+			g_free (xd.publisher);
 		}
 
-		if (merge_data.title) {
-			tracker_statement_list_insert (metadata, uri, NIE_PREFIX "title", merge_data.title);
-			g_free (merge_data.title);
+		if (xd.type) {
+			tracker_sparql_builder_predicate (metadata, "dc:type");
+			tracker_sparql_builder_object_unvalidated (metadata, xd.type);
+			g_free (xd.type);
 		}
 
-		if (xmp_data.keywords) {
-			insert_keywords (metadata, uri, xmp_data.keywords);
-			g_free (xmp_data.keywords);
+		if (xd.format) {
+			tracker_sparql_builder_predicate (metadata, "dc:format");
+			tracker_sparql_builder_object_unvalidated (metadata, xd.format);
+			g_free (xd.format);
 		}
 
-		if (xmp_data.subject) {
-			insert_keywords (metadata, uri, xmp_data.subject);
-			g_free (xmp_data.subject);
+		if (xd.identifier) {
+			tracker_sparql_builder_predicate (metadata, "dc:identifier");
+			tracker_sparql_builder_object_unvalidated (metadata, xd.identifier);
+			g_free (xd.identifier);
 		}
 
-		if (xmp_data.publisher) {
-			tracker_statement_list_insert (metadata, ":", RDF_PREFIX "type", NCO_PREFIX "Contact");
-			tracker_statement_list_insert (metadata, ":", NCO_PREFIX "fullname", xmp_data.publisher);
-			tracker_statement_list_insert (metadata, uri, NCO_PREFIX "publisher", ":");
-			g_free (xmp_data.publisher);
+		if (xd.source) {
+			tracker_sparql_builder_predicate (metadata, "dc:source");
+			tracker_sparql_builder_object_unvalidated (metadata, xd.source);
+			g_free (xd.source);
 		}
 
-		if (xmp_data.type) {
-			tracker_statement_list_insert (metadata, uri, DC_PREFIX "type", xmp_data.type);
-			g_free (xmp_data.type);
+		if (xd.language) {
+			tracker_sparql_builder_predicate (metadata, "dc:language");
+			tracker_sparql_builder_object_unvalidated (metadata, xd.language);
+			g_free (xd.language);
 		}
 
-		if (xmp_data.format) {
-			tracker_statement_list_insert (metadata, uri, DC_PREFIX "format", xmp_data.format);
-			g_free (xmp_data.format);
+		if (xd.relation) {
+			tracker_sparql_builder_predicate (metadata, "dc:relation");
+			tracker_sparql_builder_object_unvalidated (metadata, xd.relation);
+			g_free (xd.relation);
 		}
 
-		if (xmp_data.identifier) {
-			tracker_statement_list_insert (metadata, uri, DC_PREFIX "identifier", xmp_data.identifier);
-			g_free (xmp_data.identifier);
+		if (xd.coverage) {
+			tracker_sparql_builder_predicate (metadata, "dc:coverage");
+			tracker_sparql_builder_object_unvalidated (metadata, xd.coverage);
+			g_free (xd.coverage);
 		}
 
-		if (xmp_data.source) {
-			tracker_statement_list_insert (metadata, uri, DC_PREFIX "source", xmp_data.source);
-			g_free (xmp_data.source);
+		if (xd.license) {
+			tracker_sparql_builder_predicate (metadata, "nie:license");
+			tracker_sparql_builder_object_unvalidated (metadata, xd.license);
+			g_free (xd.license);
 		}
 
-		if (xmp_data.language) {
-			tracker_statement_list_insert (metadata, uri, DC_PREFIX "language", xmp_data.language);
-			g_free (xmp_data.language);
+		if (xd.Make || xd.Model) {
+			gchar *camera;
+
+			if ((xd.Make == NULL || xd.Model == NULL) ||
+			    (xd.Make && xd.Model && strstr (xd.Model, xd.Make) == NULL)) {
+				camera = tracker_merge (" ", 2, xd.Make, xd.Model);
+			} else {
+				camera = g_strdup (xd.Model);
+				g_free (xd.Model);
+				g_free (xd.Make);
+			}
+
+			tracker_sparql_builder_predicate (metadata, "nmm:camera");
+			tracker_sparql_builder_object_unvalidated (metadata, camera);
+			g_free (camera);
 		}
 
-		if (xmp_data.relation) {
-			tracker_statement_list_insert (metadata, uri, DC_PREFIX "relation", xmp_data.relation);
-			g_free (xmp_data.relation);
+		if (xd.Orientation) {
+			tracker_sparql_builder_predicate (metadata, "nfo:orientation");
+			tracker_sparql_builder_object (metadata, xd.Orientation);
+			g_free (xd.Orientation);
 		}
 
-		if (xmp_data.coverage) {
-			tracker_statement_list_insert (metadata, uri, DC_PREFIX "coverage", xmp_data.coverage);
-			g_free (xmp_data.coverage);
+		if (xd.rights) {
+			tracker_sparql_builder_predicate (metadata, "nie:copyright");
+			tracker_sparql_builder_object_unvalidated (metadata, xd.rights);
+			g_free (xd.rights);
 		}
 
-		if (xmp_data.license) {
-			tracker_statement_list_insert (metadata, uri, NIE_PREFIX "license", xmp_data.license);
-			g_free (xmp_data.license);
+		if (xd.WhiteBalance) {
+			tracker_sparql_builder_predicate (metadata, "nmm:whiteBalance");
+			tracker_sparql_builder_object (metadata, xd.WhiteBalance);
+			g_free (xd.WhiteBalance);
 		}
 
-		if (xmp_data.Make || xmp_data.Model) {
-			gchar *final_camera = tracker_merge (" ", 2, xmp_data.Make, xmp_data.Model); 
-			tracker_statement_list_insert (metadata, uri, NMM_PREFIX "camera", final_camera);
-			g_free (final_camera);
-		}
+		if (xd.FNumber) {
+			gdouble value;
 
-		if (xmp_data.Orientation) {
-			tracker_statement_list_insert (metadata, uri, NFO_PREFIX "orientation", xmp_data.Orientation);
-			g_free (xmp_data.Orientation);
+			value = g_strtod (xd.FNumber, NULL);
+			tracker_sparql_builder_predicate (metadata, "nmm:fnumber");
+			tracker_sparql_builder_object_double (metadata, value);
+			g_free (xd.FNumber);
 		}
 
-		if (xmp_data.rights) {
-			tracker_statement_list_insert (metadata, uri, NIE_PREFIX "copyright", xmp_data.rights);
-			g_free (xmp_data.rights);
+		if (xd.Flash) {
+			tracker_sparql_builder_predicate (metadata, "nmm:flash");
+			tracker_sparql_builder_object (metadata, xd.Flash);
+			g_free (xd.Flash);
 		}
 
-		if (xmp_data.WhiteBalance) {
-			tracker_statement_list_insert (metadata, uri, NMM_PREFIX "whiteBalance", xmp_data.WhiteBalance);
-			g_free (xmp_data.WhiteBalance);
-		}
+		if (xd.FocalLength) {
+			gdouble value;
 
-		if (xmp_data.FNumber) {
-			tracker_statement_list_insert (metadata, uri, NMM_PREFIX "fnumber", xmp_data.FNumber);
-			g_free (xmp_data.FNumber);
+			value = g_strtod (xd.FocalLength, NULL);
+			tracker_sparql_builder_predicate (metadata, "nmm:focalLength");
+			tracker_sparql_builder_object_double (metadata, value);
+			g_free (xd.FocalLength);
 		}
 
-		if (xmp_data.Flash) {
-			tracker_statement_list_insert (metadata, uri, NMM_PREFIX "flash", xmp_data.Flash);
-			g_free (xmp_data.Flash);
+		if (xd.Artist || xd.contributor) {
+			gchar *artist;
+
+			artist = tracker_coalesce (2, xd.Artist, xd.contributor);
+			tracker_sparql_builder_predicate (metadata, "nco:contributor");
+			tracker_sparql_builder_object_blank_open (metadata);
+			tracker_sparql_builder_predicate (metadata, "a");
+			tracker_sparql_builder_object (metadata, "nco:Contact");
+			tracker_sparql_builder_predicate (metadata, "nco:fullname");
+			tracker_sparql_builder_object_unvalidated (metadata, artist);
+			tracker_sparql_builder_object_blank_close (metadata);
+			g_free (artist);
 		}
 
-		if (xmp_data.FocalLength) {
-			tracker_statement_list_insert (metadata, uri, NMM_PREFIX "focalLength", xmp_data.FocalLength);
-			g_free (xmp_data.FocalLength);
-		}
+		if (xd.ExposureTime) {
+			gdouble value;
 
-		if (xmp_data.Artist || xmp_data.contributor) {
-			gchar *final_artist =  tracker_coalesce (2, xmp_data.Artist, xmp_data.contributor);
-			tracker_statement_list_insert (metadata, ":", RDF_PREFIX "type", NCO_PREFIX "Contact");
-			tracker_statement_list_insert (metadata, ":", NCO_PREFIX "fullname", final_artist);
-			tracker_statement_list_insert (metadata, uri, NCO_PREFIX "contributor", ":");
-			g_free (final_artist);
+			value = g_strtod (xd.ExposureTime, NULL);
+			tracker_sparql_builder_predicate (metadata, "nmm:exposureTime");
+			tracker_sparql_builder_object_double (metadata, value);
+			g_free (xd.ExposureTime);
 		}
 
-		if (xmp_data.ExposureTime) {
-			tracker_statement_list_insert (metadata, uri, NMM_PREFIX "exposureTime", xmp_data.ExposureTime);
-			g_free (xmp_data.ExposureTime);
-		}
+		if (xd.ISOSpeedRatings) {
+			gdouble value;
 
-		if (xmp_data.ISOSpeedRatings) {
-			tracker_statement_list_insert (metadata, uri, NMM_PREFIX "isoSpeed", xmp_data.ISOSpeedRatings);
-			g_free (xmp_data.ISOSpeedRatings);
+			value = g_strtod (xd.ISOSpeedRatings, NULL);
+			tracker_sparql_builder_predicate (metadata, "nmm:isoSpeed");
+			tracker_sparql_builder_object_double (metadata, value);
+			g_free (xd.ISOSpeedRatings);
 		}
 
-		if (xmp_data.description) {
-			tracker_statement_list_insert (metadata, uri, NIE_PREFIX "description", xmp_data.description);
-			g_free (xmp_data.description);
+		if (xd.description) {
+			tracker_sparql_builder_predicate (metadata, "nie:description");
+			tracker_sparql_builder_object_unvalidated (metadata, xd.description);
+			g_free (xd.description);
 		}
 
-		if (xmp_data.MeteringMode) {
-			tracker_statement_list_insert (metadata, uri, NMM_PREFIX "meteringMode", xmp_data.MeteringMode);
-			g_free (xmp_data.MeteringMode);
+		if (xd.MeteringMode) {
+			tracker_sparql_builder_predicate (metadata, "nmm:meteringMode");
+			tracker_sparql_builder_object (metadata, xd.MeteringMode);
+			g_free (xd.MeteringMode);
 		}
-
+	} else {
+		/* So if we are here we have NO XMP data and we just
+		 * write what we know from Poppler.
+		 */
+		write_pdf_data (pd, metadata);
 	}
 
-	if (!tracker_is_empty_string (keywords)) {
-		char *lasts, *keyw;
-
-		for (keyw = strtok_r (keywords, ",;", &lasts); keyw; 
-		     keyw = strtok_r (NULL, ",;", &lasts)) {
-			tracker_statement_list_insert (metadata,
-					  uri, NIE_PREFIX "keyword",
-					  (const gchar*) keyw);
-		}
-	}
+	tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
+	tracker_sparql_builder_object_int64 (metadata, poppler_document_get_n_pages (document));
 
 	fts_config = tracker_main_get_fts_config ();
 	n_words = tracker_fts_config_get_max_words_to_index (fts_config);
@@ -359,12 +411,6 @@ extract_pdf (const gchar *uri,
 		g_free (content);
 	}
 
-	tracker_statement_list_insert_with_int (metadata, uri,
-					   NFO_PREFIX "pageCount",
-					   poppler_document_get_n_pages (document));
-
-	g_free (keywords);
-	g_free (metadata_xml);
 	g_object_unref (document);
 }
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]