[tracker] tracker-extract: Ported the PDF extractor to not use poppler-glib



commit f4eaeb71405c2f3a90fef429e90d2fee62641a7b
Author: Philip Van Hoof <philip codeminded be>
Date:   Thu Apr 29 14:43:20 2010 +0200

    tracker-extract: Ported the PDF extractor to not use poppler-glib
    
    Poppler's GLib bindings don't make it possible to use a TextOutputDev
    for poppler_page_get_text when there *is* Cairo support. This isn't good for
    us because that means that Cairo surfaces are needlessly made for each image
    embedded in the PDF, wasting resources.

 configure.ac                                |   40 +-
 src/tracker-extract/Makefile.am             |    8 +-
 src/tracker-extract/tracker-extract-pdf.c   |  593 --------------------
 src/tracker-extract/tracker-extract-pdf.cpp |  805 +++++++++++++++++++++++++++
 4 files changed, 829 insertions(+), 617 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 419492c..562ac15 100644
--- a/configure.ac
+++ b/configure.ac
@@ -143,7 +143,7 @@ UPOWER_REQUIRED=0.9.0
 GDKPIXBUF_REQUIRED=2.12.0
 QUILL_REQUIRED=1.0.0
 UNAC_REQUIRED=1.0.0
-POPPLER_GLIB_REQUIRED=0.4.5
+POPPLER_REQUIRED=0.12.2
 CAIRO_REQUIRED=1.0
 GDK_REQUIRED=1.0
 LIBVORBIS_REQUIRED=0.22
@@ -1122,37 +1122,37 @@ if test x$enable_unzip_psgz_files != "xno"; then
 fi
 
 ##################################################################
-# Check for poppler's glib bingings
+# Check for poppler
 ##################################################################
 
-AC_ARG_ENABLE(poppler-glib,
-	      AS_HELP_STRING([--enable-poppler-glib],
+AC_ARG_ENABLE(poppler,
+	      AS_HELP_STRING([--enable-poppler],
 	      		     [enable extractor for PDF data [[default=auto]]]),,
-	      [enable_poppler_glib=auto])
+	      [enable_poppler=auto])
 
-if test "x$enable_poppler_glib" != "xno" ; then
-   PKG_CHECK_MODULES(POPPLER_GLIB,
-   		     [poppler-glib >= $POPPLER_GLIB_REQUIRED],
-		     [have_poppler_glib=yes],
-		     [have_poppler_glib=no])
+if test "x$enable_poppler" != "xno" ; then
+   PKG_CHECK_MODULES(POPPLER,
+   		     [poppler >= $POPPLER_REQUIRED],
+		     [have_poppler=yes],
+		     [have_poppler=no])
 
-   AC_SUBST(POPPLER_GLIB_CFLAGS)
-   AC_SUBST(POPPLER_GLIB_LIBS)
+   AC_SUBST(POPPLER_CFLAGS)
+   AC_SUBST(POPPLER_LIBS)
 
-   if test "x$have_poppler_glib" = "xyes"; then
-      AC_DEFINE(HAVE_POPPLER_GLIB, [], [Define if we have poppler-glib])
+   if test "x$have_poppler" = "xyes"; then
+      AC_DEFINE(HAVE_POPPLER, [], [Define if we have poppler])
    fi
 else
-   have_poppler_glib="no  (disabled)"
+   have_poppler="no  (disabled)"
 fi
 
-if test "x$enable_poppler_glib" = "xyes"; then
-   if test "x$have_poppler_glib" != "xyes"; then
-      AC_MSG_ERROR([Couldn't find poppler-glib >= $POPPLER_GLIB_REQUIRED.])
+if test "x$enable_poppler" = "xyes"; then
+   if test "x$have_poppler" != "xyes"; then
+      AC_MSG_ERROR([Couldn't find poppler >= $POPPLER_REQUIRED.])
    fi
 fi
 
-AM_CONDITIONAL(HAVE_POPPLER_GLIB, test "x$have_poppler_glib" = "xyes")
+AM_CONDITIONAL(HAVE_POPPLER, test "x$have_poppler" = "xyes")
 
 ##################################################################
 # Check for libexif
@@ -1798,7 +1798,7 @@ Metadata Extractors:
 
 	Support libstreamanalyzer:		$have_libstreamanalyzer
 	Support PNG:				yes
-	Support PDF:				$have_poppler_glib
+	Support PDF:				$have_poppler
 	Support JPEG:				$have_libjpeg (xmp: $have_exempi, exif: $have_libexif, iptc: $have_libiptcdata)
 	Support TIFF:				$have_libtiff (xmp: $have_exempi, exif: yes, iptc: $have_libiptcdata)
 	Support Vorbis (ogg/etc):		$have_libvorbis
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index eede0c1..37454ba 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -20,7 +20,7 @@ INCLUDES = 								\
 	$(LIBGSF_CFLAGS) 						\
 	$(LIBXML2_CFLAGS) 						\
 	$(LIBPNG_CFLAGS) 						\
-	$(POPPLER_GLIB_CFLAGS) 						\
+	$(POPPLER_CFLAGS) 						\
 	$(GSTREAMER_CFLAGS) 						\
 	$(XINE_CFLAGS) 							\
 	$(TOTEM_PL_PARSER_CFLAGS)
@@ -73,7 +73,7 @@ modules_LTLIBRARIES += 							\
 	libextract-oasis.la
 endif
 
-if HAVE_POPPLER_GLIB
+if HAVE_POPPLER
 modules_LTLIBRARIES += libextract-pdf.la
 endif
 
@@ -214,12 +214,12 @@ libextract_msoffice_la_LIBADD = 					\
 	$(GCOV_LIBS)
 
 # PDF
-libextract_pdf_la_SOURCES = tracker-extract-pdf.c
+libextract_pdf_la_SOURCES = tracker-extract-pdf.cpp
 libextract_pdf_la_LDFLAGS = $(module_flags)
 libextract_pdf_la_LIBADD = 						\
 	$(top_builddir)/src/libtracker-extract/libtracker-extract- TRACKER_API_VERSION@.la \
 	$(top_builddir)/src/libtracker-common/libtracker-common.la	\
-	$(POPPLER_GLIB_LIBS)						\
+	$(POPPLER_LIBS)							\
 	$(GLIB2_LIBS)							\
 	$(GCOV_LIBS)
 
diff --git a/src/tracker-extract/tracker-extract-pdf.cpp b/src/tracker-extract/tracker-extract-pdf.cpp
new file mode 100644
index 0000000..7e6b1e7
--- /dev/null
+++ b/src/tracker-extract/tracker-extract-pdf.cpp
@@ -0,0 +1,805 @@
+/*
+ * Copyright (C) 2009, Nokia <ivan frade nokia com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ *
+ * Author: Philip Van Hoof <philip codeminded be>
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include <glib.h>
+
+/* Poppler includes*/
+#include <GlobalParams.h>
+#include <PDFDoc.h>
+#include <Outline.h>
+#include <ErrorCodes.h>
+#include <UnicodeMap.h>
+#include <PDFDocEncoding.h>
+#include <TextOutputDev.h>
+#include <Gfx.h>
+#include <Link.h>
+
+#include <libtracker-common/tracker-date-time.h>
+#include <libtracker-common/tracker-utils.h>
+
+#include <libtracker-extract/tracker-extract.h>
+
+#include "tracker-main.h"
+
+typedef struct {
+	gchar *title;
+	gchar *subject;
+	gchar *creation_date;
+	gchar *author;
+	gchar *date;
+	gchar *keywords;
+} PDFData;
+
+static void extract_pdf (const gchar          *uri,
+                         TrackerSparqlBuilder *preupdate,
+                         TrackerSparqlBuilder *metadata);
+
+static TrackerExtractData data[] = {
+	{ "application/pdf", extract_pdf },
+	{ NULL, NULL }
+};
+
+/**
+ * Philip ported this from a poppler-glib based version to a C++ libpopler
+ * version because the TextOutputDev allows us to extract text and metadata much
+ * faster than the default CairoOutputDev that poppler-glib uses in case it got
+ * compiled with support for Cairo. Regretfully can't this be selected at
+ * runtime in the poppler-glib bindings. Apologies to the GObject/GLib fans. */
+
+static gchar *
+unicode_to_char (Unicode *unicode,
+                 int      len)
+{
+	static UnicodeMap *uMap = NULL;
+	if (uMap == NULL) {
+		GooString *enc = new GooString("UTF-8");
+		uMap = globalParams->getUnicodeMap(enc);
+		uMap->incRefCnt ();
+		delete enc;
+	}
+
+	GooString gstr;
+	gchar buf[8]; /* 8 is enough for mapping an unicode char to a string */
+	int i, n;
+
+	for (i = 0; i < len; ++i) {
+		n = uMap->mapUnicode(unicode[i], buf, sizeof(buf));
+		gstr.append(buf, n);
+	}
+
+	return g_strdup (gstr.getCString ());
+}
+
+static void
+read_toc (GooList  *items,
+          GString **toc)
+{
+	guint length, i;
+
+	if (!items)
+		return;
+
+	if (!*toc) {
+		*toc = g_string_new ("");
+	}
+
+	length = items->getLength ();
+
+	for (i = 0; i < length; i++) {
+		OutlineItem *item;
+		LinkAction *link_action;
+
+		item = (OutlineItem *) items->get (i);
+
+		link_action = item->getAction ();
+
+		if (!link_action) {
+			continue;
+		}
+
+		switch (link_action->getKind()) {
+			case actionGoTo: {
+				guint title_length = item->getTitleLength ();
+				LinkGoTo *gto = dynamic_cast <LinkGoTo *> (link_action);
+				GooString *named_dest = gto->getNamedDest ();
+
+				if (title_length > 0) {
+					gchar *str = unicode_to_char (item->getTitle(),
+					                              title_length);
+					g_string_append_printf (*toc, "%s ", str);
+					g_free (str);
+				}
+
+				if (named_dest)
+					g_string_append_printf (*toc, "%s ", named_dest->getCString ());
+
+				break;
+			}
+
+			case actionLaunch: {
+				guint title_length = item->getTitleLength ();
+				LinkLaunch *lan = dynamic_cast <LinkLaunch *> (link_action);
+				GooString *filen, *param;
+
+				filen = lan->getFileName();
+				param = lan->getParams();
+
+				if (title_length > 0) {
+					gchar *str = unicode_to_char (item->getTitle(),
+					                              title_length);
+					g_string_append_printf (*toc, "%s ", str);
+					g_free (str);
+				}
+
+				if (filen)
+					g_string_append_printf (*toc, "%s ", filen->getCString ());
+
+				if (param)
+					g_string_append_printf (*toc, "%s ", param->getCString ());
+
+				break;
+			}
+
+			case actionURI: {
+				LinkURI *uri = dynamic_cast <LinkURI *> (link_action);
+				GooString *muri;
+
+				muri = uri->getURI();
+
+				if (muri)
+					g_string_append_printf (*toc, "%s ", muri->getCString ());
+
+				break;
+			}
+
+			case actionNamed: {
+				guint title_length = item->getTitleLength ();
+				LinkNamed *named = dynamic_cast <LinkNamed *> (link_action);
+				GooString *named_dest = named->getName ();
+
+				if (title_length > 0) {
+					gchar *str = unicode_to_char (item->getTitle(),
+					                              title_length);
+					g_string_append_printf (*toc, "%s ", str);
+					g_free (str);
+				}
+
+				if (named_dest)
+					g_string_append_printf (*toc, "%s ", named_dest->getCString ());
+
+				break;
+			}
+
+			case actionMovie: {
+				guint title_length = item->getTitleLength ();
+
+				if (title_length > 0) {
+					gchar *str = unicode_to_char (item->getTitle(),
+					                              title_length);
+					g_string_append_printf (*toc, "%s ", str);
+					g_free (str);
+				}
+
+				break;
+			}
+
+			case actionRendition:
+			case actionSound:
+			case actionJavaScript:
+			case actionUnknown:
+			case actionGoToR:
+				/* Do nothing */
+				break;
+		}
+
+		if (item->hasKids ())
+			read_toc (item->getKids (), toc);
+	}
+
+}
+
+static void
+read_outline (PDFDoc               *document,
+              TrackerSparqlBuilder *metadata)
+{
+	Outline *outline;
+	GString *toc = NULL;
+	GooList *items;
+
+	outline = document->getOutline();
+
+	if (!outline) {
+		return;
+	}
+
+	items = outline->getItems ();
+
+	if (items == NULL)
+		return;
+
+	read_toc (items, &toc);
+
+	if (toc) {
+		if (toc->len > 0) {
+			tracker_sparql_builder_predicate (metadata, "nfo:tableOfContents");
+			tracker_sparql_builder_object_unvalidated (metadata, toc->str);
+		}
+
+		g_string_free (toc, TRUE);
+	}
+}
+
+static void
+insert_keywords (TrackerSparqlBuilder *metadata,
+                 gchar                *keywords)
+{
+	char *saveptr, *p;
+	size_t len;
+
+	p = keywords;
+	keywords = strchr (keywords, '"');
+
+	if (keywords) {
+		keywords++;
+	} else {
+		keywords = p;
+	}
+
+	len = strlen (keywords);
+	if (keywords[len - 1] == '"') {
+		keywords[len - 1] = '\0';
+	}
+
+	for (p = strtok_r (keywords, ",; ", &saveptr);
+	     p;
+	     p = strtok_r (NULL, ",; ", &saveptr)) {
+		tracker_sparql_builder_predicate (metadata, "nao:hasTag");
+
+		tracker_sparql_builder_object_blank_open (metadata);
+		tracker_sparql_builder_predicate (metadata, "a");
+		tracker_sparql_builder_object (metadata, "nao:Tag");
+
+		tracker_sparql_builder_predicate (metadata, "nao:prefLabel");
+		tracker_sparql_builder_object_unvalidated (metadata, p);
+
+		tracker_sparql_builder_object_blank_close (metadata);
+	}
+}
+
+static void
+page_get_size (Page    *page,
+               gdouble *width,
+               gdouble *height)
+{
+  gdouble page_width, page_height;
+  gint rotate;
+
+  rotate = page->getRotate ();
+
+  if (rotate == 90 || rotate == 270) {
+    page_height = page->getCropWidth ();
+    page_width = page->getCropHeight ();
+  } else {
+    page_width = page->getCropWidth ();
+    page_height = page->getCropHeight ();
+  }
+
+  if (width != NULL)
+    *width = page_width;
+  if (height != NULL)
+    *height = page_height;
+}
+
+static gchar *
+extract_content (PDFDoc *document,
+                 guint   n_words)
+{
+	Page *page;
+	Catalog *catalog;
+	GString *string;
+	gint n_pages, i, words;
+	gchar *t;
+
+	n_pages = document->getNumPages();
+	string = g_string_new ("");
+	words = i = 0;
+	catalog = document->getCatalog();
+
+	while (i < n_pages && words < n_words) {
+		guint normalized_words = 0;
+		Gfx *gfx;
+		GooString *sel_text;
+		TextOutputDev *text_dev;
+		PDFRectangle pdf_selection;
+		gdouble height = 0, width = 0;
+
+		page = catalog->getPage (i + 1);
+		i++;
+
+		text_dev = new TextOutputDev (NULL, gTrue, gFalse, gFalse);
+		gfx = page->createGfx (text_dev,
+		                       72.0, 72.0, 0,
+		                       gFalse, /* useMediaBox */
+		                       gTrue, /* Crop */
+		                       -1, -1, -1, -1,
+		                       gFalse, /* printing */
+		                       catalog,
+		                       NULL, NULL, NULL, NULL);
+
+		page->display(gfx);
+		text_dev->endPage();
+
+		page_get_size (page, &width, &height);
+
+		pdf_selection.x1 = 0;
+		pdf_selection.y1 = 0;
+		pdf_selection.x2 = width;
+		pdf_selection.y2 = height;
+
+		sel_text = text_dev->getSelectionText (&pdf_selection, selectionStyleWord);
+
+		t = tracker_text_normalize (sel_text->getCString (), n_words - words, &normalized_words);
+
+		words += normalized_words;
+		g_string_append (string, t);
+
+		g_free (t);
+
+		delete gfx;
+		delete text_dev;
+	}
+
+	return g_string_free (string, FALSE);
+}
+
+static void
+write_pdf_data (PDFData               data,
+                TrackerSparqlBuilder *metadata)
+{
+	if (!tracker_is_empty_string (data.title)) {
+		tracker_sparql_builder_predicate (metadata, "nie:title");
+		tracker_sparql_builder_object_unvalidated (metadata, data.title);
+	}
+
+	if (!tracker_is_empty_string (data.subject)) {
+		tracker_sparql_builder_predicate (metadata, "nie:subject");
+		tracker_sparql_builder_object_unvalidated (metadata, data.subject);
+	}
+
+	if (!tracker_is_empty_string (data.author)) {
+		tracker_sparql_builder_predicate (metadata, "nco:creator");
+		tracker_sparql_builder_object_blank_open (metadata);
+		tracker_sparql_builder_predicate (metadata, "a");
+		tracker_sparql_builder_object (metadata, "nco:Contact");
+		tracker_sparql_builder_predicate (metadata, "nco:fullname");
+		tracker_sparql_builder_object_unvalidated (metadata, data.author);
+		tracker_sparql_builder_object_blank_close (metadata);
+	}
+
+	if (!tracker_is_empty_string (data.date)) {
+		tracker_sparql_builder_predicate (metadata, "nie:contentCreated");
+		tracker_sparql_builder_object_unvalidated (metadata, data.date);
+	}
+
+	if (!tracker_is_empty_string (data.keywords)) {
+		insert_keywords (metadata, data.keywords);
+	}
+}
+
+
+static PDFDoc*
+poppler_document_new_pdf_from_file (const char  *uri,
+                                    const char  *password)
+{
+	PDFDoc *newDoc;
+	GooString *filename_g;
+	GooString *password_g;
+	gchar *filename;
+
+	if (!globalParams) {
+		globalParams = new GlobalParams();
+	}
+
+	filename = g_filename_from_uri (uri, NULL, NULL);
+	if (!filename)
+		return NULL;
+
+	filename_g = new GooString (filename);
+	g_free (filename);
+
+	password_g = NULL;
+	if (password != NULL) {
+		if (g_utf8_validate (password, -1, NULL)) {
+			gchar *password_latin;
+
+			password_latin = g_convert (password, -1,
+			                            "ISO-8859-1",
+			                            "UTF-8",
+			                            NULL, NULL, NULL);
+			password_g = new GooString (password_latin);
+			g_free (password_latin);
+		} else {
+			password_g = new GooString (password);
+		}
+	}
+
+	newDoc = new PDFDoc(filename_g, password_g, password_g);
+	delete password_g;
+
+	return newDoc;
+}
+
+static gchar*
+info_dict_get_string (Dict *info_dict, const gchar *key)
+{
+	Object obj;
+	GooString *goo_value;
+	gchar *result;
+
+	if (!info_dict->lookup ((gchar *)key, &obj)->isString ()) {
+		obj.free ();
+		return NULL;
+	}
+
+	goo_value = obj.getString ();
+
+	if (goo_value->hasUnicodeMarker()) {
+		result = g_convert (goo_value->getCString () + 2,
+		                    goo_value->getLength () - 2,
+		                    "UTF-8", "UTF-16BE", NULL, NULL, NULL);
+	} else {
+		int len;
+		gunichar *ucs4_temp;
+		int i;
+
+		len = goo_value->getLength ();
+		ucs4_temp = g_new (gunichar, len + 1);
+		for (i = 0; i < len; ++i) {
+			ucs4_temp[i] = pdfDocEncoding[(unsigned char)goo_value->getChar(i)];
+		}
+		ucs4_temp[i] = 0;
+		result = g_ucs4_to_utf8 (ucs4_temp, -1, NULL, NULL, NULL);
+		g_free (ucs4_temp);
+	}
+
+	obj.free ();
+
+	return result;
+}
+
+static void
+extract_pdf (const gchar          *uri,
+             TrackerSparqlBuilder *preupdate,
+             TrackerSparqlBuilder *metadata)
+{
+	TrackerFTSConfig *fts_config;
+	TrackerXmpData *xd = NULL;
+	PDFData pd = { 0 }; /* actual data */
+	PDFData md = { 0 }; /* for merging */
+	PDFDoc *document;
+	gchar *content;
+	guint n_words;
+	Object obj;
+	Catalog *catalog;
+
+	g_type_init ();
+
+	document = poppler_document_new_pdf_from_file (uri, NULL);
+
+	if (!document->isOk()) {
+		int fopen_errno;
+		switch (document->getErrorCode()) {
+			case errEncrypted:
+				tracker_sparql_builder_predicate (metadata, "a");
+				tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+				tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
+				tracker_sparql_builder_object_boolean (metadata, TRUE);
+				break;
+			case errBadCatalog:
+				g_warning ("Couldn't create PopplerDocument from uri:'%s', Failed to read the document catalog", uri);
+				break;
+			case errDamaged:
+				g_warning ("Couldn't create PopplerDocument from uri:'%s', PDF document is damaged", uri);
+				break;
+			case errOpenFile:
+				fopen_errno = document->getFopenErrno();
+				g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
+				           uri, g_strerror (fopen_errno));
+				break;
+			default:
+				g_warning ("Couldn't create PopplerDocument from uri:'%s', no error given", uri);
+				break;
+		}
+
+		delete document;
+		return;
+	}
+
+	if (!document) {
+		g_warning ("Could not create PopplerDocument from uri:'%s', "
+		           "NULL returned without an error",
+		           uri);
+		return;
+	}
+
+	tracker_sparql_builder_predicate (metadata, "a");
+	tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+
+	document->getDocInfo (&obj);
+	if (obj.isDict ()) {
+		Dict *info_dict = obj.getDict();
+		pd.title = info_dict_get_string (info_dict, "Title");
+		pd.author = info_dict_get_string (info_dict, "Author");
+		pd.subject = info_dict_get_string (info_dict, "Subject");
+		pd.keywords = info_dict_get_string (info_dict, "Keywords");
+		pd.creation_date = info_dict_get_string (info_dict, "CreationDate");
+	}
+	obj.free ();
+
+
+	catalog = document->getCatalog ();
+	if (catalog && catalog->isOk ()) {
+		GooString *s = catalog->readMetadata ();
+		if (s != NULL) {
+			const gchar *xml;
+
+			xml = s->getCString();
+			xd = tracker_xmp_new (xml, strlen (xml), uri);
+
+			if (!xd) {
+				xd = g_new0 (TrackerXmpData, 1);
+			}
+
+			delete s;
+
+			/* The casts here are well understood and known */
+			md.title = (gchar *) tracker_coalesce_strip (3, pd.title, xd->title, xd->title2, xd->pdf_title);
+			md.subject = (gchar *) tracker_coalesce_strip (2, pd.subject, xd->subject);
+			md.date = (gchar *) tracker_coalesce_strip (3, pd.creation_date, xd->date, xd->time_original);
+			md.author = (gchar *) tracker_coalesce_strip (2, pd.author, xd->creator);
+
+			write_pdf_data (md, metadata);
+
+			if (xd->keywords) {
+				insert_keywords (metadata, xd->keywords);
+			}
+
+			if (xd->pdf_keywords) {
+				insert_keywords (metadata, xd->pdf_keywords);
+			}
+
+			if (xd->publisher) {
+				tracker_sparql_builder_predicate (metadata, "nco:publisher");
+				tracker_sparql_builder_object_blank_open (metadata);
+				tracker_sparql_builder_predicate (metadata, "a");
+				tracker_sparql_builder_object (metadata, "nco:Contact");
+				tracker_sparql_builder_predicate (metadata, "nco:fullname");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->publisher);
+				tracker_sparql_builder_object_blank_close (metadata);
+			}
+
+			if (xd->type) {
+				tracker_sparql_builder_predicate (metadata, "dc:type");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->type);
+			}
+
+			if (xd->format) {
+				tracker_sparql_builder_predicate (metadata, "dc:format");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->format);
+			}
+
+			if (xd->identifier) {
+				tracker_sparql_builder_predicate (metadata, "dc:identifier");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->identifier);
+			}
+
+			if (xd->source) {
+				tracker_sparql_builder_predicate (metadata, "dc:source");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->source);
+			}
+
+			if (xd->language) {
+				tracker_sparql_builder_predicate (metadata, "dc:language");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->language);
+			}
+
+			if (xd->relation) {
+				tracker_sparql_builder_predicate (metadata, "dc:relation");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->relation);
+			}
+
+			if (xd->coverage) {
+				tracker_sparql_builder_predicate (metadata, "dc:coverage");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->coverage);
+			}
+
+			if (xd->license) {
+				tracker_sparql_builder_predicate (metadata, "nie:license");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->license);
+			}
+
+			if (xd->make || xd->model) {
+				gchar *camera;
+
+				if ((xd->make == NULL || xd->model == NULL) ||
+					(xd->make && xd->model && strstr (xd->model, xd->make) == NULL)) {
+					camera = tracker_merge_const (" ", 2, xd->make, xd->model);
+				} else {
+					camera = g_strdup (xd->model);
+				}
+
+				tracker_sparql_builder_predicate (metadata, "nmm:camera");
+				tracker_sparql_builder_object_unvalidated (metadata, camera);
+				g_free (camera);
+			}
+
+			if (xd->orientation) {
+				tracker_sparql_builder_predicate (metadata, "nfo:orientation");
+				tracker_sparql_builder_object (metadata, xd->orientation);
+			}
+
+			if (xd->rights) {
+				tracker_sparql_builder_predicate (metadata, "nie:copyright");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->rights);
+			}
+
+			if (xd->white_balance) {
+				tracker_sparql_builder_predicate (metadata, "nmm:whiteBalance");
+				tracker_sparql_builder_object (metadata, xd->white_balance);
+			}
+
+			if (xd->fnumber) {
+				gdouble value;
+
+				value = g_strtod (xd->fnumber, NULL);
+				tracker_sparql_builder_predicate (metadata, "nmm:fnumber");
+				tracker_sparql_builder_object_double (metadata, value);
+			}
+
+			if (xd->flash) {
+				tracker_sparql_builder_predicate (metadata, "nmm:flash");
+				tracker_sparql_builder_object (metadata, xd->flash);
+			}
+
+			if (xd->focal_length) {
+				gdouble value;
+
+				value = g_strtod (xd->focal_length, NULL);
+				tracker_sparql_builder_predicate (metadata, "nmm:focalLength");
+				tracker_sparql_builder_object_double (metadata, value);
+			}
+
+			/* Question: Shouldn't xd->Artist be merged with md.author instead? */
+
+			if (xd->artist || xd->contributor) {
+				const gchar *artist;
+
+				artist = tracker_coalesce_strip (2, xd->artist, xd->contributor);
+				tracker_sparql_builder_predicate (metadata, "nco:contributor");
+				tracker_sparql_builder_object_blank_open (metadata);
+				tracker_sparql_builder_predicate (metadata, "a");
+				tracker_sparql_builder_object (metadata, "nco:Contact");
+				tracker_sparql_builder_predicate (metadata, "nco:fullname");
+				tracker_sparql_builder_object_unvalidated (metadata, artist);
+				tracker_sparql_builder_object_blank_close (metadata);
+			}
+
+			if (xd->exposure_time) {
+				gdouble value;
+
+				value = g_strtod (xd->exposure_time, NULL);
+				tracker_sparql_builder_predicate (metadata, "nmm:exposureTime");
+				tracker_sparql_builder_object_double (metadata, value);
+			}
+
+			if (xd->iso_speed_ratings) {
+				gdouble value;
+
+				value = g_strtod (xd->iso_speed_ratings, NULL);
+				tracker_sparql_builder_predicate (metadata, "nmm:isoSpeed");
+				tracker_sparql_builder_object_double (metadata, value);
+			}
+
+			if (xd->description) {
+				tracker_sparql_builder_predicate (metadata, "nie:description");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->description);
+			}
+
+			if (xd->metering_mode) {
+				tracker_sparql_builder_predicate (metadata, "nmm:meteringMode");
+				tracker_sparql_builder_object (metadata, xd->metering_mode);
+			}
+
+			if (xd->address || xd->country || xd->city) {
+				tracker_sparql_builder_predicate (metadata, "mlo:location");
+
+				tracker_sparql_builder_object_blank_open (metadata);
+				tracker_sparql_builder_predicate (metadata, "a");
+				tracker_sparql_builder_object (metadata, "mlo:GeoPoint");
+
+				if (xd->address) {
+					tracker_sparql_builder_predicate (metadata, "mlo:address");
+					tracker_sparql_builder_object_unvalidated (metadata, xd->address);
+				}
+
+				if (xd->state) {
+					tracker_sparql_builder_predicate (metadata, "mlo:state");
+					tracker_sparql_builder_object_unvalidated (metadata, xd->state);
+				}
+
+				if (xd->city) {
+					tracker_sparql_builder_predicate (metadata, "mlo:city");
+					tracker_sparql_builder_object_unvalidated (metadata, xd->city);
+				}
+
+				if (xd->country) {
+					tracker_sparql_builder_predicate (metadata, "mlo:country");
+					tracker_sparql_builder_object_unvalidated (metadata, xd->country);
+				}
+
+				tracker_sparql_builder_object_blank_close (metadata);
+			}
+
+			/* PDF keywords aren't used ATM (why not?) */
+			g_free (pd.keywords);
+
+			g_free (pd.title);
+			g_free (pd.subject);
+			g_free (pd.creation_date);
+			g_free (pd.author);
+			g_free (pd.date);
+
+			tracker_xmp_free (xd);
+		}
+	} else {
+		/* So if we are here we have NO XMP data and we just
+		 * write what we know from Poppler.
+		 */
+		write_pdf_data (pd, metadata);
+	}
+
+	tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
+	tracker_sparql_builder_object_int64 (metadata, document->getNumPages());
+
+	fts_config = tracker_main_get_fts_config ();
+	n_words = tracker_fts_config_get_max_words_to_index (fts_config);
+	content = extract_content (document, n_words);
+
+	if (content) {
+		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+		tracker_sparql_builder_object_unvalidated (metadata, content);
+		g_free (content);
+	}
+
+	read_outline (document, metadata);
+
+	delete document;
+}
+
+TrackerExtractData *
+tracker_extract_get_data (void)
+{
+	return data;
+}



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]