[tracker/tracker-0.10] tracker-extract, pdf: Use poppler-glib again for PDF extraction



commit ce04cd1ce89f88c90fd5bfe60249251b04f7abd0
Author: Carlos Garnacho <carlosg gnome org>
Date:   Tue Feb 15 16:53:44 2011 +0100

    tracker-extract,pdf: Use poppler-glib again for PDF extraction
    
    The old code has been resurrected with a few changes to
    have it work on Poppler 0.16.0, Now that poppler_page_get_text()
    uses TextOutputDev underneath there's no need anymore to
    use the C++ interface directly anymore, which is not guaranteed
    to be API compatible.
    
    Fixes Bug GB#636375 - 0.9.30: tracker-extract-pdf.cpp:
    error: GlobalParams.h: No such file or directory, reported
    by Andre Klapper.

 configure.ac                                |    8 +-
 src/tracker-extract/Makefile.am             |    4 +-
 src/tracker-extract/tracker-extract-pdf.c   |  594 ++++++++++++++++++
 src/tracker-extract/tracker-extract-pdf.cpp |  858 ---------------------------
 4 files changed, 600 insertions(+), 864 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 25a92fe..4712359 100644
--- a/configure.ac
+++ b/configure.ac
@@ -178,7 +178,7 @@ UPOWER_REQUIRED=0.9.0
 GDKPIXBUF_REQUIRED=2.12.0
 QT_REQUIRED=4.7.1
 MEEGOTOUCH_REQUIRED=0.20
-POPPLER_REQUIRED=0.12.2
+POPPLER_REQUIRED=0.16.0
 CAIRO_REQUIRED=1.0
 GDK_REQUIRED=1.0
 LIBVORBIS_REQUIRED=0.22
@@ -1638,7 +1638,7 @@ AC_ARG_ENABLE(poppler,
 
 if test "x$enable_poppler" != "xno" ; then
    PKG_CHECK_MODULES(POPPLER,
-                     [poppler >= $POPPLER_REQUIRED],
+                     [poppler-glib >= $POPPLER_REQUIRED],
                      [have_poppler=yes],
                      [have_poppler=no])
 
@@ -1646,7 +1646,7 @@ if test "x$enable_poppler" != "xno" ; then
    AC_SUBST(POPPLER_LIBS)
 
    if test "x$have_poppler" = "xyes"; then
-      AC_DEFINE(HAVE_POPPLER, [], [Define if we have poppler])
+      AC_DEFINE(HAVE_POPPLER, [], [Define if we have poppler-glib])
    fi
 else
    have_poppler="no  (disabled)"
@@ -1654,7 +1654,7 @@ fi
 
 if test "x$enable_poppler" = "xyes"; then
    if test "x$have_poppler" != "xyes"; then
-      AC_MSG_ERROR([Couldn't find poppler >= $POPPLER_REQUIRED.])
+      AC_MSG_ERROR([Couldn't find poppler-glib >= $POPPLER_REQUIRED.])
    fi
 fi
 
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 5c83b28..e2027a9 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -236,8 +236,8 @@ libextract_msoffice_xml_la_LIBADD = \
 	$(TRACKER_EXTRACT_MODULES_LIBS)
 
 # PDF
-libextract_pdf_la_SOURCES = tracker-extract-pdf.cpp
-libextract_pdf_la_CXXFLAGS = \
+libextract_pdf_la_SOURCES = tracker-extract-pdf.c
+libextract_pdf_la_CFLAGS = \
 	$(TRACKER_EXTRACT_MODULES_CFLAGS) \
 	$(POPPLER_CFLAGS)
 libextract_pdf_la_LDFLAGS = $(module_flags)
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.c
new file mode 100644
index 0000000..c9ee8f7
--- /dev/null
+++ b/src/tracker-extract/tracker-extract-pdf.c
@@ -0,0 +1,594 @@
+/*
+ * Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
+ * Copyright (C) 2008-2009, Nokia <ivan frade nokia com>
+ * Copyright (C) 2010, Amit Aggarwal <amitcs06 gmail com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include <glib.h>
+#include <glib/poppler.h>
+
+#include <libtracker-common/tracker-date-time.h>
+#include <libtracker-common/tracker-utils.h>
+
+#include <libtracker-extract/tracker-extract.h>
+
+#include "tracker-main.h"
+
+typedef struct {
+	gchar *title;
+	gchar *subject;
+	gchar *creation_date;
+	gchar *author;
+	gchar *date;
+	gchar *keywords;
+} PDFData;
+
+static void extract_pdf (const gchar          *uri,
+                         TrackerSparqlBuilder *preupdate,
+                         TrackerSparqlBuilder *metadata);
+
+static TrackerExtractData data[] = {
+	{ "application/pdf", extract_pdf },
+	{ NULL, NULL }
+};
+
+static void
+read_toc (PopplerIndexIter  *index,
+          GString          **toc)
+{
+	if (!index) {
+		return;
+	}
+
+	if (!*toc) {
+		*toc = g_string_new ("");
+	}
+
+	do {
+		PopplerAction *action;
+		PopplerIndexIter *iter;
+
+		action = poppler_index_iter_get_action (index);
+
+		if (!action) {
+			continue;
+		}
+
+		switch (action->type) {
+			case POPPLER_ACTION_GOTO_DEST: {
+				PopplerActionGotoDest *ag = (PopplerActionGotoDest*) action;
+				PopplerDest *agd = ag->dest;
+
+				if (!tracker_is_empty_string (ag->title)) {
+					g_string_append_printf (*toc, "%s ", ag->title);
+				}
+
+				if (!tracker_is_empty_string (agd->named_dest)) {
+					g_string_append_printf (*toc, "%s ", agd->named_dest);
+				}
+
+				break;
+			}
+
+			case POPPLER_ACTION_LAUNCH: {
+				PopplerActionLaunch *al = (PopplerActionLaunch*) action;
+
+				if (!tracker_is_empty_string (al->title)) {
+					g_string_append_printf (*toc, "%s ", al->title);
+				}
+
+				if (!tracker_is_empty_string (al->file_name)) {
+					g_string_append_printf (*toc, "%s ", al->file_name);
+				}
+
+				if (!tracker_is_empty_string (al->params)) {
+					g_string_append_printf (*toc, "%s ", al->params);
+				}
+
+				break;
+			}
+
+			case POPPLER_ACTION_URI: {
+				PopplerActionUri *au = (PopplerActionUri*) action;
+
+				if (!tracker_is_empty_string (au->uri)) {
+					g_string_append_printf (*toc, "%s ", au->uri);
+				}
+
+				break;
+			}
+
+			case POPPLER_ACTION_NAMED: {
+				PopplerActionNamed *an = (PopplerActionNamed*) action;
+
+				if (!tracker_is_empty_string (an->title)) {
+					g_string_append_printf (*toc, "%s, ", an->title);
+				}
+
+				if (!tracker_is_empty_string (an->named_dest)) {
+					g_string_append_printf (*toc, "%s ", an->named_dest);
+				}
+
+				break;
+			}
+
+			case POPPLER_ACTION_MOVIE: {
+				PopplerActionNamed *am = (PopplerActionNamed*) action;
+
+				if (!tracker_is_empty_string (am->title)) {
+					g_string_append_printf (*toc, "%s ", am->title);
+				}
+
+				break;
+			}
+
+			case POPPLER_ACTION_NONE:
+			case POPPLER_ACTION_UNKNOWN:
+			case POPPLER_ACTION_GOTO_REMOTE:
+			case POPPLER_ACTION_RENDITION:
+			case POPPLER_ACTION_OCG_STATE:
+				/* Do nothing */
+				break;
+		}
+
+		iter = poppler_index_iter_get_child (index);
+		read_toc (iter, toc);
+	} while (poppler_index_iter_next (index));
+
+	poppler_index_iter_free (index);
+}
+
+static void
+read_outline (PopplerDocument      *document,
+              TrackerSparqlBuilder *metadata)
+{
+	PopplerIndexIter *index;
+	GString *toc = NULL;
+
+	index = poppler_index_iter_new (document);
+
+	if (!index) {
+		return;
+	}
+
+	read_toc (index, &toc);
+
+	if (toc) {
+		if (toc->len > 0) {
+			tracker_sparql_builder_predicate (metadata, "nfo:tableOfContents");
+			tracker_sparql_builder_object_unvalidated (metadata, toc->str);
+		}
+
+		g_string_free (toc, TRUE);
+	}
+}
+
+static void
+insert_keywords (TrackerSparqlBuilder *metadata,
+                 gchar                *keywords)
+{
+	char *saveptr, *p;
+	size_t len;
+
+	p = keywords;
+	keywords = strchr (keywords, '"');
+
+	if (keywords) {
+		keywords++;
+	} else {
+		keywords = p;
+	}
+
+	len = strlen (keywords);
+	if (keywords[len - 1] == '"') {
+		keywords[len - 1] = '\0';
+	}
+
+	for (p = strtok_r (keywords, ",; ", &saveptr);
+	     p;
+	     p = strtok_r (NULL, ",; ", &saveptr)) {
+		tracker_sparql_builder_predicate (metadata, "nao:hasTag");
+
+		tracker_sparql_builder_object_blank_open (metadata);
+		tracker_sparql_builder_predicate (metadata, "a");
+		tracker_sparql_builder_object (metadata, "nao:Tag");
+
+		tracker_sparql_builder_predicate (metadata, "nao:prefLabel");
+		tracker_sparql_builder_object_unvalidated (metadata, p);
+
+		tracker_sparql_builder_object_blank_close (metadata);
+	}
+}
+
+static gchar *
+extract_content (PopplerDocument *document,
+                 gsize            n_bytes)
+{
+	gint n_pages, i = 0;
+	GString *string;
+	GTimer *timer;
+
+	n_pages = poppler_document_get_n_pages (document);
+	string = g_string_new ("");
+	timer = g_timer_new ();
+
+	while (i < n_pages && n_bytes > 0 &&
+	       g_timer_elapsed (timer, NULL) < 5) {
+		PopplerPage *page;
+		gsize written_bytes;
+		gchar *text;
+
+		page = poppler_document_get_page (document, i);
+		i++;
+
+		text = poppler_page_get_text (page);
+
+		if (!text) {
+			continue;
+		}
+
+		if (tracker_text_validate_utf8 (text,
+		                                MIN (strlen (text), n_bytes),
+		                                &string,
+		                                &written_bytes)) {
+			g_string_append_c (string, ' ');
+		}
+
+		n_bytes -= written_bytes;
+
+		g_free (text);
+		g_object_unref (page);
+	}
+
+	g_timer_destroy (timer);
+
+	return g_string_free (string, FALSE);
+}
+
+static void
+write_pdf_data (PDFData               data,
+                TrackerSparqlBuilder *metadata)
+{
+	if (!tracker_is_empty_string (data.title)) {
+		tracker_sparql_builder_predicate (metadata, "nie:title");
+		tracker_sparql_builder_object_unvalidated (metadata, data.title);
+	}
+
+	if (!tracker_is_empty_string (data.subject)) {
+		tracker_sparql_builder_predicate (metadata, "nie:subject");
+		tracker_sparql_builder_object_unvalidated (metadata, data.subject);
+	}
+
+	if (!tracker_is_empty_string (data.author)) {
+		tracker_sparql_builder_predicate (metadata, "nco:creator");
+		tracker_sparql_builder_object_blank_open (metadata);
+		tracker_sparql_builder_predicate (metadata, "a");
+		tracker_sparql_builder_object (metadata, "nco:Contact");
+		tracker_sparql_builder_predicate (metadata, "nco:fullname");
+		tracker_sparql_builder_object_unvalidated (metadata, data.author);
+		tracker_sparql_builder_object_blank_close (metadata);
+	}
+
+	if (!tracker_is_empty_string (data.date)) {
+		tracker_sparql_builder_predicate (metadata, "nie:contentCreated");
+		tracker_sparql_builder_object_unvalidated (metadata, data.date);
+	}
+
+	if (!tracker_is_empty_string (data.keywords)) {
+		insert_keywords (metadata, data.keywords);
+	}
+}
+
+static void
+extract_pdf (const gchar          *uri,
+             TrackerSparqlBuilder *preupdate,
+             TrackerSparqlBuilder *metadata)
+{
+	TrackerConfig *config;
+	GTime creation_date;
+	GError *error = NULL;
+	TrackerXmpData *xd = NULL;
+	PDFData pd = { 0 }; /* actual data */
+	PDFData md = { 0 }; /* for merging */
+	PopplerDocument *document;
+	gchar *xml = NULL;
+	gchar *content;
+	guint n_bytes;
+
+	g_type_init ();
+
+	document = poppler_document_new_from_file (uri, NULL, &error);
+
+	if (error) {
+		if (error->code == POPPLER_ERROR_ENCRYPTED) {
+			tracker_sparql_builder_predicate (metadata, "a");
+			tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+
+			tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
+			tracker_sparql_builder_object_boolean (metadata, TRUE);
+			return;
+		} else {
+			g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
+			           uri,
+			           error->message ? error->message : "no error given");
+		}
+
+		g_error_free (error);
+		return;
+	}
+
+	if (!document) {
+		g_warning ("Could not create PopplerDocument from uri:'%s', "
+		           "NULL returned without an error",
+		           uri);
+		return;
+	}
+
+	tracker_sparql_builder_predicate (metadata, "a");
+	tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+
+	g_object_get (document,
+	              "title", &pd.title,
+	              "author", &pd.author,
+	              "subject", &pd.subject,
+	              "keywords", &pd.keywords,
+	              "creation-date", &creation_date,
+		      "metadata", &xml,
+	              NULL);
+
+	if (creation_date > 0) {
+		pd.creation_date = tracker_date_to_string ((time_t) creation_date);
+	}
+
+	if (xml &&
+	    (xd = tracker_xmp_new (xml, strlen (xml), uri)) != NULL) {
+		/* The casts here are well understood and known */
+		md.title = (gchar *) tracker_coalesce_strip (3, pd.title, xd->title, xd->title2, xd->pdf_title);
+		md.subject = (gchar *) tracker_coalesce_strip (2, pd.subject, xd->subject);
+		md.date = (gchar *) tracker_coalesce_strip (3, pd.creation_date, xd->date, xd->time_original);
+		md.author = (gchar *) tracker_coalesce_strip (2, pd.author, xd->creator);
+
+		write_pdf_data (md, metadata);
+
+		if (xd->keywords) {
+			insert_keywords (metadata, xd->keywords);
+		}
+
+		if (xd->pdf_keywords) {
+			insert_keywords (metadata, xd->pdf_keywords);
+		}
+
+		if (pd.keywords) {
+			insert_keywords (metadata, pd.keywords);
+		}
+
+		if (xd->publisher) {
+			tracker_sparql_builder_predicate (metadata, "nco:publisher");
+			tracker_sparql_builder_object_blank_open (metadata);
+			tracker_sparql_builder_predicate (metadata, "a");
+			tracker_sparql_builder_object (metadata, "nco:Contact");
+			tracker_sparql_builder_predicate (metadata, "nco:fullname");
+			tracker_sparql_builder_object_unvalidated (metadata, xd->publisher);
+			tracker_sparql_builder_object_blank_close (metadata);
+		}
+
+		if (xd->type) {
+			tracker_sparql_builder_predicate (metadata, "dc:type");
+			tracker_sparql_builder_object_unvalidated (metadata, xd->type);
+		}
+
+		if (xd->format) {
+			tracker_sparql_builder_predicate (metadata, "dc:format");
+			tracker_sparql_builder_object_unvalidated (metadata, xd->format);
+		}
+
+		if (xd->identifier) {
+			tracker_sparql_builder_predicate (metadata, "dc:identifier");
+			tracker_sparql_builder_object_unvalidated (metadata, xd->identifier);
+		}
+
+		if (xd->source) {
+			tracker_sparql_builder_predicate (metadata, "dc:source");
+			tracker_sparql_builder_object_unvalidated (metadata, xd->source);
+		}
+
+		if (xd->language) {
+			tracker_sparql_builder_predicate (metadata, "dc:language");
+			tracker_sparql_builder_object_unvalidated (metadata, xd->language);
+		}
+
+		if (xd->relation) {
+			tracker_sparql_builder_predicate (metadata, "dc:relation");
+			tracker_sparql_builder_object_unvalidated (metadata, xd->relation);
+		}
+
+		if (xd->coverage) {
+			tracker_sparql_builder_predicate (metadata, "dc:coverage");
+			tracker_sparql_builder_object_unvalidated (metadata, xd->coverage);
+		}
+
+		if (xd->license) {
+			tracker_sparql_builder_predicate (metadata, "nie:license");
+			tracker_sparql_builder_object_unvalidated (metadata, xd->license);
+		}
+
+		if (xd->make || xd->model) {
+			gchar *camera;
+
+			if ((xd->make == NULL || xd->model == NULL) ||
+			    (xd->make && xd->model && strstr (xd->model, xd->make) == NULL)) {
+				camera = tracker_merge_const (" ", 2, xd->make, xd->model);
+			} else {
+				camera = g_strdup (xd->model);
+			}
+
+			tracker_sparql_builder_predicate (metadata, "nmm:camera");
+			tracker_sparql_builder_object_unvalidated (metadata, camera);
+			g_free (camera);
+		}
+
+		if (xd->orientation) {
+			tracker_sparql_builder_predicate (metadata, "nfo:orientation");
+			tracker_sparql_builder_object (metadata, xd->orientation);
+		}
+
+		if (xd->rights) {
+			tracker_sparql_builder_predicate (metadata, "nie:copyright");
+			tracker_sparql_builder_object_unvalidated (metadata, xd->rights);
+		}
+
+		if (xd->white_balance) {
+			tracker_sparql_builder_predicate (metadata, "nmm:whiteBalance");
+			tracker_sparql_builder_object (metadata, xd->white_balance);
+		}
+
+		if (xd->fnumber) {
+			gdouble value;
+
+			value = g_strtod (xd->fnumber, NULL);
+			tracker_sparql_builder_predicate (metadata, "nmm:fnumber");
+			tracker_sparql_builder_object_double (metadata, value);
+		}
+
+		if (xd->flash) {
+			tracker_sparql_builder_predicate (metadata, "nmm:flash");
+			tracker_sparql_builder_object (metadata, xd->flash);
+		}
+
+		if (xd->focal_length) {
+			gdouble value;
+
+			value = g_strtod (xd->focal_length, NULL);
+			tracker_sparql_builder_predicate (metadata, "nmm:focalLength");
+			tracker_sparql_builder_object_double (metadata, value);
+		}
+
+		/* Question: Shouldn't xd->Artist be merged with md.author instead? */
+
+		if (xd->artist || xd->contributor) {
+			const gchar *artist;
+
+			artist = tracker_coalesce_strip (2, xd->artist, xd->contributor);
+			tracker_sparql_builder_predicate (metadata, "nco:contributor");
+			tracker_sparql_builder_object_blank_open (metadata);
+			tracker_sparql_builder_predicate (metadata, "a");
+			tracker_sparql_builder_object (metadata, "nco:Contact");
+			tracker_sparql_builder_predicate (metadata, "nco:fullname");
+			tracker_sparql_builder_object_unvalidated (metadata, artist);
+			tracker_sparql_builder_object_blank_close (metadata);
+		}
+
+		if (xd->exposure_time) {
+			gdouble value;
+
+			value = g_strtod (xd->exposure_time, NULL);
+			tracker_sparql_builder_predicate (metadata, "nmm:exposureTime");
+			tracker_sparql_builder_object_double (metadata, value);
+		}
+
+		if (xd->iso_speed_ratings) {
+			gdouble value;
+
+			value = g_strtod (xd->iso_speed_ratings, NULL);
+			tracker_sparql_builder_predicate (metadata, "nmm:isoSpeed");
+			tracker_sparql_builder_object_double (metadata, value);
+		}
+
+		if (xd->description) {
+			tracker_sparql_builder_predicate (metadata, "nie:description");
+			tracker_sparql_builder_object_unvalidated (metadata, xd->description);
+		}
+
+		if (xd->metering_mode) {
+			tracker_sparql_builder_predicate (metadata, "nmm:meteringMode");
+			tracker_sparql_builder_object (metadata, xd->metering_mode);
+		}
+
+		if (xd->address || xd->country || xd->city) {
+			tracker_sparql_builder_predicate (metadata, "mlo:location");
+
+			tracker_sparql_builder_object_blank_open (metadata);
+			tracker_sparql_builder_predicate (metadata, "a");
+			tracker_sparql_builder_object (metadata, "mlo:GeoPoint");
+
+			if (xd->address) {
+				tracker_sparql_builder_predicate (metadata, "mlo:address");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->address);
+			}
+
+			if (xd->state) {
+				tracker_sparql_builder_predicate (metadata, "mlo:state");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->state);
+			}
+
+			if (xd->city) {
+				tracker_sparql_builder_predicate (metadata, "mlo:city");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->city);
+			}
+
+			if (xd->country) {
+				tracker_sparql_builder_predicate (metadata, "mlo:country");
+				tracker_sparql_builder_object_unvalidated (metadata, xd->country);
+			}
+
+			tracker_sparql_builder_object_blank_close (metadata);
+		}
+
+		tracker_xmp_free (xd);
+	} else {
+		/* So if we are here we have NO XMP data and we just
+		 * write what we know from Poppler.
+		 */
+		write_pdf_data (pd, metadata);
+	}
+
+	tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
+	tracker_sparql_builder_object_int64 (metadata, poppler_document_get_n_pages (document));
+
+	config = tracker_main_get_config ();
+	n_bytes = tracker_config_get_max_bytes (config);
+	content = extract_content (document, n_bytes);
+
+	if (content) {
+		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+		tracker_sparql_builder_object_unvalidated (metadata, content);
+		g_free (content);
+	}
+
+	read_outline (document, metadata);
+
+	g_free (xml);
+	g_free (pd.keywords);
+	g_free (pd.title);
+	g_free (pd.subject);
+	g_free (pd.creation_date);
+	g_free (pd.author);
+	g_free (pd.date);
+
+	g_object_unref (document);
+}
+
+TrackerExtractData *
+tracker_extract_get_data (void)
+{
+	return data;
+}



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]