[tracker/poppler-glib: 1/16] tracker-extract, pdf: Use poppler-glib again for PDF extraction
- From: Martyn James Russell <mr src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/poppler-glib: 1/16] tracker-extract, pdf: Use poppler-glib again for PDF extraction
- Date: Wed, 2 Mar 2011 14:52:51 +0000 (UTC)
commit f4b4794c1dbac3437c6dc45e0140aa984a4de094
Author: Carlos Garnacho <carlosg gnome org>
Date: Tue Feb 15 16:53:44 2011 +0100
tracker-extract,pdf: Use poppler-glib again for PDF extraction
The old code has been resurrected with a few changes to
have it work on Poppler 0.16.0, Now that poppler_page_get_text()
uses TextOutputDev underneath there's no need anymore to
use the C++ interface directly anymore, which is not guaranteed
to be API compatible.
Fixes Bug 636375 - 0.9.30: tracker-extract-pdf.cpp:
error: GlobalParams.h: No such file or directory, reported
by Andre Klapper.
configure.ac | 8 +-
src/tracker-extract/Makefile.am | 4 +-
src/tracker-extract/tracker-extract-pdf.c | 594 ++++++++++++++++++
src/tracker-extract/tracker-extract-pdf.cpp | 858 ---------------------------
4 files changed, 600 insertions(+), 864 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 25a92fe..4712359 100644
--- a/configure.ac
+++ b/configure.ac
@@ -178,7 +178,7 @@ UPOWER_REQUIRED=0.9.0
GDKPIXBUF_REQUIRED=2.12.0
QT_REQUIRED=4.7.1
MEEGOTOUCH_REQUIRED=0.20
-POPPLER_REQUIRED=0.12.2
+POPPLER_REQUIRED=0.16.0
CAIRO_REQUIRED=1.0
GDK_REQUIRED=1.0
LIBVORBIS_REQUIRED=0.22
@@ -1638,7 +1638,7 @@ AC_ARG_ENABLE(poppler,
if test "x$enable_poppler" != "xno" ; then
PKG_CHECK_MODULES(POPPLER,
- [poppler >= $POPPLER_REQUIRED],
+ [poppler-glib >= $POPPLER_REQUIRED],
[have_poppler=yes],
[have_poppler=no])
@@ -1646,7 +1646,7 @@ if test "x$enable_poppler" != "xno" ; then
AC_SUBST(POPPLER_LIBS)
if test "x$have_poppler" = "xyes"; then
- AC_DEFINE(HAVE_POPPLER, [], [Define if we have poppler])
+ AC_DEFINE(HAVE_POPPLER, [], [Define if we have poppler-glib])
fi
else
have_poppler="no (disabled)"
@@ -1654,7 +1654,7 @@ fi
if test "x$enable_poppler" = "xyes"; then
if test "x$have_poppler" != "xyes"; then
- AC_MSG_ERROR([Couldn't find poppler >= $POPPLER_REQUIRED.])
+ AC_MSG_ERROR([Couldn't find poppler-glib >= $POPPLER_REQUIRED.])
fi
fi
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 5c83b28..e2027a9 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -236,8 +236,8 @@ libextract_msoffice_xml_la_LIBADD = \
$(TRACKER_EXTRACT_MODULES_LIBS)
# PDF
-libextract_pdf_la_SOURCES = tracker-extract-pdf.cpp
-libextract_pdf_la_CXXFLAGS = \
+libextract_pdf_la_SOURCES = tracker-extract-pdf.c
+libextract_pdf_la_CFLAGS = \
$(TRACKER_EXTRACT_MODULES_CFLAGS) \
$(POPPLER_CFLAGS)
libextract_pdf_la_LDFLAGS = $(module_flags)
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.c
new file mode 100644
index 0000000..c9ee8f7
--- /dev/null
+++ b/src/tracker-extract/tracker-extract-pdf.c
@@ -0,0 +1,594 @@
+/*
+ * Copyright (C) 2006, Jamie McCracken <jamiemcc gnome org>
+ * Copyright (C) 2008-2009, Nokia <ivan frade nokia com>
+ * Copyright (C) 2010, Amit Aggarwal <amitcs06 gmail com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include <glib.h>
+#include <glib/poppler.h>
+
+#include <libtracker-common/tracker-date-time.h>
+#include <libtracker-common/tracker-utils.h>
+
+#include <libtracker-extract/tracker-extract.h>
+
+#include "tracker-main.h"
+
+typedef struct {
+ gchar *title;
+ gchar *subject;
+ gchar *creation_date;
+ gchar *author;
+ gchar *date;
+ gchar *keywords;
+} PDFData;
+
+static void extract_pdf (const gchar *uri,
+ TrackerSparqlBuilder *preupdate,
+ TrackerSparqlBuilder *metadata);
+
+static TrackerExtractData data[] = {
+ { "application/pdf", extract_pdf },
+ { NULL, NULL }
+};
+
+static void
+read_toc (PopplerIndexIter *index,
+ GString **toc)
+{
+ if (!index) {
+ return;
+ }
+
+ if (!*toc) {
+ *toc = g_string_new ("");
+ }
+
+ do {
+ PopplerAction *action;
+ PopplerIndexIter *iter;
+
+ action = poppler_index_iter_get_action (index);
+
+ if (!action) {
+ continue;
+ }
+
+ switch (action->type) {
+ case POPPLER_ACTION_GOTO_DEST: {
+ PopplerActionGotoDest *ag = (PopplerActionGotoDest*) action;
+ PopplerDest *agd = ag->dest;
+
+ if (!tracker_is_empty_string (ag->title)) {
+ g_string_append_printf (*toc, "%s ", ag->title);
+ }
+
+ if (!tracker_is_empty_string (agd->named_dest)) {
+ g_string_append_printf (*toc, "%s ", agd->named_dest);
+ }
+
+ break;
+ }
+
+ case POPPLER_ACTION_LAUNCH: {
+ PopplerActionLaunch *al = (PopplerActionLaunch*) action;
+
+ if (!tracker_is_empty_string (al->title)) {
+ g_string_append_printf (*toc, "%s ", al->title);
+ }
+
+ if (!tracker_is_empty_string (al->file_name)) {
+ g_string_append_printf (*toc, "%s ", al->file_name);
+ }
+
+ if (!tracker_is_empty_string (al->params)) {
+ g_string_append_printf (*toc, "%s ", al->params);
+ }
+
+ break;
+ }
+
+ case POPPLER_ACTION_URI: {
+ PopplerActionUri *au = (PopplerActionUri*) action;
+
+ if (!tracker_is_empty_string (au->uri)) {
+ g_string_append_printf (*toc, "%s ", au->uri);
+ }
+
+ break;
+ }
+
+ case POPPLER_ACTION_NAMED: {
+ PopplerActionNamed *an = (PopplerActionNamed*) action;
+
+ if (!tracker_is_empty_string (an->title)) {
+ g_string_append_printf (*toc, "%s, ", an->title);
+ }
+
+ if (!tracker_is_empty_string (an->named_dest)) {
+ g_string_append_printf (*toc, "%s ", an->named_dest);
+ }
+
+ break;
+ }
+
+ case POPPLER_ACTION_MOVIE: {
+ PopplerActionNamed *am = (PopplerActionNamed*) action;
+
+ if (!tracker_is_empty_string (am->title)) {
+ g_string_append_printf (*toc, "%s ", am->title);
+ }
+
+ break;
+ }
+
+ case POPPLER_ACTION_NONE:
+ case POPPLER_ACTION_UNKNOWN:
+ case POPPLER_ACTION_GOTO_REMOTE:
+ case POPPLER_ACTION_RENDITION:
+ case POPPLER_ACTION_OCG_STATE:
+ /* Do nothing */
+ break;
+ }
+
+ iter = poppler_index_iter_get_child (index);
+ read_toc (iter, toc);
+ } while (poppler_index_iter_next (index));
+
+ poppler_index_iter_free (index);
+}
+
+static void
+read_outline (PopplerDocument *document,
+ TrackerSparqlBuilder *metadata)
+{
+ PopplerIndexIter *index;
+ GString *toc = NULL;
+
+ index = poppler_index_iter_new (document);
+
+ if (!index) {
+ return;
+ }
+
+ read_toc (index, &toc);
+
+ if (toc) {
+ if (toc->len > 0) {
+ tracker_sparql_builder_predicate (metadata, "nfo:tableOfContents");
+ tracker_sparql_builder_object_unvalidated (metadata, toc->str);
+ }
+
+ g_string_free (toc, TRUE);
+ }
+}
+
+static void
+insert_keywords (TrackerSparqlBuilder *metadata,
+ gchar *keywords)
+{
+ char *saveptr, *p;
+ size_t len;
+
+ p = keywords;
+ keywords = strchr (keywords, '"');
+
+ if (keywords) {
+ keywords++;
+ } else {
+ keywords = p;
+ }
+
+ len = strlen (keywords);
+ if (keywords[len - 1] == '"') {
+ keywords[len - 1] = '\0';
+ }
+
+ for (p = strtok_r (keywords, ",; ", &saveptr);
+ p;
+ p = strtok_r (NULL, ",; ", &saveptr)) {
+ tracker_sparql_builder_predicate (metadata, "nao:hasTag");
+
+ tracker_sparql_builder_object_blank_open (metadata);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nao:Tag");
+
+ tracker_sparql_builder_predicate (metadata, "nao:prefLabel");
+ tracker_sparql_builder_object_unvalidated (metadata, p);
+
+ tracker_sparql_builder_object_blank_close (metadata);
+ }
+}
+
+static gchar *
+extract_content (PopplerDocument *document,
+ gsize n_bytes)
+{
+ gint n_pages, i = 0;
+ GString *string;
+ GTimer *timer;
+
+ n_pages = poppler_document_get_n_pages (document);
+ string = g_string_new ("");
+ timer = g_timer_new ();
+
+ while (i < n_pages && n_bytes > 0 &&
+ g_timer_elapsed (timer, NULL) < 5) {
+ PopplerPage *page;
+ gsize written_bytes;
+ gchar *text;
+
+ page = poppler_document_get_page (document, i);
+ i++;
+
+ text = poppler_page_get_text (page);
+
+ if (!text) {
+ continue;
+ }
+
+ if (tracker_text_validate_utf8 (text,
+ MIN (strlen (text), n_bytes),
+ &string,
+ &written_bytes)) {
+ g_string_append_c (string, ' ');
+ }
+
+ n_bytes -= written_bytes;
+
+ g_free (text);
+ g_object_unref (page);
+ }
+
+ g_timer_destroy (timer);
+
+ return g_string_free (string, FALSE);
+}
+
+static void
+write_pdf_data (PDFData data,
+ TrackerSparqlBuilder *metadata)
+{
+ if (!tracker_is_empty_string (data.title)) {
+ tracker_sparql_builder_predicate (metadata, "nie:title");
+ tracker_sparql_builder_object_unvalidated (metadata, data.title);
+ }
+
+ if (!tracker_is_empty_string (data.subject)) {
+ tracker_sparql_builder_predicate (metadata, "nie:subject");
+ tracker_sparql_builder_object_unvalidated (metadata, data.subject);
+ }
+
+ if (!tracker_is_empty_string (data.author)) {
+ tracker_sparql_builder_predicate (metadata, "nco:creator");
+ tracker_sparql_builder_object_blank_open (metadata);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nco:Contact");
+ tracker_sparql_builder_predicate (metadata, "nco:fullname");
+ tracker_sparql_builder_object_unvalidated (metadata, data.author);
+ tracker_sparql_builder_object_blank_close (metadata);
+ }
+
+ if (!tracker_is_empty_string (data.date)) {
+ tracker_sparql_builder_predicate (metadata, "nie:contentCreated");
+ tracker_sparql_builder_object_unvalidated (metadata, data.date);
+ }
+
+ if (!tracker_is_empty_string (data.keywords)) {
+ insert_keywords (metadata, data.keywords);
+ }
+}
+
+static void
+extract_pdf (const gchar *uri,
+ TrackerSparqlBuilder *preupdate,
+ TrackerSparqlBuilder *metadata)
+{
+ TrackerConfig *config;
+ GTime creation_date;
+ GError *error = NULL;
+ TrackerXmpData *xd = NULL;
+ PDFData pd = { 0 }; /* actual data */
+ PDFData md = { 0 }; /* for merging */
+ PopplerDocument *document;
+ gchar *xml = NULL;
+ gchar *content;
+ guint n_bytes;
+
+ g_type_init ();
+
+ document = poppler_document_new_from_file (uri, NULL, &error);
+
+ if (error) {
+ if (error->code == POPPLER_ERROR_ENCRYPTED) {
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+
+ tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
+ tracker_sparql_builder_object_boolean (metadata, TRUE);
+ return;
+ } else {
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
+ uri,
+ error->message ? error->message : "no error given");
+ }
+
+ g_error_free (error);
+ return;
+ }
+
+ if (!document) {
+ g_warning ("Could not create PopplerDocument from uri:'%s', "
+ "NULL returned without an error",
+ uri);
+ return;
+ }
+
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+
+ g_object_get (document,
+ "title", &pd.title,
+ "author", &pd.author,
+ "subject", &pd.subject,
+ "keywords", &pd.keywords,
+ "creation-date", &creation_date,
+ "metadata", &xml,
+ NULL);
+
+ if (creation_date > 0) {
+ pd.creation_date = tracker_date_to_string ((time_t) creation_date);
+ }
+
+ if (xml &&
+ (xd = tracker_xmp_new (xml, strlen (xml), uri)) != NULL) {
+ /* The casts here are well understood and known */
+ md.title = (gchar *) tracker_coalesce_strip (3, pd.title, xd->title, xd->title2, xd->pdf_title);
+ md.subject = (gchar *) tracker_coalesce_strip (2, pd.subject, xd->subject);
+ md.date = (gchar *) tracker_coalesce_strip (3, pd.creation_date, xd->date, xd->time_original);
+ md.author = (gchar *) tracker_coalesce_strip (2, pd.author, xd->creator);
+
+ write_pdf_data (md, metadata);
+
+ if (xd->keywords) {
+ insert_keywords (metadata, xd->keywords);
+ }
+
+ if (xd->pdf_keywords) {
+ insert_keywords (metadata, xd->pdf_keywords);
+ }
+
+ if (pd.keywords) {
+ insert_keywords (metadata, pd.keywords);
+ }
+
+ if (xd->publisher) {
+ tracker_sparql_builder_predicate (metadata, "nco:publisher");
+ tracker_sparql_builder_object_blank_open (metadata);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nco:Contact");
+ tracker_sparql_builder_predicate (metadata, "nco:fullname");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->publisher);
+ tracker_sparql_builder_object_blank_close (metadata);
+ }
+
+ if (xd->type) {
+ tracker_sparql_builder_predicate (metadata, "dc:type");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->type);
+ }
+
+ if (xd->format) {
+ tracker_sparql_builder_predicate (metadata, "dc:format");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->format);
+ }
+
+ if (xd->identifier) {
+ tracker_sparql_builder_predicate (metadata, "dc:identifier");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->identifier);
+ }
+
+ if (xd->source) {
+ tracker_sparql_builder_predicate (metadata, "dc:source");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->source);
+ }
+
+ if (xd->language) {
+ tracker_sparql_builder_predicate (metadata, "dc:language");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->language);
+ }
+
+ if (xd->relation) {
+ tracker_sparql_builder_predicate (metadata, "dc:relation");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->relation);
+ }
+
+ if (xd->coverage) {
+ tracker_sparql_builder_predicate (metadata, "dc:coverage");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->coverage);
+ }
+
+ if (xd->license) {
+ tracker_sparql_builder_predicate (metadata, "nie:license");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->license);
+ }
+
+ if (xd->make || xd->model) {
+ gchar *camera;
+
+ if ((xd->make == NULL || xd->model == NULL) ||
+ (xd->make && xd->model && strstr (xd->model, xd->make) == NULL)) {
+ camera = tracker_merge_const (" ", 2, xd->make, xd->model);
+ } else {
+ camera = g_strdup (xd->model);
+ }
+
+ tracker_sparql_builder_predicate (metadata, "nmm:camera");
+ tracker_sparql_builder_object_unvalidated (metadata, camera);
+ g_free (camera);
+ }
+
+ if (xd->orientation) {
+ tracker_sparql_builder_predicate (metadata, "nfo:orientation");
+ tracker_sparql_builder_object (metadata, xd->orientation);
+ }
+
+ if (xd->rights) {
+ tracker_sparql_builder_predicate (metadata, "nie:copyright");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->rights);
+ }
+
+ if (xd->white_balance) {
+ tracker_sparql_builder_predicate (metadata, "nmm:whiteBalance");
+ tracker_sparql_builder_object (metadata, xd->white_balance);
+ }
+
+ if (xd->fnumber) {
+ gdouble value;
+
+ value = g_strtod (xd->fnumber, NULL);
+ tracker_sparql_builder_predicate (metadata, "nmm:fnumber");
+ tracker_sparql_builder_object_double (metadata, value);
+ }
+
+ if (xd->flash) {
+ tracker_sparql_builder_predicate (metadata, "nmm:flash");
+ tracker_sparql_builder_object (metadata, xd->flash);
+ }
+
+ if (xd->focal_length) {
+ gdouble value;
+
+ value = g_strtod (xd->focal_length, NULL);
+ tracker_sparql_builder_predicate (metadata, "nmm:focalLength");
+ tracker_sparql_builder_object_double (metadata, value);
+ }
+
+ /* Question: Shouldn't xd->Artist be merged with md.author instead? */
+
+ if (xd->artist || xd->contributor) {
+ const gchar *artist;
+
+ artist = tracker_coalesce_strip (2, xd->artist, xd->contributor);
+ tracker_sparql_builder_predicate (metadata, "nco:contributor");
+ tracker_sparql_builder_object_blank_open (metadata);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nco:Contact");
+ tracker_sparql_builder_predicate (metadata, "nco:fullname");
+ tracker_sparql_builder_object_unvalidated (metadata, artist);
+ tracker_sparql_builder_object_blank_close (metadata);
+ }
+
+ if (xd->exposure_time) {
+ gdouble value;
+
+ value = g_strtod (xd->exposure_time, NULL);
+ tracker_sparql_builder_predicate (metadata, "nmm:exposureTime");
+ tracker_sparql_builder_object_double (metadata, value);
+ }
+
+ if (xd->iso_speed_ratings) {
+ gdouble value;
+
+ value = g_strtod (xd->iso_speed_ratings, NULL);
+ tracker_sparql_builder_predicate (metadata, "nmm:isoSpeed");
+ tracker_sparql_builder_object_double (metadata, value);
+ }
+
+ if (xd->description) {
+ tracker_sparql_builder_predicate (metadata, "nie:description");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->description);
+ }
+
+ if (xd->metering_mode) {
+ tracker_sparql_builder_predicate (metadata, "nmm:meteringMode");
+ tracker_sparql_builder_object (metadata, xd->metering_mode);
+ }
+
+ if (xd->address || xd->country || xd->city) {
+ tracker_sparql_builder_predicate (metadata, "mlo:location");
+
+ tracker_sparql_builder_object_blank_open (metadata);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "mlo:GeoPoint");
+
+ if (xd->address) {
+ tracker_sparql_builder_predicate (metadata, "mlo:address");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->address);
+ }
+
+ if (xd->state) {
+ tracker_sparql_builder_predicate (metadata, "mlo:state");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->state);
+ }
+
+ if (xd->city) {
+ tracker_sparql_builder_predicate (metadata, "mlo:city");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->city);
+ }
+
+ if (xd->country) {
+ tracker_sparql_builder_predicate (metadata, "mlo:country");
+ tracker_sparql_builder_object_unvalidated (metadata, xd->country);
+ }
+
+ tracker_sparql_builder_object_blank_close (metadata);
+ }
+
+ tracker_xmp_free (xd);
+ } else {
+ /* So if we are here we have NO XMP data and we just
+ * write what we know from Poppler.
+ */
+ write_pdf_data (pd, metadata);
+ }
+
+ tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
+ tracker_sparql_builder_object_int64 (metadata, poppler_document_get_n_pages (document));
+
+ config = tracker_main_get_config ();
+ n_bytes = tracker_config_get_max_bytes (config);
+ content = extract_content (document, n_bytes);
+
+ if (content) {
+ tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+ tracker_sparql_builder_object_unvalidated (metadata, content);
+ g_free (content);
+ }
+
+ read_outline (document, metadata);
+
+ g_free (xml);
+ g_free (pd.keywords);
+ g_free (pd.title);
+ g_free (pd.subject);
+ g_free (pd.creation_date);
+ g_free (pd.author);
+ g_free (pd.date);
+
+ g_object_unref (document);
+}
+
+TrackerExtractData *
+tracker_extract_get_data (void)
+{
+ return data;
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]