[tracker/pdfmem-for-master] tracker-extract: Ported the PDF extractor to not use poppler-glib
- From: Philip Van Hoof <pvanhoof src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/pdfmem-for-master] tracker-extract: Ported the PDF extractor to not use poppler-glib
- Date: Thu, 29 Apr 2010 13:45:28 +0000 (UTC)
commit 6f2513bd9b5fd178377c76890c17ccbbf9b1997e
Author: Philip Van Hoof <philip codeminded be>
Date: Thu Apr 29 14:43:20 2010 +0200
tracker-extract: Ported the PDF extractor to not use poppler-glib
Poppler's GLib bindings don't make it possible to use a TextOutputDev
for poppler_page_get_text when there *is* Cairo support. This isn't good for
us because that means that Cairo surfaces are needlessly made for each image
embedded in the PDF, wasting resources.
configure.ac | 40 +-
src/tracker-extract/Makefile.am | 8 +-
...acker-extract-pdf.c => tracker-extract-pdf.cpp} | 427 +++++++++++++++-----
3 files changed, 345 insertions(+), 130 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 84319f5..77e60ff 100644
--- a/configure.ac
+++ b/configure.ac
@@ -143,7 +143,7 @@ UPOWER_REQUIRED=0.9.0
GDKPIXBUF_REQUIRED=2.12.0
QUILL_REQUIRED=1.0.0
UNAC_REQUIRED=1.0.0
-POPPLER_GLIB_REQUIRED=0.4.5
+POPPLER_REQUIRED=0.12.2
CAIRO_REQUIRED=1.0
GDK_REQUIRED=1.0
LIBVORBIS_REQUIRED=0.22
@@ -1095,37 +1095,37 @@ if test x$enable_unzip_psgz_files != "xno"; then
fi
##################################################################
-# Check for poppler's glib bingings
+# Check for poppler
##################################################################
-AC_ARG_ENABLE(poppler-glib,
- AS_HELP_STRING([--enable-poppler-glib],
+AC_ARG_ENABLE(poppler,
+ AS_HELP_STRING([--enable-poppler],
[enable extractor for PDF data [[default=auto]]]),,
- [enable_poppler_glib=auto])
+ [enable_poppler=auto])
-if test "x$enable_poppler_glib" != "xno" ; then
- PKG_CHECK_MODULES(POPPLER_GLIB,
- [poppler-glib >= $POPPLER_GLIB_REQUIRED],
- [have_poppler_glib=yes],
- [have_poppler_glib=no])
+if test "x$enable_poppler" != "xno" ; then
+ PKG_CHECK_MODULES(POPPLER,
+ [poppler >= $POPPLER_REQUIRED],
+ [have_poppler=yes],
+ [have_poppler=no])
- AC_SUBST(POPPLER_GLIB_CFLAGS)
- AC_SUBST(POPPLER_GLIB_LIBS)
+ AC_SUBST(POPPLER_CFLAGS)
+ AC_SUBST(POPPLER_LIBS)
- if test "x$have_poppler_glib" = "xyes"; then
- AC_DEFINE(HAVE_POPPLER_GLIB, [], [Define if we have poppler-glib])
+ if test "x$have_poppler" = "xyes"; then
+ AC_DEFINE(HAVE_POPPLER, [], [Define if we have poppler])
fi
else
- have_poppler_glib="no (disabled)"
+ have_poppler="no (disabled)"
fi
-if test "x$enable_poppler_glib" = "xyes"; then
- if test "x$have_poppler_glib" != "xyes"; then
- AC_MSG_ERROR([Couldn't find poppler-glib >= $POPPLER_GLIB_REQUIRED.])
+if test "x$enable_poppler" = "xyes"; then
+ if test "x$have_poppler" != "xyes"; then
+ AC_MSG_ERROR([Couldn't find poppler >= $POPPLER_REQUIRED.])
fi
fi
-AM_CONDITIONAL(HAVE_POPPLER_GLIB, test "x$have_poppler_glib" = "xyes")
+AM_CONDITIONAL(HAVE_POPPLER, test "x$have_poppler" = "xyes")
##################################################################
# Check for libexif
@@ -1770,7 +1770,7 @@ Metadata Extractors:
Support libstreamanalyzer: $have_libstreamanalyzer
Support PNG: yes
- Support PDF: $have_poppler_glib
+ Support PDF: $have_poppler
Support JPEG: $have_libjpeg (xmp: $have_exempi, exif: $have_libexif, iptc: $have_libiptcdata)
Support TIFF: $have_libtiff (xmp: $have_exempi, exif: yes, iptc: $have_libiptcdata)
Support Vorbis (ogg/etc): $have_libvorbis
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index eede0c1..37454ba 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -20,7 +20,7 @@ INCLUDES = \
$(LIBGSF_CFLAGS) \
$(LIBXML2_CFLAGS) \
$(LIBPNG_CFLAGS) \
- $(POPPLER_GLIB_CFLAGS) \
+ $(POPPLER_CFLAGS) \
$(GSTREAMER_CFLAGS) \
$(XINE_CFLAGS) \
$(TOTEM_PL_PARSER_CFLAGS)
@@ -73,7 +73,7 @@ modules_LTLIBRARIES += \
libextract-oasis.la
endif
-if HAVE_POPPLER_GLIB
+if HAVE_POPPLER
modules_LTLIBRARIES += libextract-pdf.la
endif
@@ -214,12 +214,12 @@ libextract_msoffice_la_LIBADD = \
$(GCOV_LIBS)
# PDF
-libextract_pdf_la_SOURCES = tracker-extract-pdf.c
+libextract_pdf_la_SOURCES = tracker-extract-pdf.cpp
libextract_pdf_la_LDFLAGS = $(module_flags)
libextract_pdf_la_LIBADD = \
$(top_builddir)/src/libtracker-extract/libtracker-extract- TRACKER_API_VERSION@.la \
$(top_builddir)/src/libtracker-common/libtracker-common.la \
- $(POPPLER_GLIB_LIBS) \
+ $(POPPLER_LIBS) \
$(GLIB2_LIBS) \
$(GCOV_LIBS)
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.cpp
similarity index 57%
rename from src/tracker-extract/tracker-extract-pdf.c
rename to src/tracker-extract/tracker-extract-pdf.cpp
index 8c25984..50366c3 100644
--- a/src/tracker-extract/tracker-extract-pdf.c
+++ b/src/tracker-extract/tracker-extract-pdf.cpp
@@ -24,7 +24,17 @@
#include <string.h>
#include <glib.h>
-#include <poppler.h>
+
+/* Poppler includes*/
+#include <GlobalParams.h>
+#include <PDFDoc.h>
+#include <Outline.h>
+#include <ErrorCodes.h>
+#include <UnicodeMap.h>
+#include <PDFDocEncoding.h>
+#include <TextOutputDev.h>
+#include <Gfx.h>
+#include <Link.h>
#include <libtracker-common/tracker-date-time.h>
#include <libtracker-common/tracker-utils.h>
@@ -51,124 +61,185 @@ static TrackerExtractData data[] = {
{ NULL, NULL }
};
+/**
+ * Philip ported this from a poppler-glib based version to a C++ libpopler
+ * version because the TextOutputDev allows us to extract text and metadata much
+ * faster than the default CairoOutputDev that poppler-glib uses in case it got
+ * compiled with support for Cairo. Regretfully can't this be selected at
+ * runtime in the poppler-glib bindings. Apologies to the GObject/GLib fans. */
+
+static gchar *
+unicode_to_char (Unicode *unicode,
+ int len)
+{
+ static UnicodeMap *uMap = NULL;
+ if (uMap == NULL) {
+ GooString *enc = new GooString("UTF-8");
+ uMap = globalParams->getUnicodeMap(enc);
+ uMap->incRefCnt ();
+ delete enc;
+ }
+
+ GooString gstr;
+ gchar buf[8]; /* 8 is enough for mapping an unicode char to a string */
+ int i, n;
+
+ for (i = 0; i < len; ++i) {
+ n = uMap->mapUnicode(unicode[i], buf, sizeof(buf));
+ gstr.append(buf, n);
+ }
+
+ return g_strdup (gstr.getCString ());
+}
+
static void
-read_toc (PopplerIndexIter *index,
- GString **toc)
+read_toc (GooList *items,
+ GString **toc)
{
- if (!index) {
+ guint length, i;
+
+ if (!items)
return;
- }
if (!*toc) {
*toc = g_string_new ("");
}
- do {
- PopplerAction *action;
- PopplerIndexIter *iter;
+ length = items->getLength ();
+
+ for (i = 0; i < length; i++) {
+ OutlineItem *item;
+ LinkAction *link_action;
+
+ item = (OutlineItem *) items->get (i);
- action = poppler_index_iter_get_action (index);
+ link_action = item->getAction ();
- if (!action) {
+ if (!link_action) {
continue;
}
- switch (action->type) {
- case POPPLER_ACTION_GOTO_DEST: {
- PopplerActionGotoDest *ag = (PopplerActionGotoDest*) action;
- PopplerDest *agd = ag->dest;
-
- if (!tracker_is_empty_string (ag->title)) {
- g_string_append_printf (*toc, "%s ", ag->title);
+ switch (link_action->getKind()) {
+ case actionGoTo: {
+ guint title_length = item->getTitleLength ();
+ LinkGoTo *gto = dynamic_cast <LinkGoTo *> (link_action);
+ GooString *named_dest = gto->getNamedDest ();
+
+ if (title_length > 0) {
+ gchar *str = unicode_to_char (item->getTitle(),
+ title_length);
+ g_string_append_printf (*toc, "%s ", str);
+ g_free (str);
}
- if (!tracker_is_empty_string (agd->named_dest)) {
- g_string_append_printf (*toc, "%s ", agd->named_dest);
- }
+ if (named_dest)
+ g_string_append_printf (*toc, "%s ", named_dest->getCString ());
break;
}
- case POPPLER_ACTION_LAUNCH: {
- PopplerActionLaunch *al = (PopplerActionLaunch*) action;
+ case actionLaunch: {
+ guint title_length = item->getTitleLength ();
+ LinkLaunch *lan = dynamic_cast <LinkLaunch *> (link_action);
+ GooString *filen, *param;
- if (!tracker_is_empty_string (al->title)) {
- g_string_append_printf (*toc, "%s ", al->title);
- }
+ filen = lan->getFileName();
+ param = lan->getParams();
- if (!tracker_is_empty_string (al->file_name)) {
- g_string_append_printf (*toc, "%s ", al->file_name);
+ if (title_length > 0) {
+ gchar *str = unicode_to_char (item->getTitle(),
+ title_length);
+ g_string_append_printf (*toc, "%s ", str);
+ g_free (str);
}
- if (!tracker_is_empty_string (al->params)) {
- g_string_append_printf (*toc, "%s ", al->params);
- }
+ if (filen)
+ g_string_append_printf (*toc, "%s ", filen->getCString ());
+
+ if (param)
+ g_string_append_printf (*toc, "%s ", param->getCString ());
break;
}
- case POPPLER_ACTION_URI: {
- PopplerActionUri *au = (PopplerActionUri*) action;
+ case actionURI: {
+ LinkURI *uri = dynamic_cast <LinkURI *> (link_action);
+ GooString *muri;
- if (!tracker_is_empty_string (au->uri)) {
- g_string_append_printf (*toc, "%s ", au->uri);
- }
+ muri = uri->getURI();
+
+ if (muri)
+ g_string_append_printf (*toc, "%s ", muri->getCString ());
break;
}
- case POPPLER_ACTION_NAMED: {
- PopplerActionNamed *an = (PopplerActionNamed*) action;
+ case actionNamed: {
+ guint title_length = item->getTitleLength ();
+ LinkNamed *named = dynamic_cast <LinkNamed *> (link_action);
+ GooString *named_dest = named->getName ();
- if (!tracker_is_empty_string (an->title)) {
- g_string_append_printf (*toc, "%s, ", an->title);
+ if (title_length > 0) {
+ gchar *str = unicode_to_char (item->getTitle(),
+ title_length);
+ g_string_append_printf (*toc, "%s ", str);
+ g_free (str);
}
- if (!tracker_is_empty_string (an->named_dest)) {
- g_string_append_printf (*toc, "%s ", an->named_dest);
- }
+ if (named_dest)
+ g_string_append_printf (*toc, "%s ", named_dest->getCString ());
break;
}
- case POPPLER_ACTION_MOVIE: {
- PopplerActionNamed *am = (PopplerActionNamed*) action;
+ case actionMovie: {
+ guint title_length = item->getTitleLength ();
- if (!tracker_is_empty_string (am->title)) {
- g_string_append_printf (*toc, "%s ", am->title);
+ if (title_length > 0) {
+ gchar *str = unicode_to_char (item->getTitle(),
+ title_length);
+ g_string_append_printf (*toc, "%s ", str);
+ g_free (str);
}
break;
}
- case POPPLER_ACTION_NONE:
- case POPPLER_ACTION_UNKNOWN:
- case POPPLER_ACTION_GOTO_REMOTE:
+ case actionRendition:
+ case actionSound:
+ case actionJavaScript:
+ case actionUnknown:
+ case actionGoToR:
/* Do nothing */
break;
}
- iter = poppler_index_iter_get_child (index);
- read_toc (iter, toc);
- } while (poppler_index_iter_next (index));
+ if (item->hasKids ())
+ read_toc (item->getKids (), toc);
+ }
- poppler_index_iter_free (index);
}
static void
-read_outline (PopplerDocument *document,
+read_outline (PDFDoc *document,
TrackerSparqlBuilder *metadata)
{
- PopplerIndexIter *index;
+ Outline *outline;
GString *toc = NULL;
+ GooList *items;
- index = poppler_index_iter_new (document);
+ outline = document->getOutline();
- if (!index) {
+ if (!outline) {
return;
}
- read_toc (index, &toc);
+ items = outline->getItems ();
+
+ if (items == NULL)
+ return;
+
+ read_toc (items, &toc);
if (toc) {
if (toc->len > 0) {
@@ -217,38 +288,87 @@ insert_keywords (TrackerSparqlBuilder *metadata,
}
}
+static void
+page_get_size (Page *page,
+ gdouble *width,
+ gdouble *height)
+{
+ gdouble page_width, page_height;
+ gint rotate;
+
+ rotate = page->getRotate ();
+
+ if (rotate == 90 || rotate == 270) {
+ page_height = page->getCropWidth ();
+ page_width = page->getCropHeight ();
+ } else {
+ page_width = page->getCropWidth ();
+ page_height = page->getCropHeight ();
+ }
+
+ if (width != NULL)
+ *width = page_width;
+ if (height != NULL)
+ *height = page_height;
+}
+
static gchar *
-extract_content (PopplerDocument *document,
- guint n_words)
+extract_content (PDFDoc *document,
+ guint n_words)
{
- PopplerPage *page;
- PopplerRectangle rect;
+ Page *page;
+ Catalog *catalog;
GString *string;
gint n_pages, i, words;
- gchar *text, *t;
+ gchar *t;
- n_pages = poppler_document_get_n_pages (document);
+ n_pages = document->getNumPages();
string = g_string_new ("");
words = i = 0;
+ catalog = document->getCatalog();
while (i < n_pages && words < n_words) {
- gint normalized_words;
-
- page = poppler_document_get_page (document, i);
+ guint normalized_words = 0;
+ Gfx *gfx;
+ GooString *sel_text;
+ TextOutputDev *text_dev;
+ PDFRectangle pdf_selection;
+ gdouble height = 0, width = 0;
+
+ page = catalog->getPage (i + 1);
i++;
- rect.x1 = rect.y1 = 0;
- poppler_page_get_size (page, &rect.x2, &rect.y2);
+ text_dev = new TextOutputDev (NULL, gTrue, gFalse, gFalse);
+ gfx = page->createGfx (text_dev,
+ 72.0, 72.0, 0,
+ gFalse, /* useMediaBox */
+ gTrue, /* Crop */
+ -1, -1, -1, -1,
+ gFalse, /* printing */
+ catalog,
+ NULL, NULL, NULL, NULL);
+
+ page->display(gfx);
+ text_dev->endPage();
- text = poppler_page_get_text (page, POPPLER_SELECTION_WORD, &rect);
- t = tracker_text_normalize (text, n_words - words, &normalized_words);
+ page_get_size (page, &width, &height);
+
+ pdf_selection.x1 = 0;
+ pdf_selection.y1 = 0;
+ pdf_selection.x2 = width;
+ pdf_selection.y2 = height;
+
+ sel_text = text_dev->getSelectionText (&pdf_selection, selectionStyleWord);
+
+ t = tracker_text_normalize (sel_text->getCString (), n_words - words, &normalized_words);
words += normalized_words;
g_string_append (string, t);
- g_free (text);
g_free (t);
- g_object_unref (page);
+
+ delete gfx;
+ delete text_dev;
}
return g_string_free (string, FALSE);
@@ -288,41 +408,133 @@ write_pdf_data (PDFData data,
}
}
+
+static PDFDoc*
+poppler_document_new_pdf_from_file (const char *uri,
+ const char *password)
+{
+ PDFDoc *newDoc;
+ GooString *filename_g;
+ GooString *password_g;
+ gchar *filename;
+
+ if (!globalParams) {
+ globalParams = new GlobalParams();
+ }
+
+ filename = g_filename_from_uri (uri, NULL, NULL);
+ if (!filename)
+ return NULL;
+
+ filename_g = new GooString (filename);
+ g_free (filename);
+
+ password_g = NULL;
+ if (password != NULL) {
+ if (g_utf8_validate (password, -1, NULL)) {
+ gchar *password_latin;
+
+ password_latin = g_convert (password, -1,
+ "ISO-8859-1",
+ "UTF-8",
+ NULL, NULL, NULL);
+ password_g = new GooString (password_latin);
+ g_free (password_latin);
+ } else {
+ password_g = new GooString (password);
+ }
+ }
+
+ newDoc = new PDFDoc(filename_g, password_g, password_g);
+ delete password_g;
+
+ return newDoc;
+}
+
+static gchar*
+info_dict_get_string (Dict *info_dict, const gchar *key)
+{
+ Object obj;
+ GooString *goo_value;
+ gchar *result;
+
+ if (!info_dict->lookup ((gchar *)key, &obj)->isString ()) {
+ obj.free ();
+ return NULL;
+ }
+
+ goo_value = obj.getString ();
+
+ if (goo_value->hasUnicodeMarker()) {
+ result = g_convert (goo_value->getCString () + 2,
+ goo_value->getLength () - 2,
+ "UTF-8", "UTF-16BE", NULL, NULL, NULL);
+ } else {
+ int len;
+ gunichar *ucs4_temp;
+ int i;
+
+ len = goo_value->getLength ();
+ ucs4_temp = g_new (gunichar, len + 1);
+ for (i = 0; i < len; ++i) {
+ ucs4_temp[i] = pdfDocEncoding[(unsigned char)goo_value->getChar(i)];
+ }
+ ucs4_temp[i] = 0;
+ result = g_ucs4_to_utf8 (ucs4_temp, -1, NULL, NULL, NULL);
+ g_free (ucs4_temp);
+ }
+
+ obj.free ();
+
+ return result;
+}
+
static void
extract_pdf (const gchar *uri,
TrackerSparqlBuilder *preupdate,
TrackerSparqlBuilder *metadata)
{
TrackerFTSConfig *fts_config;
- GTime creation_date;
- GError *error = NULL;
TrackerXmpData *xd = NULL;
PDFData pd = { 0 }; /* actual data */
PDFData md = { 0 }; /* for merging */
- PopplerDocument *document;
+ PDFDoc *document;
gchar *xml = NULL;
gchar *content;
guint n_words;
+ Object obj;
+ Catalog *catalog;
g_type_init ();
- document = poppler_document_new_from_file (uri, NULL, &error);
-
- if (error) {
- if (error->code == POPPLER_ERROR_ENCRYPTED) {
- tracker_sparql_builder_predicate (metadata, "a");
- tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+ document = poppler_document_new_pdf_from_file (uri, NULL);
- tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
- tracker_sparql_builder_object_boolean (metadata, TRUE);
- return;
- } else {
- g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
- uri,
- error->message ? error->message : "no error given");
+ if (!document->isOk()) {
+ int fopen_errno;
+ switch (document->getErrorCode()) {
+ case errEncrypted:
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+ tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
+ tracker_sparql_builder_object_boolean (metadata, TRUE);
+ break;
+ case errBadCatalog:
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', Failed to read the document catalog", uri);
+ break;
+ case errDamaged:
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', PDF document is damaged", uri);
+ break;
+ case errOpenFile:
+ fopen_errno = document->getFopenErrno();
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
+ uri, g_strerror (fopen_errno));
+ break;
+ default:
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', no error given", uri);
+ break;
}
- g_error_free (error);
+ delete document;
return;
}
@@ -336,21 +548,24 @@ extract_pdf (const gchar *uri,
tracker_sparql_builder_predicate (metadata, "a");
tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
- g_object_get (document,
- "title", &pd.title,
- "author", &pd.author,
- "subject", &pd.subject,
- "keywords", &pd.keywords,
- "creation-date", &creation_date,
- NULL);
-
- /* metadata property not present in older poppler versions */
- if (g_object_class_find_property (G_OBJECT_GET_CLASS (document), "metadata")) {
- g_object_get (document, "metadata", &xml, NULL);
+ document->getDocInfo (&obj);
+ if (obj.isDict ()) {
+ Dict *info_dict = obj.getDict();
+ pd.title = info_dict_get_string (info_dict, "Title");
+ pd.author = info_dict_get_string (info_dict, "Author");
+ pd.subject = info_dict_get_string (info_dict, "Subject");
+ pd.keywords = info_dict_get_string (info_dict, "Keywords");
+ pd.creation_date = info_dict_get_string (info_dict, "CreationDate");
}
-
- if (creation_date > 0) {
- pd.creation_date = tracker_date_to_string ((time_t) creation_date);
+ obj.free ();
+
+ catalog = document->getCatalog ();
+ if (catalog && catalog->isOk ()) {
+ GooString *s = catalog->readMetadata ();
+ if ( s != NULL ) {
+ xml = s->getCString();
+ delete s;
+ }
}
if (xml) {
@@ -569,7 +784,7 @@ extract_pdf (const gchar *uri,
}
tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
- tracker_sparql_builder_object_int64 (metadata, poppler_document_get_n_pages (document));
+ tracker_sparql_builder_object_int64 (metadata, document->getNumPages());
fts_config = tracker_main_get_fts_config ();
n_words = tracker_fts_config_get_max_words_to_index (fts_config);
@@ -583,7 +798,7 @@ extract_pdf (const gchar *uri,
read_outline (document, metadata);
- g_object_unref (document);
+ delete document;
}
TrackerExtractData *
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]