[tracker/pdfmem] tracker-extract: Ported the PDF extractor to not use poppler-glib
- From: Philip Van Hoof <pvanhoof src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/pdfmem] tracker-extract: Ported the PDF extractor to not use poppler-glib
- Date: Thu, 29 Apr 2010 12:46:11 +0000 (UTC)
commit fc700d6b020645286af54ffbe9c4f027e2d07134
Author: Philip Van Hoof <philip codeminded be>
Date: Thu Apr 29 14:43:20 2010 +0200
tracker-extract: Ported the PDF extractor to not use poppler-glib
Poppler's GLib bindings don't make it possible to use a TextOutputDev
for poppler_page_get_text when there *is* Cairo support. This isn't good for
us because that means that Cairo surfaces are needlessly made for each image
embedded in the PDF, wasting resources.
This commit doesn't yet have the TOC code ported. This will happen later today.
Hold on tight
configure.ac | 40 +-
src/tracker-extract/Makefile.am | 8 +-
...acker-extract-pdf.c => tracker-extract-pdf.cpp} | 367 +++++++++++++++-----
3 files changed, 305 insertions(+), 110 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 84319f5..77e60ff 100644
--- a/configure.ac
+++ b/configure.ac
@@ -143,7 +143,7 @@ UPOWER_REQUIRED=0.9.0
GDKPIXBUF_REQUIRED=2.12.0
QUILL_REQUIRED=1.0.0
UNAC_REQUIRED=1.0.0
-POPPLER_GLIB_REQUIRED=0.4.5
+POPPLER_REQUIRED=0.12.2
CAIRO_REQUIRED=1.0
GDK_REQUIRED=1.0
LIBVORBIS_REQUIRED=0.22
@@ -1095,37 +1095,37 @@ if test x$enable_unzip_psgz_files != "xno"; then
fi
##################################################################
-# Check for poppler's glib bingings
+# Check for poppler
##################################################################
-AC_ARG_ENABLE(poppler-glib,
- AS_HELP_STRING([--enable-poppler-glib],
+AC_ARG_ENABLE(poppler,
+ AS_HELP_STRING([--enable-poppler],
[enable extractor for PDF data [[default=auto]]]),,
- [enable_poppler_glib=auto])
+ [enable_poppler=auto])
-if test "x$enable_poppler_glib" != "xno" ; then
- PKG_CHECK_MODULES(POPPLER_GLIB,
- [poppler-glib >= $POPPLER_GLIB_REQUIRED],
- [have_poppler_glib=yes],
- [have_poppler_glib=no])
+if test "x$enable_poppler" != "xno" ; then
+ PKG_CHECK_MODULES(POPPLER,
+ [poppler >= $POPPLER_REQUIRED],
+ [have_poppler=yes],
+ [have_poppler=no])
- AC_SUBST(POPPLER_GLIB_CFLAGS)
- AC_SUBST(POPPLER_GLIB_LIBS)
+ AC_SUBST(POPPLER_CFLAGS)
+ AC_SUBST(POPPLER_LIBS)
- if test "x$have_poppler_glib" = "xyes"; then
- AC_DEFINE(HAVE_POPPLER_GLIB, [], [Define if we have poppler-glib])
+ if test "x$have_poppler" = "xyes"; then
+ AC_DEFINE(HAVE_POPPLER, [], [Define if we have poppler])
fi
else
- have_poppler_glib="no (disabled)"
+ have_poppler="no (disabled)"
fi
-if test "x$enable_poppler_glib" = "xyes"; then
- if test "x$have_poppler_glib" != "xyes"; then
- AC_MSG_ERROR([Couldn't find poppler-glib >= $POPPLER_GLIB_REQUIRED.])
+if test "x$enable_poppler" = "xyes"; then
+ if test "x$have_poppler" != "xyes"; then
+ AC_MSG_ERROR([Couldn't find poppler >= $POPPLER_REQUIRED.])
fi
fi
-AM_CONDITIONAL(HAVE_POPPLER_GLIB, test "x$have_poppler_glib" = "xyes")
+AM_CONDITIONAL(HAVE_POPPLER, test "x$have_poppler" = "xyes")
##################################################################
# Check for libexif
@@ -1770,7 +1770,7 @@ Metadata Extractors:
Support libstreamanalyzer: $have_libstreamanalyzer
Support PNG: yes
- Support PDF: $have_poppler_glib
+ Support PDF: $have_poppler
Support JPEG: $have_libjpeg (xmp: $have_exempi, exif: $have_libexif, iptc: $have_libiptcdata)
Support TIFF: $have_libtiff (xmp: $have_exempi, exif: yes, iptc: $have_libiptcdata)
Support Vorbis (ogg/etc): $have_libvorbis
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index eede0c1..37454ba 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -20,7 +20,7 @@ INCLUDES = \
$(LIBGSF_CFLAGS) \
$(LIBXML2_CFLAGS) \
$(LIBPNG_CFLAGS) \
- $(POPPLER_GLIB_CFLAGS) \
+ $(POPPLER_CFLAGS) \
$(GSTREAMER_CFLAGS) \
$(XINE_CFLAGS) \
$(TOTEM_PL_PARSER_CFLAGS)
@@ -73,7 +73,7 @@ modules_LTLIBRARIES += \
libextract-oasis.la
endif
-if HAVE_POPPLER_GLIB
+if HAVE_POPPLER
modules_LTLIBRARIES += libextract-pdf.la
endif
@@ -214,12 +214,12 @@ libextract_msoffice_la_LIBADD = \
$(GCOV_LIBS)
# PDF
-libextract_pdf_la_SOURCES = tracker-extract-pdf.c
+libextract_pdf_la_SOURCES = tracker-extract-pdf.cpp
libextract_pdf_la_LDFLAGS = $(module_flags)
libextract_pdf_la_LIBADD = \
$(top_builddir)/src/libtracker-extract/libtracker-extract- TRACKER_API_VERSION@.la \
$(top_builddir)/src/libtracker-common/libtracker-common.la \
- $(POPPLER_GLIB_LIBS) \
+ $(POPPLER_LIBS) \
$(GLIB2_LIBS) \
$(GCOV_LIBS)
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.cpp
similarity index 64%
rename from src/tracker-extract/tracker-extract-pdf.c
rename to src/tracker-extract/tracker-extract-pdf.cpp
index 8c25984..24b20f2 100644
--- a/src/tracker-extract/tracker-extract-pdf.c
+++ b/src/tracker-extract/tracker-extract-pdf.cpp
@@ -24,7 +24,23 @@
#include <string.h>
#include <glib.h>
-#include <poppler.h>
+
+/* Poppler includes*/
+#include <goo/GooList.h>
+#include <splash/SplashBitmap.h>
+#include <GlobalParams.h>
+#include <PDFDoc.h>
+#include <Outline.h>
+#include <ErrorCodes.h>
+#include <UnicodeMap.h>
+#include <GfxState.h>
+#include <SplashOutputDev.h>
+#include <Stream.h>
+#include <FontInfo.h>
+#include <PDFDocEncoding.h>
+#include <OptionalContent.h>
+#include <TextOutputDev.h>
+#include <Gfx.h>
#include <libtracker-common/tracker-date-time.h>
#include <libtracker-common/tracker-utils.h>
@@ -51,45 +67,75 @@ static TrackerExtractData data[] = {
{ NULL, NULL }
};
-static void
-read_toc (PopplerIndexIter *index,
- GString **toc)
+static const GooString
+unicode_to_char (Unicode *unicode, int len)
{
- if (!index) {
- return;
+ static UnicodeMap *uMap = NULL;
+ GooString gstr;
+ gchar buf[8]; /* 8 is enough for mapping an unicode char to a string */
+ int i, n;
+
+ if (uMap == NULL) {
+ GooString *enc = new GooString ("UTF-8");
+ uMap = globalParams->getUnicodeMap (enc);
+ uMap->incRefCnt ();
+ delete enc;
+ }
+
+ for (i = 0; i < len; ++i) {
+ n = uMap->mapUnicode (unicode[i], buf, sizeof (buf));
+ gstr.append(buf, n);
}
+ return gstr;
+}
+/*
+static void
+read_toc (GooList *items,
+ GString **toc)
+{
+ guint length, i;
+
if (!*toc) {
*toc = g_string_new ("");
}
- do {
- PopplerAction *action;
- PopplerIndexIter *iter;
+ length = items->getLength ();
+
+ for (i = 0; i < length; i++) {
+ OutlineItem *item;
+ LinkAction *action;
- action = poppler_index_iter_get_action (index);
+ item = (OutlineItem *) items->get (i);
- if (!action) {
+ link_action = item->getAction ();
+
+ if (!link_action) {
continue;
}
- switch (action->type) {
- case POPPLER_ACTION_GOTO_DEST: {
- PopplerActionGotoDest *ag = (PopplerActionGotoDest*) action;
- PopplerDest *agd = ag->dest;
-
- if (!tracker_is_empty_string (ag->title)) {
- g_string_append_printf (*toc, "%s ", ag->title);
+ switch (link_action->getKind()) {
+ case actionGoTo: {
+ guint title_length = item->getTitleLength ();
+ LinkGoto *gto = dynamic_cast <LinkGoTo *> (link_action);
+ GooString *named_dest = gto->getNamedDest ();
+ const gchar *ndest = named_dest->getCString ();
+
+ if (title_length > 0) {
+ GooString gstr;
+ gstr = unicode_to_char (item->getTitle(),
+ title_length);
+ g_string_append_printf (*toc, "%s ", gstr.getCString ());
}
- if (!tracker_is_empty_string (agd->named_dest)) {
- g_string_append_printf (*toc, "%s ", agd->named_dest);
+ if (!tracker_is_empty_string (ndest)) {
+ g_string_append_printf (*toc, "%s ", ndest);
}
break;
}
- case POPPLER_ACTION_LAUNCH: {
+ case actionLaunch: {
PopplerActionLaunch *al = (PopplerActionLaunch*) action;
if (!tracker_is_empty_string (al->title)) {
@@ -107,7 +153,7 @@ read_toc (PopplerIndexIter *index,
break;
}
- case POPPLER_ACTION_URI: {
+ case actionURI: {
PopplerActionUri *au = (PopplerActionUri*) action;
if (!tracker_is_empty_string (au->uri)) {
@@ -117,7 +163,7 @@ read_toc (PopplerIndexIter *index,
break;
}
- case POPPLER_ACTION_NAMED: {
+ case actionNamed: {
PopplerActionNamed *an = (PopplerActionNamed*) action;
if (!tracker_is_empty_string (an->title)) {
@@ -131,7 +177,7 @@ read_toc (PopplerIndexIter *index,
break;
}
- case POPPLER_ACTION_MOVIE: {
+ case actionMovie: {
PopplerActionNamed *am = (PopplerActionNamed*) action;
if (!tracker_is_empty_string (am->title)) {
@@ -141,34 +187,39 @@ read_toc (PopplerIndexIter *index,
break;
}
- case POPPLER_ACTION_NONE:
- case POPPLER_ACTION_UNKNOWN:
- case POPPLER_ACTION_GOTO_REMOTE:
- /* Do nothing */
+ case actionNone:
+ case actionUnknown:
+ case actionGoToR:
+ * Do nothing *
break;
}
- iter = poppler_index_iter_get_child (index);
- read_toc (iter, toc);
- } while (poppler_index_iter_next (index));
+ if (item->hasKids ())
+ read_toc (item->getKids (), toc);
+ }
- poppler_index_iter_free (index);
}
-
+*/
static void
-read_outline (PopplerDocument *document,
+read_outline (PDFDoc *document,
TrackerSparqlBuilder *metadata)
{
- PopplerIndexIter *index;
+ Outline *outline;
GString *toc = NULL;
+ GooList *items;
- index = poppler_index_iter_new (document);
+ outline = document->getOutline();
- if (!index) {
+ if (!outline) {
return;
}
- read_toc (index, &toc);
+ items = outline->getItems ();
+
+ if (items == NULL)
+ return;
+
+// read_toc (items, &toc);
if (toc) {
if (toc->len > 0) {
@@ -217,38 +268,87 @@ insert_keywords (TrackerSparqlBuilder *metadata,
}
}
+static void
+page_get_size (Page *page,
+ gdouble *width,
+ gdouble *height)
+{
+ gdouble page_width, page_height;
+ gint rotate;
+
+ rotate = page->getRotate ();
+
+ if (rotate == 90 || rotate == 270) {
+ page_height = page->getCropWidth ();
+ page_width = page->getCropHeight ();
+ } else {
+ page_width = page->getCropWidth ();
+ page_height = page->getCropHeight ();
+ }
+
+ if (width != NULL)
+ *width = page_width;
+ if (height != NULL)
+ *height = page_height;
+}
+
static gchar *
-extract_content (PopplerDocument *document,
- guint n_words)
+extract_content (PDFDoc *document,
+ guint n_words)
{
- PopplerPage *page;
- PopplerRectangle rect;
+ Page *page;
+ Catalog *catalog;
GString *string;
gint n_pages, i, words;
- gchar *text, *t;
+ gchar *t;
- n_pages = poppler_document_get_n_pages (document);
+ n_pages = document->getNumPages();
string = g_string_new ("");
words = i = 0;
+ catalog = document->getCatalog();
while (i < n_pages && words < n_words) {
- gint normalized_words;
-
- page = poppler_document_get_page (document, i);
+ guint normalized_words = 0;
+ Gfx *gfx;
+ GooString *sel_text;
+ TextOutputDev *text_dev;
+ PDFRectangle pdf_selection;
+ gdouble height = 0, width = 0;
+
+ page = catalog->getPage (i + 1);
i++;
- rect.x1 = rect.y1 = 0;
- poppler_page_get_size (page, &rect.x2, &rect.y2);
+ text_dev = new TextOutputDev (NULL, gTrue, gFalse, gFalse);
+ gfx = page->createGfx (text_dev,
+ 72.0, 72.0, 0,
+ gFalse, /* useMediaBox */
+ gTrue, /* Crop */
+ -1, -1, -1, -1,
+ gFalse, /* printing */
+ catalog,
+ NULL, NULL, NULL, NULL);
+
+ page->display(gfx);
+ text_dev->endPage();
+
+ page_get_size (page, &width, &height);
- text = poppler_page_get_text (page, POPPLER_SELECTION_WORD, &rect);
- t = tracker_text_normalize (text, n_words - words, &normalized_words);
+ pdf_selection.x1 = 0;
+ pdf_selection.y1 = 0;
+ pdf_selection.x2 = width;
+ pdf_selection.y2 = height;
+
+ sel_text = text_dev->getSelectionText (&pdf_selection, selectionStyleWord);
+
+ t = tracker_text_normalize (sel_text->getCString (), n_words - words, &normalized_words);
words += normalized_words;
g_string_append (string, t);
- g_free (text);
g_free (t);
- g_object_unref (page);
+
+ delete gfx;
+ delete text_dev;
}
return g_string_free (string, FALSE);
@@ -288,41 +388,133 @@ write_pdf_data (PDFData data,
}
}
+
+static PDFDoc*
+poppler_document_new_pdf_from_file (const char *uri,
+ const char *password)
+{
+ PDFDoc *newDoc;
+ GooString *filename_g;
+ GooString *password_g;
+ gchar *filename;
+
+ if (!globalParams) {
+ globalParams = new GlobalParams();
+ }
+
+ filename = g_filename_from_uri (uri, NULL, NULL);
+ if (!filename)
+ return NULL;
+
+ filename_g = new GooString (filename);
+ g_free (filename);
+
+ password_g = NULL;
+ if (password != NULL) {
+ if (g_utf8_validate (password, -1, NULL)) {
+ gchar *password_latin;
+
+ password_latin = g_convert (password, -1,
+ "ISO-8859-1",
+ "UTF-8",
+ NULL, NULL, NULL);
+ password_g = new GooString (password_latin);
+ g_free (password_latin);
+ } else {
+ password_g = new GooString (password);
+ }
+ }
+
+ newDoc = new PDFDoc(filename_g, password_g, password_g);
+ delete password_g;
+
+ return newDoc;
+}
+
+static gchar*
+info_dict_get_string (Dict *info_dict, const gchar *key)
+{
+ Object obj;
+ GooString *goo_value;
+ gchar *result;
+
+ if (!info_dict->lookup ((gchar *)key, &obj)->isString ()) {
+ obj.free ();
+ return NULL;
+ }
+
+ goo_value = obj.getString ();
+
+ if (goo_value->hasUnicodeMarker()) {
+ result = g_convert (goo_value->getCString () + 2,
+ goo_value->getLength () - 2,
+ "UTF-8", "UTF-16BE", NULL, NULL, NULL);
+ } else {
+ int len;
+ gunichar *ucs4_temp;
+ int i;
+
+ len = goo_value->getLength ();
+ ucs4_temp = g_new (gunichar, len + 1);
+ for (i = 0; i < len; ++i) {
+ ucs4_temp[i] = pdfDocEncoding[(unsigned char)goo_value->getChar(i)];
+ }
+ ucs4_temp[i] = 0;
+ result = g_ucs4_to_utf8 (ucs4_temp, -1, NULL, NULL, NULL);
+ g_free (ucs4_temp);
+ }
+
+ obj.free ();
+
+ return result;
+}
+
static void
extract_pdf (const gchar *uri,
TrackerSparqlBuilder *preupdate,
TrackerSparqlBuilder *metadata)
{
TrackerFTSConfig *fts_config;
- GTime creation_date;
- GError *error = NULL;
TrackerXmpData *xd = NULL;
PDFData pd = { 0 }; /* actual data */
PDFData md = { 0 }; /* for merging */
- PopplerDocument *document;
+ PDFDoc *document;
gchar *xml = NULL;
gchar *content;
guint n_words;
+ Object obj;
+ Catalog *catalog;
g_type_init ();
- document = poppler_document_new_from_file (uri, NULL, &error);
-
- if (error) {
- if (error->code == POPPLER_ERROR_ENCRYPTED) {
- tracker_sparql_builder_predicate (metadata, "a");
- tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+ document = poppler_document_new_pdf_from_file (uri, NULL);
- tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
- tracker_sparql_builder_object_boolean (metadata, TRUE);
- return;
- } else {
- g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
- uri,
- error->message ? error->message : "no error given");
+ if (!document->isOk()) {
+ int fopen_errno;
+ switch (document->getErrorCode()) {
+ case errEncrypted:
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+ tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
+ tracker_sparql_builder_object_boolean (metadata, TRUE);
+ break;
+ case errBadCatalog:
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', Failed to read the document catalog", uri);
+ break;
+ case errDamaged:
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', PDF document is damaged", uri);
+ break;
+ case errOpenFile:
+ fopen_errno = document->getFopenErrno();
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
+ uri, g_strerror (fopen_errno));
+ break;
+ default:
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', no error given", uri);
+ break;
}
- g_error_free (error);
+ delete document;
return;
}
@@ -336,21 +528,24 @@ extract_pdf (const gchar *uri,
tracker_sparql_builder_predicate (metadata, "a");
tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
- g_object_get (document,
- "title", &pd.title,
- "author", &pd.author,
- "subject", &pd.subject,
- "keywords", &pd.keywords,
- "creation-date", &creation_date,
- NULL);
-
- /* metadata property not present in older poppler versions */
- if (g_object_class_find_property (G_OBJECT_GET_CLASS (document), "metadata")) {
- g_object_get (document, "metadata", &xml, NULL);
+ document->getDocInfo (&obj);
+ if (obj.isDict ()) {
+ Dict *info_dict = obj.getDict();
+ pd.title = info_dict_get_string (info_dict, "Title");
+ pd.author = info_dict_get_string (info_dict, "Author");
+ pd.subject = info_dict_get_string (info_dict, "Subject");
+ pd.keywords = info_dict_get_string (info_dict, "Keywords");
+ pd.creation_date = info_dict_get_string (info_dict, "CreationDate");
}
-
- if (creation_date > 0) {
- pd.creation_date = tracker_date_to_string ((time_t) creation_date);
+ obj.free ();
+
+ catalog = document->getCatalog ();
+ if (catalog && catalog->isOk ()) {
+ GooString *s = catalog->readMetadata ();
+ if ( s != NULL ) {
+ xml = s->getCString();
+ delete s;
+ }
}
if (xml) {
@@ -569,7 +764,7 @@ extract_pdf (const gchar *uri,
}
tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
- tracker_sparql_builder_object_int64 (metadata, poppler_document_get_n_pages (document));
+ tracker_sparql_builder_object_int64 (metadata, document->getNumPages());
fts_config = tracker_main_get_fts_config ();
n_words = tracker_fts_config_get_max_words_to_index (fts_config);
@@ -583,7 +778,7 @@ extract_pdf (const gchar *uri,
read_outline (document, metadata);
- g_object_unref (document);
+ delete document;
}
TrackerExtractData *
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]