[tracker/pdfmem] tracker-extract: Ported the PDF extractor to not use poppler-glib



commit fc700d6b020645286af54ffbe9c4f027e2d07134
Author: Philip Van Hoof <philip codeminded be>
Date:   Thu Apr 29 14:43:20 2010 +0200

    tracker-extract: Ported the PDF extractor to not use poppler-glib
    
    Poppler's GLib bindings don't make it possible to use a TextOutputDev
    for poppler_page_get_text when there *is* Cairo support. This isn't good for
    us because that means that Cairo surfaces are needlessly made for each image
    embedded in the PDF, wasting resources.
    
    This commit doesn't yet have the TOC code ported. This will happen later today.
    
    Hold on tight

 configure.ac                                       |   40 +-
 src/tracker-extract/Makefile.am                    |    8 +-
 ...acker-extract-pdf.c => tracker-extract-pdf.cpp} |  367 +++++++++++++++-----
 3 files changed, 305 insertions(+), 110 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index 84319f5..77e60ff 100644
--- a/configure.ac
+++ b/configure.ac
@@ -143,7 +143,7 @@ UPOWER_REQUIRED=0.9.0
 GDKPIXBUF_REQUIRED=2.12.0
 QUILL_REQUIRED=1.0.0
 UNAC_REQUIRED=1.0.0
-POPPLER_GLIB_REQUIRED=0.4.5
+POPPLER_REQUIRED=0.12.2
 CAIRO_REQUIRED=1.0
 GDK_REQUIRED=1.0
 LIBVORBIS_REQUIRED=0.22
@@ -1095,37 +1095,37 @@ if test x$enable_unzip_psgz_files != "xno"; then
 fi
 
 ##################################################################
-# Check for poppler's glib bingings
+# Check for poppler
 ##################################################################
 
-AC_ARG_ENABLE(poppler-glib,
-	      AS_HELP_STRING([--enable-poppler-glib],
+AC_ARG_ENABLE(poppler,
+	      AS_HELP_STRING([--enable-poppler],
 	      		     [enable extractor for PDF data [[default=auto]]]),,
-	      [enable_poppler_glib=auto])
+	      [enable_poppler=auto])
 
-if test "x$enable_poppler_glib" != "xno" ; then
-   PKG_CHECK_MODULES(POPPLER_GLIB,
-   		     [poppler-glib >= $POPPLER_GLIB_REQUIRED],
-		     [have_poppler_glib=yes],
-		     [have_poppler_glib=no])
+if test "x$enable_poppler" != "xno" ; then
+   PKG_CHECK_MODULES(POPPLER,
+   		     [poppler >= $POPPLER_REQUIRED],
+		     [have_poppler=yes],
+		     [have_poppler=no])
 
-   AC_SUBST(POPPLER_GLIB_CFLAGS)
-   AC_SUBST(POPPLER_GLIB_LIBS)
+   AC_SUBST(POPPLER_CFLAGS)
+   AC_SUBST(POPPLER_LIBS)
 
-   if test "x$have_poppler_glib" = "xyes"; then
-      AC_DEFINE(HAVE_POPPLER_GLIB, [], [Define if we have poppler-glib])
+   if test "x$have_poppler" = "xyes"; then
+      AC_DEFINE(HAVE_POPPLER, [], [Define if we have poppler])
    fi
 else
-   have_poppler_glib="no  (disabled)"
+   have_poppler="no  (disabled)"
 fi
 
-if test "x$enable_poppler_glib" = "xyes"; then
-   if test "x$have_poppler_glib" != "xyes"; then
-      AC_MSG_ERROR([Couldn't find poppler-glib >= $POPPLER_GLIB_REQUIRED.])
+if test "x$enable_poppler" = "xyes"; then
+   if test "x$have_poppler" != "xyes"; then
+      AC_MSG_ERROR([Couldn't find poppler >= $POPPLER_REQUIRED.])
    fi
 fi
 
-AM_CONDITIONAL(HAVE_POPPLER_GLIB, test "x$have_poppler_glib" = "xyes")
+AM_CONDITIONAL(HAVE_POPPLER, test "x$have_poppler" = "xyes")
 
 ##################################################################
 # Check for libexif
@@ -1770,7 +1770,7 @@ Metadata Extractors:
 
 	Support libstreamanalyzer:		$have_libstreamanalyzer
 	Support PNG:				yes
-	Support PDF:				$have_poppler_glib
+	Support PDF:				$have_poppler
 	Support JPEG:				$have_libjpeg (xmp: $have_exempi, exif: $have_libexif, iptc: $have_libiptcdata)
 	Support TIFF:				$have_libtiff (xmp: $have_exempi, exif: yes, iptc: $have_libiptcdata)
 	Support Vorbis (ogg/etc):		$have_libvorbis
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index eede0c1..37454ba 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -20,7 +20,7 @@ INCLUDES = 								\
 	$(LIBGSF_CFLAGS) 						\
 	$(LIBXML2_CFLAGS) 						\
 	$(LIBPNG_CFLAGS) 						\
-	$(POPPLER_GLIB_CFLAGS) 						\
+	$(POPPLER_CFLAGS) 						\
 	$(GSTREAMER_CFLAGS) 						\
 	$(XINE_CFLAGS) 							\
 	$(TOTEM_PL_PARSER_CFLAGS)
@@ -73,7 +73,7 @@ modules_LTLIBRARIES += 							\
 	libextract-oasis.la
 endif
 
-if HAVE_POPPLER_GLIB
+if HAVE_POPPLER
 modules_LTLIBRARIES += libextract-pdf.la
 endif
 
@@ -214,12 +214,12 @@ libextract_msoffice_la_LIBADD = 					\
 	$(GCOV_LIBS)
 
 # PDF
-libextract_pdf_la_SOURCES = tracker-extract-pdf.c
+libextract_pdf_la_SOURCES = tracker-extract-pdf.cpp
 libextract_pdf_la_LDFLAGS = $(module_flags)
 libextract_pdf_la_LIBADD = 						\
 	$(top_builddir)/src/libtracker-extract/libtracker-extract- TRACKER_API_VERSION@.la \
 	$(top_builddir)/src/libtracker-common/libtracker-common.la	\
-	$(POPPLER_GLIB_LIBS)						\
+	$(POPPLER_LIBS)							\
 	$(GLIB2_LIBS)							\
 	$(GCOV_LIBS)
 
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.cpp
similarity index 64%
rename from src/tracker-extract/tracker-extract-pdf.c
rename to src/tracker-extract/tracker-extract-pdf.cpp
index 8c25984..24b20f2 100644
--- a/src/tracker-extract/tracker-extract-pdf.c
+++ b/src/tracker-extract/tracker-extract-pdf.cpp
@@ -24,7 +24,23 @@
 #include <string.h>
 
 #include <glib.h>
-#include <poppler.h>
+
+/* Poppler includes*/
+#include <goo/GooList.h>
+#include <splash/SplashBitmap.h>
+#include <GlobalParams.h>
+#include <PDFDoc.h>
+#include <Outline.h>
+#include <ErrorCodes.h>
+#include <UnicodeMap.h>
+#include <GfxState.h>
+#include <SplashOutputDev.h>
+#include <Stream.h>
+#include <FontInfo.h>
+#include <PDFDocEncoding.h>
+#include <OptionalContent.h>
+#include <TextOutputDev.h>
+#include <Gfx.h>
 
 #include <libtracker-common/tracker-date-time.h>
 #include <libtracker-common/tracker-utils.h>
@@ -51,45 +67,75 @@ static TrackerExtractData data[] = {
 	{ NULL, NULL }
 };
 
-static void
-read_toc (PopplerIndexIter  *index,
-          GString          **toc)
+static const GooString
+unicode_to_char (Unicode *unicode, int len)
 {
-	if (!index) {
-		return;
+	static UnicodeMap *uMap = NULL;
+	GooString gstr;
+	gchar buf[8]; /* 8 is enough for mapping an unicode char to a string */
+	int i, n;
+
+	if (uMap == NULL) {
+		GooString *enc = new GooString ("UTF-8");
+		uMap = globalParams->getUnicodeMap (enc);
+		uMap->incRefCnt ();
+		delete enc;
+	}
+
+	for (i = 0; i < len; ++i) {
+		n = uMap->mapUnicode (unicode[i], buf, sizeof (buf));
+		gstr.append(buf, n);
 	}
 
+	return gstr;
+}
+/*
+static void
+read_toc (GooList  *items,
+          GString **toc)
+{
+	guint length, i;
+
 	if (!*toc) {
 		*toc = g_string_new ("");
 	}
 
-	do {
-		PopplerAction *action;
-		PopplerIndexIter *iter;
+	length = items->getLength ();
+
+	for (i = 0; i < length; i++) {
+		OutlineItem *item;
+		LinkAction *action;
 
-		action = poppler_index_iter_get_action (index);
+		item = (OutlineItem *) items->get (i);
 
-		if (!action) {
+		link_action = item->getAction ();
+
+		if (!link_action) {
 			continue;
 		}
 
-		switch (action->type) {
-			case POPPLER_ACTION_GOTO_DEST: {
-				PopplerActionGotoDest *ag = (PopplerActionGotoDest*) action;
-				PopplerDest *agd = ag->dest;
-
-				if (!tracker_is_empty_string (ag->title)) {
-					g_string_append_printf (*toc, "%s ", ag->title);
+		switch (link_action->getKind()) {
+			case actionGoTo: {
+				guint title_length = item->getTitleLength ();
+				LinkGoto *gto = dynamic_cast <LinkGoTo *> (link_action);
+				GooString *named_dest = gto->getNamedDest ();
+				const gchar *ndest = named_dest->getCString ();
+
+				if (title_length > 0) {
+					GooString gstr;
+					gstr = unicode_to_char (item->getTitle(),
+					                        title_length);
+					g_string_append_printf (*toc, "%s ", gstr.getCString ());
 				}
 
-				if (!tracker_is_empty_string (agd->named_dest)) {
-					g_string_append_printf (*toc, "%s ", agd->named_dest);
+				if (!tracker_is_empty_string (ndest)) {
+					g_string_append_printf (*toc, "%s ", ndest);
 				}
 
 				break;
 			}
 
-			case POPPLER_ACTION_LAUNCH: {
+			case actionLaunch: {
 				PopplerActionLaunch *al = (PopplerActionLaunch*) action;
 
 				if (!tracker_is_empty_string (al->title)) {
@@ -107,7 +153,7 @@ read_toc (PopplerIndexIter  *index,
 				break;
 			}
 
-			case POPPLER_ACTION_URI: {
+			case actionURI: {
 				PopplerActionUri *au = (PopplerActionUri*) action;
 
 				if (!tracker_is_empty_string (au->uri)) {
@@ -117,7 +163,7 @@ read_toc (PopplerIndexIter  *index,
 				break;
 			}
 
-			case POPPLER_ACTION_NAMED: {
+			case actionNamed: {
 				PopplerActionNamed *an = (PopplerActionNamed*) action;
 
 				if (!tracker_is_empty_string (an->title)) {
@@ -131,7 +177,7 @@ read_toc (PopplerIndexIter  *index,
 				break;
 			}
 
-			case POPPLER_ACTION_MOVIE: {
+			case actionMovie: {
 				PopplerActionNamed *am = (PopplerActionNamed*) action;
 
 				if (!tracker_is_empty_string (am->title)) {
@@ -141,34 +187,39 @@ read_toc (PopplerIndexIter  *index,
 				break;
 			}
 
-			case POPPLER_ACTION_NONE:
-			case POPPLER_ACTION_UNKNOWN:
-			case POPPLER_ACTION_GOTO_REMOTE:
-				/* Do nothing */
+			case actionNone:
+			case actionUnknown:
+			case actionGoToR:
+				* Do nothing *
 				break;
 		}
 
-		iter = poppler_index_iter_get_child (index);
-		read_toc (iter, toc);
-	} while (poppler_index_iter_next (index));
+		if (item->hasKids ())
+			read_toc (item->getKids (), toc);
+	}
 
-	poppler_index_iter_free (index);
 }
-
+*/
 static void
-read_outline (PopplerDocument      *document,
+read_outline (PDFDoc               *document,
               TrackerSparqlBuilder *metadata)
 {
-	PopplerIndexIter *index;
+	Outline *outline;
 	GString *toc = NULL;
+	GooList *items;
 
-	index = poppler_index_iter_new (document);
+	outline = document->getOutline();
 
-	if (!index) {
+	if (!outline) {
 		return;
 	}
 
-	read_toc (index, &toc);
+	items = outline->getItems ();
+
+	if (items == NULL)
+		return;
+
+//	read_toc (items, &toc);
 
 	if (toc) {
 		if (toc->len > 0) {
@@ -217,38 +268,87 @@ insert_keywords (TrackerSparqlBuilder *metadata,
 	}
 }
 
+static void
+page_get_size (Page    *page,
+               gdouble *width,
+               gdouble *height)
+{
+  gdouble page_width, page_height;
+  gint rotate;
+
+  rotate = page->getRotate ();
+
+  if (rotate == 90 || rotate == 270) {
+    page_height = page->getCropWidth ();
+    page_width = page->getCropHeight ();
+  } else {
+    page_width = page->getCropWidth ();
+    page_height = page->getCropHeight ();
+  }
+
+  if (width != NULL)
+    *width = page_width;
+  if (height != NULL)
+    *height = page_height;
+}
+
 static gchar *
-extract_content (PopplerDocument *document,
-                 guint            n_words)
+extract_content (PDFDoc *document,
+                 guint   n_words)
 {
-	PopplerPage *page;
-	PopplerRectangle rect;
+	Page *page;
+	Catalog *catalog;
 	GString *string;
 	gint n_pages, i, words;
-	gchar *text, *t;
+	gchar *t;
 
-	n_pages = poppler_document_get_n_pages (document);
+	n_pages = document->getNumPages();
 	string = g_string_new ("");
 	words = i = 0;
+	catalog = document->getCatalog();
 
 	while (i < n_pages && words < n_words) {
-		gint normalized_words;
-
-		page = poppler_document_get_page (document, i);
+		guint normalized_words = 0;
+		Gfx *gfx;
+		GooString *sel_text;
+		TextOutputDev *text_dev;
+		PDFRectangle pdf_selection;
+		gdouble height = 0, width = 0;
+
+		page = catalog->getPage (i + 1);
 		i++;
 
-		rect.x1 = rect.y1 = 0;
-		poppler_page_get_size (page, &rect.x2, &rect.y2);
+		text_dev = new TextOutputDev (NULL, gTrue, gFalse, gFalse);
+		gfx = page->createGfx (text_dev,
+		                       72.0, 72.0, 0,
+		                       gFalse, /* useMediaBox */
+		                       gTrue, /* Crop */
+		                       -1, -1, -1, -1,
+		                       gFalse, /* printing */
+		                       catalog,
+		                       NULL, NULL, NULL, NULL);
+
+		page->display(gfx);
+		text_dev->endPage();
+
+		page_get_size (page, &width, &height);
 
-		text = poppler_page_get_text (page, POPPLER_SELECTION_WORD, &rect);
-		t = tracker_text_normalize (text, n_words - words, &normalized_words);
+		pdf_selection.x1 = 0;
+		pdf_selection.y1 = 0;
+		pdf_selection.x2 = width;
+		pdf_selection.y2 = height;
+
+		sel_text = text_dev->getSelectionText (&pdf_selection, selectionStyleWord);
+
+		t = tracker_text_normalize (sel_text->getCString (), n_words - words, &normalized_words);
 
 		words += normalized_words;
 		g_string_append (string, t);
 
-		g_free (text);
 		g_free (t);
-		g_object_unref (page);
+
+		delete gfx;
+		delete text_dev;
 	}
 
 	return g_string_free (string, FALSE);
@@ -288,41 +388,133 @@ write_pdf_data (PDFData               data,
 	}
 }
 
+
+static PDFDoc*
+poppler_document_new_pdf_from_file (const char  *uri,
+                                    const char  *password)
+{
+	PDFDoc *newDoc;
+	GooString *filename_g;
+	GooString *password_g;
+	gchar *filename;
+
+	if (!globalParams) {
+		globalParams = new GlobalParams();
+	}
+
+	filename = g_filename_from_uri (uri, NULL, NULL);
+	if (!filename)
+		return NULL;
+
+	filename_g = new GooString (filename);
+	g_free (filename);
+
+	password_g = NULL;
+	if (password != NULL) {
+		if (g_utf8_validate (password, -1, NULL)) {
+			gchar *password_latin;
+
+			password_latin = g_convert (password, -1,
+			                            "ISO-8859-1",
+			                            "UTF-8",
+			                            NULL, NULL, NULL);
+			password_g = new GooString (password_latin);
+			g_free (password_latin);
+		} else {
+			password_g = new GooString (password);
+		}
+	}
+
+	newDoc = new PDFDoc(filename_g, password_g, password_g);
+	delete password_g;
+
+	return newDoc;
+}
+
+static gchar*
+info_dict_get_string (Dict *info_dict, const gchar *key)
+{
+	Object obj;
+	GooString *goo_value;
+	gchar *result;
+
+	if (!info_dict->lookup ((gchar *)key, &obj)->isString ()) {
+		obj.free ();
+		return NULL;
+	}
+
+	goo_value = obj.getString ();
+
+	if (goo_value->hasUnicodeMarker()) {
+		result = g_convert (goo_value->getCString () + 2,
+		                    goo_value->getLength () - 2,
+		                    "UTF-8", "UTF-16BE", NULL, NULL, NULL);
+	} else {
+		int len;
+		gunichar *ucs4_temp;
+		int i;
+
+		len = goo_value->getLength ();
+		ucs4_temp = g_new (gunichar, len + 1);
+		for (i = 0; i < len; ++i) {
+			ucs4_temp[i] = pdfDocEncoding[(unsigned char)goo_value->getChar(i)];
+		}
+		ucs4_temp[i] = 0;
+		result = g_ucs4_to_utf8 (ucs4_temp, -1, NULL, NULL, NULL);
+		g_free (ucs4_temp);
+	}
+
+	obj.free ();
+
+	return result;
+}
+
 static void
 extract_pdf (const gchar          *uri,
              TrackerSparqlBuilder *preupdate,
              TrackerSparqlBuilder *metadata)
 {
 	TrackerFTSConfig *fts_config;
-	GTime creation_date;
-	GError *error = NULL;
 	TrackerXmpData *xd = NULL;
 	PDFData pd = { 0 }; /* actual data */
 	PDFData md = { 0 }; /* for merging */
-	PopplerDocument *document;
+	PDFDoc *document;
 	gchar *xml = NULL;
 	gchar *content;
 	guint n_words;
+	Object obj;
+	Catalog *catalog;
 
 	g_type_init ();
 
-	document = poppler_document_new_from_file (uri, NULL, &error);
-
-	if (error) {
-		if (error->code == POPPLER_ERROR_ENCRYPTED) {
-			tracker_sparql_builder_predicate (metadata, "a");
-			tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+	document = poppler_document_new_pdf_from_file (uri, NULL);
 
-			tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
-			tracker_sparql_builder_object_boolean (metadata, TRUE);
-			return;
-		} else {
-			g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
-			           uri,
-			           error->message ? error->message : "no error given");
+	if (!document->isOk()) {
+		int fopen_errno;
+		switch (document->getErrorCode()) {
+			case errEncrypted:
+				tracker_sparql_builder_predicate (metadata, "a");
+				tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+				tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
+				tracker_sparql_builder_object_boolean (metadata, TRUE);
+				break;
+			case errBadCatalog:
+				g_warning ("Couldn't create PopplerDocument from uri:'%s', Failed to read the document catalog", uri);
+				break;
+			case errDamaged:
+				g_warning ("Couldn't create PopplerDocument from uri:'%s', PDF document is damaged", uri);
+				break;
+			case errOpenFile:
+				fopen_errno = document->getFopenErrno();
+				g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
+				           uri, g_strerror (fopen_errno));
+				break;
+			default:
+				g_warning ("Couldn't create PopplerDocument from uri:'%s', no error given", uri);
+				break;
 		}
 
-		g_error_free (error);
+		delete document;
 		return;
 	}
 
@@ -336,21 +528,24 @@ extract_pdf (const gchar          *uri,
 	tracker_sparql_builder_predicate (metadata, "a");
 	tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
 
-	g_object_get (document,
-	              "title", &pd.title,
-	              "author", &pd.author,
-	              "subject", &pd.subject,
-	              "keywords", &pd.keywords,
-	              "creation-date", &creation_date,
-	              NULL);
-
-	/* metadata property not present in older poppler versions */
-	if (g_object_class_find_property (G_OBJECT_GET_CLASS (document), "metadata")) {
-		g_object_get (document, "metadata", &xml, NULL);
+	document->getDocInfo (&obj);
+	if (obj.isDict ()) {
+		Dict *info_dict = obj.getDict();
+		pd.title = info_dict_get_string (info_dict, "Title");
+		pd.author = info_dict_get_string (info_dict, "Author");
+		pd.subject = info_dict_get_string (info_dict, "Subject");
+		pd.keywords = info_dict_get_string (info_dict, "Keywords");
+		pd.creation_date = info_dict_get_string (info_dict, "CreationDate");
 	}
-
-	if (creation_date > 0) {
-		pd.creation_date = tracker_date_to_string ((time_t) creation_date);
+	obj.free ();
+
+	catalog = document->getCatalog ();
+	if (catalog && catalog->isOk ()) {
+		GooString *s = catalog->readMetadata ();
+		if ( s != NULL ) {
+			xml = s->getCString();
+			delete s;
+		}
 	}
 
 	if (xml) {
@@ -569,7 +764,7 @@ extract_pdf (const gchar          *uri,
 	}
 
 	tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
-	tracker_sparql_builder_object_int64 (metadata, poppler_document_get_n_pages (document));
+	tracker_sparql_builder_object_int64 (metadata, document->getNumPages());
 
 	fts_config = tracker_main_get_fts_config ();
 	n_words = tracker_fts_config_get_max_words_to_index (fts_config);
@@ -583,7 +778,7 @@ extract_pdf (const gchar          *uri,
 
 	read_outline (document, metadata);
 
-	g_object_unref (document);
+	delete document;
 }
 
 TrackerExtractData *



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]