[tracker/posix_fadvise: 1/3] tracker-extract: PDF, code clean up
- From: Martyn James Russell <mr src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/posix_fadvise: 1/3] tracker-extract: PDF, code clean up
- Date: Mon, 11 Oct 2010 14:57:00 +0000 (UTC)
commit 52a7c80baa8b141a548bfd6ed9bd0705ac028091
Author: Martyn Russell <martyn lanedo com>
Date: Mon Oct 11 14:11:52 2010 +0100
tracker-extract: PDF, code clean up
src/tracker-extract/tracker-extract-pdf.cpp | 353 +++++++++++++++------------
1 files changed, 197 insertions(+), 156 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-pdf.cpp b/src/tracker-extract/tracker-extract-pdf.cpp
index 465b817..3b3fd76 100644
--- a/src/tracker-extract/tracker-extract-pdf.cpp
+++ b/src/tracker-extract/tracker-extract-pdf.cpp
@@ -61,18 +61,21 @@ static TrackerExtractData data[] = {
{ NULL, NULL }
};
-/**
- * Philip ported this from a poppler-glib based version to a C++ libpopler
- * version because the TextOutputDev allows us to extract text and metadata much
- * faster than the default CairoOutputDev that poppler-glib uses in case it got
- * compiled with support for Cairo. Regretfully can't this be selected at
- * runtime in the poppler-glib bindings. Apologies to the GObject/GLib fans. */
+/*
+ * Philip ported this from a poppler-glib based version to a C++
+ * libpopler version because the TextOutputDev allows us to extract
+ * text and metadata much faster than the default CairoOutputDev that
+ * poppler-glib uses in case it got compiled with support for Cairo.
+ * Regretfully can't this be selected at runtime in the poppler-glib
+ * bindings. Apologies to the GObject/GLib fans.
+ */
static gchar *
unicode_to_char (Unicode *unicode,
int len)
{
static UnicodeMap *uMap = NULL;
+
if (uMap == NULL) {
GooString *enc = new GooString("UTF-8");
uMap = globalParams->getUnicodeMap(enc);
@@ -85,8 +88,8 @@ unicode_to_char (Unicode *unicode,
int i, n;
for (i = 0; i < len; ++i) {
- n = uMap->mapUnicode(unicode[i], buf, sizeof(buf));
- gstr.append(buf, n);
+ n = uMap->mapUnicode (unicode[i], buf, sizeof(buf));
+ gstr.append (buf, n);
}
return g_strdup (gstr.getCString ());
@@ -98,8 +101,9 @@ read_toc (GooList *items,
{
guint length, i;
- if (!items)
+ if (!items) {
return;
+ }
if (!*toc) {
*toc = g_string_new ("");
@@ -120,114 +124,130 @@ read_toc (GooList *items,
}
switch (link_action->getKind()) {
- case actionGoTo: {
- LinkGoTo *gto = dynamic_cast <LinkGoTo *> (link_action);
-
- if (gto) {
- guint title_length = item->getTitleLength ();
- GooString *named_dest = gto->getNamedDest ();
-
- if (title_length > 0) {
- gchar *str = unicode_to_char (item->getTitle(),
- title_length);
- g_string_append_printf (*toc, "%s ", str);
- g_free (str);
- }
-
- if (named_dest)
- g_string_append_printf (*toc, "%s ", named_dest->getCString ());
- }
+ case actionGoTo: {
+ LinkGoTo *gto = dynamic_cast <LinkGoTo *> (link_action);
- break;
- }
+ if (gto) {
+ GooString *named_dest;
+ guint title_length;
- case actionLaunch: {
- LinkLaunch *lan = dynamic_cast <LinkLaunch *> (link_action);
+ title_length = item->getTitleLength ();
+ named_dest = gto->getNamedDest ();
- if (lan) {
- guint title_length = item->getTitleLength ();
- GooString *filen, *param;
+ if (title_length > 0) {
+ gchar *str;
- filen = lan->getFileName();
- param = lan->getParams();
+ str = unicode_to_char (item->getTitle(), title_length);
+ g_string_append_printf (*toc, "%s ", str);
+ g_free (str);
+ }
- if (title_length > 0) {
- gchar *str = unicode_to_char (item->getTitle(),
- title_length);
- g_string_append_printf (*toc, "%s ", str);
- g_free (str);
- }
+ if (named_dest)
+ g_string_append_printf (*toc, "%s ", named_dest->getCString ());
+ }
- if (filen)
- g_string_append_printf (*toc, "%s ", filen->getCString ());
+ break;
+ }
- if (param)
- g_string_append_printf (*toc, "%s ", param->getCString ());
- }
+ case actionLaunch: {
+ LinkLaunch *lan = dynamic_cast <LinkLaunch *> (link_action);
- break;
- }
+ if (lan) {
+ GooString *filen, *param;
+ guint title_length;
- case actionURI: {
- LinkURI *uri = dynamic_cast <LinkURI *> (link_action);
+ filen = lan->getFileName();
+ param = lan->getParams();
+ title_length = item->getTitleLength ();
- if (uri) {
- GooString *muri;
+ if (title_length > 0) {
+ gchar *str;
- muri = uri->getURI();
+ str = unicode_to_char (item->getTitle (), title_length);
+ g_string_append_printf (*toc, "%s ", str);
+ g_free (str);
+ }
- if (muri)
- g_string_append_printf (*toc, "%s ", muri->getCString ());
+ if (filen) {
+ g_string_append_printf (*toc, "%s ", filen->getCString ());
}
- break;
+ if (param) {
+ g_string_append_printf (*toc, "%s ", param->getCString ());
+ }
}
- case actionNamed: {
- LinkNamed *named = dynamic_cast <LinkNamed *> (link_action);
+ break;
+ }
- if (named) {
- GooString *named_dest = named->getName ();
- guint title_length = item->getTitleLength ();
+ case actionURI: {
+ LinkURI *uri = dynamic_cast <LinkURI *> (link_action);
- if (title_length > 0) {
- gchar *str = unicode_to_char (item->getTitle(),
- title_length);
- g_string_append_printf (*toc, "%s ", str);
- g_free (str);
- }
+ if (uri) {
+ GooString *muri;
- if (named_dest)
- g_string_append_printf (*toc, "%s ", named_dest->getCString ());
- }
+ muri = uri->getURI ();
- break;
+ if (muri) {
+ g_string_append_printf (*toc, "%s ", muri->getCString ());
+ }
}
- case actionMovie: {
- guint title_length = item->getTitleLength ();
+ break;
+ }
+
+ case actionNamed: {
+ LinkNamed *named = dynamic_cast <LinkNamed *> (link_action);
+
+ if (named) {
+ GooString *named_dest;
+ guint title_length;
+
+ named_dest = named->getName ();
+ title_length = item->getTitleLength ();
if (title_length > 0) {
- gchar *str = unicode_to_char (item->getTitle(),
- title_length);
+ gchar *str;
+
+ str = unicode_to_char (item->getTitle(), title_length);
g_string_append_printf (*toc, "%s ", str);
g_free (str);
}
- break;
+ if (named_dest) {
+ g_string_append_printf (*toc, "%s ", named_dest->getCString ());
+ }
+ }
+
+ break;
+ }
+
+ case actionMovie: {
+ guint title_length = item->getTitleLength ();
+
+ if (title_length > 0) {
+ gchar *str;
+
+ str = unicode_to_char (item->getTitle (), title_length);
+ g_string_append_printf (*toc, "%s ", str);
+ g_free (str);
}
- case actionRendition:
- case actionSound:
- case actionJavaScript:
- case actionUnknown:
- case actionGoToR:
- /* Do nothing */
- break;
+ break;
}
- if (item->hasKids ())
+ case actionRendition:
+ case actionSound:
+ case actionJavaScript:
+ case actionUnknown:
+ case actionGoToR:
+ /* Do nothing */
+ break;
+ }
+
+ if (item->hasKids ()) {
read_toc (item->getKids (), toc);
+ }
}
}
@@ -240,7 +260,7 @@ read_outline (PDFDoc *document,
GString *toc = NULL;
GooList *items;
- outline = document->getOutline();
+ outline = document->getOutline ();
if (!outline) {
return;
@@ -248,19 +268,22 @@ read_outline (PDFDoc *document,
items = outline->getItems ();
- if (items == NULL)
+ if (items == NULL) {
return;
+ }
read_toc (items, &toc);
- if (toc) {
- if (toc->len > 0) {
- tracker_sparql_builder_predicate (metadata, "nfo:tableOfContents");
- tracker_sparql_builder_object_unvalidated (metadata, toc->str);
- }
+ if (!toc) {
+ return;
+ }
- g_string_free (toc, TRUE);
+ if (toc->len > 0) {
+ tracker_sparql_builder_predicate (metadata, "nfo:tableOfContents");
+ tracker_sparql_builder_object_unvalidated (metadata, toc->str);
}
+
+ g_string_free (toc, TRUE);
}
@@ -269,23 +292,26 @@ page_get_size (Page *page,
gdouble *width,
gdouble *height)
{
- gdouble page_width, page_height;
- gint rotate;
-
- rotate = page->getRotate ();
-
- if (rotate == 90 || rotate == 270) {
- page_height = page->getCropWidth ();
- page_width = page->getCropHeight ();
- } else {
- page_width = page->getCropWidth ();
- page_height = page->getCropHeight ();
- }
-
- if (width != NULL)
- *width = page_width;
- if (height != NULL)
- *height = page_height;
+ gdouble page_width, page_height;
+ gint rotate;
+
+ rotate = page->getRotate ();
+
+ if (rotate == 90 || rotate == 270) {
+ page_height = page->getCropWidth ();
+ page_width = page->getCropHeight ();
+ } else {
+ page_width = page->getCropWidth ();
+ page_height = page->getCropHeight ();
+ }
+
+ if (width != NULL) {
+ *width = page_width;
+ }
+
+ if (height != NULL) {
+ *height = page_height;
+ }
}
static gchar *
@@ -321,15 +347,15 @@ extract_content (PDFDoc *document,
text_dev = new TextOutputDev (NULL, gTrue, gFalse, gFalse);
gfx = page->createGfx (text_dev,
72.0, 72.0, 0,
- gFalse, /* useMediaBox */
- gTrue, /* Crop */
+ gFalse, /* useMediaBox */
+ gTrue, /* Crop */
-1, -1, -1, -1,
- gFalse, /* printing */
+ gFalse, /* printing */
catalog,
NULL, NULL, NULL, NULL);
- page->display(gfx);
- text_dev->endPage();
+ page->display (gfx);
+ text_dev->endPage ();
page_get_size (page, &width, &height);
@@ -398,10 +424,9 @@ write_pdf_data (PDFData data,
}
}
-
-static PDFDoc*
-poppler_document_new_pdf_from_file (const char *uri,
- const char *password)
+static PDFDoc *
+new_pdf_from_file (const char *uri,
+ const char *password)
{
PDFDoc *newDoc;
GooString *filename_g;
@@ -409,12 +434,13 @@ poppler_document_new_pdf_from_file (const char *uri,
gchar *filename;
if (!globalParams) {
- globalParams = new GlobalParams();
+ globalParams = new GlobalParams ();
}
filename = g_filename_from_uri (uri, NULL, NULL);
- if (!filename)
+ if (!filename) {
return NULL;
+ }
filename_g = new GooString (filename);
g_free (filename);
@@ -424,10 +450,14 @@ poppler_document_new_pdf_from_file (const char *uri,
if (g_utf8_validate (password, -1, NULL)) {
gchar *password_latin;
- password_latin = g_convert (password, -1,
+ password_latin = g_convert (password,
+ -1,
"ISO-8859-1",
"UTF-8",
- NULL, NULL, NULL);
+ NULL,
+ NULL,
+ NULL);
+
password_g = new GooString (password_latin);
g_free (password_latin);
} else {
@@ -435,20 +465,21 @@ poppler_document_new_pdf_from_file (const char *uri,
}
}
- newDoc = new PDFDoc(filename_g, password_g, password_g);
+ newDoc = new PDFDoc (filename_g, password_g, password_g);
delete password_g;
return newDoc;
}
-static gchar*
-info_dict_get_string (Dict *info_dict, const gchar *key)
+static gchar *
+info_dict_get_string (Dict *info_dict,
+ const gchar *key)
{
Object obj;
GooString *goo_value;
gchar *result;
- if (!info_dict->lookup ((gchar *)key, &obj)->isString ()) {
+ if (!info_dict->lookup ((gchar*) key, &obj)->isString ()) {
obj.free ();
return NULL;
}
@@ -458,7 +489,11 @@ info_dict_get_string (Dict *info_dict, const gchar *key)
if (goo_value->hasUnicodeMarker()) {
result = g_convert (goo_value->getCString () + 2,
goo_value->getLength () - 2,
- "UTF-8", "UTF-16BE", NULL, NULL, NULL);
+ "UTF-8",
+ "UTF-16BE",
+ NULL,
+ NULL,
+ NULL);
} else {
int len;
gunichar *ucs4_temp;
@@ -466,9 +501,11 @@ info_dict_get_string (Dict *info_dict, const gchar *key)
len = goo_value->getLength ();
ucs4_temp = g_new (gunichar, len + 1);
+
for (i = 0; i < len; ++i) {
- ucs4_temp[i] = pdfDocEncoding[(unsigned char)goo_value->getChar(i)];
+ ucs4_temp[i] = pdfDocEncoding[(unsigned char) goo_value->getChar (i)];
}
+
ucs4_temp[i] = 0;
result = g_ucs4_to_utf8 (ucs4_temp, -1, NULL, NULL, NULL);
g_free (ucs4_temp);
@@ -498,7 +535,7 @@ extract_pdf (const gchar *uri,
g_type_init ();
- document = poppler_document_new_pdf_from_file (uri, NULL);
+ document = new_pdf_from_file (uri, NULL);
if (!document) {
g_warning ("Could not create PopplerDocument from uri:'%s', "
@@ -507,29 +544,30 @@ extract_pdf (const gchar *uri,
return;
}
- if (!document->isOk()) {
+ if (!document->isOk ()) {
int fopen_errno;
+
switch (document->getErrorCode()) {
- case errEncrypted:
- tracker_sparql_builder_predicate (metadata, "a");
- tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
- tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
- tracker_sparql_builder_object_boolean (metadata, TRUE);
- break;
- case errBadCatalog:
- g_warning ("Couldn't create PopplerDocument from uri:'%s', Failed to read the document catalog", uri);
- break;
- case errDamaged:
- g_warning ("Couldn't create PopplerDocument from uri:'%s', PDF document is damaged", uri);
- break;
- case errOpenFile:
- fopen_errno = document->getFopenErrno();
- g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
- uri, g_strerror (fopen_errno));
- break;
- default:
- g_warning ("Couldn't create PopplerDocument from uri:'%s', no error given", uri);
- break;
+ case errEncrypted:
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
+ tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
+ tracker_sparql_builder_object_boolean (metadata, TRUE);
+ break;
+ case errBadCatalog:
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', Failed to read the document catalog", uri);
+ break;
+ case errDamaged:
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', PDF document is damaged", uri);
+ break;
+ case errOpenFile:
+ fopen_errno = document->getFopenErrno ();
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
+ uri, g_strerror (fopen_errno));
+ break;
+ default:
+ g_warning ("Couldn't create PopplerDocument from uri:'%s', no error given", uri);
+ break;
}
delete document;
@@ -542,7 +580,9 @@ extract_pdf (const gchar *uri,
document->getDocInfo (&obj);
if (obj.isDict ()) {
gchar *creation_date;
- Dict *info_dict = obj.getDict();
+ Dict *info_dict;
+
+ info_dict = obj.getDict();
pd.title = info_dict_get_string (info_dict, "Title");
pd.author = info_dict_get_string (info_dict, "Author");
pd.subject = info_dict_get_string (info_dict, "Subject");
@@ -557,18 +597,19 @@ extract_pdf (const gchar *uri,
catalog = document->getCatalog ();
if (catalog && catalog->isOk ()) {
- GooString *s = catalog->readMetadata ();
- if (s != NULL) {
+ GooString *str = catalog->readMetadata ();
+
+ if (str != NULL) {
const gchar *xml;
- xml = s->getCString();
+ xml = str->getCString ();
xd = tracker_xmp_new (xml, strlen (xml), uri);
if (!xd) {
xd = g_new0 (TrackerXmpData, 1);
}
- delete s;
+ delete str;
/* The casts here are well understood and known */
md.title = (gchar *) tracker_coalesce_strip (4, pd.title, xd->title, xd->title2, xd->pdf_title);
@@ -797,7 +838,7 @@ extract_pdf (const gchar *uri,
for (i = 0; i < keywords->len; i++) {
gchar *p;
- p = (gchar *) g_ptr_array_index (keywords, i);
+ p = (gchar*) g_ptr_array_index (keywords, i);
tracker_sparql_builder_predicate (metadata, "nao:hasTag");
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]