[tracker/miner-web: 44/77] Fixes GB#609075, Adding support for pdf extractor to extract the index data from the pdf files
- From: Adrien Bustany <abustany src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/miner-web: 44/77] Fixes GB#609075, Adding support for pdf extractor to extract the index data from the pdf files
- Date: Wed, 3 Mar 2010 12:52:24 +0000 (UTC)
commit 066210f8769bb4a6084f16a763b23cac5f969c63
Author: Amin Jain <ext-amit 1 jain nokia com>
Date: Thu Feb 25 15:28:21 2010 +0000
Fixes GB#609075, Adding support for pdf extractor to extract the index data from the pdf files
src/tracker-extract/tracker-extract-pdf.c | 132 +++++++++++++++++++++++++++++
1 files changed, 132 insertions(+), 0 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.c
index 32490d1..b651765 100644
--- a/src/tracker-extract/tracker-extract-pdf.c
+++ b/src/tracker-extract/tracker-extract-pdf.c
@@ -1,6 +1,7 @@
/*
* Copyright (C) 2006, Mr Jamie McCracken (jamiemcc gnome org)
* Copyright (C) 2008-2009, Nokia
+ * Copyright (C) 2010, Amit Aggarwal (amitcs06 gmail com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -56,6 +57,135 @@ static TrackerExtractData data[] = {
};
static void
+read_toc (PopplerIndexIter *index,
+ GString **toc)
+{
+ if (!index) {
+ return;
+ }
+
+ if (!*toc) {
+ *toc = g_string_new ("");
+ }
+
+ do {
+ PopplerAction *action;
+ PopplerIndexIter *iter;
+
+ action = poppler_index_iter_get_action (index);
+
+ if (!action) {
+ continue;
+ }
+
+ switch (action->type) {
+ case POPPLER_ACTION_GOTO_DEST: {
+ PopplerActionGotoDest *ag = (PopplerActionGotoDest*) action;
+ PopplerDest *agd = ag->dest;
+
+ if (!tracker_is_empty_string (ag->title)) {
+ g_string_append_printf (*toc, "%s ", ag->title);
+ }
+
+ if (!tracker_is_empty_string (agd->named_dest)) {
+ g_string_append_printf (*toc, "%s ", agd->named_dest);
+ }
+
+ break;
+ }
+
+ case POPPLER_ACTION_LAUNCH: {
+ PopplerActionLaunch *al = (PopplerActionLaunch*) action;
+
+ if (!tracker_is_empty_string (al->title)) {
+ g_string_append_printf (*toc, "%s ", al->title);
+ }
+
+ if (!tracker_is_empty_string (al->file_name)) {
+ g_string_append_printf (*toc, "%s ", al->file_name);
+ }
+
+ if (!tracker_is_empty_string (al->params)) {
+ g_string_append_printf (*toc, "%s ", al->params);
+ }
+
+ break;
+ }
+
+ case POPPLER_ACTION_URI: {
+ PopplerActionUri *au = (PopplerActionUri*) action;
+
+ if (!tracker_is_empty_string (au->uri)) {
+ g_string_append_printf (*toc, "%s ", au->uri);
+ }
+
+ break;
+ }
+
+ case POPPLER_ACTION_NAMED: {
+ PopplerActionNamed *an = (PopplerActionNamed*) action;
+
+ if (!tracker_is_empty_string (an->title)) {
+ g_string_append_printf (*toc, "%s, ", an->title);
+ }
+
+ if (!tracker_is_empty_string (an->named_dest)) {
+ g_string_append_printf (*toc, "%s ", an->named_dest);
+ }
+
+ break;
+ }
+
+ case POPPLER_ACTION_MOVIE: {
+ PopplerActionNamed *am = (PopplerActionNamed*) action;
+
+ if (!tracker_is_empty_string (am->title)) {
+ g_string_append_printf (*toc, "%s ", am->title);
+ }
+
+ break;
+ }
+
+ case POPPLER_ACTION_NONE:
+ case POPPLER_ACTION_UNKNOWN:
+ case POPPLER_ACTION_GOTO_REMOTE:
+ /* Do nothing */
+ break;
+ }
+
+ iter = poppler_index_iter_get_child (index);
+ read_toc (iter, toc);
+ } while (poppler_index_iter_next (index));
+
+ poppler_index_iter_free (index);
+}
+
+static void
+read_outline (PopplerDocument *document,
+ TrackerSparqlBuilder *metadata)
+{
+ PopplerIndexIter *index;
+ GString *toc = NULL;
+
+ index = poppler_index_iter_new (document);
+
+ if (!index) {
+ return;
+ }
+
+ read_toc (index, &toc);
+
+ if (toc) {
+ if (toc->len > 0) {
+ tracker_sparql_builder_predicate (metadata, "nfo:tableOfContents");
+ tracker_sparql_builder_object_unvalidated (metadata, toc->str);
+ }
+
+ g_string_free (toc, TRUE);
+ }
+}
+
+static void
insert_keywords (TrackerSparqlBuilder *metadata,
gchar *keywords)
{
@@ -466,6 +596,8 @@ extract_pdf (const gchar *uri,
g_free (content);
}
+ read_outline (document, metadata);
+
g_object_unref (document);
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]