[tracker/poppler-glib: 4/16] tracker-extract, pdf: Port 'Avoid duplicate tags'
- From: Martyn James Russell <mr src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/poppler-glib: 4/16] tracker-extract, pdf: Port 'Avoid duplicate tags'
- Date: Wed, 2 Mar 2011 14:53:06 +0000 (UTC)
commit 9a95de9db9f75681c6eff7b34561a61471bb8098
Author: Aleksander Morgado <aleksander lanedo com>
Date: Tue Feb 22 11:08:57 2011 +0100
tracker-extract,pdf: Port 'Avoid duplicate tags'
Original commit ID: 39d7aa2ac913a10195a3f2ceacfa54a9be930247
src/tracker-extract/tracker-extract-pdf.c | 78 +++++++++++------------------
1 files changed, 30 insertions(+), 48 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.c
index 391471a..ca99212 100644
--- a/src/tracker-extract/tracker-extract-pdf.c
+++ b/src/tracker-extract/tracker-extract-pdf.c
@@ -182,43 +182,6 @@ read_outline (PopplerDocument *document,
}
}
-static void
-insert_keywords (TrackerSparqlBuilder *metadata,
- gchar *keywords)
-{
- char *saveptr, *p;
- size_t len;
-
- p = keywords;
- keywords = strchr (keywords, '"');
-
- if (keywords) {
- keywords++;
- } else {
- keywords = p;
- }
-
- len = strlen (keywords);
- if (keywords[len - 1] == '"') {
- keywords[len - 1] = '\0';
- }
-
- for (p = strtok_r (keywords, ",;", &saveptr);
- p;
- p = strtok_r (NULL, ",;", &saveptr)) {
- tracker_sparql_builder_predicate (metadata, "nao:hasTag");
-
- tracker_sparql_builder_object_blank_open (metadata);
- tracker_sparql_builder_predicate (metadata, "a");
- tracker_sparql_builder_object (metadata, "nao:Tag");
-
- tracker_sparql_builder_predicate (metadata, "nao:prefLabel");
- tracker_sparql_builder_object_unvalidated (metadata, p);
-
- tracker_sparql_builder_object_blank_close (metadata);
- }
-}
-
static gchar *
extract_content (PopplerDocument *document,
gsize n_bytes)
@@ -231,7 +194,8 @@ extract_content (PopplerDocument *document,
string = g_string_new ("");
timer = g_timer_new ();
- while (i < n_pages && n_bytes > 0 &&
+ while (i < n_pages &&
+ n_bytes > 0 &&
g_timer_elapsed (timer, NULL) < 5) {
PopplerPage *page;
gsize written_bytes;
@@ -266,7 +230,8 @@ extract_content (PopplerDocument *document,
static void
write_pdf_data (PDFData data,
- TrackerSparqlBuilder *metadata)
+ TrackerSparqlBuilder *metadata,
+ GPtrArray *keywords)
{
if (!tracker_is_empty_string (data.title)) {
tracker_sparql_builder_predicate (metadata, "nie:title");
@@ -294,7 +259,7 @@ write_pdf_data (PDFData data,
}
if (!tracker_is_empty_string (data.keywords)) {
- insert_keywords (metadata, data.keywords);
+ tracker_keywords_parse (keywords, data.keywords);
}
}
@@ -313,6 +278,8 @@ extract_pdf (const gchar *uri,
gchar *xml = NULL;
gchar *content;
guint n_bytes;
+ GPtrArray *keywords;
+ guint i;
g_type_init ();
@@ -359,6 +326,8 @@ extract_pdf (const gchar *uri,
pd.creation_date = tracker_date_to_string ((time_t) creation_date);
}
+ keywords = g_ptr_array_new ();
+
if (xml &&
(xd = tracker_xmp_new (xml, strlen (xml), uri)) != NULL) {
/* The casts here are well understood and known */
@@ -367,18 +336,14 @@ extract_pdf (const gchar *uri,
md.date = (gchar *) tracker_coalesce_strip (3, pd.creation_date, xd->date, xd->time_original);
md.author = (gchar *) tracker_coalesce_strip (2, pd.author, xd->creator);
- write_pdf_data (md, metadata);
+ write_pdf_data (md, metadata, keywords);
if (xd->keywords) {
- insert_keywords (metadata, xd->keywords);
+ tracker_keywords_parse (keywords, xd->keywords);
}
if (xd->pdf_keywords) {
- insert_keywords (metadata, xd->pdf_keywords);
- }
-
- if (pd.keywords) {
- insert_keywords (metadata, pd.keywords);
+ tracker_keywords_parse (keywords, xd->pdf_keywords);
}
if (xd->publisher) {
@@ -558,8 +523,25 @@ extract_pdf (const gchar *uri,
/* So if we are here we have NO XMP data and we just
* write what we know from Poppler.
*/
- write_pdf_data (pd, metadata);
+ write_pdf_data (pd, metadata, keywords);
+ }
+
+ for (i = 0; i < keywords->len; i++) {
+ gchar *p;
+
+ p = (gchar *) g_ptr_array_index (keywords, i);
+
+ tracker_sparql_builder_predicate (metadata, "nao:hasTag");
+
+ tracker_sparql_builder_object_blank_open (metadata);
+ tracker_sparql_builder_predicate (metadata, "a");
+ tracker_sparql_builder_object (metadata, "nao:Tag");
+ tracker_sparql_builder_predicate (metadata, "nao:prefLabel");
+ tracker_sparql_builder_object_unvalidated (metadata, p);
+ tracker_sparql_builder_object_blank_close (metadata);
+ g_free (p);
}
+ g_ptr_array_free (keywords, TRUE);
tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
tracker_sparql_builder_object_int64 (metadata, poppler_document_get_n_pages (document));
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]