[tracker/poppler-glib: 4/16] tracker-extract, pdf: Port 'Avoid duplicate tags'



commit 9a95de9db9f75681c6eff7b34561a61471bb8098
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Tue Feb 22 11:08:57 2011 +0100

    tracker-extract,pdf: Port 'Avoid duplicate tags'
    
    Original commit ID: 39d7aa2ac913a10195a3f2ceacfa54a9be930247

 src/tracker-extract/tracker-extract-pdf.c |   78 +++++++++++------------------
 1 files changed, 30 insertions(+), 48 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.c
index 391471a..ca99212 100644
--- a/src/tracker-extract/tracker-extract-pdf.c
+++ b/src/tracker-extract/tracker-extract-pdf.c
@@ -182,43 +182,6 @@ read_outline (PopplerDocument      *document,
 	}
 }
 
-static void
-insert_keywords (TrackerSparqlBuilder *metadata,
-                 gchar                *keywords)
-{
-	char *saveptr, *p;
-	size_t len;
-
-	p = keywords;
-	keywords = strchr (keywords, '"');
-
-	if (keywords) {
-		keywords++;
-	} else {
-		keywords = p;
-	}
-
-	len = strlen (keywords);
-	if (keywords[len - 1] == '"') {
-		keywords[len - 1] = '\0';
-	}
-
-	for (p = strtok_r (keywords, ",;", &saveptr);
-	     p;
-	     p = strtok_r (NULL, ",;", &saveptr)) {
-		tracker_sparql_builder_predicate (metadata, "nao:hasTag");
-
-		tracker_sparql_builder_object_blank_open (metadata);
-		tracker_sparql_builder_predicate (metadata, "a");
-		tracker_sparql_builder_object (metadata, "nao:Tag");
-
-		tracker_sparql_builder_predicate (metadata, "nao:prefLabel");
-		tracker_sparql_builder_object_unvalidated (metadata, p);
-
-		tracker_sparql_builder_object_blank_close (metadata);
-	}
-}
-
 static gchar *
 extract_content (PopplerDocument *document,
                  gsize            n_bytes)
@@ -231,7 +194,8 @@ extract_content (PopplerDocument *document,
 	string = g_string_new ("");
 	timer = g_timer_new ();
 
-	while (i < n_pages && n_bytes > 0 &&
+	while (i < n_pages &&
+	       n_bytes > 0 &&
 	       g_timer_elapsed (timer, NULL) < 5) {
 		PopplerPage *page;
 		gsize written_bytes;
@@ -266,7 +230,8 @@ extract_content (PopplerDocument *document,
 
 static void
 write_pdf_data (PDFData               data,
-                TrackerSparqlBuilder *metadata)
+                TrackerSparqlBuilder *metadata,
+                GPtrArray            *keywords)
 {
 	if (!tracker_is_empty_string (data.title)) {
 		tracker_sparql_builder_predicate (metadata, "nie:title");
@@ -294,7 +259,7 @@ write_pdf_data (PDFData               data,
 	}
 
 	if (!tracker_is_empty_string (data.keywords)) {
-		insert_keywords (metadata, data.keywords);
+		tracker_keywords_parse (keywords, data.keywords);
 	}
 }
 
@@ -313,6 +278,8 @@ extract_pdf (const gchar          *uri,
 	gchar *xml = NULL;
 	gchar *content;
 	guint n_bytes;
+	GPtrArray *keywords;
+	guint i;
 
 	g_type_init ();
 
@@ -359,6 +326,8 @@ extract_pdf (const gchar          *uri,
 		pd.creation_date = tracker_date_to_string ((time_t) creation_date);
 	}
 
+	keywords = g_ptr_array_new ();
+
 	if (xml &&
 	    (xd = tracker_xmp_new (xml, strlen (xml), uri)) != NULL) {
 		/* The casts here are well understood and known */
@@ -367,18 +336,14 @@ extract_pdf (const gchar          *uri,
 		md.date = (gchar *) tracker_coalesce_strip (3, pd.creation_date, xd->date, xd->time_original);
 		md.author = (gchar *) tracker_coalesce_strip (2, pd.author, xd->creator);
 
-		write_pdf_data (md, metadata);
+		write_pdf_data (md, metadata, keywords);
 
 		if (xd->keywords) {
-			insert_keywords (metadata, xd->keywords);
+			tracker_keywords_parse (keywords, xd->keywords);
 		}
 
 		if (xd->pdf_keywords) {
-			insert_keywords (metadata, xd->pdf_keywords);
-		}
-
-		if (pd.keywords) {
-			insert_keywords (metadata, pd.keywords);
+			tracker_keywords_parse (keywords, xd->pdf_keywords);
 		}
 
 		if (xd->publisher) {
@@ -558,8 +523,25 @@ extract_pdf (const gchar          *uri,
 		/* So if we are here we have NO XMP data and we just
 		 * write what we know from Poppler.
 		 */
-		write_pdf_data (pd, metadata);
+		write_pdf_data (pd, metadata, keywords);
+	}
+
+	for (i = 0; i < keywords->len; i++) {
+		gchar *p;
+
+		p = (gchar *) g_ptr_array_index (keywords, i);
+
+		tracker_sparql_builder_predicate (metadata, "nao:hasTag");
+
+		tracker_sparql_builder_object_blank_open (metadata);
+		tracker_sparql_builder_predicate (metadata, "a");
+		tracker_sparql_builder_object (metadata, "nao:Tag");
+		tracker_sparql_builder_predicate (metadata, "nao:prefLabel");
+		tracker_sparql_builder_object_unvalidated (metadata, p);
+		tracker_sparql_builder_object_blank_close (metadata);
+		g_free (p);
 	}
+	g_ptr_array_free (keywords, TRUE);
 
 	tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
 	tracker_sparql_builder_object_int64 (metadata, poppler_document_get_n_pages (document));



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]