[tracker/chunck-pdf] Experimental



commit d4e4493fea310917b45f5cba5e3bbc95958d7274
Author: Philip Van Hoof <philip codeminded be>
Date:   Fri Nov 18 14:45:30 2011 +0100

    Experimental

 src/tracker-extract/tracker-extract-pdf.c |   49 +++++++++++++++++++----------
 1 files changed, 32 insertions(+), 17 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.c
index af31369..ce162f0 100644
--- a/src/tracker-extract/tracker-extract-pdf.c
+++ b/src/tracker-extract/tracker-extract-pdf.c
@@ -203,32 +203,47 @@ extract_content (PopplerDocument *document,
 	       g_timer_elapsed (timer, NULL) < 5) {
 		PopplerPage *page;
 		gsize written_bytes;
-		gchar *text;
+			   gchar *text;
+		PopplerRectangle rectangle = {0, 0, 0, 0};
+		double height = 0, piece;
+		gint part;
 
 		page = poppler_document_get_page (document, i);
 		i++;
 
-		text = poppler_page_get_text (page);
+		poppler_page_get_size (page, &rectangle.x2, &height);
+		piece = height / 10;
 
-		if (!text) {
-			g_object_unref (page);
-			continue;
-		}
+		for (part = 1; part <= 10 && g_timer_elapsed (timer, NULL) < 105; part++) {
 
-		if (tracker_text_validate_utf8 (text,
-		                                MIN (strlen (text), remaining_bytes),
-		                                &string,
-		                                &written_bytes)) {
-			g_string_append_c (string, ' ');
-		}
+			rectangle.y1 = piece * (part - 1);
+			rectangle.y2 = piece * part;
+
+
+			text = poppler_page_get_selected_text (page, POPPLER_SELECTION_GLYPH, &rectangle);
+
+			g_print ("from %f to %f took %fs\n", rectangle.y1, rectangle.y2, g_timer_elapsed (timer, NULL));
 
-		remaining_bytes -= written_bytes;
+			if (!text) {
+				continue;
+			}
+
+			if (tracker_text_validate_utf8 (text,
+			                                MIN (strlen (text), remaining_bytes),
+			                                &string,
+			                                &written_bytes)) {
+				g_string_append_c (string, ' ');
+			}
 
-		g_debug ("Extracted %" G_GSIZE_FORMAT " bytes from page %d, "
-		         "%" G_GSIZE_FORMAT " bytes remaining",
-		         written_bytes, i, remaining_bytes);
+			remaining_bytes -= written_bytes;
+
+			g_debug ("Extracted %" G_GSIZE_FORMAT " bytes from page %d, "
+			         "%" G_GSIZE_FORMAT " bytes remaining",
+			         written_bytes, i, remaining_bytes);
+
+			g_free (text);
+		}
 
-		g_free (text);
 		g_object_unref (page);
 	}
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]