[tracker] Make the PDF extractor use tracker_text_normalize().
- From: Carlos Garnacho <carlosg src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [tracker] Make the PDF extractor use tracker_text_normalize().
- Date: Thu, 8 Oct 2009 16:20:28 +0000 (UTC)
commit 6f7a65e2e026384ee739cc37fd56e2a9612287d9
Author: Carlos Garnacho <carlos lanedo com>
Date: Thu Oct 8 18:16:12 2009 +0200
Make the PDF extractor use tracker_text_normalize().
src/tracker-extract/tracker-extract-pdf.c | 39 ++++++-----------------------
1 files changed, 8 insertions(+), 31 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.c
index b8cf817..8517b78 100644
--- a/src/tracker-extract/tracker-extract-pdf.c
+++ b/src/tracker-extract/tracker-extract-pdf.c
@@ -94,51 +94,28 @@ extract_content (PopplerDocument *document,
GString *string;
gint n_pages, i, words;
gchar *text, *t;
- gboolean in_break = TRUE;
- gunichar ch;
n_pages = poppler_document_get_n_pages (document);
string = g_string_new ("");
- words = 0;
- i = 0;
+ words = i = 0;
while (i < n_pages && words < n_words) {
+ gint normalized_words;
+
page = poppler_document_get_page (document, i);
i++;
rect.x1 = rect.y1 = 0;
poppler_page_get_size (page, &rect.x2, &rect.y2);
- text = t = poppler_page_get_text (page, POPPLER_SELECTION_WORD, &rect);
-
- while ((ch = g_utf8_get_char_validated (t, -1)) > 0) {
- GUnicodeType type;
-
- type = g_unichar_type (ch);
-
- if (type == G_UNICODE_LOWERCASE_LETTER ||
- type == G_UNICODE_MODIFIER_LETTER ||
- type == G_UNICODE_OTHER_LETTER ||
- type == G_UNICODE_TITLECASE_LETTER ||
- type == G_UNICODE_UPPERCASE_LETTER) {
- /* Append regular chars */
- g_string_append_unichar (string, ch);
- in_break = FALSE;
- } else if (!in_break) {
- /* Non-regular char found, treat as word break */
- g_string_append_c (string, ' ');
- in_break = TRUE;
- words++;
-
- if (words > n_words) {
- break;
- }
- }
-
- t = g_utf8_find_next_char (t, NULL);
- }
+ text = poppler_page_get_text (page, POPPLER_SELECTION_WORD, &rect);
+ t = tracker_text_normalize (text, n_words - words, &normalized_words);
+
+ words += normalized_words;
+ g_string_append (string, t);
g_free (text);
+ g_free (t);
}
return g_string_free (string, FALSE);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]