[tracker/tracker-1.0] tracker-extract-oasis: Continue extracting if we find embedded tabs + line breaks



commit 3e40515b6a5961dbf3f1183f8b49572eb1776095
Author: Karl Relton <karllinuxtest relton ntlworld com>
Date:   Thu Jul 10 11:01:00 2014 +0100

    tracker-extract-oasis: Continue extracting if we find embedded tabs + line breaks
    
    The following patch improves the oasis extractor on odt documents so
    that it keeps extracting plain text content even when there are embedded
    tab and line-break xml tags. Without this patch the extractor stops when
    such a tag is encountered, and resumes typically at the next paragraph
    or style/format change. This means extractable text is missed.

 src/tracker-extract/tracker-extract-oasis.c |   13 +++++++++++--
 1 files changed, 11 insertions(+), 2 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index 70e4492..658b140 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -395,7 +395,10 @@ xml_start_element_handler_content (GMarkupParseContext  *context,
                    (g_ascii_strcasecmp (element_name, "text:h") == 0) ||
                    (g_ascii_strcasecmp (element_name, "text:a") == 0) ||
                    (g_ascii_strcasecmp (element_name, "text:span") == 0) ||
-                   (g_ascii_strcasecmp (element_name, "table:table-cell")) == 0) {
+                   (g_ascii_strcasecmp (element_name, "table:table-cell") == 0) ||
+                   (g_ascii_strcasecmp (element_name, "text:s") == 0) ||
+                   (g_ascii_strcasecmp (element_name, "text:tab") == 0) ||
+                   (g_ascii_strcasecmp (element_name, "text:line-break") == 0)) {
                        data->current = ODT_TAG_TYPE_WORD_TEXT;
                } else {
                        data->current = -1;
@@ -436,7 +439,13 @@ xml_end_element_handler_content (GMarkupParseContext  *context,
 {
        ODTContentParseInfo *data = user_data;
 
-       data->current = -1;
+       /* Don't stop processing if it was a so-called 'empty' tag (e.g. <text:tab/>) */
+       if (!((g_ascii_strcasecmp (element_name, "text:s") == 0)   ||
+             (g_ascii_strcasecmp (element_name, "text:tab") == 0) ||
+             (g_ascii_strcasecmp (element_name, "text:line-break") == 0))) {
+               data->current = -1;
+       }
+
 }
 
 static void


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]