[tracker/tracker-1.0] tracker-extract-oasis: Continue extracting if we find embedded tabs + line breaks
- From: Martyn James Russell <mr src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/tracker-1.0] tracker-extract-oasis: Continue extracting if we find embedded tabs + line breaks
- Date: Thu, 10 Jul 2014 13:36:28 +0000 (UTC)
commit 3e40515b6a5961dbf3f1183f8b49572eb1776095
Author: Karl Relton <karllinuxtest relton ntlworld com>
Date: Thu Jul 10 11:01:00 2014 +0100
tracker-extract-oasis: Continue extracting if we find embedded tabs + line breaks
The following patch improves the oasis extractor on odt documents so
that it keeps extracting plain text content even when there are embedded
tab and line-break xml tags. Without this patch the extractor stops when
such a tag is encountered, and resumes typically at the next paragraph
or style/format change. This means extractable text is missed.
src/tracker-extract/tracker-extract-oasis.c | 13 +++++++++++--
1 files changed, 11 insertions(+), 2 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index 70e4492..658b140 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -395,7 +395,10 @@ xml_start_element_handler_content (GMarkupParseContext *context,
(g_ascii_strcasecmp (element_name, "text:h") == 0) ||
(g_ascii_strcasecmp (element_name, "text:a") == 0) ||
(g_ascii_strcasecmp (element_name, "text:span") == 0) ||
- (g_ascii_strcasecmp (element_name, "table:table-cell")) == 0) {
+ (g_ascii_strcasecmp (element_name, "table:table-cell") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:s") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:tab") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:line-break") == 0)) {
data->current = ODT_TAG_TYPE_WORD_TEXT;
} else {
data->current = -1;
@@ -436,7 +439,13 @@ xml_end_element_handler_content (GMarkupParseContext *context,
{
ODTContentParseInfo *data = user_data;
- data->current = -1;
+ /* Don't stop processing if it was a so-called 'empty' tag (e.g. <text:tab/>) */
+ if (!((g_ascii_strcasecmp (element_name, "text:s") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:tab") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:line-break") == 0))) {
+ data->current = -1;
+ }
+
}
static void
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]