Re: [Tracker] PATCH: v2 Simplified and improved extraction of oasis text files.
- From: Philip Van Hoof <philip codeminded be>
- To: Karl Relton <karllinuxtest relton ntlworld com>
- Cc: tracker-list gnome org
- Subject: Re: [Tracker] PATCH: v2 Simplified and improved extraction of oasis text files.
- Date: Wed, 23 May 2012 15:55:07 +0200
Hi team,
Is somebody picking up patch review of this stuff by Karl?
ps. I'm very busy lately with a variety of things, but if not I can in a
few weeks look into this. Hopefully, for Karl's efforts, will somebody
else do a review before that.
Kind regards,
Philip
On Sat, 2012-04-14 at 13:44 +0100, Karl Relton wrote:
My first patch inadvertently set the wrong tag type - this patch has that corrected.
----
As per thread starting at
http://mail.gnome.org/archives/tracker-list/2012-April/msg00012.html
here is a proposed patch that simplifies (and improves) the indexing of
oasis text files (.odt files). With this patch you get alot more of the
content indexed on a typical file saved by Libreoffice, and so they are
far more likely to show up in searches.
Karl
--- tracker-0.14.0.orig/src/tracker-extract/tracker-extract-oasis.c 2012-04-09 13:31:04.132949981 +0100
+++ tracker-0.14.0/src/tracker-extract/tracker-extract-oasis.c 2012-04-09 19:13:15.553943645 +0100
@@ -59,7 +59,6 @@ typedef struct {
typedef struct {
ODTTagType current;
- gboolean styles_present;
ODTFileType file_type;
GString *content;
gulong bytes_pending;
@@ -128,7 +127,6 @@ extract_oasis_content (const gchar
/* Create parse info */
info.current = ODT_TAG_TYPE_UNKNOWN;
info.file_type = file_type;
- info.styles_present = FALSE;
info.content = g_string_new ("");
info.bytes_pending = total_bytes;
@@ -391,45 +389,12 @@ xml_start_element_handler_content (GMark
switch (data->file_type) {
case FILE_TYPE_ODT:
- if ((g_ascii_strcasecmp (element_name, "text:table-of-content") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:table-index") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:illustration-index") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:section") == 0)) {
- data->styles_present = TRUE;
- } else if (g_ascii_strcasecmp (element_name, "table:table-cell") == 0) {
- data->current = ODT_TAG_TYPE_WORD_TEXT;
- } else if (g_ascii_strcasecmp (element_name, "text:p") == 0) {
- if (data->styles_present) {
- data->current = ODT_TAG_TYPE_WORD_TEXT;
- break;
- }
-
- for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
- if (g_ascii_strcasecmp (*a, "text:style-name") != 0) {
- continue;
- }
-
- if ((g_ascii_strcasecmp (*v, "title-article") == 0) ||
- (g_ascii_strcasecmp (*v, "para-padding") == 0) ||
- (g_ascii_strcasecmp (*v, "para-screen") == 0)) {
- data->current = ODT_TAG_TYPE_WORD_TEXT;
- }
- }
- } else if (g_ascii_strcasecmp (element_name, "text:h") == 0) {
- for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
- if (g_ascii_strcasecmp (*a, "text:style-name") != 0) {
- continue;
- }
-
- if (g_ascii_strncasecmp (*v, "Heading", 7) == 0) {
- data->current = ODT_TAG_TYPE_WORD_TEXT;
- }
- }
- } else if (g_ascii_strcasecmp (element_name, "text:span") == 0) {
- data->current = ODT_TAG_TYPE_WORD_TEXT;
- } else if ((g_ascii_strcasecmp (element_name, "text:a") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:s") == 0)) {
- data->current = ODT_TAG_TYPE_WORD_TEXT;
+ if ((g_ascii_strcasecmp (element_name, "text:p") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:h") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:a") == 0) ||
+ (g_ascii_strcasecmp (element_name, "text:span") == 0) ||
+ (g_ascii_strcasecmp (element_name, "table:table-cell")) == 0) {
+ data->current = ODT_TAG_TYPE_WORD_TEXT;
} else {
data->current = -1;
}
@@ -461,23 +426,8 @@ xml_end_element_handler_content (GMarkup
{
ODTContentParseInfo *data = user_data;
- switch (data->file_type) {
- case FILE_TYPE_ODT:
- if ((g_ascii_strcasecmp (element_name, "text:table-of-content") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:table-index") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:illustration-index") == 0) ||
- (g_ascii_strcasecmp (element_name, "text:section") == 0)) {
- data->styles_present = FALSE;
- }
- break;
- default:
- break;
- }
+ data->current = -1;
- if ((g_ascii_strcasecmp (element_name, "text:a") != 0) &&
- (g_ascii_strcasecmp (element_name, "text:s") != 0)) {
- data->current = -1;
- }
}
static void
_______________________________________________
tracker-list mailing list
tracker-list gnome org
http://mail.gnome.org/mailman/listinfo/tracker-list
--
Philip Van Hoof
Software developer
Codeminded BVBA - http://codeminded.be
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]