Re: [Tracker] PATCH: v2 Simplified and improved extraction of oasis text files.



Never mind. Looks like mail delivery of the Tracker ML was turned off
for me, because of that I didn't see Martyn's replies.

On Wed, 2012-05-23 at 15:55 +0200, Philip Van Hoof wrote:
Hi team,

Is somebody picking up patch review of this stuff by Karl?

ps. I'm very busy lately with a variety of things, but if not I can in a
few weeks look into this. Hopefully, for Karl's efforts, will somebody
else do a review before that.

Kind regards,

Philip

On Sat, 2012-04-14 at 13:44 +0100, Karl Relton wrote:
My first patch inadvertently set the wrong tag type - this patch has that corrected.

----

As per thread starting at
http://mail.gnome.org/archives/tracker-list/2012-April/msg00012.html

here is a proposed patch that simplifies (and improves) the indexing of
oasis text files (.odt files). With this patch you get alot more of the
content indexed on a typical file saved by Libreoffice, and so they are
far more likely to show up in searches.

Karl

--- tracker-0.14.0.orig/src/tracker-extract/tracker-extract-oasis.c 2012-04-09 13:31:04.132949981 +0100
+++ tracker-0.14.0/src/tracker-extract/tracker-extract-oasis.c      2012-04-09 19:13:15.553943645 +0100
@@ -59,7 +59,6 @@ typedef struct {
 
 typedef struct {
    ODTTagType current;
-   gboolean styles_present;
    ODTFileType file_type;
    GString *content;
    gulong bytes_pending;
@@ -128,7 +127,6 @@ extract_oasis_content (const gchar
    /* Create parse info */
    info.current = ODT_TAG_TYPE_UNKNOWN;
    info.file_type = file_type;
-   info.styles_present = FALSE;
    info.content = g_string_new ("");
    info.bytes_pending = total_bytes;
 
@@ -391,45 +389,12 @@ xml_start_element_handler_content (GMark
 
    switch (data->file_type) {
    case FILE_TYPE_ODT:
-           if ((g_ascii_strcasecmp (element_name, "text:table-of-content") == 0) ||
-               (g_ascii_strcasecmp (element_name, "text:table-index") == 0) ||
-               (g_ascii_strcasecmp (element_name, "text:illustration-index") == 0) ||
-               (g_ascii_strcasecmp (element_name, "text:section") == 0)) {
-                   data->styles_present = TRUE;
-           } else if (g_ascii_strcasecmp (element_name, "table:table-cell") == 0) {
-                   data->current = ODT_TAG_TYPE_WORD_TEXT;
-           } else if (g_ascii_strcasecmp (element_name, "text:p") == 0) {
-                   if (data->styles_present) {
-                           data->current = ODT_TAG_TYPE_WORD_TEXT;
-                           break;
-                   }
-
-                   for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
-                           if (g_ascii_strcasecmp (*a, "text:style-name") != 0) {
-                                   continue;
-                           }
-
-                           if ((g_ascii_strcasecmp (*v, "title-article") == 0) ||
-                               (g_ascii_strcasecmp (*v, "para-padding") == 0) ||
-                               (g_ascii_strcasecmp (*v, "para-screen") == 0)) {
-                                   data->current = ODT_TAG_TYPE_WORD_TEXT;
-                           }
-                   }
-           } else if (g_ascii_strcasecmp (element_name, "text:h") == 0) {
-                   for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
-                           if (g_ascii_strcasecmp (*a, "text:style-name") != 0) {
-                                   continue;
-                           }
-
-                           if (g_ascii_strncasecmp (*v, "Heading", 7) == 0) {
-                                   data->current = ODT_TAG_TYPE_WORD_TEXT;
-                           }
-                   }
-           } else if (g_ascii_strcasecmp (element_name, "text:span") == 0) {
-                   data->current = ODT_TAG_TYPE_WORD_TEXT;
-           } else if ((g_ascii_strcasecmp (element_name, "text:a") == 0) ||
-                      (g_ascii_strcasecmp (element_name, "text:s") == 0)) {
-                   data->current = ODT_TAG_TYPE_WORD_TEXT;
+           if ((g_ascii_strcasecmp (element_name, "text:p") == 0) ||
+               (g_ascii_strcasecmp (element_name, "text:h") == 0) ||
+               (g_ascii_strcasecmp (element_name, "text:a") == 0) ||
+               (g_ascii_strcasecmp (element_name, "text:span") == 0) ||
+               (g_ascii_strcasecmp (element_name, "table:table-cell")) == 0) {
+                   data->current = ODT_TAG_TYPE_WORD_TEXT;
            } else {
                    data->current = -1;
            }
@@ -461,23 +426,8 @@ xml_end_element_handler_content (GMarkup
 {
    ODTContentParseInfo *data = user_data;
 
-   switch (data->file_type) {
-   case FILE_TYPE_ODT:
-           if ((g_ascii_strcasecmp (element_name, "text:table-of-content") == 0) ||
-               (g_ascii_strcasecmp (element_name, "text:table-index") == 0) ||
-               (g_ascii_strcasecmp (element_name, "text:illustration-index") == 0) ||
-               (g_ascii_strcasecmp (element_name, "text:section") == 0)) {
-                   data->styles_present = FALSE;
-           }
-           break;
-   default:
-           break;
-   }
+   data->current = -1;
 
-   if ((g_ascii_strcasecmp (element_name, "text:a") != 0) &&
-       (g_ascii_strcasecmp (element_name, "text:s") != 0)) {
-           data->current = -1;
-   }
 }
 
 static void



_______________________________________________
tracker-list mailing list
tracker-list gnome org
http://mail.gnome.org/mailman/listinfo/tracker-list



-- 


Philip Van Hoof
Software developer
Codeminded BVBA - http://codeminded.be




[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]