[tracker-miners/sam/extract-empty-text-files] tracker-extract: Process small / empty text files



commit 0f67ca52a52eb634cafbb7defbc7b28e1c8c710c
Author: Sam Thursfield <sam afuera me uk>
Date:   Wed Feb 12 22:45:31 2020 +0100

    tracker-extract: Process small / empty text files
    
    Previously small or empty text files were ignored by the extractor.
    This could cause inconsistent results in the database, for example
    if the user truncated a regular text file to 0 bytes then the old
    content would remain in the store.
    
    We now always update the nie:plainTextContent field for small and
    0-byte files.

 src/tracker-extract/meson.build            |  2 +-
 src/tracker-extract/tracker-extract-text.c | 24 ++++++++++-----
 src/tracker-extract/tracker-extract.h      |  3 +-
 src/tracker-extract/tracker-read.c         | 48 +++++++++++++-----------------
 src/tracker-extract/tracker-read.h         |  8 +++--
 5 files changed, 45 insertions(+), 40 deletions(-)
---
diff --git a/src/tracker-extract/meson.build b/src/tracker-extract/meson.build
index b8ab6baaa..aa9abaf04 100644
--- a/src/tracker-extract/meson.build
+++ b/src/tracker-extract/meson.build
@@ -125,7 +125,7 @@ foreach module : modules
   shared_module(name, sources,
     c_args: tracker_c_args,
     dependencies: [tracker_extract_dep] + dependencies,
-    include_directories: configinc,
+    include_directories: [configinc, commoninc],
     install: true,
     install_dir: tracker_extract_modules_dir)
 
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
index 8c6daebcb..80712f889 100644
--- a/src/tracker-extract/tracker-extract-text.c
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -37,11 +37,13 @@
 #include <libtracker-extract/tracker-extract.h>
 
 #include "tracker-main.h"
+#include "tracker-extract.h"
 #include "tracker-read.h"
 
 static gchar *
-get_file_content (GFile *file,
-                  gsize  n_bytes)
+get_file_content (GFile   *file,
+                  gsize    n_bytes,
+                  GError **error)
 {
        gchar *text, *uri, *path;
        int fd;
@@ -54,9 +56,8 @@ get_file_content (GFile *file,
        fd = tracker_file_open_fd (path);
 
        if (fd == -1) {
-               g_message ("Could not open file '%s': %s",
-                          uri,
-                          g_strerror (errno));
+               g_set_error (error, TRACKER_EXTRACT_ERROR, TRACKER_EXTRACT_ERROR_IO_ERROR,
+                            "Could not open file '%s': %s", uri, g_strerror (errno));
                g_free (uri);
                g_free (path);
                return NULL;
@@ -68,7 +69,7 @@ get_file_content (GFile *file,
        /* Read up to n_bytes from stream. Output is always, always valid UTF-8,
         * this function closes the FD.
         */
-       text = tracker_read_text_from_fd (fd, n_bytes);
+       text = tracker_read_text_from_fd (fd, n_bytes, error);
        g_free (uri);
        g_free (path);
 
@@ -81,13 +82,18 @@ tracker_extract_get_metadata (TrackerExtractInfo *info)
        TrackerResource *metadata;
        TrackerConfig *config;
        gchar *content = NULL;
+       GError *error = NULL;
 
        config = tracker_main_get_config ();
 
-       content = get_file_content (tracker_extract_info_get_file (info), tracker_config_get_max_bytes 
(config));
+       content = get_file_content (tracker_extract_info_get_file (info),
+                                   tracker_config_get_max_bytes (config),
+                                   &error);
 
-       if (content == NULL) {
+       if (error != NULL) {
                /* An error occurred, perhaps the file was deleted. */
+               g_message ("Error extracting content: %s", error->message);
+               g_error_free (error);
                return FALSE;
        }
 
@@ -98,6 +104,8 @@ tracker_extract_get_metadata (TrackerExtractInfo *info)
        if (content) {
                tracker_resource_set_string (metadata, "nie:plainTextContent", content);
                g_free (content);
+       } else {
+               tracker_resource_set_string (metadata, "nie:plainTextContent", "");
        }
 
        tracker_extract_info_set_resource (info, metadata);
diff --git a/src/tracker-extract/tracker-extract.h b/src/tracker-extract/tracker-extract.h
index 06aae9578..3fb532bfc 100644
--- a/src/tracker-extract/tracker-extract.h
+++ b/src/tracker-extract/tracker-extract.h
@@ -43,7 +43,8 @@ typedef struct TrackerExtractClass TrackerExtractClass;
 
 typedef enum {
        TRACKER_EXTRACT_ERROR_NO_MIMETYPE,
-       TRACKER_EXTRACT_ERROR_NO_EXTRACTOR
+       TRACKER_EXTRACT_ERROR_NO_EXTRACTOR,
+       TRACKER_EXTRACT_ERROR_IO_ERROR,
 } TrackerExtractError;
 
 struct TrackerExtract {
diff --git a/src/tracker-extract/tracker-read.c b/src/tracker-extract/tracker-read.c
index 6de9b677b..da8540bfa 100644
--- a/src/tracker-extract/tracker-read.c
+++ b/src/tracker-extract/tracker-read.c
@@ -29,6 +29,7 @@
 #include <libtracker-extract/tracker-extract.h>
 
 #include "tracker-read.h"
+#include "tracker-extract.h"
 
 /* Size of the buffer to use when reading, in bytes */
 #define BUFFER_SIZE 65535
@@ -107,20 +108,13 @@ process_chunk (const gchar  *read_bytes,
         * case where we read 10 bytes in and it is just one
         * line with no '\n'. Once we have confirmed this we
         * check that the buffer has a '\n' to make sure the
-        * file is worth indexing. Similarly if the file has
-        * <= 3 bytes then we drop it.
+        * file is worth indexing.
         *
         * NOTE: We may have non-UTF8 content read (say,
         * UTF-16LE), so we can't rely on methods which assume
         * NUL-terminated strings, as g_strstr_len().
         */
        if (s->len == 0) {
-               if (read_size <= 3) {
-                       g_debug ("  File has less than 3 characters in it, "
-                                "not indexing file");
-                       return FALSE;
-               }
-
                if (read_size == buffer_size) {
                        const gchar *i;
                        gboolean eol_found = FALSE;
@@ -159,7 +153,8 @@ process_chunk (const gchar  *read_bytes,
 }
 
 static gchar *
-process_whole_string (GString  *s)
+process_whole_string (GString  *s,
+                      GError  **error)
 {
        gchar *utf8 = NULL;
        gsize  utf8_len = 0;
@@ -169,7 +164,7 @@ process_whole_string (GString  *s)
         * Windows OS. We will only accept text files in UTF-16 which come
         * with a proper BOM. */
        if (s->len > 2) {
-               GError *error = NULL;
+               GError *inner_error = NULL;
 
                if (memcmp (s->str, "\xFF\xFE", 2) == 0) {
                        g_debug ("String comes in UTF-16LE, converting");
@@ -179,7 +174,7 @@ process_whole_string (GString  *s)
                                          "UTF-16LE",
                                          NULL,
                                          &utf8_len,
-                                         &error);
+                                         &inner_error);
 
                } else if (memcmp (s->str, "\xFE\xFF", 2) == 0) {
                        g_debug ("String comes in UTF-16BE, converting");
@@ -189,13 +184,11 @@ process_whole_string (GString  *s)
                                          "UTF-16BE",
                                          NULL,
                                          &utf8_len,
-                                         &error);
+                                         &inner_error);
                }
 
-               if (error) {
-                       g_warning ("Couldn't convert string from UTF-16 to UTF-8...: %s",
-                                  error->message);
-                       g_error_free (error);
+               if (inner_error) {
+                       g_propagate_error (error, inner_error);
                        g_string_free (s, TRUE);
                        return NULL;
                }
@@ -264,7 +257,8 @@ process_whole_string (GString  *s)
  **/
 gchar *
 tracker_read_text_from_stream (GInputStream *stream,
-                               gsize         max_bytes)
+                               gsize         max_bytes,
+                               GError      **error)
 {
        GString *s = NULL;
        gsize n_bytes_remaining = max_bytes;
@@ -282,7 +276,7 @@ tracker_read_text_from_stream (GInputStream *stream,
         */
        while (n_bytes_remaining > 0) {
                gchar buf[BUFFER_SIZE];
-               GError *error = NULL;
+               GError *inner_error = NULL;
                gsize n_bytes_read;
 
                /* Read bytes from stream */
@@ -291,10 +285,8 @@ tracker_read_text_from_stream (GInputStream *stream,
                                              MIN (BUFFER_SIZE, n_bytes_remaining),
                                              &n_bytes_read,
                                              NULL,
-                                             &error)) {
-                       g_message ("Error reading from stream: '%s'",
-                                  error->message);
-                       g_error_free (error);
+                                             &inner_error)) {
+                       g_propagate_error (error, inner_error);
                        break;
                }
 
@@ -309,7 +301,7 @@ tracker_read_text_from_stream (GInputStream *stream,
        }
 
        /* Validate UTF-8 if something was read, and return it */
-       return s ? process_whole_string (s) : NULL;
+       return s ? process_whole_string (s, error) : NULL;
 }
 
 
@@ -327,15 +319,17 @@ tracker_read_text_from_stream (GInputStream *stream,
  * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
  **/
 gchar *
-tracker_read_text_from_fd (gint  fd,
-                           gsize max_bytes)
+tracker_read_text_from_fd (gint     fd,
+                           gsize    max_bytes,
+                           GError **error)
 {
        FILE *fz;
        GString *s;
        gsize n_bytes_remaining = max_bytes;
 
        if ((fz = fdopen (fd, "r")) == NULL) {
-               g_warning ("Cannot read from FD... could not extract text");
+               g_set_error (error, TRACKER_EXTRACT_ERROR, TRACKER_EXTRACT_ERROR_IO_ERROR,
+                            "Cannot read from file so could not extract text.");
                close (fd);
                return NULL;
        }
@@ -378,5 +372,5 @@ tracker_read_text_from_fd (gint  fd,
        fclose (fz);
 
        /* Validate UTF-8 if something was read, and return it */
-       return process_whole_string (s);
+       return process_whole_string (s, error);
 }
diff --git a/src/tracker-extract/tracker-read.h b/src/tracker-extract/tracker-read.h
index 513801916..919d661e9 100644
--- a/src/tracker-extract/tracker-read.h
+++ b/src/tracker-extract/tracker-read.h
@@ -26,10 +26,12 @@
 G_BEGIN_DECLS
 
 gchar *tracker_read_text_from_stream (GInputStream *stream,
-                                      gsize         max_bytes);
+                                      gsize         max_bytes,
+                                      GError      **error);
 
-gchar *tracker_read_text_from_fd (gint  fd,
-                                  gsize max_bytes);
+gchar *tracker_read_text_from_fd (gint     fd,
+                                  gsize    max_bytes,
+                                  GError **error);
 
 G_END_DECLS
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]