[tracker-miners/sam/extract-empty-text-files] tracker-extract: Process small / empty text files
- From: Sam Thursfield <sthursfield src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker-miners/sam/extract-empty-text-files] tracker-extract: Process small / empty text files
- Date: Wed, 12 Feb 2020 21:48:05 +0000 (UTC)
commit 0f67ca52a52eb634cafbb7defbc7b28e1c8c710c
Author: Sam Thursfield <sam afuera me uk>
Date: Wed Feb 12 22:45:31 2020 +0100
tracker-extract: Process small / empty text files
Previously small or empty text files were ignored by the extractor.
This could cause inconsistent results in the database, for example
if the user truncated a regular text file to 0 bytes then the old
content would remain in the store.
We now always update the nie:plainTextContent field for small and
0-byte files.
src/tracker-extract/meson.build | 2 +-
src/tracker-extract/tracker-extract-text.c | 24 ++++++++++-----
src/tracker-extract/tracker-extract.h | 3 +-
src/tracker-extract/tracker-read.c | 48 +++++++++++++-----------------
src/tracker-extract/tracker-read.h | 8 +++--
5 files changed, 45 insertions(+), 40 deletions(-)
---
diff --git a/src/tracker-extract/meson.build b/src/tracker-extract/meson.build
index b8ab6baaa..aa9abaf04 100644
--- a/src/tracker-extract/meson.build
+++ b/src/tracker-extract/meson.build
@@ -125,7 +125,7 @@ foreach module : modules
shared_module(name, sources,
c_args: tracker_c_args,
dependencies: [tracker_extract_dep] + dependencies,
- include_directories: configinc,
+ include_directories: [configinc, commoninc],
install: true,
install_dir: tracker_extract_modules_dir)
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
index 8c6daebcb..80712f889 100644
--- a/src/tracker-extract/tracker-extract-text.c
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -37,11 +37,13 @@
#include <libtracker-extract/tracker-extract.h>
#include "tracker-main.h"
+#include "tracker-extract.h"
#include "tracker-read.h"
static gchar *
-get_file_content (GFile *file,
- gsize n_bytes)
+get_file_content (GFile *file,
+ gsize n_bytes,
+ GError **error)
{
gchar *text, *uri, *path;
int fd;
@@ -54,9 +56,8 @@ get_file_content (GFile *file,
fd = tracker_file_open_fd (path);
if (fd == -1) {
- g_message ("Could not open file '%s': %s",
- uri,
- g_strerror (errno));
+ g_set_error (error, TRACKER_EXTRACT_ERROR, TRACKER_EXTRACT_ERROR_IO_ERROR,
+ "Could not open file '%s': %s", uri, g_strerror (errno));
g_free (uri);
g_free (path);
return NULL;
@@ -68,7 +69,7 @@ get_file_content (GFile *file,
/* Read up to n_bytes from stream. Output is always, always valid UTF-8,
* this function closes the FD.
*/
- text = tracker_read_text_from_fd (fd, n_bytes);
+ text = tracker_read_text_from_fd (fd, n_bytes, error);
g_free (uri);
g_free (path);
@@ -81,13 +82,18 @@ tracker_extract_get_metadata (TrackerExtractInfo *info)
TrackerResource *metadata;
TrackerConfig *config;
gchar *content = NULL;
+ GError *error = NULL;
config = tracker_main_get_config ();
- content = get_file_content (tracker_extract_info_get_file (info), tracker_config_get_max_bytes
(config));
+ content = get_file_content (tracker_extract_info_get_file (info),
+ tracker_config_get_max_bytes (config),
+ &error);
- if (content == NULL) {
+ if (error != NULL) {
/* An error occurred, perhaps the file was deleted. */
+ g_message ("Error extracting content: %s", error->message);
+ g_error_free (error);
return FALSE;
}
@@ -98,6 +104,8 @@ tracker_extract_get_metadata (TrackerExtractInfo *info)
if (content) {
tracker_resource_set_string (metadata, "nie:plainTextContent", content);
g_free (content);
+ } else {
+ tracker_resource_set_string (metadata, "nie:plainTextContent", "");
}
tracker_extract_info_set_resource (info, metadata);
diff --git a/src/tracker-extract/tracker-extract.h b/src/tracker-extract/tracker-extract.h
index 06aae9578..3fb532bfc 100644
--- a/src/tracker-extract/tracker-extract.h
+++ b/src/tracker-extract/tracker-extract.h
@@ -43,7 +43,8 @@ typedef struct TrackerExtractClass TrackerExtractClass;
typedef enum {
TRACKER_EXTRACT_ERROR_NO_MIMETYPE,
- TRACKER_EXTRACT_ERROR_NO_EXTRACTOR
+ TRACKER_EXTRACT_ERROR_NO_EXTRACTOR,
+ TRACKER_EXTRACT_ERROR_IO_ERROR,
} TrackerExtractError;
struct TrackerExtract {
diff --git a/src/tracker-extract/tracker-read.c b/src/tracker-extract/tracker-read.c
index 6de9b677b..da8540bfa 100644
--- a/src/tracker-extract/tracker-read.c
+++ b/src/tracker-extract/tracker-read.c
@@ -29,6 +29,7 @@
#include <libtracker-extract/tracker-extract.h>
#include "tracker-read.h"
+#include "tracker-extract.h"
/* Size of the buffer to use when reading, in bytes */
#define BUFFER_SIZE 65535
@@ -107,20 +108,13 @@ process_chunk (const gchar *read_bytes,
* case where we read 10 bytes in and it is just one
* line with no '\n'. Once we have confirmed this we
* check that the buffer has a '\n' to make sure the
- * file is worth indexing. Similarly if the file has
- * <= 3 bytes then we drop it.
+ * file is worth indexing.
*
* NOTE: We may have non-UTF8 content read (say,
* UTF-16LE), so we can't rely on methods which assume
* NUL-terminated strings, as g_strstr_len().
*/
if (s->len == 0) {
- if (read_size <= 3) {
- g_debug (" File has less than 3 characters in it, "
- "not indexing file");
- return FALSE;
- }
-
if (read_size == buffer_size) {
const gchar *i;
gboolean eol_found = FALSE;
@@ -159,7 +153,8 @@ process_chunk (const gchar *read_bytes,
}
static gchar *
-process_whole_string (GString *s)
+process_whole_string (GString *s,
+ GError **error)
{
gchar *utf8 = NULL;
gsize utf8_len = 0;
@@ -169,7 +164,7 @@ process_whole_string (GString *s)
* Windows OS. We will only accept text files in UTF-16 which come
* with a proper BOM. */
if (s->len > 2) {
- GError *error = NULL;
+ GError *inner_error = NULL;
if (memcmp (s->str, "\xFF\xFE", 2) == 0) {
g_debug ("String comes in UTF-16LE, converting");
@@ -179,7 +174,7 @@ process_whole_string (GString *s)
"UTF-16LE",
NULL,
&utf8_len,
- &error);
+ &inner_error);
} else if (memcmp (s->str, "\xFE\xFF", 2) == 0) {
g_debug ("String comes in UTF-16BE, converting");
@@ -189,13 +184,11 @@ process_whole_string (GString *s)
"UTF-16BE",
NULL,
&utf8_len,
- &error);
+ &inner_error);
}
- if (error) {
- g_warning ("Couldn't convert string from UTF-16 to UTF-8...: %s",
- error->message);
- g_error_free (error);
+ if (inner_error) {
+ g_propagate_error (error, inner_error);
g_string_free (s, TRUE);
return NULL;
}
@@ -264,7 +257,8 @@ process_whole_string (GString *s)
**/
gchar *
tracker_read_text_from_stream (GInputStream *stream,
- gsize max_bytes)
+ gsize max_bytes,
+ GError **error)
{
GString *s = NULL;
gsize n_bytes_remaining = max_bytes;
@@ -282,7 +276,7 @@ tracker_read_text_from_stream (GInputStream *stream,
*/
while (n_bytes_remaining > 0) {
gchar buf[BUFFER_SIZE];
- GError *error = NULL;
+ GError *inner_error = NULL;
gsize n_bytes_read;
/* Read bytes from stream */
@@ -291,10 +285,8 @@ tracker_read_text_from_stream (GInputStream *stream,
MIN (BUFFER_SIZE, n_bytes_remaining),
&n_bytes_read,
NULL,
- &error)) {
- g_message ("Error reading from stream: '%s'",
- error->message);
- g_error_free (error);
+ &inner_error)) {
+ g_propagate_error (error, inner_error);
break;
}
@@ -309,7 +301,7 @@ tracker_read_text_from_stream (GInputStream *stream,
}
/* Validate UTF-8 if something was read, and return it */
- return s ? process_whole_string (s) : NULL;
+ return s ? process_whole_string (s, error) : NULL;
}
@@ -327,15 +319,17 @@ tracker_read_text_from_stream (GInputStream *stream,
* Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
**/
gchar *
-tracker_read_text_from_fd (gint fd,
- gsize max_bytes)
+tracker_read_text_from_fd (gint fd,
+ gsize max_bytes,
+ GError **error)
{
FILE *fz;
GString *s;
gsize n_bytes_remaining = max_bytes;
if ((fz = fdopen (fd, "r")) == NULL) {
- g_warning ("Cannot read from FD... could not extract text");
+ g_set_error (error, TRACKER_EXTRACT_ERROR, TRACKER_EXTRACT_ERROR_IO_ERROR,
+ "Cannot read from file so could not extract text.");
close (fd);
return NULL;
}
@@ -378,5 +372,5 @@ tracker_read_text_from_fd (gint fd,
fclose (fz);
/* Validate UTF-8 if something was read, and return it */
- return process_whole_string (s);
+ return process_whole_string (s, error);
}
diff --git a/src/tracker-extract/tracker-read.h b/src/tracker-extract/tracker-read.h
index 513801916..919d661e9 100644
--- a/src/tracker-extract/tracker-read.h
+++ b/src/tracker-extract/tracker-read.h
@@ -26,10 +26,12 @@
G_BEGIN_DECLS
gchar *tracker_read_text_from_stream (GInputStream *stream,
- gsize max_bytes);
+ gsize max_bytes,
+ GError **error);
-gchar *tracker_read_text_from_fd (gint fd,
- gsize max_bytes);
+gchar *tracker_read_text_from_fd (gint fd,
+ gsize max_bytes,
+ GError **error);
G_END_DECLS
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]