[tracker/extractor-remove-word-counting-review] Avoid the use of GIOChannels
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/extractor-remove-word-counting-review] Avoid the use of GIOChannels
- Date: Tue, 18 May 2010 13:36:40 +0000 (UTC)
commit 08539b4500f21233612aa8a44940efb7b75d4160
Author: Aleksander Morgado <aleksander lanedo com>
Date: Tue May 18 13:57:13 2010 +0200
Avoid the use of GIOChannels
src/libtracker-extract/tracker-utils.c | 2 +-
src/tracker-extract/Makefile.am | 4 +-
src/tracker-extract/tracker-extract-oasis.c | 39 ++--
src/tracker-extract/tracker-extract-text.c | 34 +--
src/tracker-extract/tracker-iochannel.c | 223 ---------------
src/tracker-extract/tracker-read.c | 284 ++++++++++++++++++++
.../{tracker-iochannel.h => tracker-read.h} | 18 +-
7 files changed, 328 insertions(+), 276 deletions(-)
---
diff --git a/src/libtracker-extract/tracker-utils.c b/src/libtracker-extract/tracker-utils.c
index 21a2f42..7f291b3 100644
--- a/src/libtracker-extract/tracker-utils.c
+++ b/src/libtracker-extract/tracker-utils.c
@@ -360,7 +360,7 @@ tracker_text_normalize (const gchar *text,
/**
* tracker_text_validate_utf8:
* @text: the text to validate
- * @text_len: length of @text, or -1 if NULL-terminated
+ * @text_len: length of @text, or -1 if NUL-terminated
* @str: the string where to place the validated UTF-8 characters, or %NULL if
* not needed.
* @valid_len: Output number of valid UTF-8 bytes found, or %NULL if not needed
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index be80110..d512f08 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -321,8 +321,8 @@ tracker_extract_SOURCES = \
tracker-dbus.h \
tracker-extract.c \
tracker-extract.h \
- tracker-iochannel.c \
- tracker-iochannel.h \
+ tracker-read.c \
+ tracker-read.h \
tracker-main.c \
tracker-main.h \
tracker-albumart-generic.h
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index 725b125..76985fa 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -24,7 +24,7 @@
#include "tracker-main.h"
#include "tracker-gsf.h"
-#include "tracker-iochannel.h"
+#include "tracker-read.h"
#include <unistd.h>
@@ -74,11 +74,11 @@ static gchar *
extract_oasis_content (const gchar *uri,
gsize n_bytes)
{
+ GError *error = NULL;
const gchar *argv[4];
gchar *text = NULL;
gchar *path;
- GIOChannel *channel;
- GPid pid;
+ gint fd;
/* Newly allocated string with the file path */
path = g_filename_from_uri (uri, NULL, NULL);
@@ -94,26 +94,27 @@ extract_oasis_content (const gchar *uri,
argv[0], argv[1], argv[2], n_bytes);
/* Fork & spawn */
- if (tracker_spawn_async_with_channels (argv,
- 10,
- &pid,
- NULL,
- &channel,
- NULL)) {
- /* Read up to n_bytes from stream */
- text = tracker_iochannel_read_text (channel,
- n_bytes,
- FALSE,
- TRUE);
-
- /* Close spawned PID */
- g_spawn_close_pid (pid);
+ if (!g_spawn_async_with_pipes (g_get_tmp_dir (),
+ (gchar **)argv,
+ NULL,
+ G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL,
+ tracker_spawn_child_func,
+ GINT_TO_POINTER (10),
+ NULL,
+ NULL,
+ &fd,
+ NULL,
+ &error)) {
+ g_warning ("Spawning failed, could not extract text from '%s': %s",
+ path, error ? error->message : NULL);
+ g_clear_error (&error);
+ } else {
+ /* Read up to n_bytes from FD (also closes FD) */
+ text = tracker_read_text_from_fd (fd, n_bytes, FALSE);
}
g_free (path);
- /* Note: Channel already closed and unrefed */
-
return text;
}
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
index f9303e3..d28656a 100644
--- a/src/tracker-extract/tracker-extract-text.c
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -27,7 +27,7 @@
#include <libtracker-extract/tracker-extract.h>
#include "tracker-main.h"
-#include "tracker-iochannel.h"
+#include "tracker-read.h"
#define TRY_LOCALE_TO_UTF8_CONVERSION 0
@@ -45,46 +45,34 @@ static gchar *
get_file_content (const gchar *uri,
gsize n_bytes)
{
- GIOChannel *channel;
+ GFile *file;
+ GFileInputStream *stream;
GError *error = NULL;
gchar *text;
- gchar *filename;
/* Get filename from URI */
- filename = g_filename_from_uri (uri, NULL, &error);
- if (error) {
- g_message ("Could not get filename from URI '%s': %s",
- uri,
- error->message);
- g_error_free (error);
-
- return NULL;
- }
-
- /* New channel from the given file */
- channel = g_io_channel_new_file (filename, "r", &error);
+ file = g_file_new_for_uri (uri);
+ stream = g_file_read (file, NULL, &error);
if (error) {
g_message ("Could not read file '%s': %s",
uri,
error->message);
g_error_free (error);
- g_free (filename);
+ g_object_unref (file);
return NULL;
}
- g_free (filename);
-
g_debug (" Starting to read '%s' up to %" G_GSIZE_FORMAT " bytes...",
uri, n_bytes);
/* Read up to n_bytes from stream */
- text = tracker_iochannel_read_text (channel,
- n_bytes,
- TRY_LOCALE_TO_UTF8_CONVERSION,
- TRUE);
+ text = tracker_read_text_from_stream (G_INPUT_STREAM (stream),
+ n_bytes,
+ TRY_LOCALE_TO_UTF8_CONVERSION);
- /* Note: Channel already closed and unrefed */
+ g_object_unref (stream);
+ g_object_unref (file);
return text;
}
diff --git a/src/tracker-extract/tracker-read.c b/src/tracker-extract/tracker-read.c
new file mode 100644
index 0000000..deee6e9
--- /dev/null
+++ b/src/tracker-extract/tracker-read.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <string.h>
+#include <unistd.h>
+
+#include <glib.h>
+#include <gio/gio.h>
+
+#include <libtracker-extract/tracker-extract.h>
+
+#include "tracker-read.h"
+
+/* Size of the buffer to use when reading from the GIOChannel, in bytes */
+#define BUFFER_SIZE 65535
+
+/* Maximum number of retries if the GIOChannel is G_IO_STATUS_AGAIN,
+ * to avoid infinite loops */
+#define MAX_RETRIES 5
+
+
+static GString *
+get_string_in_locale (GString *s)
+{
+ GError *error = NULL;
+ gchar *str;
+ gsize bytes_read;
+ gsize bytes_written;
+
+ str = g_locale_to_utf8 (s->str,
+ s->len,
+ &bytes_read,
+ &bytes_written,
+ &error);
+ if (error) {
+ g_debug (" Conversion to UTF-8 read %" G_GSIZE_FORMAT " bytes, wrote %" G_GSIZE_FORMAT " bytes",
+ bytes_read,
+ bytes_written);
+ g_message ("Could not convert string from locale to UTF-8, %s",
+ error->message);
+ g_error_free (error);
+ g_free (str);
+ } else {
+ g_string_assign (s, str);
+ g_free (str);
+ }
+
+ return s;
+}
+
+
+/* Returns %TRUE if read operation should continue, %FALSE otherwise */
+static gboolean
+process_chunk (const gchar *read_bytes,
+ gsize read_size,
+ gsize buffer_size,
+ gsize *remaining_size,
+ GString **s)
+{
+ /* If no more bytes to read, halt loop */
+ if (read_size == 0) {
+ return FALSE;
+ }
+
+ /* First of all, check if this is the first time we
+ * have tried to read the stream up to the BUFFER_SIZE
+ * limit. Then make sure that we read the maximum size
+ * of the buffer. If we don't do this, there is the
+ * case where we read 10 bytes in and it is just one
+ * line with no '\n'. Once we have confirmed this we
+ * check that the buffer has a '\n' to make sure the
+ * file is worth indexing. Similarly if the file has
+ * <= 3 bytes then we drop it.
+ */
+ if (*s == NULL) {
+ if (read_size == buffer_size &&
+ g_strstr_len (read_bytes, read_size, "\n") == NULL) {
+ g_debug (" No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, "
+ "not indexing file",
+ read_size);
+ return FALSE;
+ } else if (read_size <= 2) {
+ g_debug (" File has less than 3 characters in it, "
+ "not indexing file");
+ return FALSE;
+ }
+ }
+
+ /* Update remaining bytes */
+ *remaining_size -= read_size;
+
+ g_debug (" Read "
+ "%" G_GSSIZE_FORMAT " bytes this time, "
+ "%" G_GSIZE_FORMAT " bytes remaining",
+ read_size,
+ *remaining_size);
+
+ /* Append non-NIL terminated bytes */
+ *s = (*s ?
+ g_string_append_len (*s, read_bytes, read_size) :
+ g_string_new_len (read_bytes, read_size));
+
+ return TRUE;
+}
+
+static gchar *
+process_whole_string (GString *s,
+ gboolean try_locale_if_not_utf8)
+{
+ gsize n_valid_utf8_bytes = 0;
+
+ /* Get number of valid UTF-8 bytes found */
+ tracker_text_validate_utf8 (s->str,
+ s->len,
+ NULL,
+ &n_valid_utf8_bytes);
+
+ /* A valid UTF-8 file will be that where all read bytes are valid,
+ * with a margin of 3 bytes for the last UTF-8 character which might
+ * have been cut. */
+ if (try_locale_if_not_utf8 &&
+ s->len - n_valid_utf8_bytes > 3) {
+ /* If not UTF-8, try to get contents in locale encoding
+ * (returns valid UTF-8) */
+ s = get_string_in_locale (s);
+ } else if (n_valid_utf8_bytes < s->len) {
+ g_debug (" Truncating to last valid UTF-8 character "
+ "(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
+ n_valid_utf8_bytes,
+ s->len);
+ s = g_string_truncate (s, n_valid_utf8_bytes);
+ }
+
+ if (s->len < 1) {
+ g_string_free (s, TRUE);
+ return NULL;
+ }
+
+ return g_string_free (s, FALSE);
+}
+
+/**
+ * tracker_read_text_from_stream:
+ * @stream: input stream to read from
+ * @max_bytes: max number of bytes to read from @stream
+ * @try_locale_if_not_utf8: if the the text read is not valid UTF-8, try to
+ * convert from locale-encoding to UTF-8
+ *
+ * Reads up to @max_bytes from @stream, and validates the read text as proper
+ * UTF-8.
+ *
+ * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
+ **/
+gchar *
+tracker_read_text_from_stream (GInputStream *stream,
+ gsize max_bytes,
+ gboolean try_locale_if_not_utf8)
+{
+ GString *s = NULL;
+ gsize n_bytes_remaining = max_bytes;
+
+ g_return_val_if_fail (stream, NULL);
+ g_return_val_if_fail (max_bytes > 0, NULL);
+
+ /* Reading in chunks of BUFFER_SIZE
+ * Loop is halted whenever one of this conditions is met:
+ * a) Read bytes reached the maximum allowed (max_bytes)
+ * b) No more bytes to read
+ * c) Error reading
+ * d) Stream has less than 3 bytes
+ * e) Stream has a single line of BUFFER_SIZE bytes with no EOL
+ */
+ while (n_bytes_remaining > 0) {
+ gchar buf[BUFFER_SIZE];
+ GError *error = NULL;
+ gsize n_bytes_read;
+
+ /* Read bytes from stream */
+ if (!g_input_stream_read_all (stream,
+ buf,
+ MIN (BUFFER_SIZE, n_bytes_remaining),
+ &n_bytes_read,
+ NULL,
+ &error)) {
+ g_message ("Error reading from stream: '%s'",
+ error->message);
+ g_error_free (error);
+ break;
+ }
+
+ /* Process read bytes, and halt loop if needed */
+ if (!process_chunk (buf,
+ n_bytes_read,
+ BUFFER_SIZE,
+ &n_bytes_remaining,
+ &s)) {
+ break;
+ }
+ }
+
+ /* Validate UTF-8 if something was read, and return it */
+ return s ? process_whole_string (s, try_locale_if_not_utf8) : NULL;
+}
+
+
+/**
+ * tracker_read_text_from_fd:
+ * @fd: input fd to read from
+ * @max_bytes: max number of bytes to read from @fd
+ * @try_locale_if_not_utf8: if the the text read is not valid UTF-8, try to
+ * convert from locale-encoding to UTF-8
+ *
+ * Reads up to @max_bytes from @fd, and validates the read text as proper
+ * UTF-8. Will also properly close the FD when finishes.
+ *
+ * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
+ **/
+gchar *
+tracker_read_text_from_fd (gint fd,
+ gsize max_bytes,
+ gboolean try_locale_if_not_utf8)
+{
+ FILE *fz;
+ GString *s = NULL;
+ gsize n_bytes_remaining = max_bytes;
+
+ g_return_val_if_fail (max_bytes > 0, NULL);
+
+ if ((fz = fdopen (fd, "r")) == NULL) {
+ g_warning ("Cannot read from FD... could not extract text");
+ close (fd);
+ return NULL;
+ }
+
+ /* Reading in chunks of BUFFER_SIZE
+ * Loop is halted whenever one of this conditions is met:
+ * a) Read bytes reached the maximum allowed (max_bytes)
+ * b) No more bytes to read
+ * c) Error reading
+ * d) Stream has less than 3 bytes
+ * e) Stream has a single line of BUFFER_SIZE bytes with no EOL
+ */
+ while (n_bytes_remaining > 0) {
+ gchar buf[BUFFER_SIZE];
+ gsize n_bytes_read;
+
+ /* Read bytes */
+ n_bytes_read = fread (buf,
+ 1,
+ MIN (BUFFER_SIZE, n_bytes_remaining),
+ fz);
+
+ /* Process read bytes, and halt loop if needed */
+ if (!process_chunk (buf,
+ n_bytes_read,
+ BUFFER_SIZE,
+ &n_bytes_remaining,
+ &s)) {
+ break;
+ }
+ }
+
+ /* Close the file here */
+ fclose (fz);
+
+ /* Validate UTF-8 if something was read, and return it */
+ return s ? process_whole_string (s, try_locale_if_not_utf8) : NULL;
+}
diff --git a/src/tracker-extract/tracker-iochannel.h b/src/tracker-extract/tracker-read.h
similarity index 64%
rename from src/tracker-extract/tracker-iochannel.h
rename to src/tracker-extract/tracker-read.h
index 3985e22..a1617a3 100644
--- a/src/tracker-extract/tracker-iochannel.h
+++ b/src/tracker-extract/tracker-read.h
@@ -17,20 +17,22 @@
* Boston, MA 02110-1301, USA.
*/
-#ifndef __TRACKER_IOCHANNEL_H__
-#define __TRACKER_IOCHANNEL_H__
+#ifndef __TRACKER_READ_H__
+#define __TRACKER_READ_H__
#include <glib.h>
-#include <gio/gio.h>
G_BEGIN_DECLS
-gchar *tracker_iochannel_read_text (GIOChannel *channel,
- gsize max_bytes,
- gboolean try_locale_if_not_utf8,
- gboolean close_channel);
+gchar *tracker_read_text_from_stream (GInputStream *stream,
+ gsize max_bytes,
+ gboolean try_locale_if_not_utf8);
+
+gchar *tracker_read_text_from_fd (gint fd,
+ gsize max_bytes,
+ gboolean try_locale_if_not_utf8);
G_END_DECLS
-#endif /* __TRACKER_IOCHANNEL_H__ */
+#endif /* __TRACKER_READ_H__ */
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]