[tracker/extractor-remove-word-counting-review] Avoid the use of GIOChannels



commit 08539b4500f21233612aa8a44940efb7b75d4160
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Tue May 18 13:57:13 2010 +0200

    Avoid the use of GIOChannels

 src/libtracker-extract/tracker-utils.c             |    2 +-
 src/tracker-extract/Makefile.am                    |    4 +-
 src/tracker-extract/tracker-extract-oasis.c        |   39 ++--
 src/tracker-extract/tracker-extract-text.c         |   34 +--
 src/tracker-extract/tracker-iochannel.c            |  223 ---------------
 src/tracker-extract/tracker-read.c                 |  284 ++++++++++++++++++++
 .../{tracker-iochannel.h => tracker-read.h}        |   18 +-
 7 files changed, 328 insertions(+), 276 deletions(-)
---
diff --git a/src/libtracker-extract/tracker-utils.c b/src/libtracker-extract/tracker-utils.c
index 21a2f42..7f291b3 100644
--- a/src/libtracker-extract/tracker-utils.c
+++ b/src/libtracker-extract/tracker-utils.c
@@ -360,7 +360,7 @@ tracker_text_normalize (const gchar *text,
 /**
  * tracker_text_validate_utf8:
  * @text: the text to validate
- * @text_len: length of @text, or -1 if NULL-terminated
+ * @text_len: length of @text, or -1 if NUL-terminated
  * @str: the string where to place the validated UTF-8 characters, or %NULL if
  *  not needed.
  * @valid_len: Output number of valid UTF-8 bytes found, or %NULL if not needed
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index be80110..d512f08 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -321,8 +321,8 @@ tracker_extract_SOURCES = 						\
 	tracker-dbus.h							\
 	tracker-extract.c						\
 	tracker-extract.h						\
-	tracker-iochannel.c						\
-	tracker-iochannel.h						\
+	tracker-read.c							\
+	tracker-read.h							\
 	tracker-main.c							\
 	tracker-main.h							\
 	tracker-albumart-generic.h
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index 725b125..76985fa 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -24,7 +24,7 @@
 
 #include "tracker-main.h"
 #include "tracker-gsf.h"
-#include "tracker-iochannel.h"
+#include "tracker-read.h"
 
 #include <unistd.h>
 
@@ -74,11 +74,11 @@ static gchar *
 extract_oasis_content (const gchar *uri,
                        gsize        n_bytes)
 {
+	GError *error = NULL;
 	const gchar *argv[4];
 	gchar *text = NULL;
 	gchar *path;
-	GIOChannel *channel;
-	GPid pid;
+	gint fd;
 
 	/* Newly allocated string with the file path */
 	path = g_filename_from_uri (uri, NULL, NULL);
@@ -94,26 +94,27 @@ extract_oasis_content (const gchar *uri,
 	         argv[0], argv[1], argv[2], n_bytes);
 
 	/* Fork & spawn */
-	if (tracker_spawn_async_with_channels (argv,
-	                                       10,
-	                                       &pid,
-	                                       NULL,
-	                                       &channel,
-	                                       NULL)) {
-		/* Read up to n_bytes from stream */
-		text = tracker_iochannel_read_text (channel,
-		                                    n_bytes,
-		                                    FALSE,
-		                                    TRUE);
-
-		/* Close spawned PID */
-		g_spawn_close_pid (pid);
+	if (!g_spawn_async_with_pipes (g_get_tmp_dir (),
+	                               (gchar **)argv,
+	                               NULL,
+	                               G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL,
+	                               tracker_spawn_child_func,
+	                               GINT_TO_POINTER (10),
+	                               NULL,
+	                               NULL,
+	                               &fd,
+	                               NULL,
+	                               &error)) {
+		g_warning ("Spawning failed, could not extract text from '%s': %s",
+		           path, error ? error->message : NULL);
+		g_clear_error (&error);
+	} else {
+		/* Read up to n_bytes from FD (also closes FD) */
+		text = tracker_read_text_from_fd (fd, n_bytes, FALSE);
 	}
 
 	g_free (path);
 
-	/* Note: Channel already closed and unrefed */
-
 	return text;
 }
 
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
index f9303e3..d28656a 100644
--- a/src/tracker-extract/tracker-extract-text.c
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -27,7 +27,7 @@
 #include <libtracker-extract/tracker-extract.h>
 
 #include "tracker-main.h"
-#include "tracker-iochannel.h"
+#include "tracker-read.h"
 
 #define  TRY_LOCALE_TO_UTF8_CONVERSION 0
 
@@ -45,46 +45,34 @@ static gchar *
 get_file_content (const gchar *uri,
                   gsize        n_bytes)
 {
-	GIOChannel *channel;
+	GFile *file;
+	GFileInputStream  *stream;
 	GError     *error = NULL;
 	gchar      *text;
-	gchar      *filename;
 
 	/* Get filename from URI */
-	filename = g_filename_from_uri (uri, NULL, &error);
-	if (error) {
-		g_message ("Could not get filename from URI '%s': %s",
-		           uri,
-		           error->message);
-		g_error_free (error);
-
-		return NULL;
-	}
-
-	/* New channel from the given file */
-	channel = g_io_channel_new_file (filename, "r", &error);
+	file = g_file_new_for_uri (uri);
+	stream = g_file_read (file, NULL, &error);
 	if (error) {
 		g_message ("Could not read file '%s': %s",
 		           uri,
 		           error->message);
 		g_error_free (error);
-		g_free (filename);
+		g_object_unref (file);
 
 		return NULL;
 	}
 
-	g_free (filename);
-
 	g_debug ("  Starting to read '%s' up to %" G_GSIZE_FORMAT " bytes...",
 	         uri, n_bytes);
 
 	/* Read up to n_bytes from stream */
-	text = tracker_iochannel_read_text (channel,
-	                                    n_bytes,
-	                                    TRY_LOCALE_TO_UTF8_CONVERSION,
-	                                    TRUE);
+	text = tracker_read_text_from_stream (G_INPUT_STREAM (stream),
+	                                      n_bytes,
+	                                      TRY_LOCALE_TO_UTF8_CONVERSION);
 
-	/* Note: Channel already closed and unrefed */
+	g_object_unref (stream);
+	g_object_unref (file);
 
 	return text;
 }
diff --git a/src/tracker-extract/tracker-read.c b/src/tracker-extract/tracker-read.c
new file mode 100644
index 0000000..deee6e9
--- /dev/null
+++ b/src/tracker-extract/tracker-read.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan frade nokia com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include <string.h>
+#include <unistd.h>
+
+#include <glib.h>
+#include <gio/gio.h>
+
+#include <libtracker-extract/tracker-extract.h>
+
+#include "tracker-read.h"
+
+/* Size of the buffer to use when reading from the GIOChannel, in bytes */
+#define BUFFER_SIZE 65535
+
+/* Maximum number of retries if the GIOChannel is G_IO_STATUS_AGAIN,
+ *  to avoid infinite loops */
+#define MAX_RETRIES 5
+
+
+static GString *
+get_string_in_locale (GString *s)
+{
+	GError *error = NULL;
+	gchar *str;
+	gsize bytes_read;
+	gsize bytes_written;
+
+	str = g_locale_to_utf8 (s->str,
+	                        s->len,
+	                        &bytes_read,
+	                        &bytes_written,
+	                        &error);
+	if (error) {
+		g_debug ("  Conversion to UTF-8 read %" G_GSIZE_FORMAT " bytes, wrote %" G_GSIZE_FORMAT " bytes",
+		         bytes_read,
+		         bytes_written);
+		g_message ("Could not convert string from locale to UTF-8, %s",
+		           error->message);
+		g_error_free (error);
+		g_free (str);
+	} else {
+		g_string_assign (s, str);
+		g_free (str);
+	}
+
+	return s;
+}
+
+
+/* Returns %TRUE if read operation should continue, %FALSE otherwise */
+static gboolean
+process_chunk (const gchar  *read_bytes,
+               gsize         read_size,
+               gsize         buffer_size,
+               gsize        *remaining_size,
+               GString     **s)
+{
+	/* If no more bytes to read, halt loop */
+	if (read_size == 0) {
+		return FALSE;
+	}
+
+	/* First of all, check if this is the first time we
+	 * have tried to read the stream up to the BUFFER_SIZE
+	 * limit. Then make sure that we read the maximum size
+	 * of the buffer. If we don't do this, there is the
+	 * case where we read 10 bytes in and it is just one
+	 * line with no '\n'. Once we have confirmed this we
+	 * check that the buffer has a '\n' to make sure the
+	 * file is worth indexing. Similarly if the file has
+	 * <= 3 bytes then we drop it.
+	 */
+	if (*s == NULL) {
+		if (read_size == buffer_size &&
+		    g_strstr_len (read_bytes, read_size, "\n") == NULL) {
+			g_debug ("  No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, "
+			         "not indexing file",
+			         read_size);
+			return FALSE;
+		} else if (read_size <= 2) {
+			g_debug ("  File has less than 3 characters in it, "
+			         "not indexing file");
+			return FALSE;
+		}
+	}
+
+	/* Update remaining bytes */
+	*remaining_size -= read_size;
+
+	g_debug ("  Read "
+	         "%" G_GSSIZE_FORMAT " bytes this time, "
+	         "%" G_GSIZE_FORMAT " bytes remaining",
+	         read_size,
+	         *remaining_size);
+
+	/* Append non-NIL terminated bytes */
+	*s = (*s ?
+	      g_string_append_len (*s, read_bytes, read_size) :
+	      g_string_new_len (read_bytes, read_size));
+
+	return TRUE;
+}
+
+static gchar *
+process_whole_string (GString  *s,
+                      gboolean  try_locale_if_not_utf8)
+{
+	gsize n_valid_utf8_bytes = 0;
+
+	/* Get number of valid UTF-8 bytes found */
+	tracker_text_validate_utf8 (s->str,
+	                            s->len,
+	                            NULL,
+	                            &n_valid_utf8_bytes);
+
+	/* A valid UTF-8 file will be that where all read bytes are valid,
+	 *  with a margin of 3 bytes for the last UTF-8 character which might
+	 *  have been cut. */
+	if (try_locale_if_not_utf8 &&
+	    s->len - n_valid_utf8_bytes > 3) {
+		/* If not UTF-8, try to get contents in locale encoding
+		 *  (returns valid UTF-8) */
+		s = get_string_in_locale (s);
+	} else if (n_valid_utf8_bytes < s->len) {
+		g_debug ("  Truncating to last valid UTF-8 character "
+		         "(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
+		         n_valid_utf8_bytes,
+		         s->len);
+		s = g_string_truncate (s, n_valid_utf8_bytes);
+	}
+
+	if (s->len < 1) {
+		g_string_free (s, TRUE);
+		return NULL;
+	}
+
+	return g_string_free (s, FALSE);
+}
+
+/**
+ * tracker_read_text_from_stream:
+ * @stream: input stream to read from
+ * @max_bytes: max number of bytes to read from @stream
+ * @try_locale_if_not_utf8: if the the text read is not valid UTF-8, try to
+ *   convert from locale-encoding to UTF-8
+ *
+ * Reads up to @max_bytes from @stream, and validates the read text as proper
+ *  UTF-8.
+ *
+ * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
+ **/
+gchar *
+tracker_read_text_from_stream (GInputStream *stream,
+                               gsize       max_bytes,
+                               gboolean    try_locale_if_not_utf8)
+{
+	GString *s = NULL;
+	gsize n_bytes_remaining = max_bytes;
+
+	g_return_val_if_fail (stream, NULL);
+	g_return_val_if_fail (max_bytes > 0, NULL);
+
+	/* Reading in chunks of BUFFER_SIZE
+	 *   Loop is halted whenever one of this conditions is met:
+	 *     a) Read bytes reached the maximum allowed (max_bytes)
+	 *     b) No more bytes to read
+	 *     c) Error reading
+	 *     d) Stream has less than 3 bytes
+	 *     e) Stream has a single line of BUFFER_SIZE bytes with no EOL
+	 */
+	while (n_bytes_remaining > 0) {
+		gchar buf[BUFFER_SIZE];
+		GError *error = NULL;
+		gsize n_bytes_read;
+
+		/* Read bytes from stream */
+		if (!g_input_stream_read_all (stream,
+		                              buf,
+		                              MIN (BUFFER_SIZE, n_bytes_remaining),
+		                              &n_bytes_read,
+		                              NULL,
+		                              &error)) {
+			g_message ("Error reading from stream: '%s'",
+			           error->message);
+			g_error_free (error);
+			break;
+		}
+
+		/* Process read bytes, and halt loop if needed */
+		if (!process_chunk (buf,
+		                    n_bytes_read,
+		                    BUFFER_SIZE,
+		                    &n_bytes_remaining,
+		                    &s)) {
+			break;
+		}
+	}
+
+	/* Validate UTF-8 if something was read, and return it */
+	return s ? process_whole_string (s, try_locale_if_not_utf8) : NULL;
+}
+
+
+/**
+ * tracker_read_text_from_fd:
+ * @fd: input fd to read from
+ * @max_bytes: max number of bytes to read from @fd
+ * @try_locale_if_not_utf8: if the the text read is not valid UTF-8, try to
+ *   convert from locale-encoding to UTF-8
+ *
+ * Reads up to @max_bytes from @fd, and validates the read text as proper
+ *  UTF-8. Will also properly close the FD when finishes.
+ *
+ * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
+ **/
+gchar *
+tracker_read_text_from_fd (gint     fd,
+                           gsize    max_bytes,
+                           gboolean try_locale_if_not_utf8)
+{
+	FILE *fz;
+	GString *s = NULL;
+	gsize n_bytes_remaining = max_bytes;
+
+	g_return_val_if_fail (max_bytes > 0, NULL);
+
+	if ((fz = fdopen (fd, "r")) == NULL) {
+		g_warning ("Cannot read from FD... could not extract text");
+		close (fd);
+		return NULL;
+	}
+
+	/* Reading in chunks of BUFFER_SIZE
+	 *   Loop is halted whenever one of this conditions is met:
+	 *     a) Read bytes reached the maximum allowed (max_bytes)
+	 *     b) No more bytes to read
+	 *     c) Error reading
+	 *     d) Stream has less than 3 bytes
+	 *     e) Stream has a single line of BUFFER_SIZE bytes with no EOL
+	 */
+	while (n_bytes_remaining > 0) {
+		gchar buf[BUFFER_SIZE];
+		gsize n_bytes_read;
+
+		/* Read bytes */
+		n_bytes_read = fread (buf,
+		                      1,
+		                      MIN (BUFFER_SIZE, n_bytes_remaining),
+		                      fz);
+
+		/* Process read bytes, and halt loop if needed */
+		if (!process_chunk (buf,
+		                    n_bytes_read,
+		                    BUFFER_SIZE,
+		                    &n_bytes_remaining,
+		                    &s)) {
+			break;
+		}
+	}
+
+	/* Close the file here */
+	fclose (fz);
+
+	/* Validate UTF-8 if something was read, and return it */
+	return s ? process_whole_string (s, try_locale_if_not_utf8) : NULL;
+}
diff --git a/src/tracker-extract/tracker-iochannel.h b/src/tracker-extract/tracker-read.h
similarity index 64%
rename from src/tracker-extract/tracker-iochannel.h
rename to src/tracker-extract/tracker-read.h
index 3985e22..a1617a3 100644
--- a/src/tracker-extract/tracker-iochannel.h
+++ b/src/tracker-extract/tracker-read.h
@@ -17,20 +17,22 @@
  * Boston, MA  02110-1301, USA.
  */
 
-#ifndef __TRACKER_IOCHANNEL_H__
-#define __TRACKER_IOCHANNEL_H__
+#ifndef __TRACKER_READ_H__
+#define __TRACKER_READ_H__
 
 #include <glib.h>
-#include <gio/gio.h>
 
 G_BEGIN_DECLS
 
-gchar *tracker_iochannel_read_text (GIOChannel *channel,
-                                    gsize       max_bytes,
-                                    gboolean    try_locale_if_not_utf8,
-                                    gboolean    close_channel);
+gchar *tracker_read_text_from_stream (GInputStream *stream,
+                                      gsize         max_bytes,
+                                      gboolean      try_locale_if_not_utf8);
+
+gchar *tracker_read_text_from_fd (gint     fd,
+                                  gsize    max_bytes,
+                                  gboolean try_locale_if_not_utf8);
 
 G_END_DECLS
 
-#endif /* __TRACKER_IOCHANNEL_H__ */
+#endif /* __TRACKER_READ_H__ */
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]