[tracker/extractor-remove-word-counting-review: 11/14] Reuse the same code for text and oasis/contents



commit 82316f6ffa2c64d1784cfd192e033635480e47f6
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Tue May 11 14:21:50 2010 +0200

    Reuse the same code for text and oasis/contents

 src/tracker-extract/Makefile.am             |    1 +
 src/tracker-extract/tracker-extract-oasis.c |   88 ++++------------
 src/tracker-extract/tracker-extract-text.c  |  110 +++++---------------
 src/tracker-extract/tracker-istream.c       |  152 +++++++++++++++++++++++----
 src/tracker-extract/tracker-istream.h       |    6 +-
 5 files changed, 185 insertions(+), 172 deletions(-)
---
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 0d3390c..038e2d7 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -328,6 +328,7 @@ tracker_extract_SOURCES = 						\
 	tracker-albumart-generic.h
 
 tracker_extract_LDADD = 						\
+	$(top_builddir)/src/libtracker-extract/libtracker-extract- TRACKER_API_VERSION@.la \
 	$(top_builddir)/src/libtracker-client/libtracker-client- TRACKER_API_VERSION@.la \
 	$(top_builddir)/src/libtracker-miner/libtracker-miner- TRACKER_API_VERSION@.la \
 	$(top_builddir)/src/libtracker-common/libtracker-common.la	\
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index da21440..c8ead97 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -24,6 +24,7 @@
 
 #include "tracker-main.h"
 #include "tracker-gsf.h"
+#include "tracker-istream.h"
 
 #include <unistd.h>
 
@@ -69,19 +70,15 @@ static TrackerExtractData extract_data[] = {
 	{ NULL, NULL }
 };
 
-
-#define ODT_BUFFER_SIZE            8193  /* bytes */
-
 static gchar *
 extract_oasis_content (const gchar *uri,
                        gsize        n_bytes)
 {
-	const gchar *argv[4];
-	gint fdz;
-	FILE *fz;
-	GError *error = NULL;
-	gchar *text = NULL;
-	gchar *path;
+	const gchar  *argv[4];
+	gchar        *text = NULL;
+	gchar        *path;
+	GIOChannel   *channel;
+	GPid         pid;
 
 	/* Newly allocated string with the file path */
 	path = g_filename_from_uri (uri, NULL, NULL);
@@ -97,67 +94,26 @@ extract_oasis_content (const gchar *uri,
 	         argv[0], argv[1], argv[2], n_bytes);
 
 	/* Fork & spawn */
-	if (!g_spawn_async_with_pipes (g_get_tmp_dir (),
-	                               (gchar **)argv,
-	                               NULL,
-	                               G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL,
-	                               tracker_spawn_child_func,
-	                               GINT_TO_POINTER (10),
-	                               NULL,
-	                               NULL,
-	                               &fdz,
-	                               NULL,
-	                               &error)) {
-		g_warning ("Spawning failed, could not extract text from '%s': %s",
-		           path, error ? error->message : NULL);
-		g_clear_error (&error);
-	}
-	/* Open file descriptor for reading */
-	else if ((fz = fdopen (fdz, "r")) == NULL) {
-		g_warning ("Cannot read child's output... could not extract "
-		           "text from '%s'", path);
-		close (fdz);
-	}
-	/* Start buffered reading... */
-	else {
-		unsigned char buf[ODT_BUFFER_SIZE];
-		size_t r, bytes_remaining;
-		GString *validated = NULL;
-
-		bytes_remaining = n_bytes;
-
-		/* Reading in chunks of ODT_BUFFER_SIZE -1 (8192)
-		 *   Loop is halted whenever one of this conditions is met:
-		 *     a) Read bytes reached the maximum allowed (n_bytes)
-		 *     b) No more bytes to read
-		 */
-		while ((bytes_remaining > 0) &&
-		       (r = fread (buf, 1, ODT_BUFFER_SIZE-1, fz))) {
-			gsize len_to_validate;
-
-			len_to_validate = MIN (bytes_remaining, r);
-
-			tracker_text_validate_utf8 (buf,
-			                            len_to_validate,
-			                            &validated,
-			                            NULL);
-
-			/* Note that in this case we shouldn't add a whitespace
-			 * separator between chunks read */
-
-			/* Update remaining */
-			bytes_remaining -= len_to_validate;
-		}
-
-		/* fclose() the stream, no need to close() the original FD */
-		fclose (fz);
-
-		/* Set final normalized contents to return */
-		text = g_string_free (validated, FALSE);
+	if (tracker_spawn_async_with_channels (argv,
+	                                       10,
+	                                       &pid,
+	                                       NULL,
+	                                       &channel,
+	                                       NULL)) {
+		/* Read up to n_bytes from stream */
+		text = tracker_iochannel_read_text (channel,
+		                                    n_bytes,
+		                                    FALSE,
+		                                    TRUE);
+
+		/* Close spawned PID */
+		g_spawn_close_pid (pid);
 	}
 
 	g_free (path);
 
+	/* Note: Channel already closed and unrefed */
+
 	return text;
 }
 
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
index 09a7340..12c3ec8 100644
--- a/src/tracker-extract/tracker-extract-text.c
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -29,7 +29,7 @@
 #include "tracker-main.h"
 #include "tracker-istream.h"
 
-#undef  TRY_LOCALE_TO_UTF8_CONVERSION
+#define  TRY_LOCALE_TO_UTF8_CONVERSION 0
 
 static void extract_text (const gchar          *uri,
                           TrackerSparqlBuilder *preupdate,
@@ -40,109 +40,53 @@ static TrackerExtractData data[] = {
 	{ NULL, NULL }
 };
 
-#ifdef TRY_LOCALE_TO_UTF8_CONVERSION
-
-static GString *
-get_file_in_locale (GString *s)
-{
-	GError *error = NULL;
-	gchar  *str;
-	gsize   bytes_read;
-	gsize   bytes_written;
-
-	str = g_locale_to_utf8 (s->str,
-	                        s->len,
-	                        &bytes_read,
-	                        &bytes_written,
-	                        &error);
-	if (error) {
-		g_debug ("  Conversion to UTF-8 read %d bytes, wrote %d bytes",
-		         bytes_read,
-		         bytes_written);
-		g_message ("Could not convert file from locale to UTF-8, %s",
-		           error->message);
-		g_error_free (error);
-		g_free (str);
-	} else {
-		g_string_assign (s, str);
-		g_free (str);
-	}
-
-	return s;
-}
-
-#endif /* TRY_LOCALE_TO_UTF8_CONVERSION */
 
 static gchar *
 get_file_content (const gchar *uri,
                   gsize        n_bytes)
 {
-	GFile            *file;
-	GFileInputStream *stream;
-	GError           *error = NULL;
-	GString          *s;
-	gsize             n_valid_utf8_bytes = 0;
-
-	file = g_file_new_for_uri (uri);
-	stream = g_file_read (file, NULL, &error);
+	GIOChannel *channel;
+	GError     *error = NULL;
+	gchar      *text;
+	gchar      *filename;
 
+	/* Get filename from URI */
+	filename = g_filename_from_uri (uri, NULL, &error);
 	if (error) {
-		g_message ("Could not read file:'%s', %s",
+		g_message ("Could not get filename from URI '%s': %s",
 		           uri,
 		           error->message);
 		g_error_free (error);
-		g_object_unref (file);
 
 		return NULL;
 	}
 
-	g_debug ("  Starting to read '%s' up to %" G_GSIZE_FORMAT " bytes...",
-	         uri, n_bytes);
-
-	/* Read up to n_bytes from stream */
-	s = tracker_istream_read_text (G_INPUT_STREAM (stream),
-	                               n_bytes);
+	/* New channel from the given file */
+	channel = g_io_channel_new_file (filename, "r", &error);
+	if (error) {
+		g_message ("Could not read file '%s': %s",
+		           uri,
+		           error->message);
+		g_error_free (error);
+		g_free (filename);
 
-	/* If nothing really read, return here */
-	if (!s) {
-		g_object_unref (stream);
-		g_object_unref (file);
 		return NULL;
 	}
 
-	/* Get number of valid UTF-8 bytes found */
-	tracker_text_validate_utf8 (s->str,
-	                            s->len,
-	                            NULL,
-	                            &n_valid_utf8_bytes);
-
-#ifdef TRY_LOCALE_TO_UTF8_CONVERSION
-	/* A valid UTF-8 file will be that where all read bytes are valid,
-	 *  with a margin of 3 bytes for the last UTF-8 character which might
-	 *  have been cut. */
-	if (s->len - n_valid_utf8_bytes > 3) {
-		/* If not UTF-8, try to get contents in locale encoding
-		 *  (returns valid UTF-8) */
-		s = get_file_in_locale (s);
-	} else
-#endif  /* TRY_LOCALE_TO_UTF8_CONVERSION */
-	if (n_valid_utf8_bytes < s->len) {
-		g_debug ("  Truncating to last valid UTF-8 character "
-		         "(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
-		         n_valid_utf8_bytes,
-		         s->len);
-		s = g_string_truncate (s, n_valid_utf8_bytes);
-	}
+	g_free (filename);
 
-	g_object_unref (stream);
-	g_object_unref (file);
+	g_debug ("  Starting to read '%s' up to %" G_GSIZE_FORMAT " bytes...",
+	         uri, n_bytes);
 
-	if (s->len < 1) {
-		g_string_free (s, TRUE);
-		return NULL;
-	}
+	/* Read up to n_bytes from stream */
+	text = tracker_iochannel_read_text (channel,
+	                                    n_bytes,
+	                                    TRY_LOCALE_TO_UTF8_CONVERSION,
+	                                    TRUE);
+
+	/* Note: Channel already closed and unrefed */
 
-	return g_string_free (s, FALSE);
+	return text;
 }
 
 static void
diff --git a/src/tracker-extract/tracker-istream.c b/src/tracker-extract/tracker-istream.c
index 2d75373..135a2af 100644
--- a/src/tracker-extract/tracker-istream.c
+++ b/src/tracker-extract/tracker-istream.c
@@ -22,54 +22,121 @@
 #include <glib.h>
 #include <gio/gio.h>
 
+#include <libtracker-extract/tracker-extract.h>
+
 #include "tracker-istream.h"
 
-#define BUFFER_SIZE 65535    /* bytes */
+/* Size of the buffer to use when reading from the GIOChannel, in bytes */
+#define BUFFER_SIZE 65535
+
+/* Maximum number of retries if the GIOChannel is G_IO_STATUS_AGAIN,
+ *  to avoid infinite loops */
+#define MAX_RETRIES 5
+
+
+
+static GString *
+get_string_in_locale (GString *s)
+{
+	GError *error = NULL;
+	gchar  *str;
+	gsize   bytes_read;
+	gsize   bytes_written;
+
+	str = g_locale_to_utf8 (s->str,
+	                        s->len,
+	                        &bytes_read,
+	                        &bytes_written,
+	                        &error);
+	if (error) {
+		g_debug ("  Conversion to UTF-8 read %d bytes, wrote %d bytes",
+		         bytes_read,
+		         bytes_written);
+		g_message ("Could not convert string from locale to UTF-8, %s",
+		           error->message);
+		g_error_free (error);
+		g_free (str);
+	} else {
+		g_string_assign (s, str);
+		g_free (str);
+	}
+
+	return s;
+}
 
-GString *
-tracker_istream_read_text (GInputStream  *stream,
-                           gsize          max_bytes)
+/**
+ * tracker_iochannel_read_text:
+ * @channel: input channel to read from
+ * @max_bytes: max number of bytes to read from @channel
+ * @try_locale_if_not_utf8: if the the text read is not valid UTF-8, try to
+ *   convert from locale-encoding to UTF-8
+ * @close_channel: if %TRUE, @channel will will be destroyed
+ *
+ * Reads up to @max_bytes from @channel, and validates the read text as proper
+ *  UTF-8.
+ *
+ * Returns: newly-allocated NIL-terminated UTF-8 string with the read text.
+ **/
+gchar *
+tracker_iochannel_read_text (GIOChannel *channel,
+                             gsize       max_bytes,
+                             gboolean    try_locale_if_not_utf8,
+                             gboolean    close_channel)
 {
 	GString *s = NULL;
-	guchar   buf[BUFFER_SIZE];
 	gsize    n_bytes_remaining;
-	GError  *error = NULL;
+	guint    n_retries = MAX_RETRIES;
 
-	g_return_val_if_fail (stream, NULL);
+	g_return_val_if_fail (channel, NULL);
 	g_return_val_if_fail (max_bytes > 0, NULL);
 
+	/* We don't want to assume that the input data is in UTF-8, as it
+	 *  may be in locale's encoding */
+	g_io_channel_set_encoding (channel, NULL, NULL);
+
 	/* Reading in chunks of BUFFER_SIZE
 	 *   Loop is halted whenever one of this conditions is met:
 	 *     a) Read bytes reached the maximum allowed (max_bytes)
 	 *     b) No more bytes to read
 	 *     c) Error reading
-	 *     d) File has less than 3 bytes
-	 *     e) File has a single line of BUFFER_SIZE bytes with no EOL
+	 *     d) Stream has less than 3 bytes
+	 *     e) Stream has a single line of BUFFER_SIZE bytes with no EOL
+	 *     f) Max reading retries arrived
 	 */
 	n_bytes_remaining = max_bytes;
-	while (n_bytes_remaining > 0) {
-		gssize bytes_read;
+	while (n_bytes_remaining > 0 &&
+	       n_retries > 0) {
+		gchar      buf[BUFFER_SIZE];
+		GError    *error = NULL;
+		gssize     bytes_read;
+		GIOStatus  status;
 
-		/* Read n_bytes_remaining or BUFFER_SIZE bytes */
-		bytes_read = g_input_stream_read (stream,
+		/* Try to read from channel */
+		status = g_io_channel_read_chars (channel,
 		                                  buf,
 		                                  MIN (BUFFER_SIZE, n_bytes_remaining),
-		                                  NULL,
+		                                  &bytes_read,
 		                                  &error);
 
 		/* If any error reading, halt the loop */
 		if (error) {
-			g_message ("Error reading from stream: '%s'",
+			g_message ("Error reading from iochannel: '%s'",
 			           error->message);
 			g_error_free (error);
 			break;
 		}
 
 		/* If no more bytes to read, halt loop */
-		if(bytes_read == 0) {
+		if (bytes_read == 0 || status == G_IO_STATUS_EOF) {
 			break;
 		}
 
+		/* If we are requested to retry, the retry */
+		if (status == G_IO_STATUS_AGAIN) {
+			n_retries--;
+			continue;
+		}
+
 		/* First of all, check if this is the first time we
 		 * have tried to read the stream up to the BUFFER_SIZE
 		 * limit. Then make sure that we read the maximum size
@@ -104,11 +171,54 @@ tracker_istream_read_text (GInputStream  *stream,
 		         n_bytes_remaining);
 
 		/* Append non-NIL terminated bytes */
-		s = (s == NULL ?
-		     g_string_new_len (buf, bytes_read) :
-		     g_string_append_len (s, buf, bytes_read));
+		s = (s ?
+		     g_string_append_len (s, buf, bytes_read) :
+		     g_string_new_len (buf, bytes_read));
 	}
 
-	/* Return whatever we got... */
-	return s;
+	/* Validate UTF-8 if something was read */
+	if (s) {
+		gsize n_valid_utf8_bytes = 0;
+
+		/* Get number of valid UTF-8 bytes found */
+		tracker_text_validate_utf8 (s->str,
+		                            s->len,
+		                            NULL,
+		                            &n_valid_utf8_bytes);
+
+		/* A valid UTF-8 file will be that where all read bytes are valid,
+		 *  with a margin of 3 bytes for the last UTF-8 character which might
+		 *  have been cut. */
+		if (try_locale_if_not_utf8 &&
+		    s->len - n_valid_utf8_bytes > 3) {
+			/* If not UTF-8, try to get contents in locale encoding
+			 *  (returns valid UTF-8) */
+			s = get_string_in_locale (s);
+		} else if (n_valid_utf8_bytes < s->len) {
+			g_debug ("  Truncating to last valid UTF-8 character "
+			         "(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
+			         n_valid_utf8_bytes,
+			         s->len);
+			s = g_string_truncate (s, n_valid_utf8_bytes);
+		}
+
+		if (s->len < 1) {
+			g_string_free (s, TRUE);
+			s = NULL;
+		}
+	}
+
+	/* Properly close channel if requested to do so */
+	if (close_channel) {
+		GError *error = NULL;
+		g_io_channel_shutdown (channel, TRUE, &error);
+		if (error) {
+			g_message ("Couldn't properly shutdown channel: '%s'",
+			           error->message);
+			g_error_free (error);
+		}
+		g_io_channel_unref (channel);
+	}
+
+	return s ? g_string_free (s, FALSE) : NULL;
 }
diff --git a/src/tracker-extract/tracker-istream.h b/src/tracker-extract/tracker-istream.h
index f155dd2..0bf8fee 100644
--- a/src/tracker-extract/tracker-istream.h
+++ b/src/tracker-extract/tracker-istream.h
@@ -25,8 +25,10 @@
 
 G_BEGIN_DECLS
 
-GString *tracker_istream_read_text (GInputStream  *stream,
-                                    gsize          max_bytes);
+gchar *tracker_iochannel_read_text (GIOChannel *channel,
+                                    gsize       max_bytes,
+                                    gboolean    try_locale_if_not_utf8,
+                                    gboolean    close_channel);
 
 G_END_DECLS
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]