[tracker/extractor-remove-word-counting-review: 11/14] Reuse the same code for text and oasis/contents
- From: Martyn James Russell <mr src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker/extractor-remove-word-counting-review: 11/14] Reuse the same code for text and oasis/contents
- Date: Tue, 18 May 2010 10:36:10 +0000 (UTC)
commit 82316f6ffa2c64d1784cfd192e033635480e47f6
Author: Aleksander Morgado <aleksander lanedo com>
Date: Tue May 11 14:21:50 2010 +0200
Reuse the same code for text and oasis/contents
src/tracker-extract/Makefile.am | 1 +
src/tracker-extract/tracker-extract-oasis.c | 88 ++++------------
src/tracker-extract/tracker-extract-text.c | 110 +++++---------------
src/tracker-extract/tracker-istream.c | 152 +++++++++++++++++++++++----
src/tracker-extract/tracker-istream.h | 6 +-
5 files changed, 185 insertions(+), 172 deletions(-)
---
diff --git a/src/tracker-extract/Makefile.am b/src/tracker-extract/Makefile.am
index 0d3390c..038e2d7 100644
--- a/src/tracker-extract/Makefile.am
+++ b/src/tracker-extract/Makefile.am
@@ -328,6 +328,7 @@ tracker_extract_SOURCES = \
tracker-albumart-generic.h
tracker_extract_LDADD = \
+ $(top_builddir)/src/libtracker-extract/libtracker-extract- TRACKER_API_VERSION@.la \
$(top_builddir)/src/libtracker-client/libtracker-client- TRACKER_API_VERSION@.la \
$(top_builddir)/src/libtracker-miner/libtracker-miner- TRACKER_API_VERSION@.la \
$(top_builddir)/src/libtracker-common/libtracker-common.la \
diff --git a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
index da21440..c8ead97 100644
--- a/src/tracker-extract/tracker-extract-oasis.c
+++ b/src/tracker-extract/tracker-extract-oasis.c
@@ -24,6 +24,7 @@
#include "tracker-main.h"
#include "tracker-gsf.h"
+#include "tracker-istream.h"
#include <unistd.h>
@@ -69,19 +70,15 @@ static TrackerExtractData extract_data[] = {
{ NULL, NULL }
};
-
-#define ODT_BUFFER_SIZE 8193 /* bytes */
-
static gchar *
extract_oasis_content (const gchar *uri,
gsize n_bytes)
{
- const gchar *argv[4];
- gint fdz;
- FILE *fz;
- GError *error = NULL;
- gchar *text = NULL;
- gchar *path;
+ const gchar *argv[4];
+ gchar *text = NULL;
+ gchar *path;
+ GIOChannel *channel;
+ GPid pid;
/* Newly allocated string with the file path */
path = g_filename_from_uri (uri, NULL, NULL);
@@ -97,67 +94,26 @@ extract_oasis_content (const gchar *uri,
argv[0], argv[1], argv[2], n_bytes);
/* Fork & spawn */
- if (!g_spawn_async_with_pipes (g_get_tmp_dir (),
- (gchar **)argv,
- NULL,
- G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL,
- tracker_spawn_child_func,
- GINT_TO_POINTER (10),
- NULL,
- NULL,
- &fdz,
- NULL,
- &error)) {
- g_warning ("Spawning failed, could not extract text from '%s': %s",
- path, error ? error->message : NULL);
- g_clear_error (&error);
- }
- /* Open file descriptor for reading */
- else if ((fz = fdopen (fdz, "r")) == NULL) {
- g_warning ("Cannot read child's output... could not extract "
- "text from '%s'", path);
- close (fdz);
- }
- /* Start buffered reading... */
- else {
- unsigned char buf[ODT_BUFFER_SIZE];
- size_t r, bytes_remaining;
- GString *validated = NULL;
-
- bytes_remaining = n_bytes;
-
- /* Reading in chunks of ODT_BUFFER_SIZE -1 (8192)
- * Loop is halted whenever one of this conditions is met:
- * a) Read bytes reached the maximum allowed (n_bytes)
- * b) No more bytes to read
- */
- while ((bytes_remaining > 0) &&
- (r = fread (buf, 1, ODT_BUFFER_SIZE-1, fz))) {
- gsize len_to_validate;
-
- len_to_validate = MIN (bytes_remaining, r);
-
- tracker_text_validate_utf8 (buf,
- len_to_validate,
- &validated,
- NULL);
-
- /* Note that in this case we shouldn't add a whitespace
- * separator between chunks read */
-
- /* Update remaining */
- bytes_remaining -= len_to_validate;
- }
-
- /* fclose() the stream, no need to close() the original FD */
- fclose (fz);
-
- /* Set final normalized contents to return */
- text = g_string_free (validated, FALSE);
+ if (tracker_spawn_async_with_channels (argv,
+ 10,
+ &pid,
+ NULL,
+ &channel,
+ NULL)) {
+ /* Read up to n_bytes from stream */
+ text = tracker_iochannel_read_text (channel,
+ n_bytes,
+ FALSE,
+ TRUE);
+
+ /* Close spawned PID */
+ g_spawn_close_pid (pid);
}
g_free (path);
+ /* Note: Channel already closed and unrefed */
+
return text;
}
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
index 09a7340..12c3ec8 100644
--- a/src/tracker-extract/tracker-extract-text.c
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -29,7 +29,7 @@
#include "tracker-main.h"
#include "tracker-istream.h"
-#undef TRY_LOCALE_TO_UTF8_CONVERSION
+#define TRY_LOCALE_TO_UTF8_CONVERSION 0
static void extract_text (const gchar *uri,
TrackerSparqlBuilder *preupdate,
@@ -40,109 +40,53 @@ static TrackerExtractData data[] = {
{ NULL, NULL }
};
-#ifdef TRY_LOCALE_TO_UTF8_CONVERSION
-
-static GString *
-get_file_in_locale (GString *s)
-{
- GError *error = NULL;
- gchar *str;
- gsize bytes_read;
- gsize bytes_written;
-
- str = g_locale_to_utf8 (s->str,
- s->len,
- &bytes_read,
- &bytes_written,
- &error);
- if (error) {
- g_debug (" Conversion to UTF-8 read %d bytes, wrote %d bytes",
- bytes_read,
- bytes_written);
- g_message ("Could not convert file from locale to UTF-8, %s",
- error->message);
- g_error_free (error);
- g_free (str);
- } else {
- g_string_assign (s, str);
- g_free (str);
- }
-
- return s;
-}
-
-#endif /* TRY_LOCALE_TO_UTF8_CONVERSION */
static gchar *
get_file_content (const gchar *uri,
gsize n_bytes)
{
- GFile *file;
- GFileInputStream *stream;
- GError *error = NULL;
- GString *s;
- gsize n_valid_utf8_bytes = 0;
-
- file = g_file_new_for_uri (uri);
- stream = g_file_read (file, NULL, &error);
+ GIOChannel *channel;
+ GError *error = NULL;
+ gchar *text;
+ gchar *filename;
+ /* Get filename from URI */
+ filename = g_filename_from_uri (uri, NULL, &error);
if (error) {
- g_message ("Could not read file:'%s', %s",
+ g_message ("Could not get filename from URI '%s': %s",
uri,
error->message);
g_error_free (error);
- g_object_unref (file);
return NULL;
}
- g_debug (" Starting to read '%s' up to %" G_GSIZE_FORMAT " bytes...",
- uri, n_bytes);
-
- /* Read up to n_bytes from stream */
- s = tracker_istream_read_text (G_INPUT_STREAM (stream),
- n_bytes);
+ /* New channel from the given file */
+ channel = g_io_channel_new_file (filename, "r", &error);
+ if (error) {
+ g_message ("Could not read file '%s': %s",
+ uri,
+ error->message);
+ g_error_free (error);
+ g_free (filename);
- /* If nothing really read, return here */
- if (!s) {
- g_object_unref (stream);
- g_object_unref (file);
return NULL;
}
- /* Get number of valid UTF-8 bytes found */
- tracker_text_validate_utf8 (s->str,
- s->len,
- NULL,
- &n_valid_utf8_bytes);
-
-#ifdef TRY_LOCALE_TO_UTF8_CONVERSION
- /* A valid UTF-8 file will be that where all read bytes are valid,
- * with a margin of 3 bytes for the last UTF-8 character which might
- * have been cut. */
- if (s->len - n_valid_utf8_bytes > 3) {
- /* If not UTF-8, try to get contents in locale encoding
- * (returns valid UTF-8) */
- s = get_file_in_locale (s);
- } else
-#endif /* TRY_LOCALE_TO_UTF8_CONVERSION */
- if (n_valid_utf8_bytes < s->len) {
- g_debug (" Truncating to last valid UTF-8 character "
- "(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
- n_valid_utf8_bytes,
- s->len);
- s = g_string_truncate (s, n_valid_utf8_bytes);
- }
+ g_free (filename);
- g_object_unref (stream);
- g_object_unref (file);
+ g_debug (" Starting to read '%s' up to %" G_GSIZE_FORMAT " bytes...",
+ uri, n_bytes);
- if (s->len < 1) {
- g_string_free (s, TRUE);
- return NULL;
- }
+ /* Read up to n_bytes from stream */
+ text = tracker_iochannel_read_text (channel,
+ n_bytes,
+ TRY_LOCALE_TO_UTF8_CONVERSION,
+ TRUE);
+
+ /* Note: Channel already closed and unrefed */
- return g_string_free (s, FALSE);
+ return text;
}
static void
diff --git a/src/tracker-extract/tracker-istream.c b/src/tracker-extract/tracker-istream.c
index 2d75373..135a2af 100644
--- a/src/tracker-extract/tracker-istream.c
+++ b/src/tracker-extract/tracker-istream.c
@@ -22,54 +22,121 @@
#include <glib.h>
#include <gio/gio.h>
+#include <libtracker-extract/tracker-extract.h>
+
#include "tracker-istream.h"
-#define BUFFER_SIZE 65535 /* bytes */
+/* Size of the buffer to use when reading from the GIOChannel, in bytes */
+#define BUFFER_SIZE 65535
+
+/* Maximum number of retries if the GIOChannel is G_IO_STATUS_AGAIN,
+ * to avoid infinite loops */
+#define MAX_RETRIES 5
+
+
+
+static GString *
+get_string_in_locale (GString *s)
+{
+ GError *error = NULL;
+ gchar *str;
+ gsize bytes_read;
+ gsize bytes_written;
+
+ str = g_locale_to_utf8 (s->str,
+ s->len,
+ &bytes_read,
+ &bytes_written,
+ &error);
+ if (error) {
+ g_debug (" Conversion to UTF-8 read %d bytes, wrote %d bytes",
+ bytes_read,
+ bytes_written);
+ g_message ("Could not convert string from locale to UTF-8, %s",
+ error->message);
+ g_error_free (error);
+ g_free (str);
+ } else {
+ g_string_assign (s, str);
+ g_free (str);
+ }
+
+ return s;
+}
-GString *
-tracker_istream_read_text (GInputStream *stream,
- gsize max_bytes)
+/**
+ * tracker_iochannel_read_text:
+ * @channel: input channel to read from
+ * @max_bytes: max number of bytes to read from @channel
+ * @try_locale_if_not_utf8: if the the text read is not valid UTF-8, try to
+ * convert from locale-encoding to UTF-8
+ * @close_channel: if %TRUE, @channel will will be destroyed
+ *
+ * Reads up to @max_bytes from @channel, and validates the read text as proper
+ * UTF-8.
+ *
+ * Returns: newly-allocated NIL-terminated UTF-8 string with the read text.
+ **/
+gchar *
+tracker_iochannel_read_text (GIOChannel *channel,
+ gsize max_bytes,
+ gboolean try_locale_if_not_utf8,
+ gboolean close_channel)
{
GString *s = NULL;
- guchar buf[BUFFER_SIZE];
gsize n_bytes_remaining;
- GError *error = NULL;
+ guint n_retries = MAX_RETRIES;
- g_return_val_if_fail (stream, NULL);
+ g_return_val_if_fail (channel, NULL);
g_return_val_if_fail (max_bytes > 0, NULL);
+ /* We don't want to assume that the input data is in UTF-8, as it
+ * may be in locale's encoding */
+ g_io_channel_set_encoding (channel, NULL, NULL);
+
/* Reading in chunks of BUFFER_SIZE
* Loop is halted whenever one of this conditions is met:
* a) Read bytes reached the maximum allowed (max_bytes)
* b) No more bytes to read
* c) Error reading
- * d) File has less than 3 bytes
- * e) File has a single line of BUFFER_SIZE bytes with no EOL
+ * d) Stream has less than 3 bytes
+ * e) Stream has a single line of BUFFER_SIZE bytes with no EOL
+ * f) Max reading retries arrived
*/
n_bytes_remaining = max_bytes;
- while (n_bytes_remaining > 0) {
- gssize bytes_read;
+ while (n_bytes_remaining > 0 &&
+ n_retries > 0) {
+ gchar buf[BUFFER_SIZE];
+ GError *error = NULL;
+ gssize bytes_read;
+ GIOStatus status;
- /* Read n_bytes_remaining or BUFFER_SIZE bytes */
- bytes_read = g_input_stream_read (stream,
+ /* Try to read from channel */
+ status = g_io_channel_read_chars (channel,
buf,
MIN (BUFFER_SIZE, n_bytes_remaining),
- NULL,
+ &bytes_read,
&error);
/* If any error reading, halt the loop */
if (error) {
- g_message ("Error reading from stream: '%s'",
+ g_message ("Error reading from iochannel: '%s'",
error->message);
g_error_free (error);
break;
}
/* If no more bytes to read, halt loop */
- if(bytes_read == 0) {
+ if (bytes_read == 0 || status == G_IO_STATUS_EOF) {
break;
}
+ /* If we are requested to retry, the retry */
+ if (status == G_IO_STATUS_AGAIN) {
+ n_retries--;
+ continue;
+ }
+
/* First of all, check if this is the first time we
* have tried to read the stream up to the BUFFER_SIZE
* limit. Then make sure that we read the maximum size
@@ -104,11 +171,54 @@ tracker_istream_read_text (GInputStream *stream,
n_bytes_remaining);
/* Append non-NIL terminated bytes */
- s = (s == NULL ?
- g_string_new_len (buf, bytes_read) :
- g_string_append_len (s, buf, bytes_read));
+ s = (s ?
+ g_string_append_len (s, buf, bytes_read) :
+ g_string_new_len (buf, bytes_read));
}
- /* Return whatever we got... */
- return s;
+ /* Validate UTF-8 if something was read */
+ if (s) {
+ gsize n_valid_utf8_bytes = 0;
+
+ /* Get number of valid UTF-8 bytes found */
+ tracker_text_validate_utf8 (s->str,
+ s->len,
+ NULL,
+ &n_valid_utf8_bytes);
+
+ /* A valid UTF-8 file will be that where all read bytes are valid,
+ * with a margin of 3 bytes for the last UTF-8 character which might
+ * have been cut. */
+ if (try_locale_if_not_utf8 &&
+ s->len - n_valid_utf8_bytes > 3) {
+ /* If not UTF-8, try to get contents in locale encoding
+ * (returns valid UTF-8) */
+ s = get_string_in_locale (s);
+ } else if (n_valid_utf8_bytes < s->len) {
+ g_debug (" Truncating to last valid UTF-8 character "
+ "(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
+ n_valid_utf8_bytes,
+ s->len);
+ s = g_string_truncate (s, n_valid_utf8_bytes);
+ }
+
+ if (s->len < 1) {
+ g_string_free (s, TRUE);
+ s = NULL;
+ }
+ }
+
+ /* Properly close channel if requested to do so */
+ if (close_channel) {
+ GError *error = NULL;
+ g_io_channel_shutdown (channel, TRUE, &error);
+ if (error) {
+ g_message ("Couldn't properly shutdown channel: '%s'",
+ error->message);
+ g_error_free (error);
+ }
+ g_io_channel_unref (channel);
+ }
+
+ return s ? g_string_free (s, FALSE) : NULL;
}
diff --git a/src/tracker-extract/tracker-istream.h b/src/tracker-extract/tracker-istream.h
index f155dd2..0bf8fee 100644
--- a/src/tracker-extract/tracker-istream.h
+++ b/src/tracker-extract/tracker-istream.h
@@ -25,8 +25,10 @@
G_BEGIN_DECLS
-GString *tracker_istream_read_text (GInputStream *stream,
- gsize max_bytes);
+gchar *tracker_iochannel_read_text (GIOChannel *channel,
+ gsize max_bytes,
+ gboolean try_locale_if_not_utf8,
+ gboolean close_channel);
G_END_DECLS
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]