[tracker] tracker-extract, txt: Support text files encoded in UTF-16 if BOM available
- From: Aleksander Morgado <aleksm src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tracker] tracker-extract, txt: Support text files encoded in UTF-16 if BOM available
- Date: Thu, 16 Dec 2010 13:58:00 +0000 (UTC)
commit b440ae43509c7e703cd63250e04454e011249987
Author: Aleksander Morgado <aleksander lanedo com>
Date: Thu Dec 16 14:53:34 2010 +0100
tracker-extract,txt: Support text files encoded in UTF-16 if BOM available
Fixes NB#212116
[NEWS]
src/tracker-extract/tracker-extract-text.c | 6 +-
src/tracker-extract/tracker-read.c | 140 +++++++++++++++++++++-------
2 files changed, 110 insertions(+), 36 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
index 4065117..c987225 100644
--- a/src/tracker-extract/tracker-extract-text.c
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -42,8 +42,8 @@ static TrackerExtractData data[] = {
static gchar *
-get_file_content (const gchar *uri,
- gsize n_bytes)
+get_file_content (const gchar *uri,
+ gsize n_bytes)
{
GFile *file;
GFileInputStream *stream;
@@ -71,7 +71,7 @@ get_file_content (const gchar *uri,
g_debug (" Starting to read '%s' up to %" G_GSIZE_FORMAT " bytes...",
uri, n_bytes);
- /* Read up to n_bytes from stream */
+ /* Read up to n_bytes from stream. Output is always, always valid UTF-8 */
text = tracker_read_text_from_stream (G_INPUT_STREAM (stream),
n_bytes,
TRY_LOCALE_TO_UTF8_CONVERSION);
diff --git a/src/tracker-extract/tracker-read.c b/src/tracker-extract/tracker-read.c
index f786c09..3959d3c 100644
--- a/src/tracker-extract/tracker-read.c
+++ b/src/tracker-extract/tracker-read.c
@@ -33,19 +33,21 @@
/* Size of the buffer to use when reading, in bytes */
#define BUFFER_SIZE 65535
-static GString *
-get_string_in_locale (GString *s)
+static gchar *
+get_string_in_locale (const gchar *locale_str,
+ gsize locale_str_len,
+ gsize *utf8_len)
{
GError *error = NULL;
- gchar *str;
- gsize bytes_read;
- gsize bytes_written;
-
- str = g_locale_to_utf8 (s->str,
- s->len,
- &bytes_read,
- &bytes_written,
- &error);
+ gchar *utf8_str;
+ gsize bytes_read = 0;
+ gsize bytes_written = 0;
+
+ utf8_str = g_locale_to_utf8 (locale_str,
+ locale_str_len,
+ &bytes_read,
+ &bytes_written,
+ &error);
if (error) {
g_debug (" Conversion to UTF-8 read %" G_GSIZE_FORMAT " bytes, wrote %" G_GSIZE_FORMAT " bytes",
bytes_read,
@@ -53,13 +55,12 @@ get_string_in_locale (GString *s)
g_message ("Could not convert string from locale to UTF-8, %s",
error->message);
g_error_free (error);
- g_free (str);
- } else {
- g_string_assign (s, str);
- g_free (str);
+ g_free (utf8_str);
+ return NULL;
}
- return s;
+ *utf8_len = bytes_written;
+ return utf8_str;
}
@@ -85,19 +86,38 @@ process_chunk (const gchar *read_bytes,
* check that the buffer has a '\n' to make sure the
* file is worth indexing. Similarly if the file has
* <= 3 bytes then we drop it.
+ *
+ * NOTE: We may have non-UTF8 content read (say,
+ * UTF-16LE), so we can't rely on methods which assume
+ * NUL-terminated strings, as g_strstr_len().
*/
if (*s == NULL) {
- if (read_size == buffer_size &&
- g_strstr_len (read_bytes, read_size, "\n") == NULL) {
- g_debug (" No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, "
- "not indexing file",
- read_size);
- return FALSE;
- } else if (read_size <= 2) {
+ if (read_size <= 3) {
g_debug (" File has less than 3 characters in it, "
"not indexing file");
return FALSE;
}
+
+ if (read_size == buffer_size) {
+ const gchar *i;
+ gboolean eol_found = FALSE;
+
+ i = read_bytes;
+ while (i != &read_bytes[read_size - 1]) {
+ if (*i == '\n') {
+ eol_found = TRUE;
+ break;
+ }
+ i++;
+ }
+
+ if (!eol_found) {
+ g_debug (" No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, "
+ "not indexing file",
+ read_size);
+ return FALSE;
+ }
+ }
}
/* Update remaining bytes */
@@ -121,11 +141,54 @@ static gchar *
process_whole_string (GString *s,
gboolean try_locale_if_not_utf8)
{
+ gchar *utf8 = NULL;
+ gsize utf8_len = 0;
gsize n_valid_utf8_bytes = 0;
+ /* Support also UTF-16 encoded text files, as the ones generated in
+ * Windows OS. We will only accept text files in UTF-16 which come
+ * with a proper BOM. */
+ if (s->len > 2) {
+ GError *error = NULL;
+
+ if (memcmp (s->str, "\xFF\xFE", 2) == 0) {
+ g_debug ("String comes in UTF-16LE, converting");
+ utf8 = g_convert (&(s->str[2]),
+ s->len - 2,
+ "UTF-8",
+ "UTF-16LE",
+ NULL,
+ &utf8_len,
+ &error);
+
+ } else if (memcmp (s->str, "\xFE\xFF", 2) == 0) {
+ g_debug ("String comes in UTF-16BE, converting");
+ utf8 = g_convert (&(s->str[2]),
+ s->len - 2,
+ "UTF-8",
+ "UTF-16BE",
+ NULL,
+ &utf8_len,
+ &error);
+ }
+
+ if (error) {
+ g_warning ("Couldn't convert string from UTF-16 to UTF-8...: %s",
+ error->message);
+ g_error_free (error);
+ g_string_free (s, TRUE);
+ return NULL;
+ }
+ }
+
+ if (!utf8) {
+ utf8_len = s->len;
+ utf8 = g_string_free (s, FALSE);
+ }
+
/* Get number of valid UTF-8 bytes found */
- tracker_text_validate_utf8 (s->str,
- s->len,
+ tracker_text_validate_utf8 (utf8,
+ utf8_len,
NULL,
&n_valid_utf8_bytes);
@@ -133,24 +196,35 @@ process_whole_string (GString *s,
* with a margin of 3 bytes for the last UTF-8 character which might
* have been cut. */
if (try_locale_if_not_utf8 &&
- s->len - n_valid_utf8_bytes > 3) {
+ utf8_len - n_valid_utf8_bytes > 3) {
+ gchar *from_locale_str;
+ gsize from_locale_str_len;
+
/* If not UTF-8, try to get contents in locale encoding
* (returns valid UTF-8) */
- s = get_string_in_locale (s);
- } else if (n_valid_utf8_bytes < s->len) {
+ from_locale_str = get_string_in_locale (utf8,
+ utf8_len,
+ &from_locale_str_len);
+ g_free (utf8);
+ if (!from_locale_str)
+ return NULL;
+ utf8 = from_locale_str;
+ utf8_len = from_locale_str_len;
+ } else if (n_valid_utf8_bytes < utf8_len) {
g_debug (" Truncating to last valid UTF-8 character "
"(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
n_valid_utf8_bytes,
- s->len);
- s = g_string_truncate (s, n_valid_utf8_bytes);
+ utf8_len);
+ utf8[n_valid_utf8_bytes] = '\0';
+ utf8_len = n_valid_utf8_bytes;
}
- if (s->len < 1) {
- g_string_free (s, TRUE);
+ if (utf8_len < 1) {
+ g_free (utf8);
return NULL;
}
- return g_string_free (s, FALSE);
+ return utf8;
}
/**
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]