[tracker/tracker-0.12] tracker-extract, text: try to extract text from files in typical windows charsets



commit a8e9a0c1781ced8bcbf3f2072504ac81b0ac2337
Author: Aleksander Morgado <aleksander lanedo com>
Date:   Wed Dec 14 12:51:45 2011 +0100

    tracker-extract,text: try to extract text from files in typical windows charsets
    
    If the input file is not valid UTF-8, we now try:
     * UTF-16, if NUL bytes are found in the string (windows-1252 and locale
       encodings are not expected to have NUL bytes within the string).
     * If locale encoding is not UTF-8, try with the locale encoding.
     * If locale encoding didn't help, try with windows-1252.
    
    Fixes GB#655383.

 src/tracker-extract/tracker-extract-text.c |    6 +-
 src/tracker-extract/tracker-read.c         |  117 ++++++++++++++++-----------
 src/tracker-extract/tracker-read.h         |    9 +--
 3 files changed, 73 insertions(+), 59 deletions(-)
---
diff --git a/src/tracker-extract/tracker-extract-text.c b/src/tracker-extract/tracker-extract-text.c
index 6a16c12..8fdb881 100644
--- a/src/tracker-extract/tracker-extract-text.c
+++ b/src/tracker-extract/tracker-extract-text.c
@@ -41,8 +41,6 @@
 #include "tracker-main.h"
 #include "tracker-read.h"
 
-#define TRY_LOCALE_TO_UTF8_CONVERSION 0
-
 static gchar *
 get_file_content (GFile *file,
                   gsize  n_bytes)
@@ -80,9 +78,7 @@ get_file_content (GFile *file,
 	/* Read up to n_bytes from stream. Output is always, always valid UTF-8,
 	 * this function closes the FD.
 	 */
-	text = tracker_read_text_from_fd (fd,
-	                                  n_bytes,
-	                                  TRY_LOCALE_TO_UTF8_CONVERSION);
+	text = tracker_read_text_from_fd (fd, n_bytes);
 	g_free (uri);
 	g_free (path);
 
diff --git a/src/tracker-extract/tracker-read.c b/src/tracker-extract/tracker-read.c
index ea4be82..c9087ef 100644
--- a/src/tracker-extract/tracker-read.c
+++ b/src/tracker-extract/tracker-read.c
@@ -34,36 +34,59 @@
 #define BUFFER_SIZE 65535
 
 static gchar *
-get_string_in_locale (const gchar *locale_str,
-                      gsize        locale_str_len,
-                      gsize       *utf8_len)
+get_string_from_guessed_encoding (const gchar *str,
+                                  gsize        str_len,
+                                  gsize       *utf8_len)
 {
-	GError *error = NULL;
-	gchar  *utf8_str;
-	gsize   bytes_read = 0;
-	gsize   bytes_written = 0;
-
-	utf8_str = g_locale_to_utf8 (locale_str,
-	                             locale_str_len,
-	                             &bytes_read,
-	                             &bytes_written,
-	                             &error);
-	if (error) {
-		g_debug ("  Conversion to UTF-8 read %" G_GSIZE_FORMAT " bytes, wrote %" G_GSIZE_FORMAT " bytes",
-		         bytes_read,
-		         bytes_written);
-		g_message ("Could not convert string from locale to UTF-8, %s",
-		           error->message);
-		g_error_free (error);
+	const gchar *current = NULL;
+
+	/* If we have embedded NULs try UTF-16 directly */
+	if (memchr (str, '\0', str_len))
+		current = "UTF-16";
+	/* If locale charset is UTF-8, try with windows-1252.
+	 * NOTE: g_get_charset() returns TRUE if locale charset is UTF-8 */
+	else if (g_get_charset (&current))
+		current = "windows-1252";
+
+	while (current) {
+		gchar *utf8_str;
+		gsize bytes_read = 0;
+		gsize bytes_written = 0;
+
+		utf8_str = g_convert (str,
+		                      str_len,
+		                      "UTF-8",
+		                      current,
+		                      &bytes_read,
+		                      &bytes_written,
+		                      NULL);
+		if (utf8_str &&
+		    str_len == bytes_read) {
+			g_debug ("Converted %" G_GSIZE_FORMAT " bytes in '%s' codeset "
+			         "to %" G_GSIZE_FORMAT " bytes in UTF-8",
+			         bytes_read,
+			         current,
+			         bytes_written);
+			*utf8_len = bytes_written;
+			return utf8_str;
+		}
 		g_free (utf8_str);
-		return NULL;
+
+		g_debug ("Text not in '%s' encoding", current);
+
+		if (!strcmp (current, "windows-1252") ||
+		    !strcmp (current, "UTF-16"))
+			/* If we tried windows-1252 or UTF-16, don't try anything else */
+			current = NULL;
+		else
+			/* If we tried a locale encoding and didn't work, retry with
+			 * windows-1252 */
+			current = "windows-1252";
 	}
 
-	*utf8_len = bytes_written;
-	return utf8_str;
+	return NULL;
 }
 
-
 /* Returns %TRUE if read operation should continue, %FALSE otherwise */
 static gboolean
 process_chunk (const gchar  *read_bytes,
@@ -138,8 +161,7 @@ process_chunk (const gchar  *read_bytes,
 }
 
 static gchar *
-process_whole_string (GString  *s,
-                      gboolean  try_locale_if_not_utf8)
+process_whole_string (GString  *s)
 {
 	gchar *utf8 = NULL;
 	gsize  utf8_len = 0;
@@ -195,21 +217,20 @@ process_whole_string (GString  *s,
 	/* A valid UTF-8 file will be that where all read bytes are valid,
 	 *  with a margin of 3 bytes for the last UTF-8 character which might
 	 *  have been cut. */
-	if (try_locale_if_not_utf8 &&
-	    utf8_len - n_valid_utf8_bytes > 3) {
-		gchar *from_locale_str;
-		gsize  from_locale_str_len;
+	if (utf8_len - n_valid_utf8_bytes > 3) {
+		gchar *from_guessed_str;
+		gsize  from_guessed_str_len;
 
-		/* If not UTF-8, try to get contents in locale encoding
+		/* If not UTF-8, try to get contents in guessed encoding
 		 *  (returns valid UTF-8) */
-		from_locale_str = get_string_in_locale (utf8,
-		                                        utf8_len,
-		                                        &from_locale_str_len);
+		from_guessed_str = get_string_from_guessed_encoding (utf8,
+		                                                     utf8_len,
+		                                                     &from_guessed_str_len);
 		g_free (utf8);
-		if (!from_locale_str)
+		if (!from_guessed_str)
 			return NULL;
-		utf8 = from_locale_str;
-		utf8_len = from_locale_str_len;
+		utf8 = from_guessed_str;
+		utf8_len = from_guessed_str_len;
 	} else if (n_valid_utf8_bytes < utf8_len) {
 		g_debug ("  Truncating to last valid UTF-8 character "
 		         "(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
@@ -231,18 +252,18 @@ process_whole_string (GString  *s,
  * tracker_read_text_from_stream:
  * @stream: input stream to read from
  * @max_bytes: max number of bytes to read from @stream
- * @try_locale_if_not_utf8: if the the text read is not valid UTF-8, try to
- *   convert from locale-encoding to UTF-8
  *
  * Reads up to @max_bytes from @stream, and validates the read text as proper
  *  UTF-8.
  *
+ * If the input text is not UTF-8 it will also try to decode it based on the
+ * current locale, or windows-1252, or UTF-16.
+ *
  * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
  **/
 gchar *
 tracker_read_text_from_stream (GInputStream *stream,
-                               gsize       max_bytes,
-                               gboolean    try_locale_if_not_utf8)
+                               gsize         max_bytes)
 {
 	GString *s = NULL;
 	gsize n_bytes_remaining = max_bytes;
@@ -287,7 +308,7 @@ tracker_read_text_from_stream (GInputStream *stream,
 	}
 
 	/* Validate UTF-8 if something was read, and return it */
-	return s ? process_whole_string (s, try_locale_if_not_utf8) : NULL;
+	return s ? process_whole_string (s) : NULL;
 }
 
 
@@ -295,18 +316,18 @@ tracker_read_text_from_stream (GInputStream *stream,
  * tracker_read_text_from_fd:
  * @fd: input fd to read from
  * @max_bytes: max number of bytes to read from @fd
- * @try_locale_if_not_utf8: if the the text read is not valid UTF-8, try to
- *   convert from locale-encoding to UTF-8
  *
  * Reads up to @max_bytes from @fd, and validates the read text as proper
  *  UTF-8. Will also properly close the FD when finishes.
  *
+ * If the input text is not UTF-8 it will also try to decode it based on the
+ * current locale, or windows-1252, or UTF-16.
+ *
  * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
  **/
 gchar *
-tracker_read_text_from_fd (gint     fd,
-                           gsize    max_bytes,
-                           gboolean try_locale_if_not_utf8)
+tracker_read_text_from_fd (gint  fd,
+                           gsize max_bytes)
 {
 	FILE *fz;
 	GString *s = NULL;
@@ -355,5 +376,5 @@ tracker_read_text_from_fd (gint     fd,
 	fclose (fz);
 
 	/* Validate UTF-8 if something was read, and return it */
-	return s ? process_whole_string (s, try_locale_if_not_utf8) : NULL;
+	return s ? process_whole_string (s) : NULL;
 }
diff --git a/src/tracker-extract/tracker-read.h b/src/tracker-extract/tracker-read.h
index 90b12d6..5138019 100644
--- a/src/tracker-extract/tracker-read.h
+++ b/src/tracker-extract/tracker-read.h
@@ -26,14 +26,11 @@
 G_BEGIN_DECLS
 
 gchar *tracker_read_text_from_stream (GInputStream *stream,
-                                      gsize         max_bytes,
-                                      gboolean      try_locale_if_not_utf8);
+                                      gsize         max_bytes);
 
-gchar *tracker_read_text_from_fd (gint     fd,
-                                  gsize    max_bytes,
-                                  gboolean try_locale_if_not_utf8);
+gchar *tracker_read_text_from_fd (gint  fd,
+                                  gsize max_bytes);
 
 G_END_DECLS
 
 #endif /* __TRACKER_READ_H__ */
-



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]