[gedit] Merge smart converter in the document output stream.



commit 85279adad605df29244253227541d0db45b98308
Author: Ignacio Casal Quinteiro <icq gnome org>
Date:   Sun Nov 21 12:18:28 2010 +0100

    Merge smart converter in the document output stream.
    
    Using a converter we are not able to get enough information where
    the conversion error has been produced, due to this we are merging
    it in the document output stream so we can do some escaping in the
    future.

 gedit/Makefile.am                     |    2 -
 gedit/gedit-document-loader.c         |   44 ++--
 gedit/gedit-document-output-stream.c  |  381 +++++++++++++++++++++++++++++-
 gedit/gedit-document-output-stream.h  |    8 +-
 gedit/gedit-smart-charset-converter.c |  425 ---------------------------------
 gedit/gedit-smart-charset-converter.h |   68 ------
 tests/Makefile.am                     |    6 +-
 tests/document-output-stream.c        |  243 +++++++++++++++++++-
 tests/smart-converter.c               |  354 ---------------------------
 9 files changed, 642 insertions(+), 889 deletions(-)
---
diff --git a/gedit/Makefile.am b/gedit/Makefile.am
index 3d9f3c5..2bd7d20 100644
--- a/gedit/Makefile.am
+++ b/gedit/Makefile.am
@@ -123,7 +123,6 @@ NOINST_H_FILES =			\
 	gedit-rounded-frame.h		\
 	gedit-session.h			\
 	gedit-settings.h		\
-	gedit-smart-charset-converter.h	\
 	gedit-status-combo-box.h	\
 	gedit-style-scheme-manager.h	\
 	gedit-tab-label.h		\
@@ -207,7 +206,6 @@ libgedit_c_files =			\
 	gedit-rounded-frame.c		\
 	gedit-session.c			\
 	gedit-settings.c		\
-	gedit-smart-charset-converter.c	\
 	gedit-statusbar.c		\
 	gedit-status-combo-box.c	\
 	gedit-style-scheme-manager.c	\
diff --git a/gedit/gedit-document-loader.c b/gedit/gedit-document-loader.c
index 391edd4..b3b65e9 100644
--- a/gedit/gedit-document-loader.c
+++ b/gedit/gedit-document-loader.c
@@ -40,7 +40,6 @@
 
 #include "gedit-document-loader.h"
 #include "gedit-document-output-stream.h"
-#include "gedit-smart-charset-converter.h"
 #include "gedit-debug.h"
 #include "gedit-metadata-manager.h"
 #include "gedit-utils.h"
@@ -118,7 +117,6 @@ struct _GeditDocumentLoaderPrivate
 	GCancellable 	         *cancellable;
 	GInputStream	         *stream;
 	GOutputStream            *output;
-	GeditSmartCharsetConverter *converter;
 
 	gchar                     buffer[READ_CHUNK_SIZE];
 
@@ -225,12 +223,6 @@ gedit_document_loader_dispose (GObject *object)
 		priv->output = NULL;
 	}
 
-	if (priv->converter != NULL)
-	{
-		g_object_unref (priv->converter);
-		priv->converter = NULL;
-	}
-
 	if (priv->error != NULL)
 	{
 		g_error_free (priv->error);
@@ -628,7 +620,7 @@ async_read_cb (GInputStream *stream,
 		g_output_stream_flush (loader->priv->output, NULL, NULL);
 
 		loader->priv->auto_detected_encoding =
-			gedit_smart_charset_converter_get_guessed (loader->priv->converter);
+			gedit_document_output_stream_get_guessed (GEDIT_DOCUMENT_OUTPUT_STREAM (loader->priv->output));
 
 		loader->priv->auto_detected_newline_type =
 			gedit_document_output_stream_detect_newline_type (GEDIT_DOCUMENT_OUTPUT_STREAM (loader->priv->output));
@@ -636,7 +628,7 @@ async_read_cb (GInputStream *stream,
 		/* Check if we needed some fallback char, if so, check if there was
 		   a previous error and if not set a fallback used error */
 		/* FIXME Uncomment this when we want to manage conversion fallback */
-		/*if ((gedit_smart_charset_converter_get_num_fallbacks (loader->priv->converter) != 0) &&
+		/*if ((gedit_document_output_stream_get_num_fallbacks (GEDIT_DOCUMENT_OUTPUT_STREAM (loader->priv->output)) != 0) &&
 		    loader->priv->error == NULL)
 		{
 			g_set_error_literal (&loader->priv->error,
@@ -721,19 +713,6 @@ start_stream_read (AsyncData *async)
 	loader = async->loader;
 	info = loader->priv->info;
 
-	/* Get the candidate encodings */
-	if (loader->priv->encoding == NULL)
-	{
-		candidate_encodings = get_candidate_encodings (loader);
-	}
-	else
-	{
-		candidate_encodings = g_slist_prepend (NULL, (gpointer)loader->priv->encoding);
-	}
-
-	loader->priv->converter = gedit_smart_charset_converter_new (candidate_encodings);
-	g_slist_free (candidate_encodings);
-
 	if (g_file_info_has_attribute (info, G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE))
 	{
 		const gchar *content_type = g_file_info_get_content_type (info);
@@ -756,12 +735,23 @@ start_stream_read (AsyncData *async)
 	}
 
 	g_object_unref (loader->priv->stream);
-	loader->priv->stream = g_converter_input_stream_new (base_stream,
-	                                                     G_CONVERTER (loader->priv->converter));
-	g_object_unref (base_stream);
+	loader->priv->stream = base_stream;
+
+	/* Get the candidate encodings */
+	if (loader->priv->encoding == NULL)
+	{
+		candidate_encodings = get_candidate_encodings (loader);
+	}
+	else
+	{
+		candidate_encodings = g_slist_prepend (NULL, (gpointer)loader->priv->encoding);
+	}
 
 	/* Output stream */
-	loader->priv->output = gedit_document_output_stream_new (loader->priv->document);
+	loader->priv->output = gedit_document_output_stream_new (loader->priv->document,
+	                                                         candidate_encodings);
+
+	g_slist_free (candidate_encodings);
 
 	/* start reading */
 	read_file_chunk (async);
diff --git a/gedit/gedit-document-output-stream.c b/gedit/gedit-document-output-stream.c
index e0338a0..7aa25a3 100644
--- a/gedit/gedit-document-output-stream.c
+++ b/gedit/gedit-document-output-stream.c
@@ -26,7 +26,9 @@
 #include <glib.h>
 #include <glib/gi18n.h>
 #include <gio/gio.h>
+#include <errno.h>
 #include "gedit-document-output-stream.h"
+#include "gedit-debug.h"
 
 /* NOTE: never use async methods on this stream, the stream is just
  * a wrapper around GtkTextBuffer api so that we can use GIO Stream
@@ -48,6 +50,16 @@ struct _GeditDocumentOutputStreamPrivate
 	gchar *buffer;
 	gsize buflen;
 
+	/* Encoding detection */
+	GIConv iconv;
+	GCharsetConverter *charset_conv;
+
+	GSList *encodings;
+	GSList *current_encoding;
+
+	guint is_utf8 : 1;
+	guint use_first : 1;
+
 	guint is_initialized : 1;
 	guint is_closed : 1;
 };
@@ -115,11 +127,32 @@ gedit_document_output_stream_get_property (GObject    *object,
 }
 
 static void
+gedit_document_output_stream_dispose (GObject *object)
+{
+	GeditDocumentOutputStream *stream = GEDIT_DOCUMENT_OUTPUT_STREAM (object);
+
+	if (stream->priv->iconv != NULL)
+	{
+		g_iconv_close (stream->priv->iconv);
+		stream->priv->iconv = NULL;
+	}
+
+	if (stream->priv->charset_conv != NULL)
+	{
+		g_object_unref (stream->priv->charset_conv);
+		stream->priv->charset_conv = NULL;
+	}
+
+	G_OBJECT_CLASS (gedit_document_output_stream_parent_class)->dispose (object);
+}
+
+static void
 gedit_document_output_stream_finalize (GObject *object)
 {
 	GeditDocumentOutputStream *stream = GEDIT_DOCUMENT_OUTPUT_STREAM (object);
 
 	g_free (stream->priv->buffer);
+	g_slist_free (stream->priv->encodings);
 
 	G_OBJECT_CLASS (gedit_document_output_stream_parent_class)->finalize (object);
 }
@@ -154,6 +187,7 @@ gedit_document_output_stream_class_init (GeditDocumentOutputStreamClass *klass)
 
 	object_class->get_property = gedit_document_output_stream_get_property;
 	object_class->set_property = gedit_document_output_stream_set_property;
+	object_class->dispose = gedit_document_output_stream_dispose;
 	object_class->finalize = gedit_document_output_stream_finalize;
 	object_class->constructed = gedit_document_output_stream_constructed;
 
@@ -181,8 +215,196 @@ gedit_document_output_stream_init (GeditDocumentOutputStream *stream)
 	stream->priv->buffer = NULL;
 	stream->priv->buflen = 0;
 
+	stream->priv->charset_conv = NULL;
+	stream->priv->encodings = NULL;
+	stream->priv->current_encoding = NULL;
+
 	stream->priv->is_initialized = FALSE;
 	stream->priv->is_closed = FALSE;
+	stream->priv->is_utf8 = FALSE;
+	stream->priv->use_first = FALSE;
+}
+
+static const GeditEncoding *
+get_encoding (GeditDocumentOutputStream *stream)
+{
+	if (stream->priv->current_encoding == NULL)
+	{
+		stream->priv->current_encoding = stream->priv->encodings;
+	}
+	else
+	{
+		stream->priv->current_encoding = g_slist_next (stream->priv->current_encoding);
+	}
+
+	if (stream->priv->current_encoding != NULL)
+	{
+		return (const GeditEncoding *)stream->priv->current_encoding->data;
+	}
+
+	return NULL;
+}
+
+static gboolean
+try_convert (GCharsetConverter *converter,
+             const void        *inbuf,
+             gsize              inbuf_size)
+{
+	GError *err;
+	gsize bytes_read, nread;
+	gsize bytes_written, nwritten;
+	GConverterResult res;
+	gchar *out;
+	gboolean ret;
+	gsize out_size;
+
+	if (inbuf == NULL || inbuf_size == 0)
+	{
+		return FALSE;
+	}
+
+	err = NULL;
+	nread = 0;
+	nwritten = 0;
+	out_size = inbuf_size * 4;
+	out = g_malloc (out_size);
+
+	do
+	{
+		res = g_converter_convert (G_CONVERTER (converter),
+		                           (gchar *)inbuf + nread,
+		                           inbuf_size - nread,
+		                           (gchar *)out + nwritten,
+		                           out_size - nwritten,
+		                           G_CONVERTER_INPUT_AT_END,
+		                           &bytes_read,
+		                           &bytes_written,
+		                           &err);
+
+		nread += bytes_read;
+		nwritten += bytes_written;
+	} while (res != G_CONVERTER_FINISHED && res != G_CONVERTER_ERROR && err == NULL);
+
+	if (err != NULL)
+	{
+		if (err->code == G_CONVERT_ERROR_PARTIAL_INPUT)
+		{
+			/* FIXME We can get partial input while guessing the
+			   encoding because we just take some amount of text
+			   to guess from. */
+			ret = TRUE;
+		}
+		else
+		{
+			ret = FALSE;
+		}
+
+		g_error_free (err);
+	}
+	else
+	{
+		ret = TRUE;
+	}
+
+	/* FIXME: Check the remainder? */
+	if (ret == TRUE && !g_utf8_validate (out, nwritten, NULL))
+	{
+		ret = FALSE;
+	}
+
+	g_free (out);
+
+	return ret;
+}
+
+static GCharsetConverter *
+guess_encoding (GeditDocumentOutputStream *stream,
+		const void                *inbuf,
+		gsize                      inbuf_size)
+{
+	GCharsetConverter *conv = NULL;
+
+	if (inbuf == NULL || inbuf_size == 0)
+	{
+		stream->priv->is_utf8 = TRUE;
+		return NULL;
+	}
+
+	if (stream->priv->encodings != NULL &&
+	    stream->priv->encodings->next == NULL)
+	{
+		stream->priv->use_first = TRUE;
+	}
+
+	/* We just check the first block */
+	while (TRUE)
+	{
+		const GeditEncoding *enc;
+
+		if (conv != NULL)
+		{
+			g_object_unref (conv);
+			conv = NULL;
+		}
+
+		/* We get an encoding from the list */
+		enc = get_encoding (stream);
+
+		/* if it is NULL we didn't guess anything */
+		if (enc == NULL)
+		{
+			break;
+		}
+
+		gedit_debug_message (DEBUG_UTILS, "trying charset: %s",
+				     gedit_encoding_get_charset (stream->priv->current_encoding->data));
+
+		if (enc == gedit_encoding_get_utf8 ())
+		{
+			gsize remainder;
+			const gchar *end;
+			
+			if (g_utf8_validate (inbuf, inbuf_size, &end) ||
+			    stream->priv->use_first)
+			{
+				stream->priv->is_utf8 = TRUE;
+				break;
+			}
+
+			/* Check if the end is less than one char */
+			remainder = inbuf_size - (end - (gchar *)inbuf);
+			if (remainder < 6)
+			{
+				stream->priv->is_utf8 = TRUE;
+				break;
+			}
+
+			continue;
+		}
+
+		conv = g_charset_converter_new ("UTF-8",
+						gedit_encoding_get_charset (enc),
+						NULL);
+
+		/* If we tried all encodings we use the first one */
+		if (stream->priv->use_first)
+		{
+			break;
+		}
+
+		/* Try to convert */
+		if (try_convert (conv, inbuf, inbuf_size))
+		{
+			break;
+		}
+	}
+
+	if (conv != NULL)
+	{
+		g_converter_reset (G_CONVERTER (conv));
+	}
+
+	return conv;
 }
 
 static GeditDocumentNewlineType
@@ -216,10 +438,17 @@ get_newline_type (GtkTextIter *end)
 }
 
 GOutputStream *
-gedit_document_output_stream_new (GeditDocument *doc)
+gedit_document_output_stream_new (GeditDocument *doc,
+                                  GSList        *candidate_encodings)
 {
-	return G_OUTPUT_STREAM (g_object_new (GEDIT_TYPE_DOCUMENT_OUTPUT_STREAM,
-					      "document", doc, NULL));
+	GeditDocumentOutputStream *stream;
+
+	stream = g_object_new (GEDIT_TYPE_DOCUMENT_OUTPUT_STREAM,
+	                       "document", doc, NULL);
+
+	stream->priv->encodings = g_slist_copy (candidate_encodings);
+
+	return G_OUTPUT_STREAM (stream);
 }
 
 GeditDocumentNewlineType
@@ -244,6 +473,38 @@ gedit_document_output_stream_detect_newline_type (GeditDocumentOutputStream *str
 	return type;
 }
 
+const GeditEncoding *
+gedit_document_output_stream_get_guessed (GeditDocumentOutputStream *stream)
+{
+	g_return_val_if_fail (GEDIT_IS_DOCUMENT_OUTPUT_STREAM (stream), NULL);
+
+	if (stream->priv->current_encoding != NULL)
+	{
+		return (const GeditEncoding *)stream->priv->current_encoding->data;
+	}
+	else if (stream->priv->is_utf8 || !stream->priv->is_initialized)
+	{
+		/* If it is not initialized we assume that we are trying to convert
+		   the empty string */
+		return gedit_encoding_get_utf8 ();
+	}
+
+	return NULL;
+}
+
+guint
+gedit_document_output_stream_get_num_fallbacks (GeditDocumentOutputStream *stream)
+{
+	g_return_val_if_fail (GEDIT_IS_DOCUMENT_OUTPUT_STREAM (stream), FALSE);
+
+	if (stream->priv->charset_conv == NULL)
+	{
+		return FALSE;
+	}
+
+	return g_charset_converter_get_num_fallbacks (stream->priv->charset_conv) != 0;
+}
+
 /* If the last char is a newline, remove it from the buffer (otherwise
    GtkTextView shows it as an empty line). See bug #324942. */
 static void
@@ -297,6 +558,7 @@ gedit_document_output_stream_write (GOutputStream            *stream,
 	const gchar *end;
 	gsize nvalid;
 	gboolean valid;
+	gsize remainder;
 
 	if (g_cancellable_set_error_if_cancelled (cancellable, error))
 	{
@@ -307,6 +569,55 @@ gedit_document_output_stream_write (GOutputStream            *stream,
 
 	if (!ostream->priv->is_initialized)
 	{
+		ostream->priv->charset_conv = guess_encoding (ostream, buffer, count);
+
+		/* If we still have the previous case is that we didn't guess
+		   anything */
+		if (ostream->priv->charset_conv == NULL &&
+		    !ostream->priv->is_utf8)
+		{
+			/* FIXME: Add a different domain when we kill gedit_convert */
+			g_set_error_literal (error, GEDIT_DOCUMENT_ERROR,
+					     GEDIT_DOCUMENT_ERROR_ENCODING_AUTO_DETECTION_FAILED,
+					     _("It is not possible to detect the encoding automatically"));
+			return -1;
+		}
+
+		/* Do not initialize iconv if we are not going to conver anything */
+		if (!ostream->priv->is_utf8)
+		{
+			gchar *from_charset;
+
+			/* Initialize iconv */
+			g_object_get (G_OBJECT (ostream->priv->charset_conv),
+				      "from-charset", &from_charset,
+				      NULL);
+
+			ostream->priv->iconv = g_iconv_open ("UTF-8", from_charset);
+
+			if (ostream->priv->iconv == (GIConv)-1)
+			{
+				if (errno == EINVAL)
+				{
+					g_set_error (error, G_IO_ERROR, G_IO_ERROR_NOT_SUPPORTED,
+						     _("Conversion from character set '%s' to 'UTF-8' is not supported"),
+						     from_charset);
+				}
+				else
+				{
+					g_set_error (error, G_IO_ERROR, G_IO_ERROR_FAILED,
+						     _("Could not open converter from '%s' to 'UTF-8'"),
+						     from_charset);
+				}
+
+				g_free (from_charset);
+
+				return -1;
+			}
+
+			g_free (from_charset);
+		}
+
 		/* Init the undoable action */
 		gtk_source_buffer_begin_not_undoable_action (GTK_SOURCE_BUFFER (ostream->priv->doc));
 
@@ -339,14 +650,74 @@ gedit_document_output_stream_write (GOutputStream            *stream,
 		len = count;
 	}
 
+	if (!ostream->priv->is_utf8)
+	{
+		gchar *conv_text;
+		gsize conv_read;
+		gsize conv_written;
+		GError *err = NULL;
+
+		if (ostream->priv->iconv == NULL)
+		{
+			g_set_error_literal (error, G_IO_ERROR, G_IO_ERROR_NOT_INITIALIZED,
+			                     _("Invalid object, not initialized"));
+
+			if (freetext)
+			{
+				g_free (text);
+			}
+
+			return -1;
+		}
+
+		/* If we reached here is because we need to convert the text so, we
+		   convert it with the charset converter */
+		conv_text = g_convert_with_iconv (text,
+		                                  len,
+		                                  ostream->priv->iconv,
+		                                  &conv_read,
+		                                  &conv_written,
+		                                  &err);
+
+		if (freetext)
+		{
+			g_free (text);
+		}
+
+		if (err != NULL)
+		{
+			remainder = len - conv_read;
+
+			/* Store the partial char for the next conversion */
+			if (err->code == G_CONVERT_ERROR_ILLEGAL_SEQUENCE && 
+			    remainder < MAX_UNICHAR_LEN &&
+			    (g_utf8_get_char_validated (text + conv_read, remainder) == (gunichar)-2))
+			{
+				ostream->priv->buffer = g_strndup (text + conv_read, remainder);
+				ostream->priv->buflen = remainder;
+			}
+			else
+			{
+				/* Something went wrong with the conversion,
+				   propagate the error and finish */
+				g_propagate_error (error, err);
+				g_free (conv_text);
+
+				return -1;
+			}
+		}
+
+		text = conv_text;
+		len = conv_written;
+		freetext = TRUE;
+	}
+
 	/* validate */
 	valid = g_utf8_validate (text, len, &end);
 	nvalid = end - text;
 
 	if (!valid)
 	{
-		gsize remainder;
-
 		remainder = len - nvalid;
 
 		if ((remainder < MAX_UNICHAR_LEN) &&
diff --git a/gedit/gedit-document-output-stream.h b/gedit/gedit-document-output-stream.h
index 9bada3c..eadd4c3 100644
--- a/gedit/gedit-document-output-stream.h
+++ b/gedit/gedit-document-output-stream.h
@@ -26,6 +26,7 @@
 
 #include <gio/gio.h>
 #include "gedit-document.h"
+#include "gedit-encodings.h"
 
 G_BEGIN_DECLS
 
@@ -55,10 +56,15 @@ struct _GeditDocumentOutputStreamClass
 
 GType			 gedit_document_output_stream_get_type		(void) G_GNUC_CONST;
 
-GOutputStream		*gedit_document_output_stream_new		(GeditDocument *doc);
+GOutputStream		*gedit_document_output_stream_new		(GeditDocument *doc,
+									 GSList        *candidate_encodings);
 
 GeditDocumentNewlineType gedit_document_output_stream_detect_newline_type (GeditDocumentOutputStream *stream);
 
+const GeditEncoding	*gedit_document_output_stream_get_guessed	(GeditDocumentOutputStream *stream);
+
+guint			 gedit_document_output_stream_get_num_fallbacks	(GeditDocumentOutputStream *stream);
+
 G_END_DECLS
 
 #endif /* __GEDIT_DOCUMENT_OUTPUT_STREAM_H__ */
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f559525..27c5db8 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -3,11 +3,7 @@ INCLUDES = -g -I$(top_srcdir) -I$(top_srcdir)/gedit $(GEDIT_DEBUG_FLAGS) $(GEDIT
 noinst_PROGRAMS = $(TEST_PROGS) $(TEST_GTK_PROGS)
 progs_ldadd     = $(top_builddir)/gedit/libgedit.la
 
-TEST_PROGS			= smart-converter
-smart_converter_SOURCES		= smart-converter.c
-smart_converter_LDADD		= $(progs_ldadd)
-
-TEST_PROGS			+= document-input-stream
+TEST_PROGS			= document-input-stream
 document_input_stream_SOURCES	= document-input-stream.c
 document_input_stream_LDADD	= $(progs_ldadd)
 
diff --git a/tests/document-output-stream.c b/tests/document-output-stream.c
index eb9acf7..591a434 100644
--- a/tests/document-output-stream.c
+++ b/tests/document-output-stream.c
@@ -40,9 +40,11 @@ test_consecutive_write (const gchar *inbuf,
 	GError *err = NULL;
 	gchar *b;
 	GeditDocumentNewlineType type;
+	GSList *encodings = NULL;
 
 	doc = gedit_document_new ();
-	out = gedit_document_output_stream_new (doc);
+	encodings = g_slist_prepend (encodings, (gpointer)gedit_encoding_get_utf8 ());
+	out = gedit_document_output_stream_new (doc, encodings);
 
 	n = 0;
 
@@ -127,9 +129,11 @@ test_boundary ()
 	GOutputStream *out;
 	gint line_count;
 	GError *err = NULL;
+	GSList *encodings = NULL;
 
 	doc = gedit_document_new ();
-	out = gedit_document_output_stream_new (doc);
+	encodings = g_slist_prepend (encodings, (gpointer)gedit_encoding_get_utf8 ());
+	out = gedit_document_output_stream_new (doc, encodings);
 
 	g_output_stream_write (out, "\r", 1, NULL, NULL);
 	g_output_stream_write (out, "\n", 1, NULL, NULL);
@@ -148,6 +152,237 @@ test_boundary ()
 	g_object_unref (out);
 }
 
+/* SMART CONVERSION */
+
+#define TEXT_TO_CONVERT "this is some text to make the tests"
+#define TEXT_TO_GUESS "hello \xe6\x96\x87 world"
+
+static void
+print_hex (gchar *ptr, gint len)
+{
+	gint i;
+
+	for (i = 0; i < len; ++i)
+	{
+		g_printf ("\\x%02x", (unsigned char)ptr[i]);
+	}
+
+	g_printf ("\n");
+}
+
+static gchar *
+get_encoded_text (const gchar         *text,
+                  gsize                nread,
+                  const GeditEncoding *to,
+                  const GeditEncoding *from,
+                  gsize               *bytes_written_aux,
+                  gboolean             care_about_error)
+{
+	GCharsetConverter *converter;
+	gchar *out, *out_aux;
+	gsize bytes_read, bytes_read_aux;
+	gsize bytes_written;
+	GConverterResult res;
+	GError *err;
+
+	converter = g_charset_converter_new (gedit_encoding_get_charset (to),
+					     gedit_encoding_get_charset (from),
+					     NULL);
+
+	out = g_malloc (200);
+	out_aux = g_malloc (200);
+	err = NULL;
+	bytes_read_aux = 0;
+	*bytes_written_aux = 0;
+
+	if (nread == -1)
+	{
+		nread = strlen (text);
+	}
+
+	do
+	{
+		res = g_converter_convert (G_CONVERTER (converter),
+		                           text + bytes_read_aux,
+		                           nread,
+		                           out_aux,
+		                           200,
+		                           G_CONVERTER_INPUT_AT_END,
+		                           &bytes_read,
+		                           &bytes_written,
+		                           &err);
+		memcpy (out + *bytes_written_aux, out_aux, bytes_written);
+		bytes_read_aux += bytes_read;
+		*bytes_written_aux += bytes_written;
+		nread -= bytes_read;
+	} while (res != G_CONVERTER_FINISHED && res != G_CONVERTER_ERROR);
+
+	if (care_about_error)
+	{
+		g_assert_no_error (err);
+	}
+	else if (err)
+	{
+		g_printf ("** You don't care, but there was an error: %s", err->message);
+		return NULL;
+	}
+
+	out[*bytes_written_aux] = '\0';
+
+	if (!g_utf8_validate (out, *bytes_written_aux, NULL) && !care_about_error)
+	{
+		if (!care_about_error)
+		{
+			return NULL;
+		}
+		else
+		{
+			g_assert_not_reached ();
+		}
+	}
+
+	return out;
+}
+
+static GSList *
+get_all_encodings ()
+{
+	GSList *encs = NULL;
+	gint i = 0;
+
+	while (TRUE)
+	{
+		const GeditEncoding *enc;
+
+		enc = gedit_encoding_get_from_index (i);
+
+		if (enc == NULL)
+			break;
+
+		encs = g_slist_prepend (encs, (gpointer)enc);
+		i++;
+	}
+
+	return encs;
+}
+
+static gchar *
+do_test (const gchar *test_in,
+         const gchar *enc,
+         GSList      *encodings,
+         gsize        nread,
+         const GeditEncoding **guessed)
+{
+	GeditDocument *doc;
+	GOutputStream *out;
+	GError *err = NULL;
+	GtkTextIter start, end;
+	gchar *text;
+
+	if (enc != NULL)
+	{
+		encodings = NULL;
+		encodings = g_slist_prepend (encodings, (gpointer)gedit_encoding_get_from_charset (enc));
+	}
+
+	doc = gedit_document_new ();
+	encodings = g_slist_prepend (encodings, (gpointer)gedit_encoding_get_utf8 ());
+	out = gedit_document_output_stream_new (doc, encodings);
+
+	g_output_stream_write (out, test_in, nread, NULL, &err);
+	g_assert_no_error (err);
+
+	g_output_stream_flush (out, NULL, &err);
+	g_assert_no_error (err);
+
+	g_output_stream_close (out, NULL, &err);
+	g_assert_no_error (err);
+
+	if (guessed != NULL)
+		*guessed = gedit_document_output_stream_get_guessed (GEDIT_DOCUMENT_OUTPUT_STREAM (out));
+
+	gtk_text_buffer_get_bounds (GTK_TEXT_BUFFER (doc), &start, &end);
+	text = gtk_text_buffer_get_text (GTK_TEXT_BUFFER (doc),
+	                                 &start,
+	                                 &end,
+	                                 FALSE);
+
+	g_object_unref (doc);
+	g_object_unref (out);
+
+	return text;
+}
+
+static void
+test_utf8_utf8 ()
+{
+	gchar *aux;
+
+	aux = do_test (TEXT_TO_CONVERT, "UTF-8", NULL, strlen (TEXT_TO_CONVERT), NULL);
+	g_assert_cmpstr (aux, ==, TEXT_TO_CONVERT);
+
+	aux = do_test ("foobar\xc3\xa8\xc3\xa8\xc3\xa8zzzzzz", "UTF-8", NULL, 18, NULL);
+	g_assert_cmpstr (aux, ==, "foobar\xc3\xa8\xc3\xa8\xc3\xa8zzzzzz");
+
+	aux = do_test ("foobar\xc3\xa8\xc3\xa8\xc3\xa8zzzzzz", "UTF-8", NULL, 12, NULL);
+	g_assert_cmpstr (aux, ==, "foobar\xc3\xa8\xc3\xa8\xc3\xa8");
+
+	/* FIXME: Use the utf8 stream for a fallback? */
+	//do_test_with_error ("\xef\xbf\xbezzzzzz", encs, G_IO_ERROR_FAILED);
+}
+
+static void
+test_empty_conversion ()
+{
+	const GeditEncoding *guessed;
+	gchar *out;
+	GSList *encodings = NULL;
+
+	/* testing the case of an empty file and list of encodings with no
+	   utf-8. In this case, the smart converter cannot determine the right
+	   encoding (because there is no input), but should still default to
+	   utf-8 for the detection */
+	encodings = g_slist_prepend (encodings, (gpointer)gedit_encoding_get_from_charset ("UTF-16"));
+	encodings = g_slist_prepend (encodings, (gpointer)gedit_encoding_get_from_charset ("ISO-8859-15"));
+
+	out = do_test ("", NULL, encodings, 0, &guessed);
+
+	g_assert_cmpstr (out, ==, "");
+
+	g_assert (guessed == gedit_encoding_get_utf8 ());
+}
+
+static void
+test_guessed ()
+{
+	GSList *encs = NULL;
+	gchar *aux, *aux2, *fail;
+	gsize aux_len, fail_len;
+	const GeditEncoding *guessed;
+
+	aux = get_encoded_text (TEXT_TO_GUESS, -1,
+	                        gedit_encoding_get_from_charset ("UTF-16"),
+	                        gedit_encoding_get_from_charset ("UTF-8"),
+	                        &aux_len,
+	                        TRUE);
+
+	fail = get_encoded_text (aux, aux_len,
+	                         gedit_encoding_get_from_charset ("UTF-8"),
+	                         gedit_encoding_get_from_charset ("ISO-8859-15"),
+	                         &fail_len,
+	                         FALSE);
+
+	g_assert (fail == NULL);
+
+	/* ISO-8859-15 should fail */
+	encs = g_slist_append (encs, (gpointer)gedit_encoding_get_from_charset ("ISO-8859-15"));
+	encs = g_slist_append (encs, (gpointer)gedit_encoding_get_from_charset ("UTF-16"));
+
+	aux2 = do_test (aux, NULL, encs, aux_len, &guessed);
+
+	g_assert (guessed == gedit_encoding_get_from_charset ("UTF-16"));
+}
+
 int main (int   argc,
           char *argv[])
 {
@@ -161,6 +396,10 @@ int main (int   argc,
 	g_test_add_func ("/document-output-stream/big-char", test_big_char);
 	g_test_add_func ("/document-output-stream/test-boundary", test_boundary);
 
+	g_test_add_func ("/document-output-stream/smart conversion: utf8-utf8", test_utf8_utf8);
+	g_test_add_func ("/document-output-stream/smart conversion: guessed", test_guessed);
+	g_test_add_func ("/document-output-stream/smart conversion: empty", test_empty_conversion);
+
 	return g_test_run ();
 }
 /* ex:ts=8:noet: */



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]