Patch: add detection of multibyte alternative encodings for cases we fail to read the streams.



	Hi,

	This patch adds support for trying to guess other encodings
(specifically GBK) if we fail to parse a message using standard
procedure.

	We have basically two changes. One is very simple as the current
support was already able to read headers in UTF-8 in this situation. I
added GBK to the array of supported encodings for guessing in headers.

	The other part is more complex. Our charset filter now has some
encoding fallbacks for the case it cannot parse properly the text (UTF-8
and GBK).

	What's this patch intended for? It's for adding support for reading
messages from some weird servers that send the messages with
unencapsulated GBK encoding. Oh, and this problem is also present in
evolution.

-- 
Jose Dapena Paz <jdapena igalia com>
Igalia
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 3488)
+++ ChangeLog	(working copy)
@@ -1,3 +1,12 @@
+2008-03-13  Jose Dapena Paz  <jdapena igalia com>
+
+	* libtinymail-camel/camel-lite/camel/camel-mime-utils.c:
+	(decode_8bit) Add GBK as a charset to test in case we fail to
+	parse a header.
+	* libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.[ch]:
+	Add support for multibyte fallbacks in case parsing with the
+	specified charset fails. Currently we provide UTF-8 and GBK.
+
 2008-03-12  Jose Dapena Paz  <jdapena igalia com>
 
 	* libtinymail-gnome-desktop/tny-gnome-device.c:
Index: libtinymail-camel/camel-lite/camel/camel-mime-utils.c
===================================================================
--- libtinymail-camel/camel-lite/camel/camel-mime-utils.c	(revision 3488)
+++ libtinymail-camel/camel-lite/camel/camel-mime-utils.c	(working copy)
@@ -908,7 +908,7 @@
 static char *
 decode_8bit (const char *text, size_t len, const char *default_charset)
 {
-	const char *charsets[4] = { "UTF-8", NULL, NULL, NULL };
+	const char *charsets[4] = { "UTF-8", "GBK", NULL, NULL };
 	size_t inleft, outleft, outlen, rc, min, n;
 	const char *locale_charset, *best;
 	char *out, *outbuf;
Index: libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.c
===================================================================
--- libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.c	(revision 3488)
+++ libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.c	(working copy)
@@ -41,6 +41,8 @@
 
 static CamelMimeFilterClass *camel_mime_filter_charset_parent;
 
+const char *best_charset[4] = { "UTF-8", "GBK", NULL, NULL };
+
 CamelType
 camel_mime_filter_charset_get_type (void)
 {
@@ -121,15 +123,28 @@
 					outbuf = mf->outbuf + converted;
 					outleft = mf->outsize - converted;
 				} else if (errno == EILSEQ) {
-					/*
-					 * EILSEQ An invalid multibyte sequence has been  encountered
-					 *        in the input.
-					 *
-					 * What we do here is eat the invalid bytes in the sequence and continue
-					 */
-
-					inbuf++;
-					inleft--;
+					if ((charset->best == NULL) || (*(charset->best+1) != NULL)) {
+						e_iconv_close (charset->ic);
+						if (charset->best == NULL) {
+							charset->best = (char **) best_charset;
+						} else {
+							charset->best ++;
+						}
+						charset->ic = e_iconv_open (charset->to, *(charset->best));
+						if (charset->ic == (iconv_t) -1) {
+							charset->ic = e_iconv_open (charset->to, "UTF-8");
+						}
+					} else {
+						/*
+						 * EILSEQ An invalid multibyte sequence has been  encountered
+						 *        in the input.
+						 *
+						 * What we do here is eat the invalid bytes in the sequence and continue
+						 */
+						
+						inbuf++;
+						inleft--;
+					}
 				} else if (errno == EINVAL) {
 					/*
 					 * EINVAL  An  incomplete  multibyte sequence has been encoun�
@@ -187,6 +202,18 @@
 				break;
 
 			if (errno == EILSEQ) {
+				if ((charset->best == NULL) || (*(charset->best+1) != NULL)) {
+					e_iconv_close (charset->ic);
+					if (charset->best == NULL) {
+						charset->best = (char **) best_charset;
+					} else {
+						charset->best ++;
+					}
+					charset->ic = e_iconv_open (charset->to, *(charset->best));
+					if (charset->ic == (iconv_t) -1) {
+						charset->ic = e_iconv_open (charset->to, "UTF-8");
+					}
+				} else {
 				/*
 				 * EILSEQ An invalid multibyte sequence has been  encountered
 				 *        in the input.
@@ -194,8 +221,9 @@
 				 * What we do here is eat the invalid bytes in the sequence and continue
 				 */
 
-				inbuf++;
-				inleft--;
+					inbuf++;
+					inleft--;
+				}
 			} else {
 				/* unknown error condition */
 				goto noop;
@@ -274,7 +302,12 @@
 	new = CAMEL_MIME_FILTER_CHARSET (camel_object_new (camel_mime_filter_charset_get_type ()));
 
 	new->ic = e_iconv_open (to_charset, from_charset);
+	new->best = NULL;
 	if (new->ic == (iconv_t) -1) {
+		new->best = (char **) best_charset;
+		new->ic = e_iconv_open (to_charset, best_charset[0]);
+	}
+	if (new->ic == (iconv_t) -1) {
 		w(g_warning ("Cannot create charset conversion from %s to %s: %s",
 			     from_charset ? from_charset : "(null)",
 			     to_charset ? to_charset : "(null)",
Index: libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.h
===================================================================
--- libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.h	(revision 3488)
+++ libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.h	(working copy)
@@ -41,6 +41,7 @@
 	iconv_t ic;
 	char *from;
 	char *to;
+	char **best;
 };
 
 struct _CamelMimeFilterCharsetClass {


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]