Re: [Evolution-hackers] Patch: add detection of multibyte alternative encodings for cases we fail to read the streams.



Hey Jose,

I added Jeffrey, Matthew and the Evolution-hackers mailing list in CC.

I will be waiting for an answer from Jeffrey as I have no personal
expertise in the decoding of header fields.

Please also try to get this patch in upstream-camel if Jeffrey approves
it.


On Thu, 2008-03-13 at 11:05 +0100, Jose Dapena Paz wrote:
> 	Hi,
> 
> 	This patch adds support for trying to guess other encodings
> (specifically GBK) if we fail to parse a message using standard
> procedure.
> 
> 	We have basically two changes. One is very simple as the current
> support was already able to read headers in UTF-8 in this situation. I
> added GBK to the array of supported encodings for guessing in headers.
> 
> 	The other part is more complex. Our charset filter now has some
> encoding fallbacks for the case it cannot parse properly the text (UTF-8
> and GBK).
> 
> 	What's this patch intended for? It's for adding support for reading
> messages from some weird servers that send the messages with
> unencapsulated GBK encoding. Oh, and this problem is also present in
> evolution.


-- 
Philip Van Hoof, freelance software developer
home: me at pvanhoof dot be 
gnome: pvanhoof at gnome dot org 
http://pvanhoof.be/blog
http://codeminded.be



Index: ChangeLog
===================================================================
--- ChangeLog	(revision 3488)
+++ ChangeLog	(working copy)
@@ -1,3 +1,12 @@
+2008-03-13  Jose Dapena Paz  <jdapena igalia com>
+
+	* libtinymail-camel/camel-lite/camel/camel-mime-utils.c:
+	(decode_8bit) Add GBK as a charset to test in case we fail to
+	parse a header.
+	* libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.[ch]:
+	Add support for multibyte fallbacks in case parsing with the
+	specified charset fails. Currently we provide UTF-8 and GBK.
+
 2008-03-12  Jose Dapena Paz  <jdapena igalia com>
 
 	* libtinymail-gnome-desktop/tny-gnome-device.c:
Index: libtinymail-camel/camel-lite/camel/camel-mime-utils.c
===================================================================
--- libtinymail-camel/camel-lite/camel/camel-mime-utils.c	(revision 3488)
+++ libtinymail-camel/camel-lite/camel/camel-mime-utils.c	(working copy)
@@ -908,7 +908,7 @@
 static char *
 decode_8bit (const char *text, size_t len, const char *default_charset)
 {
-	const char *charsets[4] = { "UTF-8", NULL, NULL, NULL };
+	const char *charsets[4] = { "UTF-8", "GBK", NULL, NULL };
 	size_t inleft, outleft, outlen, rc, min, n;
 	const char *locale_charset, *best;
 	char *out, *outbuf;
Index: libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.c
===================================================================
--- libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.c	(revision 3488)
+++ libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.c	(working copy)
@@ -41,6 +41,8 @@
 
 static CamelMimeFilterClass *camel_mime_filter_charset_parent;
 
+const char *best_charset[4] = { "UTF-8", "GBK", NULL, NULL };
+
 CamelType
 camel_mime_filter_charset_get_type (void)
 {
@@ -121,15 +123,28 @@
 					outbuf = mf->outbuf + converted;
 					outleft = mf->outsize - converted;
 				} else if (errno == EILSEQ) {
-					/*
-					 * EILSEQ An invalid multibyte sequence has been  encountered
-					 *        in the input.
-					 *
-					 * What we do here is eat the invalid bytes in the sequence and continue
-					 */
-
-					inbuf++;
-					inleft--;
+					if ((charset->best == NULL) || (*(charset->best+1) != NULL)) {
+						e_iconv_close (charset->ic);
+						if (charset->best == NULL) {
+							charset->best = (char **) best_charset;
+						} else {
+							charset->best ++;
+						}
+						charset->ic = e_iconv_open (charset->to, *(charset->best));
+						if (charset->ic == (iconv_t) -1) {
+							charset->ic = e_iconv_open (charset->to, "UTF-8");
+						}
+					} else {
+						/*
+						 * EILSEQ An invalid multibyte sequence has been  encountered
+						 *        in the input.
+						 *
+						 * What we do here is eat the invalid bytes in the sequence and continue
+						 */
+						
+						inbuf++;
+						inleft--;
+					}
 				} else if (errno == EINVAL) {
 					/*
 					 * EINVAL  An  incomplete  multibyte sequence has been encoun�
@@ -187,6 +202,18 @@
 				break;
 
 			if (errno == EILSEQ) {
+				if ((charset->best == NULL) || (*(charset->best+1) != NULL)) {
+					e_iconv_close (charset->ic);
+					if (charset->best == NULL) {
+						charset->best = (char **) best_charset;
+					} else {
+						charset->best ++;
+					}
+					charset->ic = e_iconv_open (charset->to, *(charset->best));
+					if (charset->ic == (iconv_t) -1) {
+						charset->ic = e_iconv_open (charset->to, "UTF-8");
+					}
+				} else {
 				/*
 				 * EILSEQ An invalid multibyte sequence has been  encountered
 				 *        in the input.
@@ -194,8 +221,9 @@
 				 * What we do here is eat the invalid bytes in the sequence and continue
 				 */
 
-				inbuf++;
-				inleft--;
+					inbuf++;
+					inleft--;
+				}
 			} else {
 				/* unknown error condition */
 				goto noop;
@@ -274,7 +302,12 @@
 	new = CAMEL_MIME_FILTER_CHARSET (camel_object_new (camel_mime_filter_charset_get_type ()));
 
 	new->ic = e_iconv_open (to_charset, from_charset);
+	new->best = NULL;
 	if (new->ic == (iconv_t) -1) {
+		new->best = (char **) best_charset;
+		new->ic = e_iconv_open (to_charset, best_charset[0]);
+	}
+	if (new->ic == (iconv_t) -1) {
 		w(g_warning ("Cannot create charset conversion from %s to %s: %s",
 			     from_charset ? from_charset : "(null)",
 			     to_charset ? to_charset : "(null)",
Index: libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.h
===================================================================
--- libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.h	(revision 3488)
+++ libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.h	(working copy)
@@ -41,6 +41,7 @@
 	iconv_t ic;
 	char *from;
 	char *to;
+	char **best;
 };
 
 struct _CamelMimeFilterCharsetClass {


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]