Re: [Evolution-hackers] Patch: add detection of multibyte alternative encodings for cases we fail to read the streams.
- From: Philip Van Hoof <spam pvanhoof be>
- To: Jose Dapena Paz <jdapena igalia com>
- Cc: Evolution Hackers <evolution-hackers gnome org>, tinymail-devel-list <tinymail-devel-list gnome org>
- Subject: Re: [Evolution-hackers] Patch: add detection of multibyte alternative encodings for cases we fail to read the streams.
- Date: Thu, 13 Mar 2008 15:38:47 +0100
Hey Jose,
I added Jeffrey, Matthew and the Evolution-hackers mailing list in CC.
I will be waiting for an answer from Jeffrey as I have no personal
expertise in the decoding of header fields.
Please also try to get this patch in upstream-camel if Jeffrey approves
it.
On Thu, 2008-03-13 at 11:05 +0100, Jose Dapena Paz wrote:
> Hi,
>
> This patch adds support for trying to guess other encodings
> (specifically GBK) if we fail to parse a message using standard
> procedure.
>
> We have basically two changes. One is very simple as the current
> support was already able to read headers in UTF-8 in this situation. I
> added GBK to the array of supported encodings for guessing in headers.
>
> The other part is more complex. Our charset filter now has some
> encoding fallbacks for the case it cannot parse properly the text (UTF-8
> and GBK).
>
> What's this patch intended for? It's for adding support for reading
> messages from some weird servers that send the messages with
> unencapsulated GBK encoding. Oh, and this problem is also present in
> evolution.
--
Philip Van Hoof, freelance software developer
home: me at pvanhoof dot be
gnome: pvanhoof at gnome dot org
http://pvanhoof.be/blog
http://codeminded.be
Index: ChangeLog
===================================================================
--- ChangeLog (revision 3488)
+++ ChangeLog (working copy)
@@ -1,3 +1,12 @@
+2008-03-13 Jose Dapena Paz <jdapena igalia com>
+
+ * libtinymail-camel/camel-lite/camel/camel-mime-utils.c:
+ (decode_8bit) Add GBK as a charset to test in case we fail to
+ parse a header.
+ * libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.[ch]:
+ Add support for multibyte fallbacks in case parsing with the
+ specified charset fails. Currently we provide UTF-8 and GBK.
+
2008-03-12 Jose Dapena Paz <jdapena igalia com>
* libtinymail-gnome-desktop/tny-gnome-device.c:
Index: libtinymail-camel/camel-lite/camel/camel-mime-utils.c
===================================================================
--- libtinymail-camel/camel-lite/camel/camel-mime-utils.c (revision 3488)
+++ libtinymail-camel/camel-lite/camel/camel-mime-utils.c (working copy)
@@ -908,7 +908,7 @@
static char *
decode_8bit (const char *text, size_t len, const char *default_charset)
{
- const char *charsets[4] = { "UTF-8", NULL, NULL, NULL };
+ const char *charsets[4] = { "UTF-8", "GBK", NULL, NULL };
size_t inleft, outleft, outlen, rc, min, n;
const char *locale_charset, *best;
char *out, *outbuf;
Index: libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.c
===================================================================
--- libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.c (revision 3488)
+++ libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.c (working copy)
@@ -41,6 +41,8 @@
static CamelMimeFilterClass *camel_mime_filter_charset_parent;
+const char *best_charset[4] = { "UTF-8", "GBK", NULL, NULL };
+
CamelType
camel_mime_filter_charset_get_type (void)
{
@@ -121,15 +123,28 @@
outbuf = mf->outbuf + converted;
outleft = mf->outsize - converted;
} else if (errno == EILSEQ) {
- /*
- * EILSEQ An invalid multibyte sequence has been encountered
- * in the input.
- *
- * What we do here is eat the invalid bytes in the sequence and continue
- */
-
- inbuf++;
- inleft--;
+ if ((charset->best == NULL) || (*(charset->best+1) != NULL)) {
+ e_iconv_close (charset->ic);
+ if (charset->best == NULL) {
+ charset->best = (char **) best_charset;
+ } else {
+ charset->best ++;
+ }
+ charset->ic = e_iconv_open (charset->to, *(charset->best));
+ if (charset->ic == (iconv_t) -1) {
+ charset->ic = e_iconv_open (charset->to, "UTF-8");
+ }
+ } else {
+ /*
+ * EILSEQ An invalid multibyte sequence has been encountered
+ * in the input.
+ *
+ * What we do here is eat the invalid bytes in the sequence and continue
+ */
+
+ inbuf++;
+ inleft--;
+ }
} else if (errno == EINVAL) {
/*
* EINVAL An incomplete multibyte sequence has been encoun�
@@ -187,6 +202,18 @@
break;
if (errno == EILSEQ) {
+ if ((charset->best == NULL) || (*(charset->best+1) != NULL)) {
+ e_iconv_close (charset->ic);
+ if (charset->best == NULL) {
+ charset->best = (char **) best_charset;
+ } else {
+ charset->best ++;
+ }
+ charset->ic = e_iconv_open (charset->to, *(charset->best));
+ if (charset->ic == (iconv_t) -1) {
+ charset->ic = e_iconv_open (charset->to, "UTF-8");
+ }
+ } else {
/*
* EILSEQ An invalid multibyte sequence has been encountered
* in the input.
@@ -194,8 +221,9 @@
* What we do here is eat the invalid bytes in the sequence and continue
*/
- inbuf++;
- inleft--;
+ inbuf++;
+ inleft--;
+ }
} else {
/* unknown error condition */
goto noop;
@@ -274,7 +302,12 @@
new = CAMEL_MIME_FILTER_CHARSET (camel_object_new (camel_mime_filter_charset_get_type ()));
new->ic = e_iconv_open (to_charset, from_charset);
+ new->best = NULL;
if (new->ic == (iconv_t) -1) {
+ new->best = (char **) best_charset;
+ new->ic = e_iconv_open (to_charset, best_charset[0]);
+ }
+ if (new->ic == (iconv_t) -1) {
w(g_warning ("Cannot create charset conversion from %s to %s: %s",
from_charset ? from_charset : "(null)",
to_charset ? to_charset : "(null)",
Index: libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.h
===================================================================
--- libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.h (revision 3488)
+++ libtinymail-camel/camel-lite/camel/camel-mime-filter-charset.h (working copy)
@@ -41,6 +41,7 @@
iconv_t ic;
char *from;
char *to;
+ char **best;
};
struct _CamelMimeFilterCharsetClass {
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]