Re: [Evolution-hackers] [patch] fixed incorrect rfc2047 decode for CJK header
- From: Philip Van Hoof <spam pvanhoof be>
- To: jacky <gtkdict yahoo com cn>
- Cc: tinymail-devel-list <tinymail-devel-list gnome org>, evolution-hackers gnome org
- Subject: Re: [Evolution-hackers] [patch] fixed incorrect rfc2047 decode for CJK header
- Date: Sun, 23 Dec 2007 18:22:00 +0100
Hey Jacky,
This is a port of your patch to Tinymail's camel-lite
On Sun, 2007-12-23 at 23:09 +0800, jacky wrote:
> Hi, all.
>
> The rfc2047 decoder in libcamel can not decode some
> CJK header correctly. Although some of them are not
> correspond to RFC, but I need to decode it correctly
> and I thought if evolution can display there email
> correctly more people like it.
>
> So I write a new rfc2047 decoder, and it's in the
> patch. With the patch, libcamel can decode CJK header
> correctly and evolution can display CJK header
> correctly now. I had test it in my mailbox. My mailbox
> has 2000 emails which were sent by evolution,
> thunderbird, outlook, outlook express, foxmail, open
> webmail, yahoo, gmail, lotus notes, etc. Without this
> patch, almost 20% of there emails can't be decoded and
> displayed correctly, with this patch, 99% of there
> emails can be decoded and displayed correctly.
>
> And I found that the attachment with CJK name can't be
> recognised and displayed by outlook / outlook express
> / foxmail. This is because there email clients do not
> support RFC2184. Evolution always use RFC2184 encode
> mothod to encode attachment name, so the email with
> CJK named attachment can't display in outlook /
> outlook express / foxmail. In thunderbird, you can set
> the option "mail.strictly_mime.parm_folding" to 0 or 1
> for using RFC2047 encode mothod to encode attachment
> name. Can we add a similar option?
>
> Best regards.
>
>
> ___________________________________________________________
> 雅虎邮箱传递新年祝福,个性贺卡送亲朋!
> http://cn.mail.yahoo.com/gc/index.html?entry=5&souce=mail_mailletter_tagline
> _______________________________________________ Evolution-hackers mailing list Evolution-hackers gnome org http://mail.gnome.org/mailman/listinfo/evolution-hackers
--
Philip Van Hoof, freelance software developer
home: me at pvanhoof dot be
gnome: pvanhoof at gnome dot org
http://pvanhoof.be/blog
http://codeminded.be
Index: libtinymail-camel/camel-lite/camel/camel-mime-utils.c
===================================================================
--- libtinymail-camel/camel-lite/camel/camel-mime-utils.c (revision 3190)
+++ libtinymail-camel/camel-lite/camel/camel-mime-utils.c (working copy)
@@ -821,125 +821,207 @@
*in = inptr;
}
+static void
+print_hex (unsigned char *data, size_t len)
+{
+ size_t i, x;
+ unsigned char *p = data;
+ char high, low;
+
+ x = 0;
+ printf ("%04u ", x);
+ for (i = 0; i < len; i++) {
+ high = *p >> 4;
+ high = (high<10) ? high + '0' : high + 'a' - 10;
+
+ low = *p & 0x0f;
+ low = (low<10) ? low + '0' : low + 'a' - 10;
+
+ printf ("0x%c%c ", high, low);
+
+ p++;
+ x++;
+ if (i % 8 == 7) {
+ printf ("\n%04u ", x);
+ }
+ }
+ printf ("\n");
+}
+
+static size_t
+conv_to_utf8 (const char *encname, char *in, size_t inlen, char *out, size_t outlen)
+{
+ char *charset, *inbuf, *outbuf;
+ iconv_t ic;
+ size_t inbuf_len, outbuf_len, ret;
+
+ charset = (char *) e_iconv_charset_name (encname);
+
+ ic = e_iconv_open ("UTF-8", charset);
+ if (ic == (iconv_t) -1) {
+ printf ("e_iconv_open() error\n");
+ return (size_t)-1;
+ }
+
+ inbuf = in;
+ inbuf_len = inlen;
+
+ outbuf = out;
+ outbuf_len = outlen;
+
+ ret = e_iconv (ic, (const char **) &inbuf, &inbuf_len, &outbuf, &outbuf_len);
+ if (ret == (size_t)-1) {
+ printf ("e_iconv() error! source charset is %s, target charset is %s\n", charset, "UTF-8");
+ printf ("converted %u bytes, but last %u bytes can't convert!!\n", inlen - inbuf_len, inbuf_len);
+ printf ("source data:\n");
+ print_hex (in, inlen);
+
+ *outbuf = '\0';
+ printf ("target string is \"%s\"\n", out);
+
+ return (size_t)-1;
+ }
+
+ ret = outlen - outbuf_len;
+ out[ret] = '\0';
+
+ e_iconv_close (ic);
+
+ return ret;
+}
+
/* decode rfc 2047 encoded string segment */
+#define DECWORD_LEN 1024
+#define UTF8_DECWORD_LEN 2048
+
static char *
rfc2047_decode_word(const char *in, size_t len)
{
- const char *inptr = in+2;
- const char *inend = in+len-2;
- const char *inbuf;
- const char *charset;
- char *encname, *p;
- int tmplen;
- size_t ret;
- char *decword = NULL;
- char *decoded = NULL;
- char *outbase = NULL;
- char *outbuf;
- size_t inlen, outlen;
- gboolean retried = FALSE;
- iconv_t ic;
- int idx = 0;
+ char prev_charset[32], curr_charset[32];
+ char encode;
+ char *start, *inptr, *inend;
+ char decword[DECWORD_LEN], utf8_decword[UTF8_DECWORD_LEN];
+ char *decword_ptr, *utf8_decword_ptr;
+ size_t inlen, outlen, ret;
d(printf("rfc2047: decoding '%.*s'\n", len, in));
+ prev_charset[0] = curr_charset[0] = '\0';
+
+ decword_ptr = decword;
+ utf8_decword_ptr = utf8_decword;
+
/* quick check to see if this could possibly be a real encoded word */
-
- if (len < 8 || !(in[0] == '=' && in[1] == '?')) {
+ if (len < 8
+ || !(in[0] == '=' && in[1] == '?'
+ && in[len-1] == '=' && in[len-2] == '?')) {
d(printf("invalid\n"));
return NULL;
}
- /* skip past the charset to the encoding type */
- inptr = memchr (inptr, '?', inend-inptr);
- if (inptr != NULL && inptr < inend + 2 && inptr[2] == '?') {
- d(printf("found ?, encoding is '%c'\n", inptr[0]));
- inptr++;
- tmplen = inend-inptr-2;
- decword = g_alloca (tmplen); /* this will always be more-than-enough room */
- switch(toupper(inptr[0])) {
- case 'Q':
- inlen = quoted_decode((const unsigned char *) inptr+2, tmplen, (unsigned char *) decword);
- break;
- case 'B': {
- int state = 0;
- unsigned int save = 0;
+ inptr = (char *) in;
+ inend = (char *) (in + len);
+ outlen = sizeof(utf8_decword);
- inlen = camel_base64_decode_step((unsigned char *) inptr+2, tmplen, (unsigned char *) decword, &state, &save);
- /* if state != 0 then error? */
- break;
- }
- default:
- /* uhhh, unknown encoding type - probably an invalid encoded word string */
+ while (inptr < inend) {
+ /* begin */
+ inptr = memchr (inptr, '?', inend-inptr);
+ if (!inptr || *(inptr-1) != '=') {
return NULL;
}
- d(printf("The encoded length = %d\n", inlen));
- if (inlen > 0) {
- /* yuck, all this snot is to setup iconv! */
- tmplen = inptr - in - 3;
- encname = g_alloca (tmplen + 1);
- memcpy (encname, in + 2, tmplen);
- encname[tmplen] = '\0';
- /* rfc2231 updates rfc2047 encoded words...
- * The ABNF given in RFC 2047 for encoded-words is:
- * encoded-word := "=?" charset "?" encoding "?" encoded-text "?="
- * This specification changes this ABNF to:
- * encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?="
- */
+ inptr++;
- /* trim off the 'language' part if it's there... */
- p = strchr (encname, '*');
- if (p)
- *p = '\0';
+ /* charset */
+ start = inptr;
+ inptr = memchr (inptr, '?', inend-inptr);
+ if (!inptr) {
+ return NULL;
+ }
+ strncpy (curr_charset, start, inptr-start); /* maybe overflow */
+ curr_charset[inptr-start] = '\0';
+ if (prev_charset[0] == '\0') { /* first charset in multi encode words */
+ strcpy (prev_charset, curr_charset);
+ }
+ d(printf ("curr_charset = %s\n", curr_charset));
- charset = e_iconv_charset_name (encname);
+ /* if (charset.perv != charset.curr) iconv perv to utf8 */
+ if (prev_charset[0] != '\0' && strcmp(prev_charset, curr_charset)) {
+ inlen = decword_ptr - decword;
+ ret = conv_to_utf8 (prev_charset, decword, inlen, utf8_decword_ptr, outlen);
+ if (ret == (size_t)-1) {
+ printf ("conv_to_utf8() error!\n");
+ return NULL;
+ }
- inbuf = decword;
+ utf8_decword_ptr += ret;
+ outlen = outlen - ret;
- outlen = inlen * 6 + 16;
- outbase = g_alloca (outlen);
- outbuf = outbase;
+ decword_ptr = decword; /* reset decword_ptr */
+ strcpy (prev_charset, curr_charset);
+ }
- retry:
- ic = e_iconv_open ("UTF-8", charset);
- if (ic != (iconv_t) -1) {
- ret = e_iconv (ic, &inbuf, &inlen, &outbuf, &outlen);
- if (ret != (size_t) -1) {
- e_iconv (ic, NULL, 0, &outbuf, &outlen);
- *outbuf = 0;
- decoded = g_strdup (outbase);
- } else {
- perror ("iconv");
- e_iconv (ic, NULL, 0, &outbuf, &outlen);
- *outbuf = 0;
- decoded = g_strdup (outbase);
- /* decoded = g_strdup (inbuf); */
- }
+ /* encode */
+ inptr++;
+ encode = *inptr;
+ inptr++;
+ if (*inptr != '?') {
+ return NULL;
+ }
- e_iconv_close (ic);
- } else {
- w(g_warning ("Cannot decode charset, header display may be corrupt: %s: %s",
- charset, strerror (errno)));
+ /* text */
+ inptr++;
+ start = inptr;
+ inptr = memchr (inptr, '?', inend-inptr);
+ if (!inptr || *(inptr+1) != '=') {
+ return NULL;
+ }
- if (!retried) {
- charset = e_iconv_locale_charset ();
- if (!charset)
- charset = "iso-8859-1";
+ /* decode */
+ switch(encode) {
- retried = TRUE;
- goto retry;
- }
+ case 'Q':
+ case 'q':
+ inlen = quoted_decode(start, inptr-start, decword_ptr);
+ break;
+ case 'B':
+ case 'b':
+ {
+ int state = 0;
+ unsigned int save = 0;
- /* we return the encoded word here because we've got to return valid utf8 */
- decoded = g_strndup (in, inlen);
+ inlen = camel_base64_decode_step(start, inptr-start, decword_ptr, &state, &save);
+ /* if state != 0 then error? */
}
+ break;
+ default:
+ /* uhhh, unknown encoding type - probably an invalid encoded word string */
+ return NULL;
}
+ d(printf("The encoded length = %d\n", inlen));
+ if (inlen > 0) {
+ decword_ptr += inlen;
+ } else {
+ return NULL;
+ }
+
+ inptr += 2; /* skip '?=' */
+ } /* end of "while (inptr < inend)" */
+
+ /* at last, iconv to utf8 */
+ inlen = decword_ptr - decword;
+ ret = conv_to_utf8 (curr_charset, decword, inlen, utf8_decword_ptr, outlen);
+ if (ret == (size_t)-1) {
+ printf ("conv_to_utf8() error!\n");
+ return NULL;
}
- d(printf("decoded '%s'\n", decoded));
+ utf8_decword_ptr += ret;
+ *utf8_decword_ptr = '\0';
- return decoded;
+ d(printf("decoded '%s'\n", utf8_decword));
+
+ return strdup (utf8_decword);
}
/* ok, a lot of mailers are BROKEN, and send iso-latin1 encoded
@@ -1014,66 +1096,209 @@
return str;
}
+typedef enum {
+ BEGIN,
+ BEGIN_SPACE,
+ NOENCODED_WORD,
+ ENCODED_WORD_CHARSET,
+ ENCODED_WORD_ENCODED_TEXT,
+ ENCODED_WORD_END,
+ ENCODED_WORD_END_SPACE,
+ END
+} StatsType;
+
/* decodes a simple text, rfc822 + rfc2047 */
static char *
header_decode_text (const char *in, size_t inlen, int ctext, const char *default_charset)
{
GString *out;
- const char *inptr, *inend, *start, *chunk, *locale_charset;
- GString *(* append) (GString *, const char *, gssize);
+ StatsType stats;
+ const char *inptr, *inend, *start, *locale_charset;;
char *dword = NULL;
- guint32 mask;
locale_charset = e_iconv_locale_charset ();
- if (ctext) {
- mask = (CAMEL_MIME_IS_SPECIAL | CAMEL_MIME_IS_SPACE | CAMEL_MIME_IS_CTRL);
- append = append_quoted_pair;
- } else {
- mask = (CAMEL_MIME_IS_LWSP);
- append = g_string_append_len;
- }
-
out = g_string_new ("");
inptr = in;
inend = inptr + inlen;
- chunk = NULL;
- while (inptr < inend) {
- start = inptr;
- while (inptr < inend && camel_mime_is_type (*inptr, mask))
- inptr++;
+ stats = BEGIN;
- if (inptr == inend) {
- append (out, start, inptr - start);
+ /* we'll get multi encoded word, and then decode them! */
+ while (stats != END) {
+ switch (stats) {
+ case BEGIN:
+ if (isspace(*inptr)) {
+ stats = BEGIN_SPACE;
+ start = inptr;
+ } else if (*inptr == '=' && *(inptr+1) == '?') {
+ stats = ENCODED_WORD_CHARSET;
+ start = inptr;
+ inptr++;
+ } else if (*inptr == '\0') {
+ stats = END;
+ } else { //if (isgraph(*inptr)) { // we accept multi-byte encode
+ stats = NOENCODED_WORD;
+ start = inptr;
+ }
break;
- } else if (dword == NULL) {
- append (out, start, inptr - start);
- } else {
- chunk = start;
- }
- start = inptr;
- while (inptr < inend && !camel_mime_is_type (*inptr, mask))
- inptr++;
+ case BEGIN_SPACE:
+ if (isspace(*inptr)) {
+ /* do nothing */
+ } else if (*inptr == '=' && *(inptr+1) == '?') {
+ stats = ENCODED_WORD_CHARSET;
+ start = inptr;
+ inptr++;
+ } else if (*inptr == '\0') {
+ stats = END;
+ } else { //if (isgraph(*inptr)) { // we accept multi-byte encode
+ stats = NOENCODED_WORD;
+ start = inptr;
+ }
+ break;
- dword = rfc2047_decode_word(start, inptr-start);
- if (dword) {
- g_string_append(out, dword);
- g_free(dword);
- } else {
- if (!chunk)
- chunk = start;
+ case NOENCODED_WORD:
+ if (isspace(*inptr)) {
+ /* do nothing */
+ } else if (*inptr == '=' && *(inptr+1) == '?') {
+ if ((default_charset == NULL || !append_8bit (out, start, inptr - start, default_charset))
+ && (locale_charset == NULL || !append_8bit (out, start, inptr - start, locale_charset)))
+ append_latin1 (out, start, inptr - start);
- if ((default_charset == NULL || !append_8bit (out, chunk, inptr-chunk, default_charset))
- && (locale_charset == NULL || !append_8bit(out, chunk, inptr-chunk, locale_charset))) {
+ stats = ENCODED_WORD_CHARSET;
+ start = inptr;
+ inptr++;
+ } else if (*inptr == '\0') {
+ inptr--;
+ while (isspace(*inptr)) {
+ inptr--;
+ }
+ if ((default_charset == NULL || !append_8bit (out, start, inptr + 1 - start, default_charset))
+ && (locale_charset == NULL || !append_8bit (out, start, inptr + 1 - start, locale_charset)))
+ append_latin1 (out, start, inptr - start);
-
- append_latin1(out, chunk, inptr-chunk);
+ stats = END;
+ } else { //if (isgraph(*inptr)) { // we accept multi-byte encode
+ /* do nothing */
}
+ break;
+
+ case ENCODED_WORD_CHARSET:
+ if (isspace (*inptr)) {
+ stats = NOENCODED_WORD;
+ } else if (*inptr == '?') {
+ inptr++;
+ if ((*inptr == 'Q' || *inptr == 'q'
+ || *inptr == 'B' || *inptr == 'b')
+ && *(inptr+1) == '?') {
+ inptr++;
+ stats = ENCODED_WORD_ENCODED_TEXT;
+ } else {
+ stats = NOENCODED_WORD;
+ }
+ } else if (*inptr == '\0') {
+ if ((default_charset == NULL || !append_8bit (out, start, inptr + 1 - start, default_charset))
+ && (locale_charset == NULL || !append_8bit (out, start, inptr + 1 - start, locale_charset)))
+ append_latin1 (out, start, inptr - start);
+
+ stats = END;
+ } else if (isgraph(*inptr)) {
+ /* do nothing */
+ } else {
+ /* impossible */
+ }
+ break;
+
+ case ENCODED_WORD_ENCODED_TEXT:
+ if (isspace (*inptr)) {
+ stats = NOENCODED_WORD; /* maybe do nothing */
+ } else if (*inptr == '?' && *(inptr+1) == '=') {
+ /* we will decode it in stats ENCODED_WORD_END */
+ stats = ENCODED_WORD_END;
+ inptr++;
+ } else if (*inptr == '\0') {
+ if ((default_charset == NULL || !append_8bit (out, start, inptr + 1 - start, default_charset))
+ && (locale_charset == NULL || !append_8bit (out, start, inptr + 1 - start, locale_charset)))
+ append_latin1 (out, start, inptr - start);
+
+ stats = END;
+ } else if (isgraph(*inptr)) {
+ /* do nothing */
+ } else {
+ /* impossible */
+ }
+ break;
+
+ case ENCODED_WORD_END:
+ if (isspace(*inptr)) {
+ /* fix some buggy mail clients */
+ stats = ENCODED_WORD_END_SPACE;
+ } else if (*inptr == '=' && *(inptr+1) == '?') {
+ stats = ENCODED_WORD_CHARSET;
+ inptr++;
+ } else {
+ dword = rfc2047_decode_word (start, inptr - start);
+ if (dword) {
+ g_string_append (out, dword);
+ g_free (dword);
+ } else {
+ if ((default_charset == NULL || !append_8bit (out, start, inptr + 1 - start, default_charset))
+ && (locale_charset == NULL || !append_8bit (out, start, inptr + 1 - start, locale_charset)))
+ append_latin1 (out, start, inptr - start);
+ }
+
+ if (*inptr == '\0') {
+ stats = END;
+ } else { //if (isgraph(*inptr)) { // we accept multi-byte encode
+ start = inptr;
+ stats = NOENCODED_WORD;
+ }
+ }
+ break;
+
+ case ENCODED_WORD_END_SPACE:
+ if (isspace(*inptr)) {
+ /* do nothing */
+ } else if (*inptr == '=' && *(inptr+1) == '?') {
+ /* yes, combine two encoded words */
+ stats = ENCODED_WORD_CHARSET;
+ inptr++;
+ } else {
+ if (*inptr == '\0') {
+ stats = END;
+ } else { //if (isgraph(*inptr)) { // we accept multi-byte encode
+ stats = NOENCODED_WORD;
+ }
+
+ inptr--;
+ while (isspace(*inptr)) {
+ inptr--;
+ }
+ inptr++;
+
+ dword = rfc2047_decode_word (start, inptr - start);
+ if (dword) {
+ g_string_append (out, dword);
+ g_free (dword);
+ } else {
+ if ((default_charset == NULL || !append_8bit (out, start, inptr + 1 - start, default_charset))
+ && (locale_charset == NULL || !append_8bit (out, start, inptr + 1 - start, locale_charset)))
+ append_latin1 (out, start, inptr - start);
+ }
+
+ if (stats == NOENCODED_WORD) {
+ start = inptr;
+ }
+ }
+ break;
+
+ default:
+ /* impossible */
+ break;
}
- chunk = NULL;
+ inptr++;
}
dword = out->str;
Index: libtinymail-camel/camel-lite/libedataserver/e-iconv.c
===================================================================
--- libtinymail-camel/camel-lite/libedataserver/e-iconv.c (revision 3190)
+++ libtinymail-camel/camel-lite/libedataserver/e-iconv.c (working copy)
@@ -143,15 +143,16 @@
{ "euckr-0", "EUC-KR" },
{ "5601", "EUC-KR" },
{ "zh_TW-euc", "EUC-TW" },
- { "zh_CN.euc", "gb2312" },
+ { "zh_CN.euc", "GBK" },
{ "zh_TW-big5", "BIG5" },
- { "euc-cn", "gb2312" },
+ { "euc-cn", "GBK" },
{ "big5-0", "BIG5" },
{ "big5.eten-0", "BIG5" },
{ "big5hkscs-0", "BIG5HKSCS" },
- { "gb2312-0", "gb2312" },
- { "gb2312.1980-0", "gb2312" },
- { "gb-2312", "gb2312" },
+ { "gb2312", "GBK" },
+ { "gb2312-0", "GBK" },
+ { "gb2312.1980-0", "GBK" },
+ { "gb-2312", "GBK" },
{ "gb18030-0", "gb18030" },
{ "gbk-0", "GBK" },
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]