Re: [Evolution-hackers] [patch] fixed incorrect rfc2047 decode for CJK header



Hey Jacky,

This is a port of your patch to Tinymail's camel-lite

On Sun, 2007-12-23 at 23:09 +0800, jacky wrote:
> Hi, all.
> 
> The rfc2047 decoder in libcamel can not decode some
> CJK header correctly. Although some of them are not
> correspond to RFC, but I need to decode it correctly
> and I thought if evolution can display there email
> correctly more people like it.
> 
> So I write a new rfc2047 decoder, and it's in the
> patch. With the patch, libcamel can decode CJK header
> correctly and evolution can display CJK header
> correctly now. I had test it in my mailbox. My mailbox
> has 2000 emails which were sent by evolution,
> thunderbird, outlook, outlook express, foxmail, open
> webmail, yahoo, gmail, lotus notes, etc. Without this
> patch, almost 20% of there emails can't be decoded and
> displayed correctly, with this patch, 99% of there
> emails can be decoded and displayed correctly.
> 
> And I found that the attachment with CJK name can't be
> recognised and displayed by outlook / outlook express
> / foxmail. This is because there email clients do not
> support RFC2184. Evolution always use RFC2184 encode
> mothod to encode attachment name, so the email with
> CJK named attachment can't display in outlook /
> outlook express / foxmail. In thunderbird, you can set
> the option "mail.strictly_mime.parm_folding" to 0 or 1
> for using RFC2047 encode mothod to encode attachment
> name. Can we add a similar option?
> 
> Best regards.
> 
> 
>       ___________________________________________________________ 
> 雅虎邮箱传递新年祝福,个性贺卡送亲朋! 
> http://cn.mail.yahoo.com/gc/index.html?entry=5&souce=mail_mailletter_tagline
> _______________________________________________ Evolution-hackers mailing list Evolution-hackers gnome org http://mail.gnome.org/mailman/listinfo/evolution-hackers
-- 
Philip Van Hoof, freelance software developer
home: me at pvanhoof dot be 
gnome: pvanhoof at gnome dot org 
http://pvanhoof.be/blog
http://codeminded.be



Index: libtinymail-camel/camel-lite/camel/camel-mime-utils.c
===================================================================
--- libtinymail-camel/camel-lite/camel/camel-mime-utils.c	(revision 3190)
+++ libtinymail-camel/camel-lite/camel/camel-mime-utils.c	(working copy)
@@ -821,125 +821,207 @@
 	*in = inptr;
 }
 
+static void
+print_hex (unsigned char *data, size_t len)
+{
+	size_t i, x;
+	unsigned char *p = data;
+	char high, low;
+
+	x = 0;
+	printf ("%04u    ", x);
+	for (i = 0; i < len; i++) {
+		high = *p >> 4;
+		high = (high<10) ? high + '0' : high + 'a' - 10;
+
+		low = *p & 0x0f;
+		low = (low<10) ? low + '0' : low + 'a' - 10;
+
+		printf ("0x%c%c  ", high, low);
+
+		p++;
+		x++;
+		if (i % 8 == 7) {
+			printf ("\n%04u    ", x);
+		}
+	}
+	printf ("\n");
+}
+
+static size_t
+conv_to_utf8 (const char *encname, char *in, size_t inlen, char *out, size_t outlen)
+{
+	char *charset, *inbuf, *outbuf;
+	iconv_t ic;
+	size_t inbuf_len, outbuf_len, ret;
+
+	charset = (char *) e_iconv_charset_name (encname);
+
+	ic = e_iconv_open ("UTF-8", charset);
+	if (ic == (iconv_t) -1) {
+		printf ("e_iconv_open() error\n");
+		return (size_t)-1;
+	}
+
+	inbuf = in;
+	inbuf_len = inlen;
+
+	outbuf = out;
+	outbuf_len = outlen;
+
+	ret = e_iconv (ic, (const char **) &inbuf, &inbuf_len, &outbuf, &outbuf_len);
+	if (ret == (size_t)-1) {
+		printf ("e_iconv() error! source charset is %s, target charset is %s\n", charset, "UTF-8");
+		printf ("converted %u bytes, but last %u bytes can't convert!!\n", inlen - inbuf_len, inbuf_len);
+		printf ("source data:\n");
+		print_hex (in, inlen);
+
+		*outbuf = '\0';
+		printf ("target string is \"%s\"\n", out);
+
+		return (size_t)-1;
+	}
+
+	ret = outlen - outbuf_len;
+	out[ret] = '\0';
+
+	e_iconv_close (ic);
+
+	return ret;
+}
+
 /* decode rfc 2047 encoded string segment */
+#define DECWORD_LEN 1024
+#define UTF8_DECWORD_LEN 2048
+
 static char *
 rfc2047_decode_word(const char *in, size_t len)
 {
-	const char *inptr = in+2;
-	const char *inend = in+len-2;
-	const char *inbuf;
-	const char *charset;
-	char *encname, *p;
-	int tmplen;
-	size_t ret;
-	char *decword = NULL;
-	char *decoded = NULL;
-	char *outbase = NULL;
-	char *outbuf;
-	size_t inlen, outlen;
-	gboolean retried = FALSE;
-	iconv_t ic;
-	int idx = 0;
+	char prev_charset[32], curr_charset[32];
+	char encode;
+	char *start, *inptr, *inend;
+	char decword[DECWORD_LEN], utf8_decword[UTF8_DECWORD_LEN];
+	char *decword_ptr, *utf8_decword_ptr;
+	size_t inlen, outlen, ret;
 
 	d(printf("rfc2047: decoding '%.*s'\n", len, in));
 
+	prev_charset[0] = curr_charset[0] = '\0';
+
+	decword_ptr = decword;
+	utf8_decword_ptr = utf8_decword;
+
 	/* quick check to see if this could possibly be a real encoded word */
-
-	if (len < 8 || !(in[0] == '=' && in[1] == '?')) {
+	if (len < 8
+	    || !(in[0] == '=' && in[1] == '?'
+		 && in[len-1] == '=' && in[len-2] == '?')) {
 		d(printf("invalid\n"));
 		return NULL;
 	}
 
-	/* skip past the charset to the encoding type */
-	inptr = memchr (inptr, '?', inend-inptr);
-	if (inptr != NULL && inptr < inend + 2 && inptr[2] == '?') {
-		d(printf("found ?, encoding is '%c'\n", inptr[0]));
-		inptr++;
-		tmplen = inend-inptr-2;
-		decword = g_alloca (tmplen); /* this will always be more-than-enough room */
-		switch(toupper(inptr[0])) {
-		case 'Q':
-			inlen = quoted_decode((const unsigned char *) inptr+2, tmplen, (unsigned char *) decword);
-			break;
-		case 'B': {
-			int state = 0;
-			unsigned int save = 0;
+	inptr = (char *) in;
+	inend = (char *) (in + len);
+	outlen = sizeof(utf8_decword);
 
-			inlen = camel_base64_decode_step((unsigned char *) inptr+2, tmplen, (unsigned char *) decword, &state, &save);
-			/* if state != 0 then error? */
-			break;
-		}
-		default:
-			/* uhhh, unknown encoding type - probably an invalid encoded word string */
+	while (inptr < inend) {
+		/* begin */
+		inptr = memchr (inptr, '?', inend-inptr);
+		if (!inptr || *(inptr-1) != '=') {
 			return NULL;
 		}
-		d(printf("The encoded length = %d\n", inlen));
-		if (inlen > 0) {
-			/* yuck, all this snot is to setup iconv! */
-			tmplen = inptr - in - 3;
-			encname = g_alloca (tmplen + 1);
-			memcpy (encname, in + 2, tmplen);
-			encname[tmplen] = '\0';
 
-			/* rfc2231 updates rfc2047 encoded words...
-			 * The ABNF given in RFC 2047 for encoded-words is:
-			 *   encoded-word := "=?" charset "?" encoding "?" encoded-text "?="
-			 * This specification changes this ABNF to:
-			 *   encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?="
-			 */
+		inptr++;
 
-			/* trim off the 'language' part if it's there... */
-			p = strchr (encname, '*');
-			if (p)
-				*p = '\0';
+		/* charset */
+		start = inptr;
+		inptr = memchr (inptr, '?', inend-inptr);
+		if (!inptr) {
+			return NULL;
+		}
+		strncpy (curr_charset, start, inptr-start); /* maybe overflow */
+		curr_charset[inptr-start] = '\0';
+		if (prev_charset[0] == '\0') { /* first charset in multi encode words */
+			strcpy (prev_charset, curr_charset);
+		}
+		d(printf ("curr_charset = %s\n", curr_charset));
 
-			charset = e_iconv_charset_name (encname);
+		/* if (charset.perv != charset.curr) iconv perv to utf8 */
+		if (prev_charset[0] != '\0' && strcmp(prev_charset, curr_charset)) {
+			inlen = decword_ptr - decword;
+			ret = conv_to_utf8 (prev_charset, decword, inlen, utf8_decword_ptr, outlen);
+			if (ret == (size_t)-1) {
+				printf ("conv_to_utf8() error!\n");
+				return NULL;
+			}
 
-			inbuf = decword;
+			utf8_decword_ptr += ret;
+			outlen = outlen - ret;
 
-			outlen = inlen * 6 + 16;
-			outbase = g_alloca (outlen);
-			outbuf = outbase;
+			decword_ptr = decword; /* reset decword_ptr */
+			strcpy (prev_charset, curr_charset);
+		}
 
-		retry:
-			ic = e_iconv_open ("UTF-8", charset);
-			if (ic != (iconv_t) -1) {
-				ret = e_iconv (ic, &inbuf, &inlen, &outbuf, &outlen);
-				if (ret != (size_t) -1) {
-					e_iconv (ic, NULL, 0, &outbuf, &outlen);
-					*outbuf = 0;
-					decoded = g_strdup (outbase);
-				} else {
-					perror ("iconv");
-					e_iconv (ic, NULL, 0, &outbuf, &outlen);
-					*outbuf = 0;
-					decoded = g_strdup (outbase);
-					/* decoded = g_strdup (inbuf); */
-				}
+		/* encode */
+		inptr++;
+		encode = *inptr;
+		inptr++;
+		if (*inptr != '?') {
+			return NULL;
+		}
 
-				e_iconv_close (ic);
-			} else {
-				w(g_warning ("Cannot decode charset, header display may be corrupt: %s: %s",
-					     charset, strerror (errno)));
+		/* text */
+		inptr++;
+		start = inptr;
+		inptr = memchr (inptr, '?', inend-inptr);
+		if (!inptr || *(inptr+1) != '=') {
+			return NULL;
+		}
 
-				if (!retried) {
-					charset = e_iconv_locale_charset ();
-					if (!charset)
-						charset = "iso-8859-1";
+		/* decode */
+		switch(encode) {
 
-					retried = TRUE;
-					goto retry;
-				}
+		case 'Q':
+		case 'q':
+			inlen = quoted_decode(start, inptr-start, decword_ptr);
+			break;
+		case 'B':
+		case 'b':
+			{
+				int state = 0;
+				unsigned int save = 0;
 
-				/* we return the encoded word here because we've got to return valid utf8 */
-				decoded = g_strndup (in, inlen);
+				inlen = camel_base64_decode_step(start, inptr-start, decword_ptr, &state, &save);
+				/* if state != 0 then error? */
 			}
+			break;
+		default:
+			/* uhhh, unknown encoding type - probably an invalid encoded word string */
+			return NULL;
 		}
+		d(printf("The encoded length = %d\n", inlen));
+		if (inlen > 0) {
+			decword_ptr += inlen;
+		} else {
+			return NULL;
+		}
+
+		inptr += 2;	/* skip '?=' */
+	} /* end of "while (inptr < inend)" */
+
+	/* at last, iconv to utf8 */
+	inlen = decword_ptr - decword;
+	ret = conv_to_utf8 (curr_charset, decword, inlen, utf8_decword_ptr, outlen);
+	if (ret == (size_t)-1) {
+		printf ("conv_to_utf8() error!\n");
+		return NULL;
 	}
 
-	d(printf("decoded '%s'\n", decoded));
+	utf8_decword_ptr += ret;
+	*utf8_decword_ptr = '\0';
 
-	return decoded;
+	d(printf("decoded '%s'\n", utf8_decword));
+
+	return strdup (utf8_decword);
 }
 
 /* ok, a lot of mailers are BROKEN, and send iso-latin1 encoded
@@ -1014,66 +1096,209 @@
 	return str;
 }
 
+typedef enum {
+	BEGIN,
+	BEGIN_SPACE,
+	NOENCODED_WORD,
+	ENCODED_WORD_CHARSET,
+	ENCODED_WORD_ENCODED_TEXT,
+	ENCODED_WORD_END,
+	ENCODED_WORD_END_SPACE,
+	END
+} StatsType;
+
 /* decodes a simple text, rfc822 + rfc2047 */
 static char *
 header_decode_text (const char *in, size_t inlen, int ctext, const char *default_charset)
 {
 	GString *out;
-	const char *inptr, *inend, *start, *chunk, *locale_charset;
-	GString *(* append) (GString *, const char *, gssize);
+	StatsType stats;
+	const char *inptr, *inend, *start, *locale_charset;;
 	char *dword = NULL;
-	guint32 mask;
 
 	locale_charset = e_iconv_locale_charset ();
 
-	if (ctext) {
-		mask = (CAMEL_MIME_IS_SPECIAL | CAMEL_MIME_IS_SPACE | CAMEL_MIME_IS_CTRL);
-		append = append_quoted_pair;
-	} else {
-		mask = (CAMEL_MIME_IS_LWSP);
-		append = g_string_append_len;
-	}
-
 	out = g_string_new ("");
 	inptr = in;
 	inend = inptr + inlen;
-	chunk = NULL;
 
-	while (inptr < inend) {
-		start = inptr;
-		while (inptr < inend && camel_mime_is_type (*inptr, mask))
-			inptr++;
+	stats = BEGIN;
 
-		if (inptr == inend) {
-			append (out, start, inptr - start);
+	/* we'll get multi encoded word, and then decode them! */
+	while (stats != END) {
+		switch (stats) {
+		case BEGIN:
+			if (isspace(*inptr)) {
+				stats = BEGIN_SPACE;
+				start = inptr;
+			} else if (*inptr == '=' && *(inptr+1) == '?') {
+				stats = ENCODED_WORD_CHARSET;
+				start = inptr;
+				inptr++;
+			} else if (*inptr == '\0') {
+				stats = END;
+			} else { //if (isgraph(*inptr)) { // we accept multi-byte encode
+				stats = NOENCODED_WORD;
+				start = inptr;
+			}
 			break;
-		} else if (dword == NULL) {
-			append (out, start, inptr - start);
-		} else {
-			chunk = start;
-		}
 
-		start = inptr;
-		while (inptr < inend && !camel_mime_is_type (*inptr, mask))
-			inptr++;
+		case BEGIN_SPACE:
+			if (isspace(*inptr)) {
+				/* do nothing */
+			} else if (*inptr == '=' && *(inptr+1) == '?') {
+				stats = ENCODED_WORD_CHARSET;
+				start = inptr;
+				inptr++;
+			} else if (*inptr == '\0') {
+				stats = END;
+			} else { //if (isgraph(*inptr)) { // we accept multi-byte encode
+				stats = NOENCODED_WORD;
+				start = inptr;
+			}
+			break;
 
-		dword = rfc2047_decode_word(start, inptr-start);
-		if (dword) {
-			g_string_append(out, dword);
-			g_free(dword);
-		} else {
-			if (!chunk)
-				chunk = start;
+		case NOENCODED_WORD:
+			if (isspace(*inptr)) {
+				/* do nothing */
+			} else if (*inptr == '=' && *(inptr+1) == '?') {
+				if ((default_charset == NULL || !append_8bit (out, start, inptr - start, default_charset))
+				      && (locale_charset == NULL || !append_8bit (out, start, inptr - start, locale_charset)))
+					append_latin1 (out, start, inptr - start);
 
-			if ((default_charset == NULL || !append_8bit (out, chunk, inptr-chunk, default_charset))
-			    && (locale_charset == NULL || !append_8bit(out, chunk, inptr-chunk, locale_charset))) {
+				stats = ENCODED_WORD_CHARSET;
+				start = inptr;
+				inptr++;
+			} else if (*inptr == '\0') {
+				inptr--;
+				while (isspace(*inptr)) {
+					inptr--;
+				}
+				if ((default_charset == NULL || !append_8bit (out, start, inptr + 1 - start, default_charset))
+						&& (locale_charset == NULL || !append_8bit (out, start, inptr + 1 - start, locale_charset)))
+					append_latin1 (out, start, inptr - start);
 
-			
-				append_latin1(out, chunk, inptr-chunk);
+				stats = END;
+			} else { //if (isgraph(*inptr)) { // we accept multi-byte encode
+				/* do nothing */
 			}
+			break;
+	  
+		case ENCODED_WORD_CHARSET:
+			if (isspace (*inptr)) {
+				stats = NOENCODED_WORD;
+			} else if (*inptr == '?') {
+				inptr++;
+				if ((*inptr == 'Q' || *inptr == 'q'
+				     || *inptr == 'B' || *inptr == 'b')
+				    && *(inptr+1) == '?') {
+					inptr++;
+					stats = ENCODED_WORD_ENCODED_TEXT;
+				} else {
+					stats = NOENCODED_WORD;
+				}
+			} else if (*inptr == '\0') {
+				if ((default_charset == NULL || !append_8bit (out, start, inptr + 1 - start, default_charset))
+				      && (locale_charset == NULL || !append_8bit (out, start, inptr + 1 - start, locale_charset)))
+					append_latin1 (out, start, inptr - start);
+
+				stats = END;
+			} else if (isgraph(*inptr)) {
+				/* do nothing */
+			} else {
+				/* impossible */
+			}
+			break;
+
+		case ENCODED_WORD_ENCODED_TEXT:
+			if (isspace (*inptr)) {
+				stats = NOENCODED_WORD; /* maybe do nothing */
+			} else if (*inptr == '?' && *(inptr+1) == '=') {
+				/* we will decode it in stats ENCODED_WORD_END */
+				stats = ENCODED_WORD_END;
+				inptr++;
+			} else if (*inptr == '\0') {
+				if ((default_charset == NULL || !append_8bit (out, start, inptr + 1 - start, default_charset))
+				      && (locale_charset == NULL || !append_8bit (out, start, inptr + 1 - start, locale_charset)))
+					append_latin1 (out, start, inptr - start);
+
+				stats = END;
+			} else if (isgraph(*inptr)) {
+				/* do nothing */
+			} else {
+				/* impossible */
+			}
+			break;
+	  
+		case ENCODED_WORD_END:
+			if (isspace(*inptr)) {
+				/* fix some buggy mail clients */
+				stats = ENCODED_WORD_END_SPACE;
+			} else if (*inptr == '=' && *(inptr+1) == '?') {
+				stats = ENCODED_WORD_CHARSET;
+				inptr++;
+			} else {
+				dword = rfc2047_decode_word (start, inptr - start);
+				if (dword) {
+					g_string_append (out, dword);
+					g_free (dword);
+				} else {
+					if ((default_charset == NULL || !append_8bit (out, start, inptr + 1 - start, default_charset))
+					      && (locale_charset == NULL || !append_8bit (out, start, inptr + 1 - start, locale_charset)))
+						append_latin1 (out, start, inptr - start);
+				}
+
+				if (*inptr == '\0') {
+					stats = END;
+				} else { //if (isgraph(*inptr)) { // we accept multi-byte encode
+					start = inptr;
+					stats = NOENCODED_WORD;
+				}
+			}
+			break;
+	  
+		case ENCODED_WORD_END_SPACE:
+			if (isspace(*inptr)) {
+				/* do nothing */
+			} else if (*inptr == '=' && *(inptr+1) == '?') {
+				/* yes, combine two encoded words */
+				stats = ENCODED_WORD_CHARSET;
+				inptr++;
+			} else {
+				if (*inptr == '\0') {
+					stats = END;
+				} else { //if (isgraph(*inptr)) { // we accept multi-byte encode
+					stats = NOENCODED_WORD;
+				}
+
+				inptr--;
+				while (isspace(*inptr)) {
+					inptr--;
+				}
+				inptr++;
+
+				dword = rfc2047_decode_word (start, inptr - start);
+				if (dword) {
+					g_string_append (out, dword);
+					g_free (dword);
+				} else {
+					if ((default_charset == NULL || !append_8bit (out, start, inptr + 1 - start, default_charset))
+					      && (locale_charset == NULL || !append_8bit (out, start, inptr + 1 - start, locale_charset)))
+						append_latin1 (out, start, inptr - start);
+				}
+
+				if (stats == NOENCODED_WORD) {
+					start = inptr;
+				}
+			}
+			break;
+	  
+		default:
+			/* impossible */
+			break;
 		}
 
-		chunk = NULL;
+		inptr++;
 	}
 
 	dword = out->str;
Index: libtinymail-camel/camel-lite/libedataserver/e-iconv.c
===================================================================
--- libtinymail-camel/camel-lite/libedataserver/e-iconv.c	(revision 3190)
+++ libtinymail-camel/camel-lite/libedataserver/e-iconv.c	(working copy)
@@ -143,15 +143,16 @@
 	{ "euckr-0",        "EUC-KR"     },
 	{ "5601",           "EUC-KR"     },
 	{ "zh_TW-euc",      "EUC-TW"     },
-	{ "zh_CN.euc",      "gb2312"     },
+	{ "zh_CN.euc",      "GBK"     },
 	{ "zh_TW-big5",     "BIG5"       },
-	{ "euc-cn",         "gb2312"     },
+	{ "euc-cn",         "GBK"     },
 	{ "big5-0",         "BIG5"       },
 	{ "big5.eten-0",    "BIG5"       },
 	{ "big5hkscs-0",    "BIG5HKSCS"  },
-	{ "gb2312-0",       "gb2312"     },
-	{ "gb2312.1980-0",  "gb2312"     },
-	{ "gb-2312",        "gb2312"     },
+	{ "gb2312",         "GBK"     },
+	{ "gb2312-0",       "GBK"     },
+	{ "gb2312.1980-0",  "GBK"     },
+	{ "gb-2312",        "GBK"     },
 	{ "gb18030-0",      "gb18030"    },
 	{ "gbk-0",          "GBK"        },
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]