Re: [gmime-devel] Issue with header decoding (continues)

From: Jeffrey Stedfast <fejj gnome org>
To: evil legacy <evil legacy gmail com>
Cc: gmime-devel-list gnome org
Subject: Re: [gmime-devel] Issue with header decoding (continues)
Date: Sun, 18 Dec 2011 18:00:28 -0500

The attached patch should fix it if applied to the latest gmime from git master.

A quick test has it passing all of the unit tests (e.g. test-mime), so it's probably good to go.

I don't normally like to land such massive patches in a stable cycle, so if you could test this out on your messages and see how it works in the wild, that'd be great.

This patch should also handle cases where base64 and/or quoted-printable data was split between encoded-word tokens (which addresses another feature request I've gotten a few times now).

Jeff

On 12/18/2011 09:02 AM, evil legacy wrote:

Hi,

Came across another header decoding problem when dealing with badly split utf-8 headers, i.e:

=?utf-8?B?16nXoteV158g15PXldek16cgR0FSTUlOINei150gR1BTINee15XXkdeg15Qg15XXktc=?= =?utf-8?B?nSDXkNeo16DXpyDXkNeV16TXoNeq15kg157XoteV16gg157XqdeV15HXlyE=?='

it looks like someone splited an utf-8 string wrongly, leaving "half" a char on each part

g_mime_utils_header_decode_text/phrase split the header into words and decode each word separately, and since it's utf8, iconv isn't used and the string validates with this loop:

while (!g_utf8_validate (p, len, (const char **) &p)) {

len = declen - (p - (char *) decoded);

*p = '?';

}

because the original string is poorly (brokenly) splited, the 'half' chars are replaced with '?'
I'm attaching a patch that moves the utf-8 validation to the end of g_mime_utils_header_decode_text/phrase, where these decoded words are already combined

Best Regards
On Sat, Dec 17, 2011 at 6:49 PM, Jeffrey Stedfast <fejj gnome org> wrote:
Hi,

I've just released GMime 2.4.29 and 2.6.2 with your fix (and other similar fixes).

Jeff

On 12/14/2011 01:26 PM, evil legacy wrote:
Hi,

After more debugging, I found that the problem is when iconv (cd, NULL, NULL, &outbuf, &outleft) tries to flush the buffer to outbuf, but outbuf isn't big enough to hold it.

This little patch to the charset_convert function seems to fix this problem (works for me):

<patch>

diff --git a/gmime/gmime-utils.c b/gmime/gmime-utils.c

index ca32b61..093deee 100644

--- a/gmime/gmime-utils.c

+++ b/gmime/gmime-utils.c

@@ -1553,7 +1553,15 @@ charset_convert (iconv_t cd, const char *inbuf, size_t inleft, char **outp, size

   }

   } while (inleft > 0);



- iconv (cd, NULL, NULL, &outbuf, &outleft);

+ while (iconv (cd, NULL, NULL, &outbuf, &outleft) == (size_t) -1)

+ if (errno == E2BIG) {

+ outlen += 16;

+ rc = (size_t) (outbuf - out);

+ out = g_realloc (out, outlen + 1);

+ outleft = outlen - rc;

+ outbuf = out + rc;

+ }

+

   *outbuf++ = '\0';



   *outlenp = outlen;

</patch>

Best Regards
_______________________________________________
gmime-devel-list mailing list
gmime-devel-list gnome org
http://mail.gnome.org/mailman/listinfo/gmime-devel-list
--
map{map{$a=unpack"C",$_;map{$c=$a-ord;print$_ x$c and goto"a"if$c>0}("Z",
" ");a:}split//;print"\n"}(q{&[%[%`#[%["},q{&[$[![$[%["[%["},q{&[#[#[#[%[
"[%["},q{&["[%["`#a"},q{[%["a"[([%["},q{[%["[%["[([%["},q{!_#[%["[([%["})

diff --git a/gmime/gmime-encodings.c b/gmime/gmime-encodings.c
index aa54c5f..f052035 100644
--- a/gmime/gmime-encodings.c
+++ b/gmime/gmime-encodings.c
@@ -526,6 +526,7 @@ g_mime_encoding_base64_decode_step (const unsigned char *inbuf, size_t inlen, un
 				*outptr++ = saved >> 16;
 				*outptr++ = saved >> 8;
 				*outptr++ = saved;
+				saved = 0;
 				n = 0;
 				
 				if (npad > 0) {
diff --git a/gmime/gmime-utils.c b/gmime/gmime-utils.c
index 6fc17d6..b8d6140 100644
--- a/gmime/gmime-utils.c
+++ b/gmime/gmime-utils.c
@@ -1713,29 +1713,84 @@ g_mime_utils_decode_8bit (const char *text, size_t len)
 
 
 /* this decodes rfc2047's version of quoted-printable */
-static ssize_t
-quoted_decode (const unsigned char *in, size_t len, unsigned char *out)
+static size_t
+quoted_decode (const unsigned char *in, size_t len, unsigned char *out, int *state, guint32 *save)
 {
 	register const unsigned char *inptr;
 	register unsigned char *outptr;
 	const unsigned char *inend;
 	unsigned char c, c1;
+	size_t need, i;
+	guint32 saved;
+	
+	if (len == 0)
+		return 0;
 	
 	inend = in + len;
 	outptr = out;
-	
 	inptr = in;
+	
+	need = (size_t) *state;
+	saved = *save;
+	
+	if (need > 0) {
+		if (isxdigit ((int) *inptr)) {
+			if (need == 1) {
+				c = g_ascii_toupper ((int) (saved & 0xff));
+				c1 = g_ascii_toupper ((int) *inptr++);
+				saved = 0;
+				need = 0;
+				
+				goto decode;
+			}
+
+			saved = 0;
+			need = 0;
+			
+			goto equals;
+		}
+		
+		/* last encoded-word ended in a malformed quoted-printable sequence */
+		*outptr++ = '=';
+		
+		if (need == 1)
+			*outptr++ = (char) (saved & 0xff);
+		
+		saved = 0;
+		need = 0;
+	}
+	
 	while (inptr < inend) {
 		c = *inptr++;
 		if (c == '=') {
+		equals:
 			if (inend - inptr >= 2) {
-				c = toupper (*inptr++);
-				c1 = toupper (*inptr++);
-				*outptr++ = (((c >= 'A' ? c - 'A' + 10 : c - '0') & 0x0f) << 4)
-					| ((c1 >= 'A' ? c1 - 'A' + 10 : c1 - '0') & 0x0f);
+				if (isxdigit ((int) inptr[0]) && isxdigit ((int) inptr[1])) {
+					c = g_ascii_toupper (*inptr++);
+					c1 = g_ascii_toupper (*inptr++);
+				decode:
+					*outptr++ = (((c >= 'A' ? c - 'A' + 10 : c - '0') & 0x0f) << 4)
+						| ((c1 >= 'A' ? c1 - 'A' + 10 : c1 - '0') & 0x0f);
+				} else {
+					/* malformed quoted-printable sequence? */
+					*outptr++ = '=';
+				}
 			} else {
-				/* data was truncated */
-				return -1;
+				/* truncated payload, maybe it was split across encoded-words? */
+				if (inptr < inend) {
+					if (isxdigit ((int) *inptr)) {
+						saved = *inptr;
+						need = 1;
+						break;
+					} else {
+						/* malformed quoted-printable sequence? */
+						*outptr++ = '=';
+					}
+				} else {
+					saved = 0;
+					need = 2;
+					break;
+				}
 			}
 		} else if (c == '_') {
 			/* _'s are an rfc2047 shortcut for encoding spaces */
@@ -1745,68 +1800,72 @@ quoted_decode (const unsigned char *in, size_t len, unsigned char *out)
 		}
 	}
 	
-	return (ssize_t) (outptr - out);
+	*state = (int) need;
+	*save = saved;
+	
+	return (size_t) (outptr - out);
 }
 
 #define is_rfc2047_encoded_word(atom, len) (len >= 7 && !strncmp (atom, "=?", 2) && !strncmp (atom + len - 2, "?=", 2))
 
-static char *
-rfc2047_decode_word (const char *in, size_t inlen)
+typedef struct _rfc2047_token {
+	struct _rfc2047_token *next;
+	const char *charset;
+	const char *text;
+	size_t length;
+	char encoding;
+	char is_8bit;
+} rfc2047_token;
+
+#define rfc2047_token_list_free(tokens) g_slice_free_chain (rfc2047_token, tokens, next)
+#define rfc2047_token_free(token) g_slice_free (rfc2047_token, token)
+
+static rfc2047_token *
+rfc2047_token_new (const char *text, size_t len)
 {
-	const unsigned char *instart = (const unsigned char *) in;
-	const register unsigned char *inptr = instart + 2;
-	const unsigned char *inend = instart + inlen - 2;
-	unsigned char *decoded;
+	rfc2047_token *token;
+	
+	token = g_slice_new0 (rfc2047_token);
+	token->length = len;
+	token->text = text;
+	
+	return token;
+}
+
+static rfc2047_token *
+rfc2047_token_new_encoded_word (const char *word, size_t len)
+{
+	rfc2047_token *token;
+	const char *payload;
 	const char *charset;
-	size_t len, ninval;
-	char *charenc, *p;
-	guint32 save = 0;
-	ssize_t declen;
-	int state = 0;
-	iconv_t cd;
-	char *buf;
+	const char *inptr;
+	char *buf, *lang;
+	char encoding;
+	size_t n;
 	
-	/* skip over the charset */
-	if (!(inptr = memchr (inptr, '?', inend - inptr)) || inptr[2] != '?')
+	/* check that this could even be an encoded-word token */
+	if (len < 7 || strncmp (word, "=?", 2) != 0 || strncmp (word + len - 2, "?=", 2) != 0)
 		return NULL;
 	
-	inptr++;
+	/* skip over '=?' */
+	inptr = word + 2;
+	charset = inptr;
 	
-	switch (*inptr) {
-	case 'B':
-	case 'b':
-		inptr += 2;
-		len = (size_t) (inend - inptr);
-		decoded = g_alloca (len);
-		declen = g_mime_encoding_base64_decode_step (inptr, len, decoded, &state, &save);
-		
-		if (declen == -1) {
-			d(fprintf (stderr, "encountered broken 'Q' encoding\n"));
-			return NULL;
-		}
-		break;
-	case 'Q':
-	case 'q':
-		inptr += 2;
-		len = (size_t) (inend - inptr);
-		decoded = g_alloca (len);
-		declen = quoted_decode (inptr, len, decoded);
-		
-		if (declen == -1) {
-			d(fprintf (stderr, "encountered broken 'Q' encoding\n"));
-			return NULL;
-		}
-		break;
-	default:
-		d(fprintf (stderr, "unknown encoding\n"));
+	if (*charset == '?' || *charset == '*') {
+		/* this would result in an empty charset */
 		return NULL;
 	}
 	
-	len = (inptr - 3) - (instart + 2);
-	charenc = g_alloca (len + 1);
-	memcpy (charenc, in + 2, len);
-	charenc[len] = '\0';
-	charset = charenc;
+	/* skip to the end of the charset */
+	if (!(inptr = memchr (inptr, '?', len - 2)) || inptr[2] != '?')
+		return NULL;
+	
+	/* copy the charset into a buffer */
+	n = (size_t) (inptr - charset);
+	buf = g_alloca (n + 1);
+	memcpy (buf, charset, n);
+	buf[n] = '\0';
+	charset = buf;
 	
 	/* rfc2231 updates rfc2047 encoded words...
 	 * The ABNF given in RFC 2047 for encoded-words is:
@@ -1816,98 +1875,79 @@ rfc2047_decode_word (const char *in, size_t inlen)
 	 */
 	
 	/* trim off the 'language' part if it's there... */
-	if ((p = strchr (charset, '*')))
-		*p = '\0';
+	if ((lang = strchr (charset, '*')))
+		*lang = '\0';
 	
-	/* slight optimization? */
-	if (!g_ascii_strcasecmp (charset, "UTF-8")) {
-		p = (char *) decoded;
-		len = declen;
-		
-		while (!g_utf8_validate (p, len, (const char **) &p)) {
-			len = declen - (p - (char *) decoded);
-			*p = '?';
-		}
-		
-		return g_strndup ((char *) decoded, declen);
-	}
+	/* skip over the '?' */
+	inptr++;
 	
-	if (!charset[0] || (cd = g_mime_iconv_open ("UTF-8", charset)) == (iconv_t) -1) {
-		w(g_warning ("Cannot convert from %s to UTF-8, header display may "
-			     "be corrupt: %s", charset[0] ? charset : "unspecified charset",
-			     g_strerror (errno)));
-		
-		return g_mime_utils_decode_8bit ((char *) decoded, declen);
-	}
+	/* make sure the first char after the encoding is another '?' */
+	if (inptr[1] != '?')
+		return NULL;
 	
-	len = declen;
-	buf = g_malloc (len + 1);
+	switch (*inptr++) {
+	case 'B': case 'b':
+		encoding = 'B';
+		break;
+	case 'Q': case 'q':
+		encoding = 'Q';
+		break;
+	default:
+		return NULL;
+	}
 	
-	charset_convert (cd, (char *) decoded, declen, &buf, &len, &ninval);
+	/* the payload begins right after the '?' */
+	payload = inptr + 1;
 	
-	g_mime_iconv_close (cd);
+	/* find the end of the payload */
+	inptr = word + len - 2;
 	
-#if w(!)0
-	if (ninval > 0) {
-		g_warning ("Failed to completely convert \"%.*s\" to UTF-8, display may be "
-			   "corrupt: %s", declen, decoded, g_strerror (errno));
-	}
-#endif
+	token = rfc2047_token_new (payload, inptr - payload);
+	token->charset = g_mime_charset_iconv_name (charset);
+	token->encoding = encoding;
 	
-	return buf;
+	return token;
 }
 
-
-/**
- * g_mime_utils_header_decode_text:
- * @text: header text to decode
- *
- * Decodes an rfc2047 encoded 'text' header.
- *
- * Note: See g_mime_set_user_charsets() for details on how charset
- * conversion is handled for unencoded 8bit text and/or wrongly
- * specified rfc2047 encoded-word tokens.
- *
- * Returns: a newly allocated UTF-8 string representing the the decoded
- * header.
- **/
-char *
-g_mime_utils_header_decode_text (const char *text)
+static rfc2047_token *
+tokenize_rfc2047_phrase (const char *in, size_t *len)
 {
 	gboolean enable_rfc2047_workarounds = _g_mime_enable_rfc2047_workarounds ();
-	register const char *inptr = text;
+	rfc2047_token list, *lwsp, *token, *tail;
+	register const char *inptr = in;
 	gboolean encoded = FALSE;
-	const char *lwsp, *word;
-	size_t nlwsp, n;
+	const char *text, *word;
 	gboolean ascii;
-	char *decoded;
-	GString *out;
-	
-	if (text == NULL)
-		return g_strdup ("");
+	size_t n;
 	
-	out = g_string_sized_new (strlen (text) + 1);
+	tail = (rfc2047_token *) &list;
+	list.next = NULL;
+	lwsp = NULL;
 	
 	while (*inptr != '\0') {
-		lwsp = inptr;
+		text = inptr;
 		while (is_lwsp (*inptr))
 			inptr++;
 		
-		nlwsp = (size_t) (inptr - lwsp);
+		if (inptr > text)
+			lwsp = rfc2047_token_new (text, inptr - text);
+		else
+			lwsp = NULL;
 		
-		if (*inptr != '\0') {
-			word = inptr;
-			ascii = TRUE;
-			
+		word = inptr;
+		if (is_atom (*inptr)) {
 			if (G_UNLIKELY (enable_rfc2047_workarounds)) {
+				/* Make an extra effort to detect and
+				 * separate encoded-word tokens that
+				 * have been merged with other
+				 * words. */
+				
 				if (!strncmp (inptr, "=?", 2)) {
 					inptr += 2;
 					
 					/* skip past the charset (if one is even declared, sigh) */
-					while (*inptr && *inptr != '?') {
-						ascii = ascii && is_ascii (*inptr);
+					while (*inptr && *inptr != '?')
 						inptr++;
-					}
 					
 					/* sanity check encoding type */
 					if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
@@ -1916,16 +1956,12 @@ g_mime_utils_header_decode_text (const char *text)
 					inptr += 3;
 					
 					/* find the end of the rfc2047 encoded word token */
-					while (*inptr && strncmp (inptr, "?=", 2) != 0) {
-						ascii = ascii && is_ascii (*inptr);
+					while (*inptr && strncmp (inptr, "?=", 2) != 0)
 						inptr++;
-					}
 					
 					if (*inptr == '\0') {
 						/* didn't find an end marker... */
 						inptr = word + 2;
-						ascii = TRUE;
-						
 						goto non_rfc2047;
 					}
 					
@@ -1934,117 +1970,112 @@ g_mime_utils_header_decode_text (const char *text)
 				non_rfc2047:
 					/* stop if we encounter a possible rfc2047 encoded
 					 * token even if it's inside another word, sigh. */
-					while (*inptr && !is_lwsp (*inptr) &&
-					       strncmp (inptr, "=?", 2) != 0) {
-						ascii = ascii && is_ascii (*inptr);
+					while (is_atom (*inptr) && strncmp (inptr, "=?", 2) != 0)
 						inptr++;
-					}
 				}
 			} else {
-				while (*inptr && !is_lwsp (*inptr)) {
-					ascii = ascii && is_ascii (*inptr);
+				while (is_atom (*inptr))
 					inptr++;
-				}
 			}
 			
 			n = (size_t) (inptr - word);
-			if (is_rfc2047_encoded_word (word, n)) {
-				if ((decoded = rfc2047_decode_word (word, n))) {
-					/* rfc2047 states that you must ignore all
-					 * whitespace between encoded words */
-					if (!encoded)
-						g_string_append_len (out, lwsp, nlwsp);
-					
-					g_string_append (out, decoded);
-					g_free (decoded);
-					
-					encoded = TRUE;
-				} else {
-					/* append lwsp and invalid rfc2047 encoded-word token */
-					g_string_append_len (out, lwsp, nlwsp + n);
-					encoded = FALSE;
+			if ((token = rfc2047_token_new_encoded_word (word, n))) {
+				/* rfc2047 states that you must ignore all
+				 * whitespace between encoded words */
+				if (!encoded && lwsp != NULL) {
+					tail->next = lwsp;
+					tail = lwsp;
+				} else if (lwsp != NULL) {
+					rfc2047_token_free (lwsp);
 				}
-			} else {
-				/* append lwsp */
-				g_string_append_len (out, lwsp, nlwsp);
 				
-				/* append word token */
-				if (!ascii) {
-					/* *sigh* I hate broken mailers... */
-					decoded = g_mime_utils_decode_8bit (word, n);
-					g_string_append (out, decoded);
-					g_free (decoded);
-				} else {
-					g_string_append_len (out, word, n);
+				tail->next = token;
+				tail = token;
+				
+				encoded = TRUE;
+			} else {
+				/* append the lwsp and atom tokens */
+				if (lwsp != NULL) {
+					tail->next = lwsp;
+					tail = lwsp;
 				}
 				
+				token = rfc2047_token_new (word, n);
+				tail->next = token;
+				tail = token;
+				
 				encoded = FALSE;
 			}
 		} else {
-			/* appending trailing lwsp */
-			g_string_append_len (out, lwsp, nlwsp);
-			break;
+			/* append the lwsp token */
+			if (lwsp != NULL) {
+				tail->next = lwsp;
+				tail = lwsp;
+			}
+			
+			ascii = TRUE;
+			while (*inptr && !is_lwsp (*inptr) && !is_atom (*inptr)) {
+				ascii = ascii && is_ascii (*inptr);
+				inptr++;
+			}
+			
+			n = (size_t) (inptr - word);
+			token = rfc2047_token_new (word, n);
+			if (!ascii) {
+				/* *sigh* I hate broken mailers... */
+				token->is_8bit = 1;
+			}
+			
+			tail->next = token;
+			tail = token;
+			
+			encoded = FALSE;
 		}
 	}
 	
-	decoded = out->str;
-	g_string_free (out, FALSE);
+	*len = (size_t) (inptr - in);
 	
-	return decoded;
+	return list.next;
 }
 
-
-/**
- * g_mime_utils_header_decode_phrase:
- * @phrase: header to decode
- *
- * Decodes an rfc2047 encoded 'phrase' header.
- *
- * Note: See g_mime_set_user_charsets() for details on how charset
- * conversion is handled for unencoded 8bit text and/or wrongly
- * specified rfc2047 encoded-word tokens.
- *
- * Returns: a newly allocated UTF-8 string representing the the decoded
- * header.
- **/
-char *
-g_mime_utils_header_decode_phrase (const char *phrase)
+static rfc2047_token *
+tokenize_rfc2047_text (const char *in, size_t *len)
 {
 	gboolean enable_rfc2047_workarounds = _g_mime_enable_rfc2047_workarounds ();
-	register const char *inptr = phrase;
+	rfc2047_token list, *lwsp, *token, *tail;
+	register const char *inptr = in;
 	gboolean encoded = FALSE;
-	const char *lwsp, *word;
-	size_t nlwsp, n;
+	const char *text, *word;
 	gboolean ascii;
-	char *decoded;
-	GString *out;
+	size_t n;
 	
-	if (phrase == NULL)
-		return g_strdup ("");
-	
-	out = g_string_sized_new (strlen (phrase) + 1);
+	tail = (rfc2047_token *) &list;
+	list.next = NULL;
+	lwsp = NULL;
 	
 	while (*inptr != '\0') {
-		lwsp = inptr;
+		text = inptr;
 		while (is_lwsp (*inptr))
 			inptr++;
 		
-		nlwsp = (size_t) (inptr - lwsp);
+		if (inptr > text)
+			lwsp = rfc2047_token_new (text, inptr - text);
+		else
+			lwsp = NULL;
 		
-		word = inptr;
-		if (is_atom (*inptr)) {
+		if (*inptr != '\0') {
+			word = inptr;
+			ascii = TRUE;
+			
 			if (G_UNLIKELY (enable_rfc2047_workarounds)) {
-				/* Make an extra effort to detect and
-				 * separate encoded-word tokens that
-				 * have been merged with other
-				 * words. */
-				
 				if (!strncmp (inptr, "=?", 2)) {
 					inptr += 2;
 					
 					/* skip past the charset (if one is even declared, sigh) */
-					while (*inptr && *inptr != '?')
+					while (*inptr && *inptr != '?') {
+						ascii = ascii && is_ascii (*inptr);
 						inptr++;
+					}
 					
 					/* sanity check encoding type */
 					if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
@@ -2053,12 +2084,16 @@ g_mime_utils_header_decode_phrase (const char *phrase)
 					inptr += 3;
 					
 					/* find the end of the rfc2047 encoded word token */
-					while (*inptr && strncmp (inptr, "?=", 2) != 0)
+					while (*inptr && strncmp (inptr, "?=", 2) != 0) {
+						ascii = ascii && is_ascii (*inptr);
 						inptr++;
+					}
 					
 					if (*inptr == '\0') {
 						/* didn't find an end marker... */
 						inptr = word + 2;
+						ascii = TRUE;
+						
 						goto non_rfc2047;
 					}
 					
@@ -2067,62 +2102,244 @@ g_mime_utils_header_decode_phrase (const char *phrase)
 				non_rfc2047:
 					/* stop if we encounter a possible rfc2047 encoded
 					 * token even if it's inside another word, sigh. */
-					while (is_atom (*inptr) && strncmp (inptr, "=?", 2) != 0)
+					while (*inptr && !is_lwsp (*inptr) &&
+					       strncmp (inptr, "=?", 2) != 0) {
+						ascii = ascii && is_ascii (*inptr);
 						inptr++;
+					}
 				}
 			} else {
-				while (is_atom (*inptr))
+				while (*inptr && !is_lwsp (*inptr)) {
+					ascii = ascii && is_ascii (*inptr);
 					inptr++;
+				}
 			}
 			
 			n = (size_t) (inptr - word);
-			if (is_rfc2047_encoded_word (word, n)) {
-				if ((decoded = rfc2047_decode_word (word, n))) {
-					/* rfc2047 states that you must ignore all
-					 * whitespace between encoded words */
-					if (!encoded)
-						g_string_append_len (out, lwsp, nlwsp);
-					
-					g_string_append (out, decoded);
-					g_free (decoded);
-					
-					encoded = TRUE;
-				} else {
-					/* append lwsp and invalid rfc2047 encoded-word token */
-					g_string_append_len (out, lwsp, nlwsp + n);
-					encoded = FALSE;
+			if ((token = rfc2047_token_new_encoded_word (word, n))) {
+				/* rfc2047 states that you must ignore all
+				 * whitespace between encoded words */
+				if (!encoded && lwsp != NULL) {
+					tail->next = lwsp;
+					tail = lwsp;
+				} else if (lwsp != NULL) {
+					rfc2047_token_free (lwsp);
 				}
+				
+				tail->next = token;
+				tail = token;
+				
+				encoded = TRUE;
 			} else {
-				/* append lwsp and atom token */
-				g_string_append_len (out, lwsp, nlwsp + n);
+				/* append the lwsp and atom tokens */
+				if (lwsp != NULL) {
+					tail->next = lwsp;
+					tail = lwsp;
+				}
+				
+				token = rfc2047_token_new (word, n);
+				tail->next = token;
+				tail = token;
+				
 				encoded = FALSE;
 			}
 		} else {
-			g_string_append_len (out, lwsp, nlwsp);
+			if (lwsp != NULL) {
+				/* appending trailing lwsp */
+				tail->next = lwsp;
+				tail = lwsp;
+			}
 			
-			ascii = TRUE;
-			while (*inptr && !is_lwsp (*inptr) && !is_atom (*inptr)) {
-				ascii = ascii && is_ascii (*inptr);
-				inptr++;
+			break;
+		}
+	}
+	
+	*len = (size_t) (inptr - in);
+	
+	return list.next;
+}
+
+static size_t
+rfc2047_token_decode (rfc2047_token *token, unsigned char *outbuf, int *state, guint32 *save)
+{
+	const unsigned char *inbuf = (const unsigned char *) token->text;
+	size_t len = token->length;
+	
+	if (token->encoding == 'B')
+		return g_mime_encoding_base64_decode_step (inbuf, len, outbuf, state, save);
+	else
+		return quoted_decode (inbuf, len, outbuf, state, save);
+}
+
+static char *
+rfc2047_decode_tokens (rfc2047_token *tokens, size_t buflen)
+{
+	rfc2047_token *token, *next;
+	size_t outlen, ninval, len;
+	unsigned char *outptr;
+	const char *charset;
+	GByteArray *outbuf;
+	GString *decoded;
+	char encoding;
+	guint32 save;
+	iconv_t cd;
+	int state;
+	char *str;
+	
+	decoded = g_string_sized_new (buflen + 1);
+	outbuf = g_byte_array_sized_new (76);
+	
+	token = tokens;
+	while (token != NULL) {
+		next = token->next;
+		
+		if (token->encoding) {
+			/* In order to work around broken mailers, we need to combine
+			 * the raw decoded content of runs of identically encoded word
+			 * tokens before converting into UTF-8. */
+			encoding = token->encoding;
+			charset = token->charset;
+			len = token->length;
+			state = 0;
+			save = 0;
+			
+			/* find the end of the run (and measure the buffer length we'll need) */
+			while (next && next->encoding == encoding && !strcmp (next->charset, charset)) {
+				len += next->length;
+				next = next->next;
 			}
 			
-			n = (size_t) (inptr - word);
+			/* make sure our temporary output buffer is large enough... */
+			if (len > outbuf->len)
+				g_byte_array_set_size (outbuf, len);
 			
-			if (!ascii) {
-				/* *sigh* I hate broken mailers... */
-				decoded = g_mime_utils_decode_8bit (word, n);
-				g_string_append (out, decoded);
-				g_free (decoded);
+			/* base64 / quoted-printable decode each of the tokens... */
+			outptr = outbuf->data;
+			outlen = 0;
+			do {
+				/* Note: by not resetting state/save each loop, we effectively
+				 * treat the payloads as one continuous block, thus allowing
+				 * us to handle cases where a hex-encoded triplet of a
+				 * quoted-printable encoded payload is split between 2 or more
+				 * encoded-word tokens. */
+				len = rfc2047_token_decode (token, outptr, &state, &save);
+				token = token->next;
+				outptr += len;
+				outlen += len;
+			} while (token != next);
+			outptr = outbuf->data;
+			
+			/* convert the raw decoded text into UTF-8 */
+			if (!g_ascii_strcasecmp (charset, "UTF-8")) {
+				/* slight optimization over going thru iconv */
+				str = (char *) outptr;
+				len = outlen;
+				
+				while (!g_utf8_validate (str, len, (const char **) &str)) {
+					len = outlen - (str - (char *) outptr);
+					*str = '?';
+				}
+				
+				g_string_append_len (decoded, (char *) outptr, outlen);
+			} else if ((cd = g_mime_iconv_open ("UTF-8", charset)) == (iconv_t) -1) {
+				w(g_warning ("Cannot convert from %s to UTF-8, header display may "
+					     "be corrupt: %s", charset[0] ? charset : "unspecified charset",
+					     g_strerror (errno)));
+				
+				str = g_mime_utils_decode_8bit ((char *) outptr, outlen);
+				g_string_append (decoded, str);
+				g_free (str);
 			} else {
-				g_string_append_len (out, word, n);
+				str = g_malloc (outlen + 1);
+				len = outlen;
+				
+				len = charset_convert (cd, (char *) outptr, outlen, &str, &len, &ninval);
+				g_mime_iconv_close (cd);
+				
+				g_string_append_len (decoded, str, len);
+				
+#if w(!)0
+				if (ninval > 0) {
+					g_warning ("Failed to completely convert \"%.*s\" to UTF-8, display may be "
+						   "corrupt: %s", outlen, (char *) outptr, g_strerror (errno));
+				}
+#endif
 			}
-			
-			encoded = FALSE;
+		} else if (token->is_8bit) {
+			/* *sigh* I hate broken mailers... */
+			str = g_mime_utils_decode_8bit (token->text, token->length);
+			g_string_append (decoded, str);
+			g_free (str);
+		} else {
+			g_string_append_len (decoded, token->text, token->length);
 		}
+		
+		token = next;
 	}
 	
-	decoded = out->str;
-	g_string_free (out, FALSE);
+	g_byte_array_free (outbuf, TRUE);
+	
+	return g_string_free (decoded, FALSE);
+}
+
+
+/**
+ * g_mime_utils_header_decode_text:
+ * @text: header text to decode
+ *
+ * Decodes an rfc2047 encoded 'text' header.
+ *
+ * Note: See g_mime_set_user_charsets() for details on how charset
+ * conversion is handled for unencoded 8bit text and/or wrongly
+ * specified rfc2047 encoded-word tokens.
+ *
+ * Returns: a newly allocated UTF-8 string representing the the decoded
+ * header.
+ **/
+char *
+g_mime_utils_header_decode_text (const char *text)
+{
+	rfc2047_token *tokens;
+	char *decoded;
+	size_t len;
+	
+	if (text == NULL)
+		return g_strdup ("");
+	
+	tokens = tokenize_rfc2047_text (text, &len);
+	decoded = rfc2047_decode_tokens (tokens, len);
+	rfc2047_token_list_free (tokens);
+	
+	return decoded;
+}
+
+
+/**
+ * g_mime_utils_header_decode_phrase:
+ * @phrase: header to decode
+ *
+ * Decodes an rfc2047 encoded 'phrase' header.
+ *
+ * Note: See g_mime_set_user_charsets() for details on how charset
+ * conversion is handled for unencoded 8bit text and/or wrongly
+ * specified rfc2047 encoded-word tokens.
+ *
+ * Returns: a newly allocated UTF-8 string representing the the decoded
+ * header.
+ **/
+char *
+g_mime_utils_header_decode_phrase (const char *phrase)
+{
+	rfc2047_token *tokens;
+	char *decoded;
+	size_t len;
+	
+	if (phrase == NULL)
+		return g_strdup ("");
+	
+	tokens = tokenize_rfc2047_phrase (phrase, &len);
+	decoded = rfc2047_decode_tokens (tokens, len);
+	rfc2047_token_list_free (tokens);
 	
 	return decoded;
 }
diff --git a/tests/test-mime.c b/tests/test-mime.c
index e0ab8b6..3ac64c1 100644
--- a/tests/test-mime.c
+++ b/tests/test-mime.c
@@ -204,12 +204,10 @@ static struct {
 	{ "\"Biznes=?ISO-8859-2?Q?_?=INTERIA.PL\"=?ISO-8859-2?Q?_?=<biuletyny firma interia pl>",
 	  "\"Biznes INTERIA.PL \" <biuletyny firma interia pl>",
 	  "\"Biznes INTERIA.PL\" <biuletyny firma interia pl>", },
-#if 0
 	/* UTF-8 sequence split between multiple encoded-word tokens */
 	{ "=?utf-8?Q?{#D=C3=A8=C3=A9=C2=A3=C3=A5=C3=BD_M$=C3=A1=C3?= =?utf-8?Q?=AD.=C3=A7=C3=B8m@#}?= <user domain com>",
 	  "\"{#DÃ¨Ã©Â£Ã¥Ã½ M$Ã¡Ã.Ã§Ã¸m@#}\" <user domain com>",
 	  "=?iso-8859-1?b?eyNE6Omj5f0gTSTh7S7n+G1AI30=?= <user domain com>" },
-#endif
 };
 
 static void

Follow-Ups:
- Re: [gmime-devel] Issue with header decoding (continues)
  - From: evil legacy

References:
- [gmime-devel] Issue with header decoding (continues)
  - From: evil legacy
- Re: [gmime-devel] Issue with header decoding (continues)
  - From: Jeffrey Stedfast
- Re: [gmime-devel] Issue with header decoding (continues)
  - From: evil legacy

[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]