[gmime] Fix for multibyte sequences split between 2 rfc2047 tokens



commit 85e30d0156226c567d134a9a803c7005d47e1c83
Author: Jeffrey Stedfast <fejj gnome org>
Date:   Sun Dec 18 20:23:49 2011 -0500

    Fix for multibyte sequences split between 2 rfc2047 tokens
    
    2011-12-18  Jeffrey Stedfast  <fejj gnome org>
    
    	* gmime/gmime-utils.c (quoted_decode): Made more robust and also
    	modified to keep state (e.g. for use when triplets span across
    	multiple encoded-word tokens).
    	(tokenize_rfc2047_phrase): Tokenizes a rfc822 phrase header for
    	later processing.
    	(tokenize_rfc2047_text): Tokenizes a rfc822 unstructured text
    	header for later processing.
    	(rfc2047_decode_tokens): Merge and decode rfc2047 tokens,
    	converting the decoded text into UTF-8.
    	(g_mime_utils_header_decode_phrase): Rewritten to use the above
    	functions.
    	(g_mime_utils_header_decode_text): Same.

 ChangeLog               |   15 +
 gmime/gmime-encodings.c |    1 +
 gmime/gmime-utils.c     |  716 +++++++++++++++++++++++++++++++----------------
 tests/test-mime.c       |    2 -
 4 files changed, 484 insertions(+), 250 deletions(-)
---
diff --git a/ChangeLog b/ChangeLog
index f74e28f..98717e1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
 2011-12-18  Jeffrey Stedfast  <fejj gnome org>
 
+	* gmime/gmime-utils.c (quoted_decode): Made more robust and also
+	modified to keep state (e.g. for use when triplets span across
+	multiple encoded-word tokens).
+	(tokenize_rfc2047_phrase): Tokenizes a rfc822 phrase header for
+	later processing.
+	(tokenize_rfc2047_text): Tokenizes a rfc822 unstructured text
+	header for later processing.
+	(rfc2047_decode_tokens): Merge and decode rfc2047 tokens,
+	converting the decoded text into UTF-8.
+	(g_mime_utils_header_decode_phrase): Rewritten to use the above
+	functions.
+	(g_mime_utils_header_decode_text): Same.
+
+2011-12-18  Jeffrey Stedfast  <fejj gnome org>
+
 	* gmime/gmime-utils.c (charset_convert): Don't count the
 	terminating nul character in the returned string length.
 
diff --git a/gmime/gmime-encodings.c b/gmime/gmime-encodings.c
index aa54c5f..f052035 100644
--- a/gmime/gmime-encodings.c
+++ b/gmime/gmime-encodings.c
@@ -526,6 +526,7 @@ g_mime_encoding_base64_decode_step (const unsigned char *inbuf, size_t inlen, un
 				*outptr++ = saved >> 16;
 				*outptr++ = saved >> 8;
 				*outptr++ = saved;
+				saved = 0;
 				n = 0;
 				
 				if (npad > 0) {
diff --git a/gmime/gmime-utils.c b/gmime/gmime-utils.c
index 6fc17d6..a6acb46 100644
--- a/gmime/gmime-utils.c
+++ b/gmime/gmime-utils.c
@@ -225,10 +225,13 @@ typedef struct _date_token {
 static date_token *
 datetok (const char *date)
 {
-	date_token *tokens = NULL, *token, *tail = (date_token *) &tokens;
+	date_token tokens, *token, *tail;
 	const char *start, *end;
         unsigned char mask;
 	
+	tail = (date_token *) &tokens;
+	tokens.next = NULL;
+	
 	start = date;
 	while (*start) {
 		/* kill leading whitespace */
@@ -262,7 +265,7 @@ datetok (const char *date)
 			break;
 	}
 	
-	return tokens;
+	return tokens.next;
 }
 
 static int
@@ -995,14 +998,14 @@ g_mime_utils_decode_message_id (const char *message_id)
 GMimeReferences *
 g_mime_references_decode (const char *text)
 {
-	GMimeReferences *refs, *tail, *ref;
+	GMimeReferences refs, *tail, *ref;
 	const char *word, *inptr = text;
 	char *msgid;
 	
 	g_return_val_if_fail (text != NULL, NULL);
 	
-	refs = NULL;
 	tail = (GMimeReferences *) &refs;
+	refs.next = NULL;
 	
 	while (*inptr) {
 		decode_lwsp (&inptr);
@@ -1027,7 +1030,7 @@ g_mime_references_decode (const char *text)
 		}
 	}
 	
-	return refs;
+	return refs.next;
 }
 
 
@@ -1713,29 +1716,84 @@ g_mime_utils_decode_8bit (const char *text, size_t len)
 
 
 /* this decodes rfc2047's version of quoted-printable */
-static ssize_t
-quoted_decode (const unsigned char *in, size_t len, unsigned char *out)
+static size_t
+quoted_decode (const unsigned char *in, size_t len, unsigned char *out, int *state, guint32 *save)
 {
 	register const unsigned char *inptr;
 	register unsigned char *outptr;
 	const unsigned char *inend;
 	unsigned char c, c1;
+	size_t need, i;
+	guint32 saved;
+	
+	if (len == 0)
+		return 0;
 	
 	inend = in + len;
 	outptr = out;
-	
 	inptr = in;
+	
+	need = (size_t) *state;
+	saved = *save;
+	
+	if (need > 0) {
+		if (isxdigit ((int) *inptr)) {
+			if (need == 1) {
+				c = g_ascii_toupper ((int) (saved & 0xff));
+				c1 = g_ascii_toupper ((int) *inptr++);
+				saved = 0;
+				need = 0;
+				
+				goto decode;
+			}
+
+			saved = 0;
+			need = 0;
+			
+			goto equals;
+		}
+		
+		/* last encoded-word ended in a malformed quoted-printable sequence */
+		*outptr++ = '=';
+		
+		if (need == 1)
+			*outptr++ = (char) (saved & 0xff);
+		
+		saved = 0;
+		need = 0;
+	}
+	
 	while (inptr < inend) {
 		c = *inptr++;
 		if (c == '=') {
+		equals:
 			if (inend - inptr >= 2) {
-				c = toupper (*inptr++);
-				c1 = toupper (*inptr++);
-				*outptr++ = (((c >= 'A' ? c - 'A' + 10 : c - '0') & 0x0f) << 4)
-					| ((c1 >= 'A' ? c1 - 'A' + 10 : c1 - '0') & 0x0f);
+				if (isxdigit ((int) inptr[0]) && isxdigit ((int) inptr[1])) {
+					c = g_ascii_toupper (*inptr++);
+					c1 = g_ascii_toupper (*inptr++);
+				decode:
+					*outptr++ = (((c >= 'A' ? c - 'A' + 10 : c - '0') & 0x0f) << 4)
+						| ((c1 >= 'A' ? c1 - 'A' + 10 : c1 - '0') & 0x0f);
+				} else {
+					/* malformed quoted-printable sequence? */
+					*outptr++ = '=';
+				}
 			} else {
-				/* data was truncated */
-				return -1;
+				/* truncated payload, maybe it was split across encoded-words? */
+				if (inptr < inend) {
+					if (isxdigit ((int) *inptr)) {
+						saved = *inptr;
+						need = 1;
+						break;
+					} else {
+						/* malformed quoted-printable sequence? */
+						*outptr++ = '=';
+					}
+				} else {
+					saved = 0;
+					need = 2;
+					break;
+				}
 			}
 		} else if (c == '_') {
 			/* _'s are an rfc2047 shortcut for encoding spaces */
@@ -1745,68 +1803,72 @@ quoted_decode (const unsigned char *in, size_t len, unsigned char *out)
 		}
 	}
 	
-	return (ssize_t) (outptr - out);
+	*state = (int) need;
+	*save = saved;
+	
+	return (size_t) (outptr - out);
 }
 
 #define is_rfc2047_encoded_word(atom, len) (len >= 7 && !strncmp (atom, "=?", 2) && !strncmp (atom + len - 2, "?=", 2))
 
-static char *
-rfc2047_decode_word (const char *in, size_t inlen)
+typedef struct _rfc2047_token {
+	struct _rfc2047_token *next;
+	const char *charset;
+	const char *text;
+	size_t length;
+	char encoding;
+	char is_8bit;
+} rfc2047_token;
+
+#define rfc2047_token_list_free(tokens) g_slice_free_chain (rfc2047_token, tokens, next)
+#define rfc2047_token_free(token) g_slice_free (rfc2047_token, token)
+
+static rfc2047_token *
+rfc2047_token_new (const char *text, size_t len)
 {
-	const unsigned char *instart = (const unsigned char *) in;
-	const register unsigned char *inptr = instart + 2;
-	const unsigned char *inend = instart + inlen - 2;
-	unsigned char *decoded;
+	rfc2047_token *token;
+	
+	token = g_slice_new0 (rfc2047_token);
+	token->length = len;
+	token->text = text;
+	
+	return token;
+}
+
+static rfc2047_token *
+rfc2047_token_new_encoded_word (const char *word, size_t len)
+{
+	rfc2047_token *token;
+	const char *payload;
 	const char *charset;
-	size_t len, ninval;
-	char *charenc, *p;
-	guint32 save = 0;
-	ssize_t declen;
-	int state = 0;
-	iconv_t cd;
-	char *buf;
+	const char *inptr;
+	char *buf, *lang;
+	char encoding;
+	size_t n;
 	
-	/* skip over the charset */
-	if (!(inptr = memchr (inptr, '?', inend - inptr)) || inptr[2] != '?')
+	/* check that this could even be an encoded-word token */
+	if (len < 7 || strncmp (word, "=?", 2) != 0 || strncmp (word + len - 2, "?=", 2) != 0)
 		return NULL;
 	
-	inptr++;
+	/* skip over '=?' */
+	inptr = word + 2;
+	charset = inptr;
 	
-	switch (*inptr) {
-	case 'B':
-	case 'b':
-		inptr += 2;
-		len = (size_t) (inend - inptr);
-		decoded = g_alloca (len);
-		declen = g_mime_encoding_base64_decode_step (inptr, len, decoded, &state, &save);
-		
-		if (declen == -1) {
-			d(fprintf (stderr, "encountered broken 'Q' encoding\n"));
-			return NULL;
-		}
-		break;
-	case 'Q':
-	case 'q':
-		inptr += 2;
-		len = (size_t) (inend - inptr);
-		decoded = g_alloca (len);
-		declen = quoted_decode (inptr, len, decoded);
-		
-		if (declen == -1) {
-			d(fprintf (stderr, "encountered broken 'Q' encoding\n"));
-			return NULL;
-		}
-		break;
-	default:
-		d(fprintf (stderr, "unknown encoding\n"));
+	if (*charset == '?' || *charset == '*') {
+		/* this would result in an empty charset */
 		return NULL;
 	}
 	
-	len = (inptr - 3) - (instart + 2);
-	charenc = g_alloca (len + 1);
-	memcpy (charenc, in + 2, len);
-	charenc[len] = '\0';
-	charset = charenc;
+	/* skip to the end of the charset */
+	if (!(inptr = memchr (inptr, '?', len - 2)) || inptr[2] != '?')
+		return NULL;
+	
+	/* copy the charset into a buffer */
+	n = (size_t) (inptr - charset);
+	buf = g_alloca (n + 1);
+	memcpy (buf, charset, n);
+	buf[n] = '\0';
+	charset = buf;
 	
 	/* rfc2231 updates rfc2047 encoded words...
 	 * The ABNF given in RFC 2047 for encoded-words is:
@@ -1816,98 +1878,79 @@ rfc2047_decode_word (const char *in, size_t inlen)
 	 */
 	
 	/* trim off the 'language' part if it's there... */
-	if ((p = strchr (charset, '*')))
-		*p = '\0';
+	if ((lang = strchr (charset, '*')))
+		*lang = '\0';
 	
-	/* slight optimization? */
-	if (!g_ascii_strcasecmp (charset, "UTF-8")) {
-		p = (char *) decoded;
-		len = declen;
-		
-		while (!g_utf8_validate (p, len, (const char **) &p)) {
-			len = declen - (p - (char *) decoded);
-			*p = '?';
-		}
-		
-		return g_strndup ((char *) decoded, declen);
-	}
+	/* skip over the '?' */
+	inptr++;
 	
-	if (!charset[0] || (cd = g_mime_iconv_open ("UTF-8", charset)) == (iconv_t) -1) {
-		w(g_warning ("Cannot convert from %s to UTF-8, header display may "
-			     "be corrupt: %s", charset[0] ? charset : "unspecified charset",
-			     g_strerror (errno)));
-		
-		return g_mime_utils_decode_8bit ((char *) decoded, declen);
-	}
+	/* make sure the first char after the encoding is another '?' */
+	if (inptr[1] != '?')
+		return NULL;
 	
-	len = declen;
-	buf = g_malloc (len + 1);
+	switch (*inptr++) {
+	case 'B': case 'b':
+		encoding = 'B';
+		break;
+	case 'Q': case 'q':
+		encoding = 'Q';
+		break;
+	default:
+		return NULL;
+	}
 	
-	charset_convert (cd, (char *) decoded, declen, &buf, &len, &ninval);
+	/* the payload begins right after the '?' */
+	payload = inptr + 1;
 	
-	g_mime_iconv_close (cd);
+	/* find the end of the payload */
+	inptr = word + len - 2;
 	
-#if w(!)0
-	if (ninval > 0) {
-		g_warning ("Failed to completely convert \"%.*s\" to UTF-8, display may be "
-			   "corrupt: %s", declen, decoded, g_strerror (errno));
-	}
-#endif
+	token = rfc2047_token_new (payload, inptr - payload);
+	token->charset = g_mime_charset_iconv_name (charset);
+	token->encoding = encoding;
 	
-	return buf;
+	return token;
 }
 
-
-/**
- * g_mime_utils_header_decode_text:
- * @text: header text to decode
- *
- * Decodes an rfc2047 encoded 'text' header.
- *
- * Note: See g_mime_set_user_charsets() for details on how charset
- * conversion is handled for unencoded 8bit text and/or wrongly
- * specified rfc2047 encoded-word tokens.
- *
- * Returns: a newly allocated UTF-8 string representing the the decoded
- * header.
- **/
-char *
-g_mime_utils_header_decode_text (const char *text)
+static rfc2047_token *
+tokenize_rfc2047_phrase (const char *in, size_t *len)
 {
 	gboolean enable_rfc2047_workarounds = _g_mime_enable_rfc2047_workarounds ();
-	register const char *inptr = text;
+	rfc2047_token list, *lwsp, *token, *tail;
+	register const char *inptr = in;
 	gboolean encoded = FALSE;
-	const char *lwsp, *word;
-	size_t nlwsp, n;
+	const char *text, *word;
 	gboolean ascii;
-	char *decoded;
-	GString *out;
-	
-	if (text == NULL)
-		return g_strdup ("");
+	size_t n;
 	
-	out = g_string_sized_new (strlen (text) + 1);
+	tail = (rfc2047_token *) &list;
+	list.next = NULL;
+	lwsp = NULL;
 	
 	while (*inptr != '\0') {
-		lwsp = inptr;
+		text = inptr;
 		while (is_lwsp (*inptr))
 			inptr++;
 		
-		nlwsp = (size_t) (inptr - lwsp);
+		if (inptr > text)
+			lwsp = rfc2047_token_new (text, inptr - text);
+		else
+			lwsp = NULL;
 		
-		if (*inptr != '\0') {
-			word = inptr;
-			ascii = TRUE;
-			
+		word = inptr;
+		if (is_atom (*inptr)) {
 			if (G_UNLIKELY (enable_rfc2047_workarounds)) {
+				/* Make an extra effort to detect and
+				 * separate encoded-word tokens that
+				 * have been merged with other
+				 * words. */
+				
 				if (!strncmp (inptr, "=?", 2)) {
 					inptr += 2;
 					
 					/* skip past the charset (if one is even declared, sigh) */
-					while (*inptr && *inptr != '?') {
-						ascii = ascii && is_ascii (*inptr);
+					while (*inptr && *inptr != '?')
 						inptr++;
-					}
 					
 					/* sanity check encoding type */
 					if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
@@ -1916,16 +1959,12 @@ g_mime_utils_header_decode_text (const char *text)
 					inptr += 3;
 					
 					/* find the end of the rfc2047 encoded word token */
-					while (*inptr && strncmp (inptr, "?=", 2) != 0) {
-						ascii = ascii && is_ascii (*inptr);
+					while (*inptr && strncmp (inptr, "?=", 2) != 0)
 						inptr++;
-					}
 					
 					if (*inptr == '\0') {
 						/* didn't find an end marker... */
 						inptr = word + 2;
-						ascii = TRUE;
-						
 						goto non_rfc2047;
 					}
 					
@@ -1934,117 +1973,112 @@ g_mime_utils_header_decode_text (const char *text)
 				non_rfc2047:
 					/* stop if we encounter a possible rfc2047 encoded
 					 * token even if it's inside another word, sigh. */
-					while (*inptr && !is_lwsp (*inptr) &&
-					       strncmp (inptr, "=?", 2) != 0) {
-						ascii = ascii && is_ascii (*inptr);
+					while (is_atom (*inptr) && strncmp (inptr, "=?", 2) != 0)
 						inptr++;
-					}
 				}
 			} else {
-				while (*inptr && !is_lwsp (*inptr)) {
-					ascii = ascii && is_ascii (*inptr);
+				while (is_atom (*inptr))
 					inptr++;
-				}
 			}
 			
 			n = (size_t) (inptr - word);
-			if (is_rfc2047_encoded_word (word, n)) {
-				if ((decoded = rfc2047_decode_word (word, n))) {
-					/* rfc2047 states that you must ignore all
-					 * whitespace between encoded words */
-					if (!encoded)
-						g_string_append_len (out, lwsp, nlwsp);
-					
-					g_string_append (out, decoded);
-					g_free (decoded);
-					
-					encoded = TRUE;
-				} else {
-					/* append lwsp and invalid rfc2047 encoded-word token */
-					g_string_append_len (out, lwsp, nlwsp + n);
-					encoded = FALSE;
+			if ((token = rfc2047_token_new_encoded_word (word, n))) {
+				/* rfc2047 states that you must ignore all
+				 * whitespace between encoded words */
+				if (!encoded && lwsp != NULL) {
+					tail->next = lwsp;
+					tail = lwsp;
+				} else if (lwsp != NULL) {
+					rfc2047_token_free (lwsp);
 				}
-			} else {
-				/* append lwsp */
-				g_string_append_len (out, lwsp, nlwsp);
 				
-				/* append word token */
-				if (!ascii) {
-					/* *sigh* I hate broken mailers... */
-					decoded = g_mime_utils_decode_8bit (word, n);
-					g_string_append (out, decoded);
-					g_free (decoded);
-				} else {
-					g_string_append_len (out, word, n);
+				tail->next = token;
+				tail = token;
+				
+				encoded = TRUE;
+			} else {
+				/* append the lwsp and atom tokens */
+				if (lwsp != NULL) {
+					tail->next = lwsp;
+					tail = lwsp;
 				}
 				
+				token = rfc2047_token_new (word, n);
+				tail->next = token;
+				tail = token;
+				
 				encoded = FALSE;
 			}
 		} else {
-			/* appending trailing lwsp */
-			g_string_append_len (out, lwsp, nlwsp);
-			break;
+			/* append the lwsp token */
+			if (lwsp != NULL) {
+				tail->next = lwsp;
+				tail = lwsp;
+			}
+			
+			ascii = TRUE;
+			while (*inptr && !is_lwsp (*inptr) && !is_atom (*inptr)) {
+				ascii = ascii && is_ascii (*inptr);
+				inptr++;
+			}
+			
+			n = (size_t) (inptr - word);
+			token = rfc2047_token_new (word, n);
+			if (!ascii) {
+				/* *sigh* I hate broken mailers... */
+				token->is_8bit = 1;
+			}
+			
+			tail->next = token;
+			tail = token;
+			
+			encoded = FALSE;
 		}
 	}
 	
-	decoded = out->str;
-	g_string_free (out, FALSE);
+	*len = (size_t) (inptr - in);
 	
-	return decoded;
+	return list.next;
 }
 
-
-/**
- * g_mime_utils_header_decode_phrase:
- * @phrase: header to decode
- *
- * Decodes an rfc2047 encoded 'phrase' header.
- *
- * Note: See g_mime_set_user_charsets() for details on how charset
- * conversion is handled for unencoded 8bit text and/or wrongly
- * specified rfc2047 encoded-word tokens.
- *
- * Returns: a newly allocated UTF-8 string representing the the decoded
- * header.
- **/
-char *
-g_mime_utils_header_decode_phrase (const char *phrase)
+static rfc2047_token *
+tokenize_rfc2047_text (const char *in, size_t *len)
 {
 	gboolean enable_rfc2047_workarounds = _g_mime_enable_rfc2047_workarounds ();
-	register const char *inptr = phrase;
+	rfc2047_token list, *lwsp, *token, *tail;
+	register const char *inptr = in;
 	gboolean encoded = FALSE;
-	const char *lwsp, *word;
-	size_t nlwsp, n;
+	const char *text, *word;
 	gboolean ascii;
-	char *decoded;
-	GString *out;
-	
-	if (phrase == NULL)
-		return g_strdup ("");
+	size_t n;
 	
-	out = g_string_sized_new (strlen (phrase) + 1);
+	tail = (rfc2047_token *) &list;
+	list.next = NULL;
+	lwsp = NULL;
 	
 	while (*inptr != '\0') {
-		lwsp = inptr;
+		text = inptr;
 		while (is_lwsp (*inptr))
 			inptr++;
 		
-		nlwsp = (size_t) (inptr - lwsp);
+		if (inptr > text)
+			lwsp = rfc2047_token_new (text, inptr - text);
+		else
+			lwsp = NULL;
 		
-		word = inptr;
-		if (is_atom (*inptr)) {
+		if (*inptr != '\0') {
+			word = inptr;
+			ascii = TRUE;
+			
 			if (G_UNLIKELY (enable_rfc2047_workarounds)) {
-				/* Make an extra effort to detect and
-				 * separate encoded-word tokens that
-				 * have been merged with other
-				 * words. */
-				
 				if (!strncmp (inptr, "=?", 2)) {
 					inptr += 2;
 					
 					/* skip past the charset (if one is even declared, sigh) */
-					while (*inptr && *inptr != '?')
+					while (*inptr && *inptr != '?') {
+						ascii = ascii && is_ascii (*inptr);
 						inptr++;
+					}
 					
 					/* sanity check encoding type */
 					if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
@@ -2053,12 +2087,16 @@ g_mime_utils_header_decode_phrase (const char *phrase)
 					inptr += 3;
 					
 					/* find the end of the rfc2047 encoded word token */
-					while (*inptr && strncmp (inptr, "?=", 2) != 0)
+					while (*inptr && strncmp (inptr, "?=", 2) != 0) {
+						ascii = ascii && is_ascii (*inptr);
 						inptr++;
+					}
 					
 					if (*inptr == '\0') {
 						/* didn't find an end marker... */
 						inptr = word + 2;
+						ascii = TRUE;
+						
 						goto non_rfc2047;
 					}
 					
@@ -2067,62 +2105,244 @@ g_mime_utils_header_decode_phrase (const char *phrase)
 				non_rfc2047:
 					/* stop if we encounter a possible rfc2047 encoded
 					 * token even if it's inside another word, sigh. */
-					while (is_atom (*inptr) && strncmp (inptr, "=?", 2) != 0)
+					while (*inptr && !is_lwsp (*inptr) &&
+					       strncmp (inptr, "=?", 2) != 0) {
+						ascii = ascii && is_ascii (*inptr);
 						inptr++;
+					}
 				}
 			} else {
-				while (is_atom (*inptr))
+				while (*inptr && !is_lwsp (*inptr)) {
+					ascii = ascii && is_ascii (*inptr);
 					inptr++;
+				}
 			}
 			
 			n = (size_t) (inptr - word);
-			if (is_rfc2047_encoded_word (word, n)) {
-				if ((decoded = rfc2047_decode_word (word, n))) {
-					/* rfc2047 states that you must ignore all
-					 * whitespace between encoded words */
-					if (!encoded)
-						g_string_append_len (out, lwsp, nlwsp);
-					
-					g_string_append (out, decoded);
-					g_free (decoded);
-					
-					encoded = TRUE;
-				} else {
-					/* append lwsp and invalid rfc2047 encoded-word token */
-					g_string_append_len (out, lwsp, nlwsp + n);
-					encoded = FALSE;
+			if ((token = rfc2047_token_new_encoded_word (word, n))) {
+				/* rfc2047 states that you must ignore all
+				 * whitespace between encoded words */
+				if (!encoded && lwsp != NULL) {
+					tail->next = lwsp;
+					tail = lwsp;
+				} else if (lwsp != NULL) {
+					rfc2047_token_free (lwsp);
 				}
+				
+				tail->next = token;
+				tail = token;
+				
+				encoded = TRUE;
 			} else {
-				/* append lwsp and atom token */
-				g_string_append_len (out, lwsp, nlwsp + n);
+				/* append the lwsp and atom tokens */
+				if (lwsp != NULL) {
+					tail->next = lwsp;
+					tail = lwsp;
+				}
+				
+				token = rfc2047_token_new (word, n);
+				tail->next = token;
+				tail = token;
+				
 				encoded = FALSE;
 			}
 		} else {
-			g_string_append_len (out, lwsp, nlwsp);
+			if (lwsp != NULL) {
+				/* appending trailing lwsp */
+				tail->next = lwsp;
+				tail = lwsp;
+			}
 			
-			ascii = TRUE;
-			while (*inptr && !is_lwsp (*inptr) && !is_atom (*inptr)) {
-				ascii = ascii && is_ascii (*inptr);
-				inptr++;
+			break;
+		}
+	}
+	
+	*len = (size_t) (inptr - in);
+	
+	return list.next;
+}
+
+static size_t
+rfc2047_token_decode (rfc2047_token *token, unsigned char *outbuf, int *state, guint32 *save)
+{
+	const unsigned char *inbuf = (const unsigned char *) token->text;
+	size_t len = token->length;
+	
+	if (token->encoding == 'B')
+		return g_mime_encoding_base64_decode_step (inbuf, len, outbuf, state, save);
+	else
+		return quoted_decode (inbuf, len, outbuf, state, save);
+}
+
+static char *
+rfc2047_decode_tokens (rfc2047_token *tokens, size_t buflen)
+{
+	rfc2047_token *token, *next;
+	size_t outlen, ninval, len;
+	unsigned char *outptr;
+	const char *charset;
+	GByteArray *outbuf;
+	GString *decoded;
+	char encoding;
+	guint32 save;
+	iconv_t cd;
+	int state;
+	char *str;
+	
+	decoded = g_string_sized_new (buflen + 1);
+	outbuf = g_byte_array_sized_new (76);
+	
+	token = tokens;
+	while (token != NULL) {
+		next = token->next;
+		
+		if (token->encoding) {
+			/* In order to work around broken mailers, we need to combine
+			 * the raw decoded content of runs of identically encoded word
+			 * tokens before converting into UTF-8. */
+			encoding = token->encoding;
+			charset = token->charset;
+			len = token->length;
+			state = 0;
+			save = 0;
+			
+			/* find the end of the run (and measure the buffer length we'll need) */
+			while (next && next->encoding == encoding && !strcmp (next->charset, charset)) {
+				len += next->length;
+				next = next->next;
 			}
 			
-			n = (size_t) (inptr - word);
+			/* make sure our temporary output buffer is large enough... */
+			if (len > outbuf->len)
+				g_byte_array_set_size (outbuf, len);
 			
-			if (!ascii) {
-				/* *sigh* I hate broken mailers... */
-				decoded = g_mime_utils_decode_8bit (word, n);
-				g_string_append (out, decoded);
-				g_free (decoded);
+			/* base64 / quoted-printable decode each of the tokens... */
+			outptr = outbuf->data;
+			outlen = 0;
+			do {
+				/* Note: by not resetting state/save each loop, we effectively
+				 * treat the payloads as one continuous block, thus allowing
+				 * us to handle cases where a hex-encoded triplet of a
+				 * quoted-printable encoded payload is split between 2 or more
+				 * encoded-word tokens. */
+				len = rfc2047_token_decode (token, outptr, &state, &save);
+				token = token->next;
+				outptr += len;
+				outlen += len;
+			} while (token != next);
+			outptr = outbuf->data;
+			
+			/* convert the raw decoded text into UTF-8 */
+			if (!g_ascii_strcasecmp (charset, "UTF-8")) {
+				/* slight optimization over going thru iconv */
+				str = (char *) outptr;
+				len = outlen;
+				
+				while (!g_utf8_validate (str, len, (const char **) &str)) {
+					len = outlen - (str - (char *) outptr);
+					*str = '?';
+				}
+				
+				g_string_append_len (decoded, (char *) outptr, outlen);
+			} else if ((cd = g_mime_iconv_open ("UTF-8", charset)) == (iconv_t) -1) {
+				w(g_warning ("Cannot convert from %s to UTF-8, header display may "
+					     "be corrupt: %s", charset[0] ? charset : "unspecified charset",
+					     g_strerror (errno)));
+				
+				str = g_mime_utils_decode_8bit ((char *) outptr, outlen);
+				g_string_append (decoded, str);
+				g_free (str);
 			} else {
-				g_string_append_len (out, word, n);
+				str = g_malloc (outlen + 1);
+				len = outlen;
+				
+				len = charset_convert (cd, (char *) outptr, outlen, &str, &len, &ninval);
+				g_mime_iconv_close (cd);
+				
+				g_string_append_len (decoded, str, len);
+				
+#if w(!)0
+				if (ninval > 0) {
+					g_warning ("Failed to completely convert \"%.*s\" to UTF-8, display may be "
+						   "corrupt: %s", outlen, (char *) outptr, g_strerror (errno));
+				}
+#endif
 			}
-			
-			encoded = FALSE;
+		} else if (token->is_8bit) {
+			/* *sigh* I hate broken mailers... */
+			str = g_mime_utils_decode_8bit (token->text, token->length);
+			g_string_append (decoded, str);
+			g_free (str);
+		} else {
+			g_string_append_len (decoded, token->text, token->length);
 		}
+		
+		token = next;
 	}
 	
-	decoded = out->str;
-	g_string_free (out, FALSE);
+	g_byte_array_free (outbuf, TRUE);
+	
+	return g_string_free (decoded, FALSE);
+}
+
+
+/**
+ * g_mime_utils_header_decode_text:
+ * @text: header text to decode
+ *
+ * Decodes an rfc2047 encoded 'text' header.
+ *
+ * Note: See g_mime_set_user_charsets() for details on how charset
+ * conversion is handled for unencoded 8bit text and/or wrongly
+ * specified rfc2047 encoded-word tokens.
+ *
+ * Returns: a newly allocated UTF-8 string representing the the decoded
+ * header.
+ **/
+char *
+g_mime_utils_header_decode_text (const char *text)
+{
+	rfc2047_token *tokens;
+	char *decoded;
+	size_t len;
+	
+	if (text == NULL)
+		return g_strdup ("");
+	
+	tokens = tokenize_rfc2047_text (text, &len);
+	decoded = rfc2047_decode_tokens (tokens, len);
+	rfc2047_token_list_free (tokens);
+	
+	return decoded;
+}
+
+
+/**
+ * g_mime_utils_header_decode_phrase:
+ * @phrase: header to decode
+ *
+ * Decodes an rfc2047 encoded 'phrase' header.
+ *
+ * Note: See g_mime_set_user_charsets() for details on how charset
+ * conversion is handled for unencoded 8bit text and/or wrongly
+ * specified rfc2047 encoded-word tokens.
+ *
+ * Returns: a newly allocated UTF-8 string representing the the decoded
+ * header.
+ **/
+char *
+g_mime_utils_header_decode_phrase (const char *phrase)
+{
+	rfc2047_token *tokens;
+	char *decoded;
+	size_t len;
+	
+	if (phrase == NULL)
+		return g_strdup ("");
+	
+	tokens = tokenize_rfc2047_phrase (phrase, &len);
+	decoded = rfc2047_decode_tokens (tokens, len);
+	rfc2047_token_list_free (tokens);
 	
 	return decoded;
 }
@@ -2248,13 +2468,13 @@ typedef struct _rfc822_word {
 static rfc822_word *
 rfc2047_encode_get_rfc822_words (const char *in, gboolean phrase)
 {
-	rfc822_word *words, *tail, *word;
+	rfc822_word words, *tail, *word;
 	rfc822_word_t type = WORD_ATOM;
 	const char *inptr, *start, *last;
 	int count = 0, encoding = 0;
 	
-	words = NULL;
 	tail = (rfc822_word *) &words;
+	words.next = NULL;
 	
 	last = start = inptr = in;
 	while (inptr && *inptr) {
@@ -2339,7 +2559,7 @@ rfc2047_encode_get_rfc822_words (const char *in, gboolean phrase)
 	
 #if d(!)0
 	printf ("rfc822 word tokens:\n");
-	word = words;
+	word = words.next;
 	while (word) {
 		printf ("\t'%.*s'; type=%d, encoding=%d\n",
 			word->end - word->start, word->start,
@@ -2349,7 +2569,7 @@ rfc2047_encode_get_rfc822_words (const char *in, gboolean phrase)
 	}
 #endif
 	
-	return words;
+	return words.next;
 }
 
 #define MERGED_WORD_LT_FOLDLEN(wlen, type) ((type) == WORD_2047 ? (wlen) < GMIME_FOLD_PREENCODED : (wlen) < (GMIME_FOLD_LEN - 8))
diff --git a/tests/test-mime.c b/tests/test-mime.c
index e0ab8b6..3ac64c1 100644
--- a/tests/test-mime.c
+++ b/tests/test-mime.c
@@ -204,12 +204,10 @@ static struct {
 	{ "\"Biznes=?ISO-8859-2?Q?_?=INTERIA.PL\"=?ISO-8859-2?Q?_?=<biuletyny firma interia pl>",
 	  "\"Biznes INTERIA.PL \" <biuletyny firma interia pl>",
 	  "\"Biznes INTERIA.PL\" <biuletyny firma interia pl>", },
-#if 0
 	/* UTF-8 sequence split between multiple encoded-word tokens */
 	{ "=?utf-8?Q?{#D=C3=A8=C3=A9=C2=A3=C3=A5=C3=BD_M$=C3=A1=C3?= =?utf-8?Q?=AD =C3=A7=C3=B8m #}?= <user domain com>",
 	  "\"{#DÃÃÂÃÃ M$ÃÃ ÃÃm #}\" <user domain com>",
 	  "=?iso-8859-1?b?eyNE6Omj5f0gTSTh7S7n+G1AI30=?= <user domain com>" },
-#endif
 };
 
 static void



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]