Re: [evolution-patches] patch to work around broken encoded-words that cannot be parsed as rfc822 atom tokens

From: Not Zed <notzed ximian com>
To: Jeffrey Stedfast <fejj novell com>
Cc: evolution-patches ximian com
Subject: Re: [evolution-patches] patch to work around broken encoded-words that cannot be parsed as rfc822 atom tokens
Date: Mon, 28 Feb 2005 12:22:09 +0800

So, how many valid mails does this break?

e.g.

=?iso-8859-1?b?foo this is a hidden message not for evolution users bar?=

And why did you copy the append_ functions? There should be only one of each in the code. This is a messy enough hack as it is.

Instead of a test folder, any test data must be added to the regression test we already have for rfc2047 decoding.

BTW, using if (p = memchr(inptr, '=', end-start-2) && p[1] == '?') is highly likely to be much more efficient than (while (inptr < inend-2 && !strcmp("=?")) inptr++))

On Fri, 2005-02-25 at 13:47 -0500, Jeffrey Stedfast wrote:

unfortunately, there exist a bountiful number of shitful mailers out
there who's authors clearly couldn't be bothered to read or understand
the MIME specifications and so just pulled an encoding scheme out of
their proverbials.

the attached patch tries to deal with the scenarios that I'm aware of,
namely illegal characters in the encoded text portion of the
encoded-word token (which, sadly, even includes SPACE and TAB)

for the convenience of anyone reading this message who hasn't yet read
rfc2047, here's a good quote from the end of section 2:

   IMPORTANT: 'encoded-word's are designed to be recognized as 'atom's
   by an RFC 822 parser.  As a consequence, unencoded white space
   characters (such as SPACE and HTAB) are FORBIDDEN within an
   'encoded-word'.  For example, the character sequence

      =?iso-8859-1?q?this is some text?=

   would be parsed as four 'atom's, rather than as a single 'atom' (by
   an RFC 822 parser) or 'encoded-word' (by a parser which understands
   'encoded-words').  The correct way to encode the string "this is some
   text" is to encode the SPACE characters as well, e.g.

      =?iso-8859-1?q?this=20is=20some=20text?=

so yes, the behaviour of the broken mailers is explicitly FORBIDDEN but
that hasn't stopped them. oh well.


I've also attached a test mbox for everyone's convenience in testing
this patch (feel free to add to it)

With the patch applied, both Mozilla-Mail and Evolution render the
subjects (and other headers) exactly the same afaict.

text/plain attachment (broken-rfc2047.patch)

? broken-rfc2047.patch
? camel-mime-tables.c
? providers/imap4/imap4-XGWMOVE.patch
? providers/imap4/imap4.patch
Index: ChangeLog
===================================================================
RCS file: /cvs/gnome/evolution-data-server/camel/ChangeLog,v
retrieving revision 1.2431
diff -u -p -r1.2431 ChangeLog
--- ChangeLog	15 Feb 2005 11:12:51 -0000	1.2431
+++ ChangeLog	25 Feb 2005 18:36:04 -0000
@@ -1,3 +1,13 @@
+2005-02-25  Jeffrey Stedfast  <fejj novell com>
+
+	* camel-mime-utils.c (quoted_decode): Allow spaces in the text we
+	are decoding.
+	(append_quoted_pair): Changed to take charset params and convert
+	un-quoted-pair'd strings to UTF-8.
+	(header_decode_text): Rewritten to work around broken rfc2047
+	encoded-words sent by mailers who's authors couldn't be bothered
+	to read the specs.
+
 2005-02-11  Radek Doulik  <rodo novell com>
 
 	* camel-filter-search.c (junk_test): use camel debug
Index: camel-mime-utils.c
===================================================================
RCS file: /cvs/gnome/evolution-data-server/camel/camel-mime-utils.c,v
retrieving revision 1.223
diff -u -p -r1.223 camel-mime-utils.c
--- camel-mime-utils.c	31 Jan 2005 06:56:28 -0000	1.223
+++ camel-mime-utils.c	25 Feb 2005 18:36:05 -0000
@@ -814,8 +814,12 @@ quoted_decode(const unsigned char *in, s
 			*outptr++ = 0x20;
 		} else if (c==' ' || c==0x09) {
 			/* FIXME: this is an error! ignore for now ... */
+#if ADHERE_TO_SPEC
 			ret = -1;
 			break;
+#else
+			*outptr++ = c;
+#endif
 		} else {
 			*outptr++ = c;
 		}
@@ -915,7 +919,7 @@ rfc2047_decode_word(const char *in, size
 
 	/* quick check to see if this could possibly be a real encoded word */
 	if (len < 8 || !(in[0] == '=' && in[1] == '?' && in[len-1] == '=' && in[len-2] == '?')) {
-		d(printf("invalid\n"));
+		d(printf("rfc2047_decode_word: invalid token\n"));
 		return NULL;
 	}
 	
@@ -1058,6 +1062,7 @@ append_8bit (GString *out, const char *i
 	
 }
 
+#ifdef ADHERE_TO_SPEC
 static GString *
 append_quoted_pair (GString *str, const char *in, gssize inlen)
 {
@@ -1072,7 +1077,7 @@ append_quoted_pair (GString *str, const 
 		else
 			g_string_append_c (str, c);
 	}
-
+	
 	return str;
 }
 
@@ -1140,6 +1145,115 @@ header_decode_text (const char *in, size
 	
 	return dword;
 }
+
+#else /* ! ADHERE_TO_SPEC */
+
+static void
+append_text (GString *str, const char *in, ssize_t inlen, const char *default_charset, const char *locale_charset)
+{
+	if ((default_charset == NULL || !append_8bit (str, in, inlen, default_charset))
+	    && (locale_charset == NULL || !append_8bit (str, in, inlen, locale_charset)))
+		append_latin1 (str, in, inlen);
+}
+
+static void
+append_quoted_pair (GString *str, const char *in, ssize_t inlen, const char *default_charset, const char *locale_charset)
+{
+	register const char *inptr = in;
+	const char *inend = in + inlen;
+	GString *unquoted;
+	char c;
+	
+	unquoted = g_string_new ("");
+	
+	while (inptr < inend) {
+		c = *inptr++;
+		if (c == '\\' && inptr < inend)
+			g_string_append_c (unquoted, *inptr++);
+		else
+			g_string_append_c (unquoted, c);
+	}
+	
+	append_text (str, unquoted->str, unquoted->len, default_charset, locale_charset);
+	g_string_free (unquoted, TRUE);
+}
+
+static char *
+header_decode_text (const char *in, size_t inlen, int ctext, const char *default_charset)
+{
+	void (* append) (GString *, const char *, ssize_t, const char *, const char *);
+	const char *inptr, *inend, *start, *encword, *locale_charset;
+	char *dword = NULL;
+	GString *out;
+	
+	locale_charset = e_iconv_locale_charset ();
+	
+	if (ctext)
+		append = append_quoted_pair;
+	else
+		append = append_text;
+	
+	out = g_string_new ("");
+	inptr = in;
+	inend = inptr + inlen;
+	
+	while (inptr < inend) {
+		start = inptr;
+		
+		while (inptr < (inend - 8) && strncmp (inptr, "=?", 2) != 0) {
+			if (!camel_mime_is_lwsp (*inptr))
+				dword = NULL;
+			inptr++;
+		}
+		
+		if (inptr == (inend - 8)) {
+			append (out, start, inend - start, default_charset, locale_charset);
+			break;
+		}
+		
+		/* could be an encoded word (or a broken encoded word which is why this code is so damn hairy) */
+		encword = inptr;
+		
+		inptr += 2;
+		while (inptr < (inend - 5) && *inptr != '?')
+			inptr++;
+		
+		if (inptr[0] == '?' && (inptr[1] == 'B' || inptr[1] == 'b' || inptr[1] == 'Q' || inptr[1] == 'q') && inptr[2] == '?') {
+			/* looking more and more like an encoded word... */
+			inptr += 3;
+			while (inptr < (inend - 2) && *inptr != '?')
+				inptr++;
+			
+			if (strncmp (inptr, "?=", 2) != 0)
+				goto not_encword;
+			
+			if (!dword)
+				append (out, start, encword - start, default_charset, locale_charset);
+			
+			inptr += 2;
+			
+			if ((dword = rfc2047_decode_word (encword, inptr - encword))) {
+				g_string_append (out, dword);
+				g_free (dword);
+			} else {
+				append (out, encword, inptr - encword, default_charset, locale_charset);
+			}
+		} else {
+			/* not an encoded word */
+		not_encword:
+			dword = NULL;
+			inptr = encword + 2;
+			
+			append (out, start, inptr - start, default_charset, locale_charset);
+		}
+	}
+	
+	dword = out->str;
+	g_string_free (out, FALSE);
+	
+	return dword;
+}
+#endif /* ADHERE_TO_SPEC */
 
 char *
 camel_header_decode_string (const char *in, const char *default_charset)

Follow-Ups:
- Re: [evolution-patches] patch to work around broken encoded-words that cannot be parsed as rfc822 atom tokens
  - From: Jeffrey Stedfast
- Re: [evolution-patches] patch to work around broken encoded-words that cannot be parsed as rfc822 atom tokens
  - From: Jeffrey Stedfast

References:
- [evolution-patches] patch to work around broken encoded-words that cannot be parsed as rfc822 atom tokens
  - From: Jeffrey Stedfast

[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]