Re: [gmime-devel] Email parsing functions flawed and not practical to use



On 08/26/2010 09:00 AM, Damian Pietras wrote:
> On Thu, Aug 26, 2010 at 02:34:42PM +0200, Damian Pietras wrote:
>   
>>> The good news is that I *think* that the following example should be
>>> doable...
>>> Email: "=?ISO-8859-2?Q?TEST?=" <p p org>
>>>
>>> This last one will likely require changes to header_decode_phrase().
>>> I'll try looking into this one in the next few days.
>>>       
>> I think it's just matter of unquoting strings before decoding encoded
>> words. _internet_address_decode_name() should only do decoding,
>> unquoting should be done only in decode_address(). Since it uses
>> decode_word() that handles unquoting it should work, but I don't know
>> why it's not true.
>>     
> Attached is my proposition (a patch against your patched version).
>   

Instead of manually unquoting each qword as they are tokenized, you
could just swap unquote_string() to be called *before* decode_phrase().

As it turns out, doing that fixes your other two examples as well (well,
more-or-less - the results should be acceptable in any case).

Attached is the new patch (in case I made other changes since the
previous one, I can't recall).

Hope this works out for you,

Jeff

diff --git a/gmime/internet-address.c b/gmime/internet-address.c
index 7dbba42..3477ae3 100644
--- a/gmime/internet-address.c
+++ b/gmime/internet-address.c
@@ -1,6 +1,6 @@
 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
 /*  GMime
- *  Copyright (C) 2000-2009 Jeffrey Stedfast
+ *  Copyright (C) 2000-2010 Jeffrey Stedfast
  *
  *  This library is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU Lesser General Public License
@@ -286,15 +286,9 @@ _internet_address_set_name (InternetAddress *ia, const char *name)
 {
 	char *buf;
 	
-	if (name) {
-		buf = g_mime_utils_header_decode_phrase (name);
-		g_mime_utils_unquote_string (buf);
-		g_free (ia->name);
-		ia->name = buf;
-	} else {
-		g_free (ia->name);
-		ia->name = NULL;
-	}
+	buf = g_strdup (name);
+	g_free (ia->name);
+	ia->name = buf;
 }
 
 /**
@@ -446,8 +440,7 @@ internet_address_mailbox_new (const char *name, const char *addr)
 	mailbox = g_object_newv (INTERNET_ADDRESS_TYPE_MAILBOX, 0, NULL);
 	mailbox->addr = g_strdup (addr);
 	
-	if (name != NULL)
-		_internet_address_set_name ((InternetAddress *) mailbox, name);
+	_internet_address_set_name ((InternetAddress *) mailbox, name);
 	
 	return (InternetAddress *) mailbox;
 }
@@ -1352,82 +1345,65 @@ internet_address_list_writer (InternetAddressList *list, GString *str)
 	_internet_address_list_to_string (list, flags, &linelen, str);
 }
 
+static void
+_internet_address_decode_name (InternetAddress *ia, GString *name)
+{
+	char *value, *buf = NULL;
+	char *phrase;
+	
+	if (!g_utf8_validate (name->str, name->len, NULL)) {
+		/* A (broken) mailer has sent us raw 8bit/multibyte text data... */
+		buf = g_mime_utils_decode_8bit (name->str, name->len);
+		phrase = buf;
+	} else {
+		phrase = name->str;
+	}
+	
+	/* decode the phrase */
+	g_mime_utils_unquote_string (phrase);
+	value = g_mime_utils_header_decode_phrase (phrase);
+	g_free (ia->name);
+	ia->name = value;
+	g_free (buf);
+}
+
+static InternetAddress *decode_address (const char **in);
+
+static void
+skip_lwsp (const char **in)
+{
+	register const char *inptr = *in;
+	
+	while (*inptr && is_lwsp (*inptr))
+		inptr++;
+	
+	*in = inptr;
+}
+
 static InternetAddress *
-decode_mailbox (const char **in)
+decode_addrspec (const char **in)
 {
 	InternetAddress *mailbox = NULL;
-	const char *inptr, *word;
-	gboolean bracket = FALSE;
-	GString *name = NULL;
+	const char *start, *inptr, *word;
+	gboolean got_local = FALSE;
 	GString *addr;
-	size_t n = 0;
+	size_t len;
 	
 	addr = g_string_new ("");
-	
-	decode_lwsp (in);
 	inptr = *in;
 	
-	if ((word = decode_word (&inptr)))
-		n = inptr - word;
-	
 	decode_lwsp (&inptr);
-	if (*inptr && !strchr (",.@", *inptr)) {
-		gboolean retried = FALSE;
-		
-		/* this mailbox has a name part, so get the name */
-		name = g_string_new ("");
-		while (word) {
-			g_string_append_len (name, word, n);
-			retried = FALSE;
-		retry:
-			if ((word = decode_word (&inptr))) {
-				g_string_append_c (name, ' ');
-				n = inptr - word;
-			}
-		}
-		
-		decode_lwsp (&inptr);
-		if (*inptr == '<') {
-			inptr++;
-			bracket = TRUE;
-			if ((word = decode_word (&inptr)))
-				n = inptr - word;
-		} else if (!retried && *inptr) {
-			w(g_warning ("Unexpected char '%c' in mailbox: %s: attempting recovery.",
-				     *inptr, *in));
-			/* chew up this bad char and then attempt 1 more pass at parsing */
-			g_string_append_c (name, *inptr++);
-			retried = TRUE;
-			goto retry;
-		} else {
-			g_string_free (name, TRUE);
-			g_string_free (addr, TRUE);
-			*in = inptr;
-			
-			return NULL;
-		}
-	}
+	start = inptr;
 	
-	if (word) {
-		g_string_append_len (addr, word, n);
-	} else {
-		w(g_warning ("No local part for email address: %s", *in));
-		if (name)
-			g_string_free (name, TRUE);
-		g_string_free (addr, TRUE);
-		
-		/* comma will be eaten by our caller */
-		if (*inptr && *inptr != ',')
-			*in = inptr + 1;
-		else
-			*in = inptr;
-		
-		return NULL;
+	/* extract the first word of the local-part */
+	if ((word = decode_word (&inptr))) {
+		g_string_append_len (addr, word, (size_t) (inptr - word));
+		decode_lwsp (&inptr);
+		got_local = TRUE;
 	}
 	
-	/* get the rest of the local-part */
-	decode_lwsp (&inptr);
-	while (*inptr == '.' && word) {
+	/* extract the rest of the local-part */
+	while (word && *inptr == '.') {
 		/* Note: According to the spec, only a single '.' is
 		 * allowed between word tokens in the local-part of an
 		 * addr-spec token, but according to Evolution bug
@@ -1445,145 +1421,238 @@ decode_mailbox (const char **in)
 		decode_lwsp (&inptr);
 	}
 	
-	/* we should be at the '@' now... */
 	if (*inptr == '@') {
-		size_t len = addr->len;
+		len = addr->len;
 		
 		g_string_append_c (addr, '@');
 		inptr++;
 		
-		if (!decode_domain (&inptr, addr))
+		if (!decode_domain (&inptr, addr)) {
+			/* drop the @domain and continue as if it weren't there */
+			w(g_warning ("Missing domain in addr-spec: %.*s",
+				     inptr - start, start));
 			g_string_truncate (addr, len);
-	} else {
-		w(g_warning ("No domain in email address: %s", *in));
+		}
+	} else if (got_local) {
+		w(g_warning ("Missing '@' and domain in addr-spec: %.*s",
+			     inptr - start, start));
 	}
 	
-	if (bracket) {
-		decode_lwsp (&inptr);
-		if (*inptr == '>')
-			inptr++;
-		else
-			w(g_warning ("Missing closing '>' bracket for email address: %s", *in));
+	*in = inptr;
+	
+	if (!got_local) {
+		w(g_warning ("Invalid addr-spec, missing local-part: %.*s",
+			     inptr - start, start));
+		g_string_free (addr, TRUE);
+		return NULL;
 	}
 	
-	if (!name || !name->len) {
-		/* look for a trailing comment to use as the name? */
-		char *comment;
+	mailbox = g_object_newv (INTERNET_ADDRESS_TYPE_MAILBOX, 0, NULL);
+	((InternetAddressMailbox *) mailbox)->addr = addr->str;
+	g_string_free (addr, FALSE);
+	
+	return mailbox;
+}
+
+static InternetAddress *
+decode_group (const char **in)
+{
+	InternetAddressGroup *group;
+	InternetAddress *addr;
+	const char *inptr;
+	
+	inptr = *in;
+	
+	addr = internet_address_group_new (NULL);
+	group = (InternetAddressGroup *) addr;
+	
+	decode_lwsp (&inptr);
+	while (*inptr && *inptr != ';') {
+		InternetAddress *member;
 		
-		if (name) {
-			g_string_free (name, TRUE);
-			name = NULL;
-		}
+		if ((member = decode_address (&inptr)))
+			_internet_address_group_add_member (group, member);
 		
-		comment = (char *) inptr;
 		decode_lwsp (&inptr);
-		if (inptr > comment) {
-			if ((comment = memchr (comment, '(', inptr - comment))) {
-				const char *cend;
-				
-				/* find the end of the comment */
-				cend = inptr - 1;
-				while (cend > comment && is_lwsp (*cend))
-					cend--;
-				
-				if (*cend == ')')
-					cend--;
-				
-				comment = g_strndup (comment + 1, (size_t) (cend - comment));
-				g_strstrip (comment);
-				
-				name = g_string_new (comment);
-				g_free (comment);
-			}
-		}
-	}
-	
-	*in = inptr;
-	
-	if (addr->len) {
-		if (name && !g_utf8_validate (name->str, name->len, NULL)) {
-			/* A (broken) mailer has sent us raw 8bit/multibyte text data... */
-			char *utf8 = g_mime_utils_decode_8bit (name->str, name->len);
+		while (*inptr == ',') {
+			inptr++;
+			decode_lwsp (&inptr);
+			if ((member = decode_address (&inptr)))
+				_internet_address_group_add_member (group, member);
 			
-			g_string_truncate (name, 0);
-			g_string_append (name, utf8);
-			g_free (utf8);
+			decode_lwsp (&inptr);
 		}
-		
-		mailbox = g_object_newv (INTERNET_ADDRESS_TYPE_MAILBOX, 0, NULL);
-		((InternetAddressMailbox *) mailbox)->addr = addr->str;
-		
-		if (name)
-			_internet_address_set_name (mailbox, name->str);
 	}
 	
-	g_string_free (addr, mailbox ? FALSE : TRUE);
-	if (name)
-		g_string_free (name, TRUE);
+	*in = inptr;
 	
-	return mailbox;
+	return addr;
 }
 
 static InternetAddress *
 decode_address (const char **in)
 {
-	InternetAddressGroup *group;
+	const char *inptr, *start, *word, *comment = NULL;
 	InternetAddress *addr = NULL;
-	const char *inptr, *start;
-	const char *word;
+	gboolean has_lwsp = FALSE;
+	gboolean is_word;
 	GString *name;
 	
 	decode_lwsp (in);
 	start = inptr = *in;
 	
-	/* pre-scan */
 	name = g_string_new ("");
-	word = decode_word (&inptr);
 	
-	while (word) {
-		g_string_append_len (name, word, (size_t) (inptr - word));
+	/**
+	 * Both groups and mailboxes can begin with a phrase (denoting
+	 * the display name for the address). Collect all of the
+	 * tokens that make up this name phrase.
+	 **/
+	while (*inptr) {
 		if ((word = decode_word (&inptr)))
-			g_string_append_c (name, ' ');
-	}
-	
-	decode_lwsp (&inptr);
-	if (*inptr == ':') {
-		addr = internet_address_group_new (name->str);
-		group = (InternetAddressGroup *) addr;
-		inptr++;
+			g_string_append_len (name, word, (size_t) (inptr - word));
 		
-		decode_lwsp (&inptr);
-		while (*inptr && *inptr != ';') {
-			InternetAddress *member;
+		if (word) {
+		check_lwsp:
+			word = inptr;
+			skip_lwsp (&inptr);
 			
-			if ((member = decode_mailbox (&inptr)))
-				_internet_address_group_add_member (group, member);
+			/* is the next token a word token? */
+			is_word = *inptr == '"' || is_atom (*inptr);
+			
+			if (inptr > word && is_word) {
+				g_string_append_c (name, ' ');
+				has_lwsp = TRUE;
+			}
 			
+			if (is_word)
+				continue;
+		}
+		
+		/**
+		 * specials    =  "(" / ")" / "<" / ">" / "@"  ; Must be in quoted-
+		 *             /  "," / ";" / ":" / "\" / <">  ;  string, to use
+		 *             /  "." / "[" / "]"              ;  within a word.
+		 **/
+		if (*inptr == ':') {
+			/* group */
+			inptr++;
+			addr = decode_group (&inptr);
 			decode_lwsp (&inptr);
-			while (*inptr == ',') {
+			if (*inptr != ';')
+				w(g_warning ("Invalid group spec, missing closing ';': %.*s",
+					     inptr - start, start));
+			else
+				inptr++;
+			break;
+		} else if (*inptr == '<') {
+			/* mailbox route-addr */
+			inptr++;
+			addr = decode_addrspec (&inptr);
+			decode_lwsp (&inptr);
+			if (*inptr != '>') {
+				w(g_warning ("Invalid route-addr, missing closing '>': %.*s",
+					     inptr - start, start));
+				
+				while (*inptr && *inptr != '>' && *inptr != ',')
+					inptr++;
+				
+				if (*inptr == '>')
+					inptr++;
+			} else
+				inptr++;
+			
+			/* if comment is non-NULL, we can check for a comment containing a name */
+			comment = inptr;
+			break;
+		} else if (*inptr == '(') {
+			/* comment suggests we are looking at an addr-spec */
+			goto addrspec;
+		} else if (strchr ("@,;", *inptr)) {
+			if (name->len == 0) {
+				if (*inptr == '@') {
+					GString *domain;
+					
+					w(g_warning ("Unexpected address: %s: skipping.", start));
+					
+					/* skip over @domain? */
+					inptr++;
+					domain = g_string_new ("");
+					decode_domain (&inptr, domain);
+					g_string_free (domain, TRUE);
+				} else {
+					/* empty address */
+				}
+				break;
+			} else if (has_lwsp) {
+				/* assume this is just an unquoted special that we should
+				   treat as part of the name */
+				w(g_warning ("Unquoted '%c' in address name: %s: ignoring.", *inptr, start));
+				g_string_append_c (name, *inptr);
 				inptr++;
-				decode_lwsp (&inptr);
-				if ((member = decode_mailbox (&inptr)))
-					_internet_address_group_add_member (group, member);
 				
-				decode_lwsp (&inptr);
+				goto check_lwsp;
 			}
-		}
-		
-		if (*inptr == ';')
+			
+		addrspec:
+			/* what we thought was a name was actually an addrspec? */
+			g_string_truncate (name, 0);
+			inptr = start;
+			
+			addr = decode_addrspec (&inptr);
+			
+			/* if comment is non-NULL, we can check for a comment containing a name */
+			comment = inptr;
+			break;
+		} else if (*inptr == '.') {
+			/* This would normally signify that we are
+			 * decoding the local-part of an addr-spec,
+			 * but sadly, it is common for broken mailers
+			 * to forget to quote/encode .'s in the name
+			 * phrase. */
+			g_string_append_c (name, *inptr);
 			inptr++;
-		else
-			w(g_warning ("Invalid group spec, missing closing ';': %.*s",
-				     inptr - start, start));
-		
-		*in = inptr;
-	} else {
-		/* this is a mailbox */
-		addr = decode_mailbox (in);
+			
+			goto check_lwsp;
+		} else if (*inptr) {
+			/* Technically, these are all invalid tokens
+			 * but in the interest of being liberal in
+			 * what we accept, we'll ignore them. */
+			w(g_warning ("Unexpected char '%c' in address: %s: ignoring.", *inptr, start));
+			g_string_append_c (name, *inptr);
+			inptr++;
+			
+			goto check_lwsp;
+		}
+	}
+	
+	/* Note: will also skip over any comments */
+	decode_lwsp (&inptr);
+	
+	if (name->len == 0 && comment && inptr > comment) {
+		/* missing a name, look for a trailing comment */
+		if ((comment = memchr (comment, '(', inptr - comment))) {
+			const char *cend;
+			
+			/* find the end of the comment */
+			cend = inptr - 1;
+			while (cend > comment && is_lwsp (*cend))
+				cend--;
+			
+			if (*cend == ')')
+				cend--;
+			
+			g_string_append_len (name, comment + 1, (size_t) (cend - comment));
+		}
 	}
 	
+	if (addr && name->len > 0)
+		_internet_address_decode_name (addr, name);
+	
 	g_string_free (name, TRUE);
 	
+	*in = inptr;
+	
 	return addr;
 }
 
@@ -1619,9 +1688,9 @@ internet_address_list_parse_string (const char *str)
 		}
 		
 		decode_lwsp (&inptr);
-		if (*inptr == ',')
+		if (*inptr == ',') {
 			inptr++;
-		else if (*inptr) {
+		} else if (*inptr) {
 			w(g_warning ("Parse error at '%s': expected ','", inptr));
 			/* try skipping to the next address */
 			if ((inptr = strchr (inptr, ',')))
diff --git a/tests/test-mime.c b/tests/test-mime.c
index fe9711d..36d993f 100644
--- a/tests/test-mime.c
+++ b/tests/test-mime.c
@@ -78,6 +78,9 @@ static struct {
 	{ "Jeffrey \"fejj\" Stedfast <fejj helixcode com>",
 	  "Jeffrey fejj Stedfast <fejj helixcode com>",
 	  "Jeffrey fejj Stedfast <fejj helixcode com>" },
+	{ "\"Jeffrey \\\"fejj\\\" Stedfast\" <fejj helixcode com>",
+	  "Jeffrey \"fejj\" Stedfast <fejj helixcode com>",
+	  "\"Jeffrey \\\"fejj\\\" Stedfast\" <fejj helixcode com>" },
 	{ "\"Stedfast, Jeffrey\" <fejj helixcode com>",
 	  "\"Stedfast, Jeffrey\" <fejj helixcode com>",
 	  "\"Stedfast, Jeffrey\" <fejj helixcode com>" },
@@ -166,6 +169,21 @@ static struct {
 	{ "Canonical Patch Queue Manager<pqm pqm ubuntu com>",
 	  "Canonical Patch Queue Manager <pqm pqm ubuntu com>",
 	  "Canonical Patch Queue Manager <pqm pqm ubuntu com>" },
+	/* The following tests cases are meant to test forgivingness
+	 * of the parser when it encounters unquoted specials in the
+	 * name component */
+	{ "Warren Worthington, Jr. <warren worthington com>",
+	  "\"Warren Worthington, Jr.\" <warren worthington com>",
+	  "\"Warren Worthington, Jr.\" <warren worthington com>" },
+	{ "dot.com <dot.com>",
+	  "\"dot.com\" <dot.com>",
+	  "\"dot.com\" <dot.com>" },
+	{ "=?UTF-8?Q?agatest123_\"test\"?= <agatest123 o2 pl>",
+	  "agatest123 test <agatest123 o2 pl>",
+	  "agatest123 test <agatest123 o2 pl>" },
+	{ "\"=?ISO-8859-2?Q?TEST?=\" <p p org>",
+	  "TEST <p p org>",
+	  "TEST <p p org>" },
 };
 
 static void
@@ -186,12 +204,12 @@ test_addrspec (void)
 			
 			str = internet_address_list_to_string (addrlist, FALSE);
 			if (strcmp (addrspec[i].display, str) != 0)
-				throw (exception_new ("display addr-spec does not match: %s", str));
+				throw (exception_new ("display addr-spec %s does not match: %s", addrspec[i].display, str));
 			g_free (str);
 			
 			str = internet_address_list_to_string (addrlist, TRUE);
 			if (strcmp (addrspec[i].encoded, str) != 0)
-				throw (exception_new ("encoded addr-spec does not match: %s", str));
+				throw (exception_new ("encoded addr-spec %s does not match: %s", addrspec[i].encoded, str));
 			
 			testsuite_check_passed ();
 		} catch (ex) {
@@ -527,7 +545,7 @@ int main (int argc, char **argv)
 	testsuite_start ("addr-spec parser");
 	test_addrspec ();
 	testsuite_end ();
-	
+
 	testsuite_start ("date parser");
 	test_date_parser ();
 	testsuite_end ();
@@ -545,7 +563,7 @@ int main (int argc, char **argv)
 	testsuite_end ();
 	
 	g_mime_shutdown ();
-	
+
 	g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
 	testsuite_start ("broken rfc2047 encoding/decoding");
 	test_rfc2047 (TRUE);


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]