Re: [gmime-devel] Email parsing functions flawed and not practical to use
- From: Jeffrey Stedfast <fejj novell com>
- To: Damian Pietras <daper daper net>
- Cc: gmime-devel-list gnome org
- Subject: Re: [gmime-devel] Email parsing functions flawed and not practical to use
- Date: Thu, 26 Aug 2010 10:38:29 -0400
On 08/26/2010 09:00 AM, Damian Pietras wrote:
> On Thu, Aug 26, 2010 at 02:34:42PM +0200, Damian Pietras wrote:
>
>>> The good news is that I *think* that the following example should be
>>> doable...
>>> Email: "=?ISO-8859-2?Q?TEST?=" <p p org>
>>>
>>> This last one will likely require changes to header_decode_phrase().
>>> I'll try looking into this one in the next few days.
>>>
>> I think it's just matter of unquoting strings before decoding encoded
>> words. _internet_address_decode_name() should only do decoding,
>> unquoting should be done only in decode_address(). Since it uses
>> decode_word() that handles unquoting it should work, but I don't know
>> why it's not true.
>>
> Attached is my proposition (a patch against your patched version).
>
Instead of manually unquoting each qword as they are tokenized, you
could just swap unquote_string() to be called *before* decode_phrase().
As it turns out, doing that fixes your other two examples as well (well,
more-or-less - the results should be acceptable in any case).
Attached is the new patch (in case I made other changes since the
previous one, I can't recall).
Hope this works out for you,
Jeff
diff --git a/gmime/internet-address.c b/gmime/internet-address.c
index 7dbba42..3477ae3 100644
--- a/gmime/internet-address.c
+++ b/gmime/internet-address.c
@@ -1,6 +1,6 @@
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/* GMime
- * Copyright (C) 2000-2009 Jeffrey Stedfast
+ * Copyright (C) 2000-2010 Jeffrey Stedfast
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@@ -286,15 +286,9 @@ _internet_address_set_name (InternetAddress *ia, const char *name)
{
char *buf;
- if (name) {
- buf = g_mime_utils_header_decode_phrase (name);
- g_mime_utils_unquote_string (buf);
- g_free (ia->name);
- ia->name = buf;
- } else {
- g_free (ia->name);
- ia->name = NULL;
- }
+ buf = g_strdup (name);
+ g_free (ia->name);
+ ia->name = buf;
}
/**
@@ -446,8 +440,7 @@ internet_address_mailbox_new (const char *name, const char *addr)
mailbox = g_object_newv (INTERNET_ADDRESS_TYPE_MAILBOX, 0, NULL);
mailbox->addr = g_strdup (addr);
- if (name != NULL)
- _internet_address_set_name ((InternetAddress *) mailbox, name);
+ _internet_address_set_name ((InternetAddress *) mailbox, name);
return (InternetAddress *) mailbox;
}
@@ -1352,82 +1345,65 @@ internet_address_list_writer (InternetAddressList *list, GString *str)
_internet_address_list_to_string (list, flags, &linelen, str);
}
+static void
+_internet_address_decode_name (InternetAddress *ia, GString *name)
+{
+ char *value, *buf = NULL;
+ char *phrase;
+
+ if (!g_utf8_validate (name->str, name->len, NULL)) {
+ /* A (broken) mailer has sent us raw 8bit/multibyte text data... */
+ buf = g_mime_utils_decode_8bit (name->str, name->len);
+ phrase = buf;
+ } else {
+ phrase = name->str;
+ }
+
+ /* decode the phrase */
+ g_mime_utils_unquote_string (phrase);
+ value = g_mime_utils_header_decode_phrase (phrase);
+ g_free (ia->name);
+ ia->name = value;
+ g_free (buf);
+}
+
+static InternetAddress *decode_address (const char **in);
+
+static void
+skip_lwsp (const char **in)
+{
+ register const char *inptr = *in;
+
+ while (*inptr && is_lwsp (*inptr))
+ inptr++;
+
+ *in = inptr;
+}
+
static InternetAddress *
-decode_mailbox (const char **in)
+decode_addrspec (const char **in)
{
InternetAddress *mailbox = NULL;
- const char *inptr, *word;
- gboolean bracket = FALSE;
- GString *name = NULL;
+ const char *start, *inptr, *word;
+ gboolean got_local = FALSE;
GString *addr;
- size_t n = 0;
+ size_t len;
addr = g_string_new ("");
-
- decode_lwsp (in);
inptr = *in;
- if ((word = decode_word (&inptr)))
- n = inptr - word;
-
decode_lwsp (&inptr);
- if (*inptr && !strchr (",.@", *inptr)) {
- gboolean retried = FALSE;
-
- /* this mailbox has a name part, so get the name */
- name = g_string_new ("");
- while (word) {
- g_string_append_len (name, word, n);
- retried = FALSE;
- retry:
- if ((word = decode_word (&inptr))) {
- g_string_append_c (name, ' ');
- n = inptr - word;
- }
- }
-
- decode_lwsp (&inptr);
- if (*inptr == '<') {
- inptr++;
- bracket = TRUE;
- if ((word = decode_word (&inptr)))
- n = inptr - word;
- } else if (!retried && *inptr) {
- w(g_warning ("Unexpected char '%c' in mailbox: %s: attempting recovery.",
- *inptr, *in));
- /* chew up this bad char and then attempt 1 more pass at parsing */
- g_string_append_c (name, *inptr++);
- retried = TRUE;
- goto retry;
- } else {
- g_string_free (name, TRUE);
- g_string_free (addr, TRUE);
- *in = inptr;
-
- return NULL;
- }
- }
+ start = inptr;
- if (word) {
- g_string_append_len (addr, word, n);
- } else {
- w(g_warning ("No local part for email address: %s", *in));
- if (name)
- g_string_free (name, TRUE);
- g_string_free (addr, TRUE);
-
- /* comma will be eaten by our caller */
- if (*inptr && *inptr != ',')
- *in = inptr + 1;
- else
- *in = inptr;
-
- return NULL;
+ /* extract the first word of the local-part */
+ if ((word = decode_word (&inptr))) {
+ g_string_append_len (addr, word, (size_t) (inptr - word));
+ decode_lwsp (&inptr);
+ got_local = TRUE;
}
- /* get the rest of the local-part */
- decode_lwsp (&inptr);
- while (*inptr == '.' && word) {
+ /* extract the rest of the local-part */
+ while (word && *inptr == '.') {
/* Note: According to the spec, only a single '.' is
* allowed between word tokens in the local-part of an
* addr-spec token, but according to Evolution bug
@@ -1445,145 +1421,238 @@ decode_mailbox (const char **in)
decode_lwsp (&inptr);
}
- /* we should be at the '@' now... */
if (*inptr == '@') {
- size_t len = addr->len;
+ len = addr->len;
g_string_append_c (addr, '@');
inptr++;
- if (!decode_domain (&inptr, addr))
+ if (!decode_domain (&inptr, addr)) {
+ /* drop the @domain and continue as if it weren't there */
+ w(g_warning ("Missing domain in addr-spec: %.*s",
+ inptr - start, start));
g_string_truncate (addr, len);
- } else {
- w(g_warning ("No domain in email address: %s", *in));
+ }
+ } else if (got_local) {
+ w(g_warning ("Missing '@' and domain in addr-spec: %.*s",
+ inptr - start, start));
}
- if (bracket) {
- decode_lwsp (&inptr);
- if (*inptr == '>')
- inptr++;
- else
- w(g_warning ("Missing closing '>' bracket for email address: %s", *in));
+ *in = inptr;
+
+ if (!got_local) {
+ w(g_warning ("Invalid addr-spec, missing local-part: %.*s",
+ inptr - start, start));
+ g_string_free (addr, TRUE);
+ return NULL;
}
- if (!name || !name->len) {
- /* look for a trailing comment to use as the name? */
- char *comment;
+ mailbox = g_object_newv (INTERNET_ADDRESS_TYPE_MAILBOX, 0, NULL);
+ ((InternetAddressMailbox *) mailbox)->addr = addr->str;
+ g_string_free (addr, FALSE);
+
+ return mailbox;
+}
+
+static InternetAddress *
+decode_group (const char **in)
+{
+ InternetAddressGroup *group;
+ InternetAddress *addr;
+ const char *inptr;
+
+ inptr = *in;
+
+ addr = internet_address_group_new (NULL);
+ group = (InternetAddressGroup *) addr;
+
+ decode_lwsp (&inptr);
+ while (*inptr && *inptr != ';') {
+ InternetAddress *member;
- if (name) {
- g_string_free (name, TRUE);
- name = NULL;
- }
+ if ((member = decode_address (&inptr)))
+ _internet_address_group_add_member (group, member);
- comment = (char *) inptr;
decode_lwsp (&inptr);
- if (inptr > comment) {
- if ((comment = memchr (comment, '(', inptr - comment))) {
- const char *cend;
-
- /* find the end of the comment */
- cend = inptr - 1;
- while (cend > comment && is_lwsp (*cend))
- cend--;
-
- if (*cend == ')')
- cend--;
-
- comment = g_strndup (comment + 1, (size_t) (cend - comment));
- g_strstrip (comment);
-
- name = g_string_new (comment);
- g_free (comment);
- }
- }
- }
-
- *in = inptr;
-
- if (addr->len) {
- if (name && !g_utf8_validate (name->str, name->len, NULL)) {
- /* A (broken) mailer has sent us raw 8bit/multibyte text data... */
- char *utf8 = g_mime_utils_decode_8bit (name->str, name->len);
+ while (*inptr == ',') {
+ inptr++;
+ decode_lwsp (&inptr);
+ if ((member = decode_address (&inptr)))
+ _internet_address_group_add_member (group, member);
- g_string_truncate (name, 0);
- g_string_append (name, utf8);
- g_free (utf8);
+ decode_lwsp (&inptr);
}
-
- mailbox = g_object_newv (INTERNET_ADDRESS_TYPE_MAILBOX, 0, NULL);
- ((InternetAddressMailbox *) mailbox)->addr = addr->str;
-
- if (name)
- _internet_address_set_name (mailbox, name->str);
}
- g_string_free (addr, mailbox ? FALSE : TRUE);
- if (name)
- g_string_free (name, TRUE);
+ *in = inptr;
- return mailbox;
+ return addr;
}
static InternetAddress *
decode_address (const char **in)
{
- InternetAddressGroup *group;
+ const char *inptr, *start, *word, *comment = NULL;
InternetAddress *addr = NULL;
- const char *inptr, *start;
- const char *word;
+ gboolean has_lwsp = FALSE;
+ gboolean is_word;
GString *name;
decode_lwsp (in);
start = inptr = *in;
- /* pre-scan */
name = g_string_new ("");
- word = decode_word (&inptr);
- while (word) {
- g_string_append_len (name, word, (size_t) (inptr - word));
+ /**
+ * Both groups and mailboxes can begin with a phrase (denoting
+ * the display name for the address). Collect all of the
+ * tokens that make up this name phrase.
+ **/
+ while (*inptr) {
if ((word = decode_word (&inptr)))
- g_string_append_c (name, ' ');
- }
-
- decode_lwsp (&inptr);
- if (*inptr == ':') {
- addr = internet_address_group_new (name->str);
- group = (InternetAddressGroup *) addr;
- inptr++;
+ g_string_append_len (name, word, (size_t) (inptr - word));
- decode_lwsp (&inptr);
- while (*inptr && *inptr != ';') {
- InternetAddress *member;
+ if (word) {
+ check_lwsp:
+ word = inptr;
+ skip_lwsp (&inptr);
- if ((member = decode_mailbox (&inptr)))
- _internet_address_group_add_member (group, member);
+ /* is the next token a word token? */
+ is_word = *inptr == '"' || is_atom (*inptr);
+
+ if (inptr > word && is_word) {
+ g_string_append_c (name, ' ');
+ has_lwsp = TRUE;
+ }
+ if (is_word)
+ continue;
+ }
+
+ /**
+ * specials = "(" / ")" / "<" / ">" / "@" ; Must be in quoted-
+ * / "," / ";" / ":" / "\" / <"> ; string, to use
+ * / "." / "[" / "]" ; within a word.
+ **/
+ if (*inptr == ':') {
+ /* group */
+ inptr++;
+ addr = decode_group (&inptr);
decode_lwsp (&inptr);
- while (*inptr == ',') {
+ if (*inptr != ';')
+ w(g_warning ("Invalid group spec, missing closing ';': %.*s",
+ inptr - start, start));
+ else
+ inptr++;
+ break;
+ } else if (*inptr == '<') {
+ /* mailbox route-addr */
+ inptr++;
+ addr = decode_addrspec (&inptr);
+ decode_lwsp (&inptr);
+ if (*inptr != '>') {
+ w(g_warning ("Invalid route-addr, missing closing '>': %.*s",
+ inptr - start, start));
+
+ while (*inptr && *inptr != '>' && *inptr != ',')
+ inptr++;
+
+ if (*inptr == '>')
+ inptr++;
+ } else
+ inptr++;
+
+ /* if comment is non-NULL, we can check for a comment containing a name */
+ comment = inptr;
+ break;
+ } else if (*inptr == '(') {
+ /* comment suggests we are looking at an addr-spec */
+ goto addrspec;
+ } else if (strchr ("@,;", *inptr)) {
+ if (name->len == 0) {
+ if (*inptr == '@') {
+ GString *domain;
+
+ w(g_warning ("Unexpected address: %s: skipping.", start));
+
+ /* skip over @domain? */
+ inptr++;
+ domain = g_string_new ("");
+ decode_domain (&inptr, domain);
+ g_string_free (domain, TRUE);
+ } else {
+ /* empty address */
+ }
+ break;
+ } else if (has_lwsp) {
+ /* assume this is just an unquoted special that we should
+ treat as part of the name */
+ w(g_warning ("Unquoted '%c' in address name: %s: ignoring.", *inptr, start));
+ g_string_append_c (name, *inptr);
inptr++;
- decode_lwsp (&inptr);
- if ((member = decode_mailbox (&inptr)))
- _internet_address_group_add_member (group, member);
- decode_lwsp (&inptr);
+ goto check_lwsp;
}
- }
-
- if (*inptr == ';')
+
+ addrspec:
+ /* what we thought was a name was actually an addrspec? */
+ g_string_truncate (name, 0);
+ inptr = start;
+
+ addr = decode_addrspec (&inptr);
+
+ /* if comment is non-NULL, we can check for a comment containing a name */
+ comment = inptr;
+ break;
+ } else if (*inptr == '.') {
+ /* This would normally signify that we are
+ * decoding the local-part of an addr-spec,
+ * but sadly, it is common for broken mailers
+ * to forget to quote/encode .'s in the name
+ * phrase. */
+ g_string_append_c (name, *inptr);
inptr++;
- else
- w(g_warning ("Invalid group spec, missing closing ';': %.*s",
- inptr - start, start));
-
- *in = inptr;
- } else {
- /* this is a mailbox */
- addr = decode_mailbox (in);
+
+ goto check_lwsp;
+ } else if (*inptr) {
+ /* Technically, these are all invalid tokens
+ * but in the interest of being liberal in
+ * what we accept, we'll ignore them. */
+ w(g_warning ("Unexpected char '%c' in address: %s: ignoring.", *inptr, start));
+ g_string_append_c (name, *inptr);
+ inptr++;
+
+ goto check_lwsp;
+ }
+ }
+
+ /* Note: will also skip over any comments */
+ decode_lwsp (&inptr);
+
+ if (name->len == 0 && comment && inptr > comment) {
+ /* missing a name, look for a trailing comment */
+ if ((comment = memchr (comment, '(', inptr - comment))) {
+ const char *cend;
+
+ /* find the end of the comment */
+ cend = inptr - 1;
+ while (cend > comment && is_lwsp (*cend))
+ cend--;
+
+ if (*cend == ')')
+ cend--;
+
+ g_string_append_len (name, comment + 1, (size_t) (cend - comment));
+ }
}
+ if (addr && name->len > 0)
+ _internet_address_decode_name (addr, name);
+
g_string_free (name, TRUE);
+ *in = inptr;
+
return addr;
}
@@ -1619,9 +1688,9 @@ internet_address_list_parse_string (const char *str)
}
decode_lwsp (&inptr);
- if (*inptr == ',')
+ if (*inptr == ',') {
inptr++;
- else if (*inptr) {
+ } else if (*inptr) {
w(g_warning ("Parse error at '%s': expected ','", inptr));
/* try skipping to the next address */
if ((inptr = strchr (inptr, ',')))
diff --git a/tests/test-mime.c b/tests/test-mime.c
index fe9711d..36d993f 100644
--- a/tests/test-mime.c
+++ b/tests/test-mime.c
@@ -78,6 +78,9 @@ static struct {
{ "Jeffrey \"fejj\" Stedfast <fejj helixcode com>",
"Jeffrey fejj Stedfast <fejj helixcode com>",
"Jeffrey fejj Stedfast <fejj helixcode com>" },
+ { "\"Jeffrey \\\"fejj\\\" Stedfast\" <fejj helixcode com>",
+ "Jeffrey \"fejj\" Stedfast <fejj helixcode com>",
+ "\"Jeffrey \\\"fejj\\\" Stedfast\" <fejj helixcode com>" },
{ "\"Stedfast, Jeffrey\" <fejj helixcode com>",
"\"Stedfast, Jeffrey\" <fejj helixcode com>",
"\"Stedfast, Jeffrey\" <fejj helixcode com>" },
@@ -166,6 +169,21 @@ static struct {
{ "Canonical Patch Queue Manager<pqm pqm ubuntu com>",
"Canonical Patch Queue Manager <pqm pqm ubuntu com>",
"Canonical Patch Queue Manager <pqm pqm ubuntu com>" },
+ /* The following tests cases are meant to test forgivingness
+ * of the parser when it encounters unquoted specials in the
+ * name component */
+ { "Warren Worthington, Jr. <warren worthington com>",
+ "\"Warren Worthington, Jr.\" <warren worthington com>",
+ "\"Warren Worthington, Jr.\" <warren worthington com>" },
+ { "dot.com <dot.com>",
+ "\"dot.com\" <dot.com>",
+ "\"dot.com\" <dot.com>" },
+ { "=?UTF-8?Q?agatest123_\"test\"?= <agatest123 o2 pl>",
+ "agatest123 test <agatest123 o2 pl>",
+ "agatest123 test <agatest123 o2 pl>" },
+ { "\"=?ISO-8859-2?Q?TEST?=\" <p p org>",
+ "TEST <p p org>",
+ "TEST <p p org>" },
};
static void
@@ -186,12 +204,12 @@ test_addrspec (void)
str = internet_address_list_to_string (addrlist, FALSE);
if (strcmp (addrspec[i].display, str) != 0)
- throw (exception_new ("display addr-spec does not match: %s", str));
+ throw (exception_new ("display addr-spec %s does not match: %s", addrspec[i].display, str));
g_free (str);
str = internet_address_list_to_string (addrlist, TRUE);
if (strcmp (addrspec[i].encoded, str) != 0)
- throw (exception_new ("encoded addr-spec does not match: %s", str));
+ throw (exception_new ("encoded addr-spec %s does not match: %s", addrspec[i].encoded, str));
testsuite_check_passed ();
} catch (ex) {
@@ -527,7 +545,7 @@ int main (int argc, char **argv)
testsuite_start ("addr-spec parser");
test_addrspec ();
testsuite_end ();
-
+
testsuite_start ("date parser");
test_date_parser ();
testsuite_end ();
@@ -545,7 +563,7 @@ int main (int argc, char **argv)
testsuite_end ();
g_mime_shutdown ();
-
+
g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
testsuite_start ("broken rfc2047 encoding/decoding");
test_rfc2047 (TRUE);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]