[balsa] imap-util: Error in IMAP mailbox encoding



commit 78006803b74d0b30ee1d94e8274c8cecaab36779
Author: Albrecht Dreß <albrecht dress arcor de>
Date:   Sat Feb 23 14:14:27 2019 -0500

    imap-util: Error in IMAP mailbox encoding
    
    Balsa has a bug in encoding IMAP mailbox names to modified UTF-7
    as defined in RFC 3501, sect. 5.1.3 [1], as the '\' (0x5c) is
    encoded as '\\' which is explicitly prohibited.  To reproduce:
    activate main() in libbalsa/imap/util.c, compile it, and run
    e.g. (yes, the mailbox name is weird, but it includes all special
    cases from the RFC…)
    
    /util 'ϴä ab c&d+e/f\~ßx'
    orig='ϴä ab c&d+e/f\~ßx' mbx='&A,QA5A- ab c&-d+e/f\\~&AN8-x' back='ϴä ab c&d+e/f\\~ßx'
    WRONG CONVERSION: --------------------------------^^
    
    * libbalsa/imap/util.c (imap_utf8_to_mailbox),
      (imap_mailbox_to_utf8): replace the hand-coded conversion by
      utilising g_convert(), and also fix the bug
    * libbalsa/imap/util.h: declare them with the
      G_GNUC_WARN_UNUSED_RESULT attribute.
    
    Signed-off-by: Peter Bloomfield <PeterBloomfield bellsouth net>

 ChangeLog            |  21 ++++
 libbalsa/imap/util.c | 295 +++++++++++++++------------------------------------
 libbalsa/imap/util.h |   6 +-
 3 files changed, 113 insertions(+), 209 deletions(-)
---
diff --git a/ChangeLog b/ChangeLog
index 2385c884c..21ee412fb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,24 @@
+2018-02-23  Albrecht Dreß  <albrecht dress arcor de>
+
+       Error in IMAP mailbox encoding
+
+       Balsa has a bug in encoding IMAP mailbox names to modified UTF-7
+       as defined in RFC 3501, sect. 5.1.3 [1], as the '\' (0x5c) is
+       encoded as '\\' which is explicitly prohibited.  To reproduce:
+       activate main() in libbalsa/imap/util.c, compile it, and run
+       e.g. (yes, the mailbox name is weird, but it includes all special
+       cases from the RFC…)
+
+       /util 'ϴä ab c&d+e/f\~ßx'
+       orig='ϴä ab c&d+e/f\~ßx' mbx='&A,QA5A- ab c&-d+e/f\\~&AN8-x' back='ϴä ab c&d+e/f\\~ßx'
+       WRONG CONVERSION: --------------------------------^^
+
+       * libbalsa/imap/util.c (imap_utf8_to_mailbox),
+         (imap_mailbox_to_utf8): replace the hand-coded conversion by
+         utilising g_convert(), and also fix the bug
+       * libbalsa/imap/util.h: declare them with the
+         G_GNUC_WARN_UNUSED_RESULT attribute.
+
 2019-02-22  Peter Bloomfield  <pbloomfield bellsouth net>
 
        * libbalsa/html.c (lbh_web_view_new): use a static location when
diff --git a/libbalsa/imap/util.c b/libbalsa/imap/util.c
index 6860ab3da..b2809f223 100644
--- a/libbalsa/imap/util.c
+++ b/libbalsa/imap/util.c
@@ -94,220 +94,101 @@ imap_next_word(char *s)
 /* ===================================================================
  * UTF-7 conversion routines as in RFC 2192
  * =================================================================== */
-/* UTF7 modified base64 alphabet */
-static char base64chars[] =
-  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
-#define UNDEFINED 64
 
-/* UTF16 definitions */
-#define UTF16MASK       0x03FFUL
-#define UTF16SHIFT      10
-#define UTF16BASE       0x10000UL
-#define UTF16HIGHSTART  0xD800UL
-#define UTF16HIGHEND    0xDBFFUL
-#define UTF16LOSTART    0xDC00UL
-#define UTF16LOEND      0xDFFFUL
+/* see RFC 3501, Section 5.1.3. Mailbox International Naming Convention:
+ * In modified UTF-7, printable US-ASCII characters, except for "&", represent themselves; that is, 
characters with octet values
+ * 0x20-0x25 and 0x27-0x7e. */
+#define IS_VALID_ASCII(c)      ((((c) >= '\x20') && ((c) <= '\x25')) || (((c) >= '\x27') && ((c) <= '\x7e')))
 
-
-/* Convert an IMAP mailbox to a UTF-8 string.
- *  dst needs to have roughly 4 times the storage space of src
- *    Hex encoding can triple the size of the input
- *    UTF-7 can be slightly denser than UTF-8
- *     (worst case: 8 octets UTF-7 becomes 9 octets UTF-8)
- */
-char*
-imap_mailbox_to_utf8(const char *mbox)
+gchar *
+imap_utf8_to_mailbox(const gchar *mbox)
 {
-  unsigned c, i, bitcount;
-  unsigned long ucs4, utf16, bitbuf;
-  unsigned char base64[256];
-  const char *src;
-  char *dst, *res  = malloc(2*strlen(mbox)+1);
-  
-  bitbuf = 0;
-  dst = res;
-  src = mbox;
-  if(!dst) return NULL;
-  /* initialize modified base64 decoding table */
-  memset(base64, UNDEFINED, sizeof (base64));
-  for (i = 0; i < sizeof (base64chars); ++i) {
-    base64[(unsigned)base64chars[i]] = i;
-  }
-  
-  /* loop until end of string */
-  while (*src != '\0') {
-    c = *src++;
-    /* deal with literal characters and &- */
-    if (c != '&' || *src == '-') {
-      /* encode literally */
-      *dst++ = c;
-      /* skip over the '-' if this is an &- sequence */
-      if (c == '&') ++src;
-    } else {
-      /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */
-      bitbuf = 0;
-      bitcount = 0;
-      ucs4 = 0;
-      while ((c = base64[(unsigned char) *src]) != UNDEFINED) {
-        ++src;
-        bitbuf = (bitbuf << 6) | c;
-        bitcount += 6;
-        /* enough bits for a UTF-16 character? */
-        if (bitcount >= 16) {
-          bitcount -= 16;
-          utf16 = (bitcount ? bitbuf >> bitcount
-                   : bitbuf) & 0xffff;
-          /* convert UTF16 to UCS4 */
-          if
-            (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND) {
-            ucs4 = (utf16 - UTF16HIGHSTART) << UTF16SHIFT;
-            continue;
-          } else if
-            (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND) {
-            ucs4 += utf16 - UTF16LOSTART + UTF16BASE;
-          } else {
-            ucs4 = utf16;
-          }
-          
-          /* convert UTF-16 range of UCS4 to UTF-8 */
-          if (ucs4 <= 0x7fUL) {
-            dst[0] = ucs4;
-            dst += 1;
-          } else if (ucs4 <= 0x7ffUL) {
-            dst[0] = 0xc0 | (ucs4 >> 6);
-            dst[1] = 0x80 | (ucs4 & 0x3f);
-            dst += 2;
-          } else if (ucs4 <= 0xffffUL) {
-            dst[0] = 0xe0 | (ucs4 >> 12);
-            dst[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
-            dst[2] = 0x80 | (ucs4 & 0x3f);
-            dst += 3;
-          } else {
-            dst[0] = 0xf0 | (ucs4 >> 18);
-            dst[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
-            dst[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
-            dst[3] = 0x80 | (ucs4 & 0x3f);
-            dst += 4;
-          }
-        }
-      }
-      /* skip over trailing '-' in modified UTF-7 encoding */
-      if (*src == '-') ++src;
-    }
-  }
-  /* terminate destination string */
-  *dst = '\0';
-  return res;
+       GString *buffer;
+       const gchar *next_in;
+
+       buffer = g_string_sized_new(strlen(mbox));      /* sufficient size for ASCII only */
+       next_in = mbox;
+       while (*next_in != '\0') {
+               if (IS_VALID_ASCII(*next_in)) {
+                       g_string_append_c(buffer, *next_in++);
+               } else if (*next_in == '&') {
+                       g_string_append(buffer, "&-");          /* see RFC 3501, Section 5.1.3 */
+                       next_in++;
+               } else {
+                       const gchar *next_ascii;
+                       gchar *utf7;
+                       gsize utf7len;
+
+                       next_ascii = g_utf8_next_char(next_in);
+                       while ((*next_ascii != '\0') && !IS_VALID_ASCII(*next_ascii)) {
+                                next_ascii = g_utf8_next_char(next_ascii);
+                       }
+                       utf7 = g_convert(next_in, next_ascii - next_in, "utf7", "utf8", NULL, &utf7len, NULL);
+                       if (utf7 != NULL) {
+                               gsize n;
+                               utf7[0] = '&';                                  /* see RFC 3501, Section 
5.1.3 */
+
+                               for (n = 1U; n < utf7len; n++) {
+                                       if (utf7[n] == '/') {           /* see RFC 3501, Section 5.1.3 */
+                                               utf7[n] = ',';
+                                       }
+                               }
+                               g_string_append_len(buffer, utf7, utf7len);
+                               g_free(utf7);
+                       }
+                       next_in = next_ascii;
+               }
+       }
+
+       return g_string_free(buffer, FALSE);
 }
 
-/* Convert hex coded UTF-8 string to modified UTF-7 IMAP mailbox
- *  dst should be about twice the length of src to deal with non-hex
- *  coded URLs
- */
-char*
-imap_utf8_to_mailbox(const char *src)
+gchar *
+imap_mailbox_to_utf8(const gchar *mbox)
 {
-  unsigned int utf8pos, utf8total, c, utf7mode, bitstogo, utf16flag;
-  unsigned long ucs4 = 0, bitbuf = 0;
-
-  /* initialize hex lookup table */
-  char *dst, *res = malloc(2*strlen(src)+1);
-  dst = res;
-  if(!dst) return NULL;
+       GString *buffer;
+       const gchar *next_in;
+
+       buffer = g_string_sized_new(strlen(mbox));              /* always sufficiently long */
+       next_in = mbox;
+       while (*next_in != '\0') {
+               if (*next_in == '&') {
+                       if (next_in[1] == '-') {                                /* see RFC 3501, Section 
5.1.3 */
+                               g_string_append_c(buffer, '&');
+                               next_in = &next_in[2];
+                       } else {
+                               gchar *utf7buf;
+                               gchar *next_utf7;
+                               gchar *utf8;
+                               gsize utf8len;
+
+                               utf7buf = g_malloc0(strlen(next_in) + 1U);
+                               utf7buf[0] = '+';                                       /* RFC 2152 shift 
character */
+                               next_in++;
+                               for (next_utf7 = &utf7buf[1]; (*next_in != '\0') && (*next_in != '-'); 
next_in++) {
+                                       if (*next_in == ',') {                  /* see RFC 3501, Section 
5.1.3 */
+                                               *next_utf7++ = '/';
+                                       } else {
+                                               *next_utf7++ = *next_in;
+                                       }
+                               }
+                               *next_utf7 = *next_in;
+                               if (*next_in == '-') {
+                                       next_in++;
+                               }
+                               utf8 = g_convert(utf7buf, -1, "utf8", "utf7", NULL, &utf8len, NULL);
+                               if (utf8 != NULL) {
+                                       g_string_append_len(buffer, utf8, utf8len);
+                                       g_free(utf8);
+                               }
+                               g_free(utf7buf);
+                       }
+               } else {
+                       g_string_append_c(buffer, *next_in++);
+               }
+       }
 
-  utf7mode = 0;
-  utf8total = 0;
-  bitstogo = 0;
-  utf8pos = 0;
-  while ((c = (unsigned char)*src) != '\0') {
-    ++src;
-    /* normal character? */
-    if (c >= ' ' && c <= '~') {
-      /* switch out of UTF-7 mode */
-      if (utf7mode) {
-        if (bitstogo) {
-          *dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
-        }
-        *dst++ = '-';
-        utf7mode = 0;
-        utf8pos  = 0;
-        bitstogo = 0;
-        utf8total= 0;
-      }
-      /* encode '\' as '\\', and '"' as '\"' */
-      if (c == '\\' || c == '"') {
-        *dst++ = '\\';
-      }
-      *dst++ = c;
-      /* encode '&' as '&-' */
-      if (c == '&') {
-        *dst++ = '-';
-      }
-      continue;
-    }
-    /* switch to UTF-7 mode */
-    if (!utf7mode) {
-      *dst++ = '&';
-      utf7mode = 1;
-    }
-    /* Encode US-ASCII characters as themselves */
-    if (c < 0x80) {
-      ucs4 = c;
-      utf8total = 1;
-    } else if (utf8total) {
-      /* save UTF8 bits into UCS4 */
-      ucs4 = (ucs4 << 6) | (c & 0x3FUL);
-      if (++utf8pos < utf8total) {
-        continue;
-      }
-    } else {
-      utf8pos = 1;
-      if (c < 0xE0) {
-        utf8total = 2;
-        ucs4 = c & 0x1F;
-      } else if (c < 0xF0) {
-        utf8total = 3;
-        ucs4 = c & 0x0F;
-      } else {
-        /* NOTE: can't convert UTF8 sequences longer than 4 */
-        utf8total = 4;
-        ucs4 = c & 0x03;
-      }
-      continue;
-    }
-    /* loop to split ucs4 into two utf16 chars if necessary */
-    utf8total = 0;
-    do {
-      if (ucs4 >= UTF16BASE) {
-        ucs4 -= UTF16BASE;
-        bitbuf = (bitbuf << 16) | ((ucs4 >> UTF16SHIFT)
-                                   + UTF16HIGHSTART);
-        ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART;
-        utf16flag = 1;
-      } else {
-        bitbuf = (bitbuf << 16) | ucs4;
-        utf16flag = 0;
-      }
-      bitstogo += 16;
-      /* spew out base64 */
-      while (bitstogo >= 6) {
-        bitstogo -= 6;
-        *dst++ = base64chars[(bitstogo ? (bitbuf >> bitstogo)
-                              : bitbuf)
-                             & 0x3F];
-      }
-    } while (utf16flag);
-  }
-  /* if in UTF-7 mode, finish in ASCII */
-  if (utf7mode) {
-    if (bitstogo) {
-      *dst++ = base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
-    }
-    *dst++ = '-';
-  }
-  /* tie off string */
-  *dst = '\0';
-  return res;
+       return g_string_free(buffer, FALSE);
 }
 
 #if 0
diff --git a/libbalsa/imap/util.h b/libbalsa/imap/util.h
index a01477f1d..1557e1a99 100644
--- a/libbalsa/imap/util.h
+++ b/libbalsa/imap/util.h
@@ -24,7 +24,9 @@ gchar *imap_quote_string(const gchar *src)
 char* imap_next_word(char *s);
 char* imap_skip_atom(char *s);
 
-char* imap_mailbox_to_utf8(const char *src);
-char* imap_utf8_to_mailbox(const char *src);
+gchar* imap_mailbox_to_utf8(const char *src)
+       G_GNUC_WARN_UNUSED_RESULT;
+gchar* imap_utf8_to_mailbox(const char *src)
+       G_GNUC_WARN_UNUSED_RESULT;
 
 #endif


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]