[vte] iconv: Fix the UTF-8 decoder when a NUL byte is encountered

From: Egmont Koblinger <egmontkob src gnome org>
To: commits-list gnome org
Cc:
Subject: [vte] iconv: Fix the UTF-8 decoder when a NUL byte is encountered
Date: Sat, 17 May 2014 11:47:35 +0000 (UTC)
commit 95879ab9b4aa113d7877e4d080fbb7b403963f3b
Author: Egmont Koblinger <egmont gmail com>
Date:   Sat May 17 13:45:09 2014 +0200

    iconv: Fix the UTF-8 decoder when a NUL byte is encountered
    
    https://bugzilla.gnome.org/show_bug.cgi?id=730220

 src/vteconv.c |   85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 83 insertions(+), 2 deletions(-)
---
diff --git a/src/vteconv.c b/src/vteconv.c
index dcf4fd7..fb48a89 100644
--- a/src/vteconv.c
+++ b/src/vteconv.c
@@ -54,6 +54,46 @@ _vte_conv_utf8_strlen(const gchar *p, gssize max)
        return length;
 }
 
+/* A variant of g_utf8_validate() that allows NUL characters.
+ * Requires that max_len >= 0 && end != NULL. */
+static gboolean
+_vte_conv_utf8_validate(const gchar *str,
+                        gssize max_len,
+                        const gchar **end)
+{
+        gboolean ret;
+        do {
+                ret = g_utf8_validate(str, max_len, end);
+                max_len -= *end - str;
+                str = *end;
+                /* Hitting a NUL is okay. Clear the error and iterate over them. */
+                while (max_len > 0 && *str == '\0') {
+                        ret = TRUE;
+                        max_len--;
+                        str++;
+                        *end = str;
+                }
+        } while (ret && max_len > 0);
+        return ret;
+}
+
+/* A variant of g_utf8_get_char_validated() that allows NUL characters.
+ * Requires that max_len >= 0. */
+static gunichar
+_vte_conv_utf8_get_char_validated(const gchar *p,
+                                  gssize max_len) {
+        gunichar ret;
+        /* Handle NUL at the beginning. */
+        if (max_len > 0 && p[0] == '\0')
+                return 0;
+        ret = g_utf8_get_char_validated(p, max_len);
+        /* If a partial match is returned but there's a NUL in the buffer
+         * then this is a wrong error, we're facing an invalid character. */
+        if (ret == (gunichar) -2 && memchr(p, '\0', max_len) != NULL)
+                ret = (gunichar) -1;
+        return ret;
+}
+
 /* A bogus UTF-8 to UTF-8 conversion function which attempts to provide the
  * same semantics as g_iconv(). */
 static size_t
@@ -71,7 +111,7 @@ _vte_conv_utf8_utf8(GIConv converter,
        g_assert(*outbytes_left >= *inbytes_left);
 
        /* The only error we can throw is EILSEQ, so check for that here. */
-       validated = g_utf8_validate(*inbuf, *inbytes_left, &endptr);
+        validated = _vte_conv_utf8_validate(*inbuf, *inbytes_left, &endptr);
 
        /* Copy whatever data was validated. */
        bytes = endptr - *inbuf;
@@ -88,7 +128,7 @@ _vte_conv_utf8_utf8(GIConv converter,
        }
 
        /* Determine why the end of the string is not valid. */
-       if (g_utf8_get_char_validated(*inbuf, *inbytes_left) == (gunichar) -2) {
+        if (_vte_conv_utf8_get_char_validated(*inbuf, *inbytes_left) == (gunichar) -2) {
                /* Prefix of a valid UTF-8 */
                errno = EINVAL;
        } else {
@@ -392,6 +432,8 @@ main(int argc, char **argv)
        const guchar *inbuf;
        guchar *outbuf;
        gsize inbytes, outbytes;
+        gchar *str;
+        const gchar *end;
        char mbyte_test[] = {0xe2, 0x94, 0x80};
        char mbyte_test_break[] = {0xe2, 0xe2, 0xe2};
        int i;
@@ -414,6 +456,45 @@ main(int argc, char **argv)
         i = _vte_conv_utf8_strlen("\xC2\xA0\xC2\xA0", 4);
         g_assert(i == 2);
 
+        /* Test _vte_conv_utf8_validate. */
+        str = "\0\0\0";
+        g_assert(_vte_conv_utf8_validate(str, 0, &end) == TRUE);
+        g_assert(end - str == 0);
+        g_assert(_vte_conv_utf8_validate(str, 1, &end) == TRUE);
+        g_assert(end - str == 1);
+        g_assert(_vte_conv_utf8_validate(str, 3, &end) == TRUE);
+        g_assert(end - str == 3);
+        str = "ab\0cd\0\0ef";
+        g_assert(_vte_conv_utf8_validate(str, 6, &end) == TRUE);
+        g_assert(end - str == 6);
+        g_assert(_vte_conv_utf8_validate(str, 7, &end) == TRUE);
+        g_assert(end - str == 7);
+        g_assert(_vte_conv_utf8_validate(str, 9, &end) == TRUE);
+        g_assert(end - str == 9);
+        str = "ab\xE2\x94\x80\0\xE2\x94\x80yz";
+        g_assert(_vte_conv_utf8_validate(str, 11, &end) == TRUE);
+        g_assert(end - str == 11);
+        str = "ab\x80\0cd";
+        g_assert(_vte_conv_utf8_validate(str, 6, &end) == FALSE);
+        g_assert(end - str == 2);
+        str = "ab\xE2\0cd";
+        g_assert(_vte_conv_utf8_validate(str, 6, &end) == FALSE);
+        g_assert(end - str == 2);
+
+        /* Test _vte_conv_utf8_get_char_validated. */
+        g_assert(_vte_conv_utf8_get_char_validated("", 0) == -2);
+        g_assert(_vte_conv_utf8_get_char_validated("\0", 1) == 0);
+        g_assert(_vte_conv_utf8_get_char_validated(mbyte_test, 1) == -2);
+        g_assert(_vte_conv_utf8_get_char_validated(mbyte_test, 2) == -2);
+        g_assert(_vte_conv_utf8_get_char_validated(mbyte_test, 3) == 0x2500);
+        g_assert(_vte_conv_utf8_get_char_validated(mbyte_test_break, 1) == -2);
+        g_assert(_vte_conv_utf8_get_char_validated(mbyte_test_break, 2) == -1);
+        g_assert(_vte_conv_utf8_get_char_validated(mbyte_test_break, 3) == -1);
+        g_assert(_vte_conv_utf8_get_char_validated("\x80\0", 2) == -1);
+        g_assert(_vte_conv_utf8_get_char_validated("\xE2\0", 2) == -1);
+        g_assert(_vte_conv_utf8_get_char_validated("\xE2\x94\0", 3) == -1);
+        g_assert(_vte_conv_utf8_get_char_validated("\xE2\x94\x80\0", 4) == 0x2500);
+
        /* Test g_iconv, no gunichar stuff. */
        clear(wide_test, narrow_test);
        memset(buf, 0, sizeof(buf));
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]