[vte] utf8: Make decoder conform to recommendation on replacement characters

From: Christian Persch <chpe src gnome org>
To: commits-list gnome org
Cc:
Subject: [vte] utf8: Make decoder conform to recommendation on replacement characters
Date: Wed, 12 Sep 2018 11:55:25 +0000 (UTC)
commit b8b1aa4ed5ef12368c5c3f6d85ebf3e1d72f91a8
Author: Christian Persch <chpe src gnome org>
Date:   Mon Sep 3 16:10:51 2018 +0200

    utf8: Make decoder conform to recommendation on replacement characters
    
    With this change, the decoder conforms to the W3 Encoding TR and
    the Unicode recommendation on inserting replacement characters
    from §3.9 of the Unicode core spec.
    
    https://gitlab.gnome.org/GNOME/vte/issues/30

 src/parser-cat.cc | 15 ++++++--------
 src/utf8-test.cc  | 58 ++++++++++---------------------------------------------
 src/utf8.cc       | 47 ++++++++++++++++++++++++++++++++++++--------
 src/utf8.hh       |  7 ++-----
 src/vte.cc        | 15 ++++++--------
 5 files changed, 63 insertions(+), 79 deletions(-)
---
diff --git a/src/parser-cat.cc b/src/parser-cat.cc
index 6a634bc6..5c79b34e 100644
--- a/src/parser-cat.cc
+++ b/src/parser-cat.cc
@@ -347,18 +347,15 @@ process_file_utf8(int fd,
 
                 for (auto sptr = buf; sptr < bufend; ++sptr) {
                         switch (decoder.decode(*sptr)) {
-                        case vte::base::UTF8Decoder::REJECT:
-                                decoder.reset();
-
-                                /* If a start byte occurred in the middle of a sequence,
-                                 * rewind the stream so we try to start a new character
-                                 * with it.
+                        case vte::base::UTF8Decoder::REJECT_REWIND:
+                                /* Rewind the stream.
                                  * Note that this will never lead to a loop, since in the
                                  * next round this byte *will* be consumed.
                                  */
-                                if (decoder.is_start_byte(*sptr))
-                                        --sptr;
-
+                                --sptr;
+                                /* [[fallthrough]]; */
+                        case vte::base::UTF8Decoder::REJECT:
+                                decoder.reset();
                                 /* Fall through to insert the U+FFFD replacement character. */
                                 /* [[fallthrough]]; */
                         case vte::base::UTF8Decoder::ACCEPT: {
diff --git a/src/utf8-test.cc b/src/utf8-test.cc
index 463fa404..9d1b84c4 100644
--- a/src/utf8-test.cc
+++ b/src/utf8-test.cc
@@ -48,21 +48,6 @@ test_utf8_decoder_decode(void)
         }
 }
 
-static constexpr bool
-is_utf8_start_byte(uint32_t c)
-{
-        return (c < 0x80u || (c >= 0xc2u && c <= 0xf4u));
-}
-
-static void
-test_utf8_decoder_start(void)
-{
-        decoder.reset();
-        for (uint32_t c = 0; c < 0x100u; ++c) {
-                g_assert_cmpint(decoder.is_start_byte(c), ==, is_utf8_start_byte(c));
-        }
-}
-
 static void
 decode(uint8_t const* in,
        size_t len,
@@ -74,19 +59,15 @@ decode(uint8_t const* in,
         uint32_t state = UTF8Decoder::ACCEPT;
         for (auto iptr = in; iptr < iend; ++iptr) {
                 switch ((state = decoder.decode(*iptr))) {
+                case vte::base::UTF8Decoder::REJECT_REWIND:
+                        /* Note that this will never lead to a loop, since in the
+                         * next round this byte *will* be consumed.
+                         */
+                        --iptr;
+                        // [[fallthrough]]; */
                 case vte::base::UTF8Decoder::REJECT:
                         decoder.reset();
                         state = UTF8Decoder::ACCEPT;
-
-                        /* If a start byte occurred in the middle of a sequence,
-                         * rewind the stream so we try to start a new character
-                         * with it.
-                         * Note that this will never lead to a loop, since in the
-                         * next round this byte *will* be consumed.
-                         */
-                        if (decoder.is_start_byte(*iptr))
-                                --iptr;
-
                         /* Fall through to insert the U+FFFD replacement character. */
                         /* [[fallthrough]]; */
                 case vte::base::UTF8Decoder::ACCEPT:
@@ -210,13 +191,11 @@ test_utf8_decoder_replacement(void)
         assert_decode("a\xC0\x80", -1, U"a\uFFFD\uFFFD"s);
         assert_decode("a\xC0\x80Z", -1, U"a\uFFFD\uFFFDZ"s);
         // Lowest single-byte as three-byte overlong sequence
-        #ifdef INCLUDE_KNOWN_FAIL
         assert_decode("a\xE0\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xE0\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
         // Lowest single-byte as four-byte overlong sequence
         assert_decode("a\xF0\x80\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xF0\x80\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
-        #endif
         // One below lowest single-byte
         assert_decode("a\xFF", -1, U"a\uFFFD"s);
         assert_decode("a\xFFZ", -1, U"a\uFFFDZ"s);
@@ -227,13 +206,11 @@ test_utf8_decoder_replacement(void)
         assert_decode("a\xC1\xBF", -1, U"a\uFFFD\uFFFD"s);
         assert_decode("a\xC1\xBFZ", -1, U"a\uFFFD\uFFFDZ"s);
         // Highest single-byte as three-byte overlong sequence
-        #ifdef INCLUDE_KNOWN_FAIL
         assert_decode("a\xE0\x81\xBF", -1, U"a\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xE0\x81\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
         // Highest single-byte as four-byte overlong sequence
         assert_decode("a\xF0\x80\x81\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xF0\x80\x81\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
-        #endif
         // One past highest single byte (also lone continuation)
         assert_decode("a\x80Z", -1, U"a\uFFFDZ"s);
         assert_decode("a\x80", -1, U"a\uFFFD"s);
@@ -250,13 +227,11 @@ test_utf8_decoder_replacement(void)
         assert_decode("a\xC2\x80", -1, U"a\u0080"s);
         assert_decode("a\xC2\x80Z", -1, U"a\u0080Z"s);
         // Lowest two-byte as three-byte overlong sequence
-        #ifdef INCLUDE_KNOWN_FAIL
         assert_decode("a\xE0\x82\x80", -1, U"a\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xE0\x82\x80Z", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
         // Lowest two-byte as four-byte overlong sequence
         assert_decode("a\xF0\x80\x82\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xF0\x80\x82\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
-        #endif
         // Lead one below lowest two-byte
         assert_decode("a\xC1\x80", -1, U"a\uFFFD\uFFFD"s);
         assert_decode("a\xC1\x80Z", -1, U"a\uFFFD\uFFFDZ"s);
@@ -267,26 +242,21 @@ test_utf8_decoder_replacement(void)
         assert_decode("a\xDF\xBF", -1, U"a\u07FF"s);
         assert_decode("a\xDF\xBFZ", -1, U"a\u07FFZ"s);
         // Highest two-byte as three-byte overlong sequence
-        #ifdef INCLUDE_KNOWN_FAIL
         assert_decode("a\xE0\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xE0\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
         // Highest two-byte as four-byte overlong sequence
         assert_decode("a\xF0\x80\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xF0\x80\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
-        #endif
         // Lowest three-byte
         assert_decode("a\xE0\xA0\x80", -1, U"a\u0800"s);
         assert_decode("a\xE0\xA0\x80Z", -1, U"a\u0800Z"s);
         // Lowest three-byte as four-byte overlong sequence
-        #ifdef INCLUDE_KNOWN_FAIL
         assert_decode("a\xF0\x80\xA0\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xF0\x80\xA0\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
-        #endif
         // Highest below surrogates
         assert_decode("a\xED\x9F\xBF", -1, U"a\uD7FF"s);
         assert_decode("a\xED\x9F\xBFZ", -1, U"a\uD7FFZ"s);
         // Highest below surrogates as four-byte overlong sequence
-        #ifdef INCLUDE_KNOWN_FAIL
         assert_decode("a\xF0\x8D\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xF0\x8D\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
         // First surrogate
@@ -301,38 +271,31 @@ test_utf8_decoder_replacement(void)
         // Last surrogate as four-byte overlong sequence
         assert_decode("a\xF0\x8D\xBF\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xF0\x8D\xBF\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
-        #endif
         // Lowest above surrogates
         assert_decode("a\xEE\x80\x80", -1, U"a\uE000"s);
         assert_decode("a\xEE\x80\x80Z", -1, U"a\uE000Z"s);
         // Lowest above surrogates as four-byte overlong sequence
-        #ifdef INCLUDE_KNOWN_FAIL
         assert_decode("a\xF0\x8E\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xF0\x8E\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
-        #endif
         // Highest three-byte
         assert_decode("a\xEF\xBF\xBF", -1, U"a\uFFFF"s);
         assert_decode("a\xEF\xBF\xBFZ", -1, U"a\uFFFFZ"s);
         // Highest three-byte as four-byte overlong sequence
-        #ifdef INCLUDE_KNOWN_FAIL
         assert_decode("a\xF0\x8F\xBF\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xF0\x8F\xBF\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
         // Lowest four-byte
-        assert_decode("a\xF0\x90\x80\x80", -1, U"a\u10000"s);
-        assert_decode("a\xF0\x90\x80\x80Z", -1, U"a\u10000Z"s);
+        assert_decode("a\xF0\x90\x80\x80", -1, U"a\U00010000"s);
+        assert_decode("a\xF0\x90\x80\x80Z", -1, U"a\U00010000Z"s);
         // Highest four-byte
-        assert_decode("a\xF4\x8F\xBF\xBF", -1, U"a\u10FFFF"s);
-        assert_decode("a\xF4\x8F\xBF\xBFZ", -1, U"a\u10FFFFZ"s);
+        assert_decode("a\xF4\x8F\xBF\xBF", -1, U"a\U0010FFFF"s);
+        assert_decode("a\xF4\x8F\xBF\xBFZ", -1, U"a\U0010FFFFZ"s);
         // One past highest four-byte
         assert_decode("a\xF4\x90\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
         assert_decode("a\xF4\x90\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
-        #endif
 
         // Highest four-byte with last byte replaced with 0xFF
-        #ifdef INCLUDE_KNOWN_FAIL
         assert_decode("a\xF4\x8F\xBF\xFF", -1, U"a\uFFFD\uFFFD"s);
         assert_decode("a\xF4\x8F\xBF\xFFZ", -1, U"a\uFFFD\uFFFDZ"s);
-        #endif
 }
 
 int
@@ -342,7 +305,6 @@ main(int argc,
         g_test_init(&argc, &argv, nullptr);
 
         g_test_add_func("/vte/utf8/decoder/decode", test_utf8_decoder_decode);
-        g_test_add_func("/vte/utf8/decoder/start", test_utf8_decoder_start);
         g_test_add_func("/vte/utf8/decoder/replacement", test_utf8_decoder_replacement);
 
         return g_test_run();
diff --git a/src/utf8.cc b/src/utf8.cc
index f5373bfb..0d05f060 100644
--- a/src/utf8.cc
+++ b/src/utf8.cc
@@ -26,6 +26,7 @@
 #include "utf8.hh"
 
 #define RJ vte::base::UTF8Decoder::REJECT
+#define RW vte::base::UTF8Decoder::REJECT_REWIND
 
 uint8_t const vte::base::UTF8Decoder::kTable[] = {
         // The first part of the table maps bytes to character classes that
@@ -40,7 +41,7 @@ uint8_t const vte::base::UTF8Decoder::kTable[] = {
         // 0xe0:       10
         // 0xe1..0xec: 3
         // 0xed:       4
-        // 0xee..0xff: 3
+        // 0xee..0xef: 3
         // 0xf0:       11
         // 0xf1..0xf3: 6
         // 0xf4:       5
@@ -64,6 +65,35 @@ uint8_t const vte::base::UTF8Decoder::kTable[] = {
 
         // To understand this DFA, see transitions graph on the website
         // linked above.
+        //
+        // The following translates the states of the DFA to the
+        // algorithm of the UTF-8 decoder from the W3 Encodings spec
+        // [https://www.w3.org/TR/encoding/#utf-8]:
+        //
+        // DFA   │ bytes   bytes   lower   upper
+        // state │ seen    needed  bound   bound
+        // ──────┼─────────────────────────────────
+        //   0   │ 0       0       0x80    0xbf
+        //  12   │
+        //  24   │ 1,2,3   1       0x80    0xbf
+        //  36   │ 1,2     2       0x80    0xbf
+        //  48   │ 1       2       0xa0    0xbf
+        //  60   │ 1       2       0x80    0x9f
+        //  72   │ 1       3       0x90    0xbf
+        //  84   │ 1       3       0x80    0xbf
+        //  96   │ 1       3       0x80    0x8f
+        // 108   │
+        //
+        // If an unexpected byte is read in a non-ACCEPT/REJECT* state,
+        // transition to REJECT_REWIND so that the decoder will read that
+        // byte again after being reset; this makes the decoder conform
+        // to the Unicode recommendation for insering replacement
+        // characters, and to the W3 Encoding TR spec.
+        //
+        // If an unexpected byte is read in the ACCEPT or a REJECT* state,
+        // transition to REJECT; that byte must not be read again, since
+        // that would lead to an infinite loop.
+        //
         // For each state (row), the table records which state will
         // be transitioned to when consuming a character of the class
         // (column).
@@ -72,11 +102,12 @@ uint8_t const vte::base::UTF8Decoder::kTable[] = {
         */
          0, RJ, 24, 36, 60, 96, 84, RJ, RJ, RJ, 48, 72, // state 0 (accept)
         RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, // state 12 (reject)
-        RJ,  0, RJ, RJ, RJ, RJ, RJ,  0, RJ,  0, RJ, RJ, // state 24
-        RJ, 24, RJ, RJ, RJ, RJ, RJ, 24, RJ, 24, RJ, RJ, // state 36
-        RJ, RJ, RJ, RJ, RJ, RJ, RJ, 24, RJ, RJ, RJ, RJ, // state 48
-        RJ, 24, RJ, RJ, RJ, RJ, RJ, RJ, RJ, 24, RJ, RJ, // state 60
-        RJ, RJ, RJ, RJ, RJ, RJ, RJ, 36, RJ, 36, RJ, RJ, // state 72
-        RJ, 36, RJ, RJ, RJ, RJ, RJ, 36, RJ, 36, RJ, RJ, // state 84
-        RJ, 36, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, // state 96
+        RW,  0, RW, RW, RW, RW, RW,  0, RW,  0, RW, RW, // state 24
+        RW, 24, RW, RW, RW, RW, RW, 24, RW, 24, RW, RW, // state 36
+        RW, RW, RW, RW, RW, RW, RW, 24, RW, RW, RW, RW, // state 48
+        RW, 24, RW, RW, RW, RW, RW, RW, RW, 24, RW, RW, // state 60
+        RW, RW, RW, RW, RW, RW, RW, 36, RW, 36, RW, RW, // state 72
+        RW, 36, RW, RW, RW, RW, RW, 36, RW, 36, RW, RW, // state 84
+        RW, 36, RW, RW, RW, RW, RW, RW, RW, RW, RW, RW, // state 96
+        RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, // state 108 (reject-rewind)
 };
diff --git a/src/utf8.hh b/src/utf8.hh
index 3a6c1962..8b90bd75 100644
--- a/src/utf8.hh
+++ b/src/utf8.hh
@@ -38,7 +38,8 @@ class UTF8Decoder {
 public:
         enum {
                 ACCEPT = 0,
-                REJECT = 12
+                REJECT = 12,
+                REJECT_REWIND = 108
         };
 
         UTF8Decoder() noexcept = default;
@@ -66,10 +67,6 @@ public:
                 m_codepoint = 0xfffdU;
         }
 
-        inline bool is_start_byte(uint32_t byte) const noexcept {
-                return kTable[256 + 0 /* start state */ + kTable[byte]] != REJECT;
-        }
-
 private:
         uint32_t m_state{ACCEPT};
         uint32_t m_codepoint{0};
diff --git a/src/vte.cc b/src/vte.cc
index 069ff7dc..a6f8b4c3 100644
--- a/src/vte.cc
+++ b/src/vte.cc
@@ -3588,18 +3588,15 @@ Terminal::process_incoming()
                 for ( ; ip < iend; ++ip) {
 
                         switch (m_utf8_decoder.decode(*ip)) {
-                        case vte::base::UTF8Decoder::REJECT:
-                                m_utf8_decoder.reset();
-
-                                /* If a start byte occurred in the middle of a sequence,
-                                 * rewind the stream so we try to start a new character
-                                 * with it.
+                        case vte::base::UTF8Decoder::REJECT_REWIND:
+                                /* Rewind the stream.
                                  * Note that this will never lead to a loop, since in the
                                  * next round this byte *will* be consumed.
                                  */
-                                if (m_utf8_decoder.is_start_byte(*ip))
-                                        --ip;
-
+                                --ip;
+                                /* [[fallthrough]]; */
+                        case vte::base::UTF8Decoder::REJECT:
+                                m_utf8_decoder.reset();
                                 /* Fall through to insert the U+FFFD replacement character. */
                                 /* [[fallthrough]]; */
                         case vte::base::UTF8Decoder::ACCEPT: {
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]