[gparted] Workaround g_utf8_get_char_validate() bug with embedded NUL bytes (#777973)



commit 8dbbb47ce2db0ee733ff909c1ead2f4de9475596
Author: Mike Fleetwood <mike fleetwood googlemail com>
Date:   Wed Mar 15 17:02:04 2017 +0000

    Workaround g_utf8_get_char_validate() bug with embedded NUL bytes (#777973)
    
    If PipeCapture reads a NUL byte in the middle of what is expected to be
    a multi-byte UTF-8 character then PipeCapture either returns the
    captured characters to the previous update or loops forever depending on
    whether the end of the stream is encountered before the read buffer is
    full or not.  This is equivalent to saying whether the NUL byte occurs
    within the last 512 bytes of the output or not.
    
    This is caused by a bug in g_utf8_get_char_validated() reporting that a
    partial UTF-8 character has been found when the NUL byte is encountered
    in the middle of a multi-byte character even though more bytes are
    available in the length specified buffer.  g_utf8_get_char_validated()
    is always stopping at the NUL byte assuming it is working with a NUL
    terminated string.
    
    Workaround this by checking for g_utf8_get_char_validated() claiming a
    partial UTF-8 character has been found when in fact there are at least
    enough bytes in the read buffer to instead determine that it is really
    an invalid UTF-8 character.
    
    Reference:
        Bug 780095 - g_utf8_get_char_validated() stopping at nul byte even
                     for length specified buffers
        https://bugzilla.gnome.org/show_bug.cgi?id=780095
    
    Bug 777973 - Segmentation fault on bad disk

 include/PipeCapture.h |    1 +
 src/PipeCapture.cc    |   37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 0 deletions(-)
---
diff --git a/include/PipeCapture.h b/include/PipeCapture.h
index 21e3b21..d986b3f 100644
--- a/include/PipeCapture.h
+++ b/include/PipeCapture.h
@@ -45,6 +45,7 @@ private:
                                     gpointer data );
        static void append_unichar_vector_to_utf8( std::string & str,
                                                   const std::vector<gunichar> & ucvec );
+       static int utf8_char_length( unsigned char firstbyte );
 
        Glib::RefPtr<Glib::IOChannel> channel;  // Wrapper around fd
        char * readbuf;                 // Bytes read from IOChannel (fd)
diff --git a/src/PipeCapture.cc b/src/PipeCapture.cc
index b900a7d..3579b2f 100644
--- a/src/PipeCapture.cc
+++ b/src/PipeCapture.cc
@@ -128,6 +128,18 @@ bool PipeCapture::OnReadable( Glib::IOCondition condition )
                        gunichar uc = g_utf8_get_char_validated( read_ptr, end_ptr - read_ptr );
                        if ( uc == UTF8_PARTIAL )
                        {
+                               // Workaround bug in g_utf8_get_char_validated() in which
+                               // it reports an partial UTF-8 char when a NUL byte is
+                               // encountered in the middle of a multi-byte character,
+                               // yet there are more bytes available in the length
+                               // specified buffer.  Report as invalid character instead.
+                               int len = utf8_char_length( *read_ptr );
+                               if ( len == -1 || read_ptr + len <= end_ptr )
+                                       uc = UTF8_INVALID;
+                       }
+
+                       if ( uc == UTF8_PARTIAL )
+                       {
                                // Partial UTF-8 character at end of read buffer.  Copy to
                                // start of read buffer.
                                size_t bytes_remaining = end_ptr - read_ptr;
@@ -231,6 +243,31 @@ void PipeCapture::append_unichar_vector_to_utf8( std::string & str, const std::v
        }
 }
 
+int PipeCapture::utf8_char_length( unsigned char firstbyte )
+{
+       // Recognise the size of FSS-UTF (1992) / UTF-8 (1993) characters given the first
+       // byte.  Characters can be up to 6 bytes.  (Later UTF-8 (2003) limited characters
+       // to 4 bytes and 21-bits of Unicode code-space).
+       // Reference:
+       //     https://en.wikipedia.org/wiki/UTF-8
+       if ( ( firstbyte & 0x80 ) == 0x00 )       // 0xxxxxxx - 1 byte UTF-8 char
+               return 1;
+       else if ( ( firstbyte & 0xE0 ) == 0xC0 )  // 110xxxxx - First byte of a 2 byte UTF-8 char
+               return 2;
+       else if ( ( firstbyte & 0xF0 ) == 0xE0 )  // 1110xxxx - First byte of a 3 byte UTF-8 char
+               return 3;
+       else if ( ( firstbyte & 0xF8 ) == 0xF0 )  // 11110xxx - First byte of a 4 byte UTF-8 char
+               return 4;
+       else if ( ( firstbyte & 0xFC ) == 0xF8 )  // 111110xx - First byte of a 5 byte UTF-8 char
+               return 5;
+       else if ( ( firstbyte & 0xFE ) == 0xFC )  // 1111110x - First byte of a 6 byte UTF-8 char
+               return 6;
+       else if ( ( firstbyte & 0xC0 ) == 0x80 )  // 10xxxxxx - Continuation byte
+               return -1;
+       else                                      // Invalid byte
+               return -1;
+}
+
 PipeCapture::~PipeCapture()
 {
        delete[] readbuf;


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]