[glib] gutf8: Fix documentation for g_utf8_get_char_validated() length limits



commit 3e89b19c44d353edfafde876e12b56ddd29ef8a4
Author: Philip Withnall <withnall endlessm com>
Date:   Fri Mar 17 12:15:15 2017 +0000

    gutf8: Fix documentation for g_utf8_get_char_validated() length limits
    
    If g_utf8_get_char_validated() encounters a nul byte in the middle of a
    string of given longer length, it returns -2, indicating a partial
    gunichar. That is not the obvious behaviour, but since
    g_utf8_get_char_validated() has been API for a long time, the behaviour
    cannot be changed.
    
    Document it, and add some unit tests (for this behaviour and the other
    behaviour of g_utf8_get_char_validated()).
    
    Signed-off-by: Philip Withnall <withnall endlessm com>
    
    https://bugzilla.gnome.org/show_bug.cgi?id=780095

 glib/gutf8.c               |    4 +++
 glib/tests/utf8-validate.c |   53 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 0 deletions(-)
---
diff --git a/glib/gutf8.c b/glib/gutf8.c
index e66e061..e9191e2 100644
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -654,6 +654,10 @@ g_utf8_get_char_extended (const  gchar *p,
  * This function checks for incomplete characters, for invalid characters
  * such as characters that are out of the range of Unicode, and for
  * overlong encodings of valid characters.
+ *
+ * Note that g_utf8_get_char_validated() returns (gunichar)-2 if
+ * @max_len is positive and any of the bytes in the first UTF-8 character
+ * sequence are nul.
  * 
  * Returns: the resulting character. If @p points to a partial
  *     sequence at the end of a string that could begin a valid 
diff --git a/glib/tests/utf8-validate.c b/glib/tests/utf8-validate.c
index 122aa76..1609bde 100644
--- a/glib/tests/utf8-validate.c
+++ b/glib/tests/utf8-validate.c
@@ -292,6 +292,57 @@ do_test (gconstpointer d)
     }
 }
 
+/* Test the behaviour of g_utf8_get_char_validated() with various inputs and
+ * length restrictions. */
+static void
+test_utf8_get_char_validated (void)
+{
+  const struct {
+    const gchar *buf;
+    gssize max_len;
+    gunichar expected_result;
+  } test_vectors[] = {
+    /* Bug #780095: */
+    { "\xC0\x00_45678", 8, (gunichar) -2 },
+    { "\xC0\x00_45678", -1, (gunichar) -2 },
+    /* It seems odd that the return value differs with the length input, but
+     * that’s how it’s documented: */
+    { "", 0, (gunichar) -2 },
+    { "", -1, (gunichar) 0 },
+    /* Normal inputs: */
+    { "hello", 5, (gunichar) 'h' },
+    { "hello", -1, (gunichar) 'h' },
+    { "\xD8\x9F", 2, 0x061F },
+    { "\xD8\x9F", -1, 0x061F },
+    { "\xD8\x9Fmore", 6, 0x061F },
+    { "\xD8\x9Fmore", -1, 0x061F },
+    { "\xE2\x96\xB3", 3, 0x25B3 },
+    { "\xE2\x96\xB3", -1, 0x25B3 },
+    { "\xE2\x96\xB3more", 7, 0x25B3 },
+    { "\xE2\x96\xB3more", -1, 0x25B3 },
+    { "\xF0\x9F\x92\xA9", 4, 0x1F4A9 },
+    { "\xF0\x9F\x92\xA9", -1, 0x1F4A9 },
+    { "\xF0\x9F\x92\xA9more", 8, 0x1F4A9 },
+    { "\xF0\x9F\x92\xA9more", -1, 0x1F4A9 },
+    /* Partial unichars: */
+    { "\xD8", -1, (gunichar) -2 },
+    { "\xD8\x9F", 1, (gunichar) -2 },
+    { "\xCE", -1, (gunichar) -2 },
+    { "\xCE", 1, (gunichar) -2 },
+  };
+  gsize i;
+
+  for (i = 0; i < G_N_ELEMENTS (test_vectors); i++)
+    {
+      gunichar actual_result;
+
+      g_test_message ("Vector %" G_GSIZE_FORMAT, i);
+      actual_result = g_utf8_get_char_validated (test_vectors[i].buf,
+                                                 test_vectors[i].max_len);
+      g_assert_cmpint (actual_result, ==, test_vectors[i].expected_result);
+    }
+}
+
 int
 main (int argc, char *argv[])
 {
@@ -307,5 +358,7 @@ main (int argc, char *argv[])
       g_free (path);
     }
 
+  g_test_add_func ("/utf8/get-char-validated", test_utf8_get_char_validated);
+
   return g_test_run ();
 }


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]