[gjs] Validate UTF-8 strings in gjs_string_to_utf8()



commit 62f57cb01cd2fdc8a8c1df1e90098ce7251e0aa9
Author: Owen W. Taylor <otaylor fishsoup net>
Date:   Thu Apr 30 17:45:58 2009 -0400

    Validate UTF-8 strings in gjs_string_to_utf8()
    
    Make sure that when converting a Javascript string to UTF-8 it is valid
    in GLib terms and doesn't contain any embedded NULs.
    
    http://bugzilla.gnome.org/show_bug.cgi?id=580947
---
 gjs/jsapi-util-string.c |   27 ++++++++++++++++++++++++++-
 test/js/testLocale.js   |   13 +++++++++++++
 2 files changed, 39 insertions(+), 1 deletions(-)

diff --git a/gjs/jsapi-util-string.c b/gjs/jsapi-util-string.c
index cea82b7..bad5f32 100644
--- a/gjs/jsapi-util-string.c
+++ b/gjs/jsapi-util-string.c
@@ -23,6 +23,8 @@
 
 #include <config.h>
 
+#include <string.h>
+
 #include "jsapi-util.h"
 
 JSBool
@@ -33,6 +35,8 @@ gjs_string_to_utf8(JSContext  *context,
     jschar *s;
     size_t s_length;
     char *utf8_string;
+    long read_items;
+    long utf8_length;
     GError *error;
 
     if (!JSVAL_IS_STRING(string_val)) {
@@ -47,7 +51,7 @@ gjs_string_to_utf8(JSContext  *context,
     error = NULL;
     utf8_string = g_utf16_to_utf8(s,
                                   (glong)s_length,
-                                  NULL, NULL,
+                                  &read_items, &utf8_length,
                                   &error);
 
     if (!utf8_string) {
@@ -59,6 +63,27 @@ gjs_string_to_utf8(JSContext  *context,
         return JS_FALSE;
     }
 
+    if ((size_t)read_items != s_length) {
+        gjs_throw(context, "JS string contains embedded NULs");
+        g_free(utf8_string);
+        return JS_FALSE;
+    }
+
+    /* Our assumption is that the string is being converted to UTF-8
+     * in order to use with GLib-style APIs; Javascript has a looser
+     * sense of validate-Unicode than GLib, so validate here to
+     * prevent problems later on. Given the success of the above,
+     * the only thing that could really be wrong here is including
+     * non-characters like a byte-reversed BOM. If the validation
+     * ever becomes a bottleneck, we could do an inline-special
+     * case of all-ASCII.
+     */
+    if (!g_utf8_validate (utf8_string, utf8_length, NULL)) {
+        gjs_throw(context, "JS string contains invalid Unicode characters");
+        g_free(utf8_string);
+        return JS_FALSE;
+    }
+
     *utf8_string_p = utf8_string;
     return JS_TRUE;
 }
diff --git a/test/js/testLocale.js b/test/js/testLocale.js
index 1984efe..214b5e6 100644
--- a/test/js/testLocale.js
+++ b/test/js/testLocale.js
@@ -41,4 +41,17 @@ function testToLocaleCompare() {
     assertRaises(function() { "a".localeCompare("\ud800"); });
 }
 
+function testInvalidStrings() {
+    // Not really related to locale handling - here we are testing
+    // gjs_string_to_utf8() to properly catch things we'll choke
+    // on later.
+
+    // Unpaired surrogate
+    assertRaises(function() { "\ud800".toLocaleLowerCase(); });
+    // Embedded NUL
+    assertRaises(function() { "\u0000".toLocaleLowerCase(); });
+    // Byte-reversed BOM (an example of a non-character)
+    assertRaises(function() { "\ufffe".toLocaleLowerCase(); });
+}
+
 gjstestRun();



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]