[gjs/wip/ptomato/mozjs38: 13/17] js: Accommodate both Latin-1 and two-byte strings

From: Philip Chimento <pchimento src gnome org>
To: commits-list gnome org
Cc:
Subject: [gjs/wip/ptomato/mozjs38: 13/17] js: Accommodate both Latin-1 and two-byte strings
Date: Fri, 20 Jan 2017 03:12:38 +0000 (UTC)
commit 517a4b9995b44669b0ef88b4fa70d6613c2bdcc9
Author: Philip Chimento <philip chimento gmail com>
Date:   Tue Jan 17 23:36:07 2017 -0800

    js: Accommodate both Latin-1 and two-byte strings
    
    Starting with SpiderMonkey 38, strings can be stored internally in the
    JS engine as either Latin-1 or two bytes per character (sort-of-UTF-16).
    When operating on the characters directly, we must check
    JS_StringHasLatin1Chars() and use the appropriate accessors depending on
    how they are stored.
    
    Additionally, since the string's characters are still owned by the
    string and may get moved around during garbage collection, we cannot use
    any JSAPI functions that might trigger a GC while maintaining a pointer
    to the string's characters. To enforce this, we must construct a
    JS::AutoCheckCannotGC object, which will cause a crash if a GC starts
    while it is in scope.

 gjs/byteArray.cpp                   |   47 ++++++++++++++-------
 gjs/jsapi-util-string.cpp           |   78 ++++++++++++++++++++++++++++------
 installed-tests/js/testByteArray.js |   20 +++++++++
 test/gjs-tests.cpp                  |   75 ++++++++++++++++++++++++++++++----
 4 files changed, 183 insertions(+), 37 deletions(-)
---
diff --git a/gjs/byteArray.cpp b/gjs/byteArray.cpp
index 2809abd..bc356b3 100644
--- a/gjs/byteArray.cpp
+++ b/gjs/byteArray.cpp
@@ -603,24 +603,41 @@ from_string_func(JSContext *context,
         g_byte_array_append(priv->array, (guint8*) utf8, strlen(utf8));
         g_free(utf8);
     } else {
-        char *encoded;
+        JSString *str = argv[0].toString();  /* Rooted by argv */
+        GError *error = NULL;
+        char *encoded = NULL;
         gsize bytes_written;
-        GError *error;
-        const char16_t *u16_chars;
-        gsize u16_len;
 
-        u16_chars = JS_GetStringCharsAndLength(context, argv[0].toString(), &u16_len);
-        if (u16_chars == NULL)
-            return false;
+        /* Scope for AutoCheckCannotGC, will crash if a GC is triggered
+         * while we are using the string's chars */
+        {
+            size_t len;
+            JS::AutoCheckCannotGC nogc;
+            if (JS_StringHasLatin1Chars(str)) {
+                const JS::Latin1Char *chars =
+                    JS_GetLatin1StringCharsAndLength(context, nogc, str, &len);
+                if (chars == NULL)
+                    return false;
+
+                encoded = g_convert((char *) chars, len,
+                                    encoding,  /* to_encoding */
+                                    "LATIN1",  /* from_encoding */
+                                    NULL,  /* bytes read */
+                                    &bytes_written, &error);
+            } else {
+                const char16_t *chars =
+                    JS_GetTwoByteStringCharsAndLength(context, nogc, str, &len);
+                if (chars == NULL)
+                    return false;
+
+                encoded = g_convert((char *) chars, len * 2,
+                                    encoding,  /* to_encoding */
+                                    "UTF-16",  /* from_encoding */
+                                    NULL,  /* bytes read */
+                                    &bytes_written, &error);
+            }
+        }
 
-        error = NULL;
-        encoded = g_convert((char*) u16_chars,
-                            u16_len * 2,
-                            encoding, /* to_encoding */
-                            "UTF-16", /* from_encoding */
-                            NULL, /* bytes read */
-                            &bytes_written,
-                            &error);
         g_free(encoding);
         if (encoded == NULL) {
             /* frees the GError */
diff --git a/gjs/jsapi-util-string.cpp b/gjs/jsapi-util-string.cpp
index 6143b47..361b632 100644
--- a/gjs/jsapi-util-string.cpp
+++ b/gjs/jsapi-util-string.cpp
@@ -23,6 +23,7 @@
 
 #include <config.h>
 
+#include <algorithm>
 #include <string.h>
 
 #include "jsapi-util.h"
@@ -180,27 +181,46 @@ gjs_string_get_char16_data(JSContext       *context,
                            char16_t       **data_p,
                            size_t          *len_p)
 {
-    const char16_t *js_data;
-    bool retval = false;
-
-    JS_BeginRequest(context);
+    JSAutoRequest ar(context);
 
     if (!value.isString()) {
         gjs_throw(context,
                   "Value is not a string, can't return binary data from it");
-        goto out;
+        return false;
+    }
+
+    /* From this point on, crash if a GC is triggered while we are using
+     * the string's chars */
+    JS::AutoCheckCannotGC nogc;
+
+    if (JS_StringHasLatin1Chars(value.toString())) {
+        const JS::Latin1Char *js_data =
+            JS_GetLatin1StringCharsAndLength(context, nogc,
+                                             value.toString(), len_p);
+        if (js_data == NULL)
+            return false;
+
+        /* Unicode codepoints 0x00-0xFF are the same as Latin-1
+         * codepoints, so we can preserve the string length and simply
+         * copy the codepoints to an array of different-sized ints */
+
+        *data_p = g_new(char16_t, *len_p);
+
+        /* This will probably use a loop, unfortunately */
+        std::copy(js_data, js_data + *len_p, *data_p);
+        return true;
     }
 
-    js_data = JS_GetStringCharsAndLength(context, value.toString(), len_p);
+    const char16_t *js_data =
+        JS_GetTwoByteStringCharsAndLength(context, nogc,
+                                          value.toString(), len_p);
+
     if (js_data == NULL)
-        goto out;
+        return false;
 
     *data_p = (char16_t *) g_memdup(js_data, sizeof(*js_data) * (*len_p));
 
-    retval = true;
-out:
-    JS_EndRequest(context);
-    return retval;
+    return true;
 }
 
 /**
@@ -228,10 +248,40 @@ gjs_string_to_ucs4(JSContext      *cx,
 
     JSAutoRequest ar(cx);
     JS::RootedString str(cx, value.toString());
-    size_t utf16_len;
+    size_t len;
     GError *error = NULL;
 
-    const char16_t *utf16 = JS_GetStringCharsAndLength(cx, str, &utf16_len);
+    /* From this point on, crash if a GC is triggered while we are using
+     * the string's chars */
+    JS::AutoCheckCannotGC nogc;
+
+    if (JS_StringHasLatin1Chars(str)) {
+        const JS::Latin1Char *latin1 =
+            JS_GetLatin1StringCharsAndLength(cx, nogc, str, &len);
+
+        if (latin1 == NULL) {
+            gjs_throw(cx, "Failed to get Latin-1 string data");
+            return false;
+        }
+
+        if (ucs4_string_p == NULL)
+            return true;
+
+        /* Unicode codepoints 0x00-0xFF are the same as Latin-1
+         * codepoints, so we can preserve the string length and simply
+         * copy the codepoints to an array of different-sized ints */
+
+        *ucs4_string_p = g_new(gunichar, len);
+        std::copy(latin1, latin1 + len, *ucs4_string_p);
+        if (len_p != NULL)
+            *len_p = len;
+
+        return true;
+    }
+
+    const char16_t *utf16 =
+        JS_GetTwoByteStringCharsAndLength(cx, nogc, str, &len);
+
     if (utf16 == NULL) {
         gjs_throw(cx, "Failed to get UTF-16 string data");
         return false;
@@ -240,7 +290,7 @@ gjs_string_to_ucs4(JSContext      *cx,
     if (ucs4_string_p != NULL) {
         long length;
         *ucs4_string_p = g_utf16_to_ucs4(reinterpret_cast<const gunichar2 *>(utf16),
-                                         utf16_len, NULL, &length, &error);
+                                         len, NULL, &length, &error);
         if (*ucs4_string_p == NULL) {
             gjs_throw(cx, "Failed to convert UTF-16 string to UCS-4: %s",
                       error->message);
diff --git a/installed-tests/js/testByteArray.js b/installed-tests/js/testByteArray.js
index 97f93e3..3d64ca4 100644
--- a/installed-tests/js/testByteArray.js
+++ b/installed-tests/js/testByteArray.js
@@ -94,6 +94,26 @@ describe('Byte array', function () {
         [97, 98, 99, 100].forEach((val, ix) => expect(a[ix]).toEqual(val));
     });
 
+    it('can be encoded from a string', function () {
+        // Pick a string likely to be stored internally as Latin1
+        let a = ByteArray.fromString('äbcd', 'LATIN1');
+        expect(a.length).toEqual(4);
+        [228, 98, 99, 100].forEach((val, ix) => expect(a[ix]).toEqual(val));
+
+        // Try again with a string not likely to be Latin1 internally
+        a = ByteArray.fromString('⅜b', 'UTF-16');
+        expect(a.length).toEqual(6);
+        // g_convert() apparently adds a BOM
+        [0xff, 0xfe, 0x5c, 0x21, 98, 0]
+            .forEach((val, ix) => expect(a[ix]).toEqual(val));
+    });
+
+    it('encodes as UTF-8 by default', function () {
+        let a = ByteArray.fromString('⅜');
+        expect(a.length).toEqual(3);
+        [0xe2, 0x85, 0x9c].forEach((val, ix) => expect(a[ix]).toEqual(val));
+    });
+
     it('can be created from an array', function () {
         let a = ByteArray.fromArray([ 1, 2, 3, 4 ]);
         expect(a.length).toEqual(4);
diff --git a/test/gjs-tests.cpp b/test/gjs-tests.cpp
index ca5af1a..f639ac1 100644
--- a/test/gjs-tests.cpp
+++ b/test/gjs-tests.cpp
@@ -22,6 +22,9 @@
  */
 
 #include <config.h>
+
+#include <string>
+
 #include <glib.h>
 #include <glib-object.h>
 #include <util/glib.h>
@@ -180,11 +183,63 @@ gjstest_test_func_gjs_jsapi_util_error_throw(GjsUnitTestFixture *fx,
 }
 
 static void
-gjstest_test_func_gjs_jsapi_util_debug_string_valid_utf8(GjsUnitTestFixture *fx,
-                                                         gconstpointer       unused)
+test_jsapi_util_string_char16_data(GjsUnitTestFixture *fx,
+                                   gconstpointer       unused)
+{
+    char16_t *chars;
+    size_t len;
+    JS::RootedValue v_string(fx->cx);
+
+    g_assert_true(gjs_string_from_utf8(fx->cx, VALID_UTF8_STRING, -1,
+                                       &v_string));
+    g_assert_true(gjs_string_get_char16_data(fx->cx, v_string, &chars,
+                                             &len));
+    std::u16string result(chars, len);
+    g_assert_true(result == u"\xc9\xd6 foobar \u30df");
+    g_free(chars);
+
+    /* Try with a string that is likely to be stored as Latin-1 */
+    v_string.setString(JS_NewStringCopyZ(fx->cx, "abcd"));
+    g_assert_true(gjs_string_get_char16_data(fx->cx, v_string, &chars,
+                                             &len));
+
+    result.assign(chars, len);
+    g_assert_true(result == u"abcd");
+    g_free(chars);
+}
+
+static void
+test_jsapi_util_string_to_ucs4(GjsUnitTestFixture *fx,
+                               gconstpointer       unused)
+{
+    gunichar *chars;
+    size_t len;
+    JS::RootedValue v_string(fx->cx);
+
+    g_assert_true(gjs_string_from_utf8(fx->cx, VALID_UTF8_STRING, -1,
+                                       &v_string));
+    g_assert_true(gjs_string_to_ucs4(fx->cx, v_string, &chars, &len));
+
+    std::u32string result(chars, chars + len);
+    g_assert_true(result == U"\xc9\xd6 foobar \u30df");
+    g_free(chars);
+
+    /* Try with a string that is likely to be stored as Latin-1 */
+    v_string.setString(JS_NewStringCopyZ(fx->cx, "abcd"));
+    g_assert_true(gjs_string_to_ucs4(fx->cx, v_string, &chars, &len));
+
+    result.assign(chars, chars + len);
+    g_assert_true(result == U"abcd");
+    g_free(chars);
+}
+
+static void
+test_jsapi_util_debug_string_valid_utf8(GjsUnitTestFixture *fx,
+                                        gconstpointer       unused)
 {
     JS::RootedValue v_string(fx->cx);
-    g_assert_true(gjs_string_from_utf8(fx->cx, VALID_UTF8_STRING, -1, &v_string));
+    g_assert_true(gjs_string_from_utf8(fx->cx, VALID_UTF8_STRING, -1,
+                                       &v_string));
 
     char *debug_output = gjs_value_debug_string(fx->cx, v_string);
 
@@ -195,8 +250,8 @@ gjstest_test_func_gjs_jsapi_util_debug_string_valid_utf8(GjsUnitTestFixture *fx,
 }
 
 static void
-gjstest_test_func_gjs_jsapi_util_debug_string_invalid_utf8(GjsUnitTestFixture *fx,
-                                                           gconstpointer       unused)
+test_jsapi_util_debug_string_invalid_utf8(GjsUnitTestFixture *fx,
+                                          gconstpointer       unused)
 {
     g_test_skip("SpiderMonkey doesn't validate UTF-8 after encoding it");
 
@@ -207,7 +262,7 @@ gjstest_test_func_gjs_jsapi_util_debug_string_invalid_utf8(GjsUnitTestFixture *f
     char *debug_output = gjs_value_debug_string(fx->cx, v_string);
 
     g_assert_nonnull(debug_output);
-    /* g_assert_cmpstr("\"\\xf5\\xf6\"", ==, debug_output); */
+    /* g_assert_cmpstr("\"\\xff\\xff\\xff\\xff\"", ==, debug_output); */
 
     g_free(debug_output);
 }
@@ -331,10 +386,14 @@ main(int    argc,
                         gjstest_test_func_gjs_jsapi_util_error_throw);
     ADD_JSAPI_UTIL_TEST("string/js/string/utf8",
                         gjstest_test_func_gjs_jsapi_util_string_js_string_utf8);
+    ADD_JSAPI_UTIL_TEST("string/char16_data",
+                        test_jsapi_util_string_char16_data);
+    ADD_JSAPI_UTIL_TEST("string/to_ucs4",
+                        test_jsapi_util_string_to_ucs4);
     ADD_JSAPI_UTIL_TEST("debug_string/valid-utf8",
-                        gjstest_test_func_gjs_jsapi_util_debug_string_valid_utf8);
+                        test_jsapi_util_debug_string_valid_utf8);
     ADD_JSAPI_UTIL_TEST("debug_string/invalid-utf8",
-                        gjstest_test_func_gjs_jsapi_util_debug_string_invalid_utf8);
+                        test_jsapi_util_debug_string_invalid_utf8);
 
 #undef ADD_JSAPI_UTIL_TEST
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]