[gjs] byteArray: Refactor functionality into Encoding



commit 4b02a015e20145c760743f3d2cb0493e631a4e5b
Author: Evan Welsh <contact evanwelsh com>
Date:   Sun Apr 25 13:14:58 2021 -0700

    byteArray: Refactor functionality into Encoding
    
    Copies common functionality from byteArray into a Encoding native
    module. This refactor is the basis for the WHATWG Encoding work.

 gjs/byteArray.cpp     | 111 +-----------------------------------
 gjs/context.cpp       |   3 +
 gjs/text-encoding.cpp | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++
 gjs/text-encoding.h   |  23 ++++++++
 meson.build           |   1 +
 5 files changed, 184 insertions(+), 108 deletions(-)
---
diff --git a/gjs/byteArray.cpp b/gjs/byteArray.cpp
index 1635e902..60826ab2 100644
--- a/gjs/byteArray.cpp
+++ b/gjs/byteArray.cpp
@@ -28,6 +28,7 @@
 #include "gjs/deprecation.h"
 #include "gjs/jsapi-util-args.h"
 #include "gjs/jsapi-util.h"
+#include "gjs/text-encoding.h"
 #include "util/misc.h"  // for _gjs_memdup2
 
 /* Callbacks to use with JS::NewExternalArrayBuffer() */
@@ -42,112 +43,6 @@ static void bytes_unref_arraybuffer(void* contents [[maybe_unused]],
     g_bytes_unref(gbytes);
 }
 
-GJS_JSAPI_RETURN_CONVENTION
-bool to_string_impl_slow(JSContext* cx, uint8_t* data, uint32_t len,
-                         const char* encoding, JS::MutableHandleValue rval) {
-    size_t bytes_written;
-    GError* error = nullptr;
-    GjsAutoChar u16_str = g_convert(reinterpret_cast<char*>(data), len,
-    // Make sure the bytes of the UTF-16 string are laid out in memory
-    // such that we can simply reinterpret_cast<char16_t> them.
-#if G_BYTE_ORDER == G_LITTLE_ENDIAN
-                                    "UTF-16LE",
-#else
-                                    "UTF-16BE",
-#endif
-                                    encoding, nullptr, /* bytes read */
-                                    &bytes_written, &error);
-    if (!u16_str)
-        return gjs_throw_gerror_message(cx, error);  // frees GError
-
-    // bytes_written should be bytes in a UTF-16 string so should be a multiple
-    // of 2
-    g_assert((bytes_written % 2) == 0);
-
-    // g_convert 0-terminates the string, although the 0 isn't included in
-    // bytes_written
-    JSString* s =
-        JS_NewUCStringCopyZ(cx, reinterpret_cast<char16_t*>(u16_str.get()));
-    if (!s)
-        return false;
-
-    rval.setString(s);
-    return true;
-}
-
-/* implement toString() with an optional encoding arg */
-GJS_JSAPI_RETURN_CONVENTION
-static bool to_string_impl(JSContext* context, JS::HandleObject byte_array,
-                           const char* encoding, JS::MutableHandleValue rval) {
-    if (!JS_IsUint8Array(byte_array)) {
-        gjs_throw(context,
-                  "Argument to ByteArray.toString() must be a Uint8Array");
-        return false;
-    }
-
-    bool encoding_is_utf8;
-    uint8_t* data;
-
-    if (encoding) {
-        /* maybe we should be smarter about utf8 synonyms here.
-         * doesn't matter much though. encoding_is_utf8 is
-         * just an optimization anyway.
-         */
-        encoding_is_utf8 = (strcmp(encoding, "UTF-8") == 0);
-    } else {
-        encoding_is_utf8 = true;
-    }
-
-    uint32_t len;
-    bool is_shared_memory;
-    js::GetUint8ArrayLengthAndData(byte_array, &len, &is_shared_memory, &data);
-
-    if (len == 0) {
-        rval.setString(JS_GetEmptyString(context));
-        return true;
-    }
-
-    if (!encoding_is_utf8)
-        return to_string_impl_slow(context, data, len, encoding, rval);
-
-    // optimization, avoids iconv overhead and runs libmozjs hardwired
-    // utf8-to-utf16
-
-    // If there are any 0 bytes, including the terminating byte, stop at the
-    // first one
-    if (data[len - 1] == 0 || memchr(data, 0, len)) {
-        if (!gjs_string_from_utf8(context, reinterpret_cast<char*>(data), rval))
-            return false;
-    } else {
-        if (!gjs_string_from_utf8_n(context, reinterpret_cast<char*>(data), len,
-                                    rval))
-            return false;
-    }
-
-    uint8_t* current_data;
-    uint32_t current_len;
-    bool ignore_val;
-
-    // If a garbage collection occurs between when we call
-    // js::GetUint8ArrayLengthAndData and return from gjs_string_from_utf8, a
-    // use-after-free corruption can occur if the garbage collector shifts the
-    // location of the Uint8Array's private data. To mitigate this we call
-    // js::GetUint8ArrayLengthAndData again and then compare if the length and
-    // pointer are still the same. If the pointers differ, we use the slow path
-    // to ensure no data corruption occurred. The shared-ness of an array cannot
-    // change between calls, so we ignore it.
-    js::GetUint8ArrayLengthAndData(byte_array, &current_len, &ignore_val,
-                                   &current_data);
-
-    // Ensure the private data hasn't changed
-    if (current_len == len && current_data == data)
-        return true;
-
-    // This was the UTF-8 optimized path, so we explicitly pass the encoding
-    return to_string_impl_slow(context, current_data, current_len, "UTF-8",
-                               rval);
-}
-
 GJS_JSAPI_RETURN_CONVENTION
 static bool to_string_func(JSContext* cx, unsigned argc, JS::Value* vp) {
     JS::CallArgs args = JS::CallArgsFromVp(argc, vp);
@@ -158,7 +53,7 @@ static bool to_string_func(JSContext* cx, unsigned argc, JS::Value* vp) {
                              &byte_array, "encoding", &encoding))
         return false;
 
-    return to_string_impl(cx, byte_array, encoding.get(), args.rval());
+    return bytearray_to_string(cx, byte_array, encoding.get(), args.rval());
 }
 
 /* Workaround to keep existing code compatible. This function is tacked onto
@@ -176,7 +71,7 @@ static bool instance_to_string_func(JSContext* cx, unsigned argc,
     if (!gjs_parse_call_args(cx, "toString", args, "|s", "encoding", &encoding))
         return false;
 
-    return to_string_impl(cx, this_obj, encoding.get(), args.rval());
+    return bytearray_to_string(cx, this_obj, encoding.get(), args.rval());
 }
 
 GJS_JSAPI_RETURN_CONVENTION
diff --git a/gjs/context.cpp b/gjs/context.cpp
index 948ca8c6..41e3297e 100644
--- a/gjs/context.cpp
+++ b/gjs/context.cpp
@@ -77,6 +77,7 @@
 #include "gjs/objectbox.h"
 #include "gjs/profiler-private.h"
 #include "gjs/profiler.h"
+#include "gjs/text-encoding.h"
 #include "modules/modules.h"
 #include "util/log.h"
 
@@ -319,6 +320,8 @@ gjs_context_class_init(GjsContextClass *klass)
     }
 
     gjs_register_native_module("_byteArrayNative", gjs_define_byte_array_stuff);
+    gjs_register_native_module("_encodingNative",
+                               gjs_define_text_encoding_stuff);
     gjs_register_native_module("_gi", gjs_define_private_gi_stuff);
     gjs_register_native_module("gi", gjs_define_repo);
 
diff --git a/gjs/text-encoding.cpp b/gjs/text-encoding.cpp
new file mode 100644
index 00000000..98d3dca9
--- /dev/null
+++ b/gjs/text-encoding.cpp
@@ -0,0 +1,154 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; -*- */
+// SPDX-License-Identifier: MIT OR LGPL-2.0-or-later
+// SPDX-FileCopyrightText: 2010 litl, LLC
+// SPDX-FileCopyrightText: 2021 Evan Welsh
+
+#include <config.h>
+
+#include <stdint.h>
+#include <string.h>  // for strcmp, memchr, strlen
+
+#include <algorithm>
+#include <vector>
+
+#include <gio/gio.h>
+#include <girepository.h>
+#include <glib-object.h>
+#include <glib.h>
+
+#include <js/ArrayBuffer.h>
+#include <js/CallArgs.h>
+#include <js/CharacterEncoding.h>
+#include <js/GCAPI.h>  // for AutoCheckCannotGC
+#include <js/PropertySpec.h>
+#include <js/RootingAPI.h>
+#include <js/TypeDecls.h>
+#include <js/Utility.h>   // for UniqueChars
+#include <jsapi.h>        // for JS_DefineFunctionById, JS_DefineFun...
+#include <jsfriendapi.h>  // for JS_NewUint8ArrayWithBuffer, GetUint...
+
+#include "gi/boxed.h"
+#include "gjs/atoms.h"
+#include "gjs/context-private.h"
+#include "gjs/deprecation.h"
+#include "gjs/jsapi-util-args.h"
+#include "gjs/jsapi-util.h"
+#include "gjs/text-encoding.h"
+
+GJS_JSAPI_RETURN_CONVENTION
+static bool to_string_impl_slow(JSContext* cx, uint8_t* data, uint32_t len,
+                                const char* encoding,
+                                JS::MutableHandleValue rval) {
+    size_t bytes_written;
+    GError* error = nullptr;
+    GjsAutoChar u16_str = g_convert(reinterpret_cast<char*>(data), len,
+    // Make sure the bytes of the UTF-16 string are laid out in memory
+    // such that we can simply reinterpret_cast<char16_t> them.
+#if G_BYTE_ORDER == G_LITTLE_ENDIAN
+                                    "UTF-16LE",
+#else
+                                    "UTF-16BE",
+#endif
+                                    encoding, nullptr, /* bytes read */
+                                    &bytes_written, &error);
+    if (!u16_str)
+        return gjs_throw_gerror_message(cx, error);  // frees GError
+
+    // bytes_written should be bytes in a UTF-16 string so should be a multiple
+    // of 2
+    g_assert((bytes_written % 2) == 0);
+
+    // g_convert 0-terminates the string, although the 0 isn't included in
+    // bytes_written
+    JSString* s =
+        JS_NewUCStringCopyZ(cx, reinterpret_cast<char16_t*>(u16_str.get()));
+    if (!s)
+        return false;
+
+    rval.setString(s);
+    return true;
+}
+
+// implement ByteArray.toString() with an optional encoding arg
+bool bytearray_to_string(JSContext* context, JS::HandleObject byte_array,
+                         const char* encoding, JS::MutableHandleValue rval) {
+    if (!JS_IsUint8Array(byte_array)) {
+        gjs_throw(context,
+                  "Argument to ByteArray.toString() must be a Uint8Array");
+        return false;
+    }
+
+    bool encoding_is_utf8;
+    uint8_t* data;
+
+    if (encoding) {
+        /* maybe we should be smarter about utf8 synonyms here.
+         * doesn't matter much though. encoding_is_utf8 is
+         * just an optimization anyway.
+         */
+        encoding_is_utf8 = (strcmp(encoding, "UTF-8") == 0);
+    } else {
+        encoding_is_utf8 = true;
+    }
+
+    uint32_t len;
+    bool is_shared_memory;
+    js::GetUint8ArrayLengthAndData(byte_array, &len, &is_shared_memory, &data);
+
+    if (len == 0) {
+        rval.setString(JS_GetEmptyString(context));
+        return true;
+    }
+
+    if (!encoding_is_utf8)
+        return to_string_impl_slow(context, data, len, encoding, rval);
+
+    // optimization, avoids iconv overhead and runs libmozjs hardwired
+    // utf8-to-utf16
+
+    // If there are any 0 bytes, including the terminating byte, stop at the
+    // first one
+    if (data[len - 1] == 0 || memchr(data, 0, len)) {
+        if (!gjs_string_from_utf8(context, reinterpret_cast<char*>(data), rval))
+            return false;
+    } else {
+        if (!gjs_string_from_utf8_n(context, reinterpret_cast<char*>(data), len,
+                                    rval))
+            return false;
+    }
+
+    uint8_t* current_data;
+    uint32_t current_len;
+    bool ignore_val;
+
+    // If a garbage collection occurs between when we call
+    // js::GetUint8ArrayLengthAndData and return from gjs_string_from_utf8, a
+    // use-after-free corruption can occur if the garbage collector shifts the
+    // location of the Uint8Array's private data. To mitigate this we call
+    // js::GetUint8ArrayLengthAndData again and then compare if the length and
+    // pointer are still the same. If the pointers differ, we use the slow path
+    // to ensure no data corruption occurred. The shared-ness of an array cannot
+    // change between calls, so we ignore it.
+    js::GetUint8ArrayLengthAndData(byte_array, &current_len, &ignore_val,
+                                   &current_data);
+
+    // Ensure the private data hasn't changed
+    if (current_len == len && current_data == data)
+        return true;
+
+    // This was the UTF-8 optimized path, so we explicitly pass the encoding
+    return to_string_impl_slow(context, current_data, current_len, "UTF-8",
+                               rval);
+}
+
+static JSFunctionSpec gjs_text_encoding_module_funcs[] = {JS_FS_END};
+
+bool gjs_define_text_encoding_stuff(JSContext* cx,
+                                    JS::MutableHandleObject module) {
+    JSObject* new_obj = JS_NewPlainObject(cx);
+    if (!new_obj)
+        return false;
+    module.set(new_obj);
+
+    return JS_DefineFunctions(cx, module, gjs_text_encoding_module_funcs);
+}
diff --git a/gjs/text-encoding.h b/gjs/text-encoding.h
new file mode 100644
index 00000000..e9425392
--- /dev/null
+++ b/gjs/text-encoding.h
@@ -0,0 +1,23 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; -*- */
+// SPDX-License-Identifier: MIT OR LGPL-2.0-or-later
+// SPDX-FileCopyrightText: 2021 Evan Welsh
+
+#pragma once
+
+#include <config.h>
+
+#include <stddef.h>  // for size_t
+
+#include <glib.h>
+
+#include <js/TypeDecls.h>
+
+#include "gjs/macros.h"
+
+GJS_JSAPI_RETURN_CONVENTION
+bool bytearray_to_string(JSContext* cx, JS::HandleObject uint8array,
+                         const char* encoding, JS::MutableHandleValue rval);
+
+GJS_JSAPI_RETURN_CONVENTION
+bool gjs_define_text_encoding_stuff(JSContext* cx,
+                                    JS::MutableHandleObject module);
diff --git a/meson.build b/meson.build
index 267ab9ad..1a8a0b63 100644
--- a/meson.build
+++ b/meson.build
@@ -406,6 +406,7 @@ libgjs_sources = [
     'gjs/native.cpp', 'gjs/native.h',
     'gjs/objectbox.cpp', 'gjs/objectbox.h',
     'gjs/profiler.cpp', 'gjs/profiler-private.h',
+    'gjs/text-encoding.cpp', 'gjs/text-encoding.h',
     'gjs/stack.cpp',
     'modules/console.cpp', 'modules/console.h',
     'modules/modules.cpp', 'modules/modules.h',


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]