[gjs/ewlsh/text-encoding: 3/5] modules: Implement non-fatal encoding and decoding

From: Evan Welsh <ewlsh src gnome org>
To: commits-list gnome org
Cc:
Subject: [gjs/ewlsh/text-encoding: 3/5] modules: Implement non-fatal encoding and decoding
Date: Mon, 5 Jul 2021 05:24:27 +0000 (UTC)

commit ab3f360172a23a6ad657b5157ecf464c2b6dbf58
Author: Evan Welsh <contact evanwelsh com>
Date:   Sun Jul 4 22:16:36 2021 -0700

    modules: Implement non-fatal encoding and decoding

 gjs/jsapi-util-string.cpp |  44 +++++++++++
 gjs/jsapi-util.h          |   5 ++
 gjs/text-encoding.cpp     | 189 ++++++++++++++++++++++++++++++++++++++--------
 gjs/text-encoding.h       |   3 +-
 modules/core/_text.js     |   2 +-
 5 files changed, 211 insertions(+), 32 deletions(-)
---
diff --git a/gjs/jsapi-util-string.cpp b/gjs/jsapi-util-string.cpp
index 49226d53..511df527 100644
--- a/gjs/jsapi-util-string.cpp
+++ b/gjs/jsapi-util-string.cpp
@@ -127,6 +127,50 @@ bool gjs_string_to_utf8_n(JSContext* cx, JS::HandleString str, char** output,
     return true;
 }
 
+/**
+ * gjs_lossy_string_from_utf8:
+ *
+ * @brief Converts an array of UTF-8 characters to a JS string.
+ * Instead of throwing, any invalid characters will be converted
+ * to the UTF-8 invalid character fallback.
+ *
+ * @param cx the current #JSContext
+ * @param utf8_string an array of UTF-8 characters
+ * @param value_p a value to store the resulting string in
+ */
+JSString* gjs_lossy_string_from_utf8(JSContext* cx, const char* utf8_string) {
+    JS::ConstUTF8CharsZ chars(utf8_string, strlen(utf8_string));
+    size_t outlen;
+    JS::UniqueTwoByteChars twobyte_chars(
+        JS::LossyUTF8CharsToNewTwoByteCharsZ(cx, chars, &outlen,
+                                             js::MallocArena)
+            .get());
+    if (!twobyte_chars)
+        return nullptr;
+
+    return JS_NewUCStringCopyN(cx, twobyte_chars.get(), outlen);
+}
+
+/**
+ * gjs_lossy_string_from_utf8_n:
+ *
+ * @brief Provides the same conversion behavior as gjs_lossy_string_from_utf8
+ * with a fixed length. See gjs_lossy_string_from_utf8()
+ */
+JSString* gjs_lossy_string_from_utf8_n(JSContext* cx, const char* utf8_string,
+                                       size_t len) {
+    JS::UTF8Chars chars(utf8_string, len);
+    size_t outlen;
+    JS::UniqueTwoByteChars twobyte_chars(
+        JS::LossyUTF8CharsToNewTwoByteCharsZ(cx, chars, &outlen,
+                                             js::MallocArena)
+            .get());
+    if (!twobyte_chars)
+        return nullptr;
+
+    return JS_NewUCStringCopyN(cx, twobyte_chars.get(), outlen);
+}
+
 bool
 gjs_string_from_utf8(JSContext             *context,
                      const char            *utf8_string,
diff --git a/gjs/jsapi-util.h b/gjs/jsapi-util.h
index c43fd860..a98a7974 100644
--- a/gjs/jsapi-util.h
+++ b/gjs/jsapi-util.h
@@ -454,6 +454,11 @@ GJS_JSAPI_RETURN_CONVENTION
 bool gjs_string_to_utf8_n(JSContext* cx, JS::HandleString str, char** output,
                           size_t* output_len);
 GJS_JSAPI_RETURN_CONVENTION
+JSString* gjs_lossy_string_from_utf8(JSContext* cx, const char* utf8_string);
+GJS_JSAPI_RETURN_CONVENTION
+JSString* gjs_lossy_string_from_utf8_n(JSContext* cx, const char* utf8_string,
+                                       size_t len);
+GJS_JSAPI_RETURN_CONVENTION
 bool gjs_string_from_utf8(JSContext             *context,
                           const char            *utf8_string,
                           JS::MutableHandleValue value_p);
diff --git a/gjs/text-encoding.cpp b/gjs/text-encoding.cpp
index 0f5ddb68..8e25a22f 100644
--- a/gjs/text-encoding.cpp
+++ b/gjs/text-encoding.cpp
@@ -61,6 +61,124 @@ static const char* UTF16_CODESET = "UTF-16LE";
 static const char* UTF16_CODESET = "UTF-16BE";
 #endif
 
+GJS_JSAPI_RETURN_CONVENTION
+static JSString* gjs_lossy_decode_from_uint8array_slow(
+    JSContext* cx, uint8_t* bytes, size_t bytes_len, const char* from_codeset) {
+    GError* error = nullptr;
+    GjsAutoUnref<GCharsetConverter> converter(
+        g_charset_converter_new(UTF16_CODESET, from_codeset, &error));
+
+    // This should only throw if an encoding is not available.
+    if (error)
+        return gjs_throw_type_error_from_gerror(cx, error);
+
+    // TODO(ewlsh): We can likely be more intelligent about our initial
+    // allocation and allocate based on bytes_len
+    int buffer_size = 1024;
+
+    // Cast data to correct input types
+    const char* input = reinterpret_cast<const char*>(bytes);
+    size_t input_len = bytes_len;
+
+    // The base string that we'll append to.
+    std::u16string output_str = u"";
+
+    do {
+        // Create a buffer to convert into.
+        std::vector<char> buffer(buffer_size);
+        size_t bytes_written = 0, bytes_read = 0;
+
+        g_converter_convert(G_CONVERTER(converter.get()), input, input_len,
+                            buffer.data(), buffer.size(),
+                            G_CONVERTER_INPUT_AT_END, &bytes_read,
+                            &bytes_written, &error);
+
+        // If bytes were read, adjust input.
+        if (bytes_read > 0) {
+            input += bytes_read;
+            input_len -= bytes_read;
+        }
+
+        // If bytes were written append the buffer contents to our string
+        // accumulator
+        if (bytes_written > 0) {
+            char16_t* utf16_buffer = reinterpret_cast<char16_t*>(buffer.data());
+            // UTF-16 uses exactly 2 bytes for every character.
+            output_str.append(utf16_buffer, bytes_written / 2);
+        } else if (error) {
+            // A PARTIAL_INPUT error can only occur if the user does not provide
+            // the full sequence for a multi-byte character, we skip over the
+            // next character and insert a unicode fallback.
+
+            // An INVALID_DATA error occurs when there is no way to decode a
+            // given byte into UTF-16 or the given byte does not exist in the
+            // source encoding.
+            if (g_error_matches(error, G_IO_ERROR, G_IO_ERROR_INVALID_DATA) ||
+                g_error_matches(error, G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT)) {
+                // If we're already at the end of the string, don't insert a
+                // fallback.
+                if (input_len > 0) {
+                    // Skip the next byte and reduce length by one.
+                    input += 1;
+                    input_len -= 1;
+
+                    // Append the unicode fallback character to the output
+                    output_str.append(u"\ufffd", 1);
+                }
+
+                // Clear the error.
+                g_clear_error(&error);
+            } else if (g_error_matches(error, G_IO_ERROR,
+                                       G_IO_ERROR_NO_SPACE)) {
+                // If the buffer was full increase the buffer
+                // size and re-try the conversion.
+                buffer_size += 512;
+
+                // Clear the error.
+                g_clear_error(&error);
+            }
+        }
+
+        // Stop decoding if an unknown error occurs.
+    } while (input_len > 0 && !error);
+
+    // An unexpected error occurred.
+    if (error)
+        return gjs_throw_type_error_from_gerror(cx, error);
+
+    // Copy the accumulator's data into a JSString of Unicode (UTF-16) chars.
+    return JS_NewUCStringCopyN(cx, output_str.c_str(), output_str.size());
+}
+
+GJS_JSAPI_RETURN_CONVENTION
+static JSString* gjs_decode_from_uint8array_slow(JSContext* cx, uint8_t* input,
+                                                 uint32_t input_len,
+                                                 const char* encoding,
+                                                 bool fatal) {
+    // If the decoding is not fatal we use the lossy decoder.
+    if (!fatal)
+        return gjs_lossy_decode_from_uint8array_slow(cx, input, input_len,
+                                                     encoding);
+
+    size_t bytes_written, bytes_read;
+    GError* error = nullptr;
+
+    GjsAutoChar bytes =
+        g_convert(reinterpret_cast<char*>(input), input_len, UTF16_CODESET,
+                  encoding, &bytes_read, &bytes_written, &error);
+
+    if (error)
+        return gjs_throw_type_error_from_gerror(cx, error);
+
+    // bytes_written should be bytes in a UTF-16 string so should be a
+    // multiple of 2
+    g_assert((bytes_written % 2) == 0);
+
+    // Cast g_convert's output to char16_t and copy the data.
+    const char16_t* unicode_bytes = reinterpret_cast<char16_t*>(bytes.get());
+    return JS_NewUCStringCopyN(cx, unicode_bytes, bytes_written / 2);
+}
+
 [[nodiscard]] static bool is_utf8_label(const char* encoding) {
     // We could be smarter about utf8 synonyms here.
     // For now, we handle any casing and trailing/leading
@@ -90,7 +208,8 @@ static const char* UTF16_CODESET = "UTF-16BE";
 // decode() function implementation
 JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array,
                                      const char* encoding,
-                                     GjsStringTermination string_termination) {
+                                     GjsStringTermination string_termination,
+                                     bool fatal) {
     if (!JS_IsUint8Array(byte_array)) {
         gjs_throw(cx, "Argument to decode() must be a Uint8Array");
         return nullptr;
@@ -117,37 +236,44 @@ JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array,
     // and encoders.
     bool encoding_is_utf8 = is_utf8_label(encoding);
     if (!encoding_is_utf8)
-        return gjs_decode_from_uint8array_slow(cx, data, len, encoding);
+        return gjs_decode_from_uint8array_slow(cx, data, len, encoding, fatal);
 
     JS::RootedString decoded(cx);
-    JS::UTF8Chars chars(reinterpret_cast<char*>(data), len);
-    JS::RootedString str(cx, JS_NewStringCopyUTF8N(cx, chars));
-
-    // If an exception occurred, we need to check if the
-    // exception was an InternalError. Unfortunately,
-    // SpiderMonkey's decoder can throw InternalError for some
-    // invalid UTF-8 sources, we have to convert this into a
-    // TypeError to match the Encoding specification.
-    if (str) {
-        decoded.set(str);
+    if (!fatal) {
+        decoded.set(gjs_lossy_string_from_utf8_n(
+            cx, reinterpret_cast<char*>(data), len));
     } else {
-        if (!JS_IsExceptionPending(cx))
-            return nullptr;
-        JS::RootedValue exc(cx);
-        if (!JS_GetPendingException(cx, &exc) || !exc.isObject())
-            return nullptr;
+        JS::UTF8Chars chars(reinterpret_cast<char*>(data), len);
+        JS::RootedString str(cx, JS_NewStringCopyUTF8N(cx, chars));
+
+        // If an exception occurred, we need to check if the
+        // exception was an InternalError. Unfortunately,
+        // SpiderMonkey's decoder can throw InternalError for some
+        // invalid UTF-8 sources, we have to convert this into a
+        // TypeError to match the Encoding specification.
+        if (str) {
+            decoded.set(str);
+        } else {
+            if (!JS_IsExceptionPending(cx))
+                return nullptr;
+            JS::RootedValue exc(cx);
 
-        JS::RootedObject exc_obj(cx, &exc.toObject());
-        const JSClass* internal_error =
-            js::ProtoKeyToClass(JSProto_InternalError);
-        if (JS_InstanceOf(cx, exc_obj, internal_error, nullptr)) {
-            // Clear the existing exception.
-            JS_ClearPendingException(cx);
-            gjs_throw_custom(cx, JSProto_TypeError, nullptr,
-                             "The provided encoded data was not valid UTF-8");
-        }
+            if (!JS_GetPendingException(cx, &exc) || !exc.isObject())
+                return nullptr;
 
-        return nullptr;
+            JS::RootedObject exc_obj(cx, &exc.toObject());
+            const JSClass* internal_error =
+                js::ProtoKeyToClass(JSProto_InternalError);
+            if (JS_InstanceOf(cx, exc_obj, internal_error, nullptr)) {
+                // Clear the existing exception.
+                JS_ClearPendingException(cx);
+                gjs_throw_custom(
+                    cx, JSProto_TypeError, nullptr,
+                    "The provided encoded data was not valid UTF-8");
+            }
+
+            return nullptr;
+        }
     }
 
     uint8_t* current_data;
@@ -188,13 +314,16 @@ static bool gjs_decode(JSContext* cx, unsigned argc, JS::Value* vp) {
 
     JS::RootedObject byte_array(cx);
     JS::UniqueChars encoding;
-    if (!gjs_parse_call_args(cx, "decode", args, "os", "byteArray", &byte_array,
-                             "encoding", &encoding))
+    bool fatal = false;
+    if (!gjs_parse_call_args(cx, "decode", args, "os|b", "byteArray",
+                             &byte_array, "encoding", &encoding, "fatal",
+                             &fatal))
         return false;
 
     JS::RootedString decoded(
         cx, gjs_decode_from_uint8array(cx, byte_array, encoding.get(),
-                                       GjsStringTermination::EXPLICIT_LENGTH));
+                                       GjsStringTermination::EXPLICIT_LENGTH,
+                                       fatal));
     if (!decoded)
         return false;
 
diff --git a/gjs/text-encoding.h b/gjs/text-encoding.h
index eee174bb..e4daa85e 100644
--- a/gjs/text-encoding.h
+++ b/gjs/text-encoding.h
@@ -22,7 +22,8 @@ enum class GjsStringTermination {
 GJS_JSAPI_RETURN_CONVENTION
 JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject uint8array,
                                      const char* encoding,
-                                     GjsStringTermination string_termination);
+                                     GjsStringTermination string_termination,
+                                     bool fatal);
 
 GJS_JSAPI_RETURN_CONVENTION
 JSObject* gjs_encode_to_uint8array(JSContext* cx, JS::HandleString str,
diff --git a/modules/core/_text.js b/modules/core/_text.js
index a54d4342..e3cefb90 100644
--- a/modules/core/_text.js
+++ b/modules/core/_text.js
@@ -125,7 +125,7 @@ var TextDecoder = class TextDecoder {
             input = new Uint8Array(buffer, byteOffset + 3, byteLength - 3);
         }
 
-        return Encoding.decode(input, this._internalEncoding);
+        return Encoding.decode(input, this._internalEncoding, this.fatal);
     }
 }
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]