[gjs/ewlsh/text-encoding: 3/4] modules: Implement non-fatal encoding and decoding
- From: Evan Welsh <ewlsh src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gjs/ewlsh/text-encoding: 3/4] modules: Implement non-fatal encoding and decoding
- Date: Thu, 12 Aug 2021 01:35:48 +0000 (UTC)
commit e85ad680ae1e60d7a19a63ced17f71caa4efcdf9
Author: Evan Welsh <contact evanwelsh com>
Date: Sun Jul 4 22:16:36 2021 -0700
modules: Implement non-fatal encoding and decoding
gjs/byteArray.cpp | 4 +-
gjs/jsapi-util-string.cpp | 44 ++++++++
gjs/jsapi-util.h | 5 +
gjs/text-encoding.cpp | 204 ++++++++++++++++++++++++++++++++------
gjs/text-encoding.h | 3 +-
modules/esm/_encoding/encoding.js | 2 +-
6 files changed, 227 insertions(+), 35 deletions(-)
---
diff --git a/gjs/byteArray.cpp b/gjs/byteArray.cpp
index a5979df0..e3afc11c 100644
--- a/gjs/byteArray.cpp
+++ b/gjs/byteArray.cpp
@@ -50,7 +50,7 @@ static bool to_string_func(JSContext* cx, unsigned argc, JS::Value* vp) {
const char* actual_encoding = encoding ? encoding.get() : "utf-8";
JS::RootedString str(
cx, gjs_decode_from_uint8array(cx, byte_array, actual_encoding,
- GjsStringTermination::ZERO_TERMINATED));
+ GjsStringTermination::ZERO_TERMINATED, true));
if (!str)
return false;
@@ -76,7 +76,7 @@ static bool instance_to_string_func(JSContext* cx, unsigned argc,
const char* actual_encoding = encoding ? encoding.get() : "utf-8";
JS::RootedString str(
cx, gjs_decode_from_uint8array(cx, this_obj, actual_encoding,
- GjsStringTermination::ZERO_TERMINATED));
+ GjsStringTermination::ZERO_TERMINATED, true));
if (!str)
return false;
diff --git a/gjs/jsapi-util-string.cpp b/gjs/jsapi-util-string.cpp
index 51ca5f9c..692a719d 100644
--- a/gjs/jsapi-util-string.cpp
+++ b/gjs/jsapi-util-string.cpp
@@ -129,6 +129,50 @@ bool gjs_string_to_utf8_n(JSContext* cx, JS::HandleString str, JS::UniqueChars*
return true;
}
+/**
+ * gjs_lossy_string_from_utf8:
+ *
+ * @brief Converts an array of UTF-8 characters to a JS string.
+ * Instead of throwing, any invalid characters will be converted
+ * to the UTF-8 invalid character fallback.
+ *
+ * @param cx the current #JSContext
+ * @param utf8_string an array of UTF-8 characters
+ * @param value_p a value to store the resulting string in
+ */
+JSString* gjs_lossy_string_from_utf8(JSContext* cx, const char* utf8_string) {
+ JS::ConstUTF8CharsZ chars(utf8_string, strlen(utf8_string));
+ size_t outlen;
+ JS::UniqueTwoByteChars twobyte_chars(
+ JS::LossyUTF8CharsToNewTwoByteCharsZ(cx, chars, &outlen,
+ js::MallocArena)
+ .get());
+ if (!twobyte_chars)
+ return nullptr;
+
+ return JS_NewUCStringCopyN(cx, twobyte_chars.get(), outlen);
+}
+
+/**
+ * gjs_lossy_string_from_utf8_n:
+ *
+ * @brief Provides the same conversion behavior as gjs_lossy_string_from_utf8
+ * with a fixed length. See gjs_lossy_string_from_utf8()
+ */
+JSString* gjs_lossy_string_from_utf8_n(JSContext* cx, const char* utf8_string,
+ size_t len) {
+ JS::UTF8Chars chars(utf8_string, len);
+ size_t outlen;
+ JS::UniqueTwoByteChars twobyte_chars(
+ JS::LossyUTF8CharsToNewTwoByteCharsZ(cx, chars, &outlen,
+ js::MallocArena)
+ .get());
+ if (!twobyte_chars)
+ return nullptr;
+
+ return JS_NewUCStringCopyN(cx, twobyte_chars.get(), outlen);
+}
+
bool
gjs_string_from_utf8(JSContext *context,
const char *utf8_string,
diff --git a/gjs/jsapi-util.h b/gjs/jsapi-util.h
index ec648347..388de031 100644
--- a/gjs/jsapi-util.h
+++ b/gjs/jsapi-util.h
@@ -454,6 +454,11 @@ GJS_JSAPI_RETURN_CONVENTION
bool gjs_string_to_utf8_n(JSContext* cx, JS::HandleString str, JS::UniqueChars* output,
size_t* output_len);
GJS_JSAPI_RETURN_CONVENTION
+JSString* gjs_lossy_string_from_utf8(JSContext* cx, const char* utf8_string);
+GJS_JSAPI_RETURN_CONVENTION
+JSString* gjs_lossy_string_from_utf8_n(JSContext* cx, const char* utf8_string,
+ size_t len);
+GJS_JSAPI_RETURN_CONVENTION
bool gjs_string_from_utf8(JSContext *context,
const char *utf8_string,
JS::MutableHandleValue value_p);
diff --git a/gjs/text-encoding.cpp b/gjs/text-encoding.cpp
index 10ae498e..edf9b354 100644
--- a/gjs/text-encoding.cpp
+++ b/gjs/text-encoding.cpp
@@ -63,9 +63,141 @@ static const char* UTF16_CODESET = "UTF-16BE";
#endif
GJS_JSAPI_RETURN_CONVENTION
-static JSString* gjs_decode_from_uint8array_slow(JSContext* cx, uint8_t* input,
+static JSString* gjs_lossy_decode_from_uint8array_slow(
+ JSContext* cx, const uint8_t* bytes, size_t bytes_len,
+ const char* from_codeset) {
+ GError* error = nullptr;
+ GjsAutoUnref<GCharsetConverter> converter(
+ g_charset_converter_new(UTF16_CODESET, from_codeset, &error));
+
+ // This should only throw if an encoding is not available.
+ if (error)
+ return gjs_throw_type_error_from_gerror(cx, error);
+
+ // This function converts *to* UTF-16, using a std::u16string
+ // as its buffer.
+ //
+ // UTF-16 represents each character with 2 bytes or
+ // 4 bytes, the best case scenario when converting to
+ // UTF-16 is that every input byte encodes to two bytes,
+ // this is typical for ASCII and non-supplementary characters.
+ // Because we are converting from an unknown encoding
+ // technically a single byte could be supplementary in
+ // Unicode (4 bytes) or even represen multiple Unicode characters.
+ //
+ // std::u16string does not care about these implementation
+ // details, its only concern is that is consists of byte pairs.
+ // Given this, a single UTF-16 character could be represented
+ // by one or two std::u16string characters.
+
+ // Allocate bytes_len * 2 + 12 as our initial buffer.
+ // bytes_len * 2 is the "best case" for LATIN1 strings
+ // and strings which are in the basic multilingual plane.
+ // Add 12 as a slight cushion and set the minimum allocation
+ // at 256 to prefer running a single iteration for
+ // small strings with supplemental plane characters.
+ //
+ // When converting Chinese characters, for example,
+ // some dialectal characters are in the supplemental plane
+ // Adding a padding of 12 prevents a few dialectal characters
+ // from requiring a reallocation.
+ size_t buffer_size = std::max(bytes_len * 2 + 12, 256lu);
+
+ // Cast data to correct input types
+ const char* input = reinterpret_cast<const char*>(bytes);
+ size_t input_len = bytes_len;
+
+ // The base string that we'll append to.
+ std::u16string output_str = u"";
+
+ do {
+ // Create a buffer to convert into.
+ std::vector<char> buffer(buffer_size);
+ size_t bytes_written = 0, bytes_read = 0;
+
+ g_converter_convert(G_CONVERTER(converter.get()), input, input_len,
+ buffer.data(), buffer.size(),
+ G_CONVERTER_INPUT_AT_END, &bytes_read,
+ &bytes_written, &error);
+
+ // If bytes were read, adjust input.
+ if (bytes_read > 0) {
+ input += bytes_read;
+ input_len -= bytes_read;
+ }
+
+ // If bytes were written append the buffer contents to our string
+ // accumulator
+ if (bytes_written > 0) {
+ char16_t* utf16_buffer = reinterpret_cast<char16_t*>(buffer.data());
+ // std::u16string uses exactly 2 bytes for every character.
+ output_str.append(utf16_buffer, bytes_written / 2);
+ } else if (error) {
+ // A PARTIAL_INPUT error can only occur if the user does not provide
+ // the full sequence for a multi-byte character, we skip over the
+ // next character and insert a unicode fallback.
+
+ // An INVALID_DATA error occurs when there is no way to decode a
+ // given byte into UTF-16 or the given byte does not exist in the
+ // source encoding.
+ if (g_error_matches(error, G_IO_ERROR, G_IO_ERROR_INVALID_DATA) ||
+ g_error_matches(error, G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT)) {
+ // If we're already at the end of the string, don't insert a
+ // fallback.
+ if (input_len > 0) {
+ // Skip the next byte and reduce length by one.
+ input += 1;
+ input_len -= 1;
+
+ // Append the unicode fallback character to the output
+ output_str.append(u"\ufffd", 1);
+ }
+
+ // Clear the error.
+ g_clear_error(&error);
+ } else if (g_error_matches(error, G_IO_ERROR,
+ G_IO_ERROR_NO_SPACE)) {
+ // If the buffer was full increase the buffer
+ // size and re-try the conversion.
+ //
+ // This logic allocates bytes_len * 3 first,
+ // then bytes_len * 4 (the worst case scenario
+ // is nearly impossible) and then continues appending
+ // arbitrary padding because we'll trust Gio and give
+ // it additional space.
+ if (buffer_size > bytes_len * 4) {
+ buffer_size += 256;
+ } else {
+ buffer_size += bytes_len;
+ }
+
+ // Clear the error.
+ g_clear_error(&error);
+ }
+ }
+
+ // Stop decoding if an unknown error occurs.
+ } while (input_len > 0 && !error);
+
+ // An unexpected error occurred.
+ if (error)
+ return gjs_throw_type_error_from_gerror(cx, error);
+
+ // Copy the accumulator's data into a JSString of Unicode (UTF-16) chars.
+ return JS_NewUCStringCopyN(cx, output_str.c_str(), output_str.size());
+}
+
+GJS_JSAPI_RETURN_CONVENTION
+static JSString* gjs_decode_from_uint8array_slow(JSContext* cx,
+ const uint8_t* input,
uint32_t input_len,
- const char* encoding) {
+ const char* encoding,
+ bool fatal) {
+ // If the decoding is not fatal we use the lossy decoder.
+ if (!fatal)
+ return gjs_lossy_decode_from_uint8array_slow(cx, input, input_len,
+ encoding);
+
size_t bytes_written, bytes_read;
GError* error = nullptr;
@@ -121,7 +253,8 @@ template <class T, class L>
// decode() function implementation
JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array,
const char* encoding,
- GjsStringTermination string_termination) {
+ GjsStringTermination string_termination,
+ bool fatal) {
g_assert(encoding && "encoding must be non-null");
if (!JS_IsUint8Array(byte_array)) {
@@ -149,35 +282,41 @@ JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array,
// and encoders.
bool encoding_is_utf8 = is_utf8_label(encoding);
if (!encoding_is_utf8)
- return gjs_decode_from_uint8array_slow(cx, data, len, encoding);
+ return gjs_decode_from_uint8array_slow(cx, data, len, encoding, fatal);
JS::RootedString decoded(cx);
- JS::UTF8Chars chars(reinterpret_cast<char*>(data), len);
- JS::RootedString str(cx, JS_NewStringCopyUTF8N(cx, chars));
-
- // If an exception occurred, we need to check if the
- // exception was an InternalError. Unfortunately,
- // SpiderMonkey's decoder can throw InternalError for some
- // invalid UTF-8 sources, we have to convert this into a
- // TypeError to match the Encoding specification.
- if (str) {
- decoded.set(str);
+ if (!fatal) {
+ decoded.set(gjs_lossy_string_from_utf8_n(
+ cx, reinterpret_cast<char*>(data), len));
} else {
- JS::RootedValue exc(cx);
- if (!JS_GetPendingException(cx, &exc) || !exc.isObject())
- return nullptr;
+ JS::UTF8Chars chars(reinterpret_cast<char*>(data), len);
+ JS::RootedString str(cx, JS_NewStringCopyUTF8N(cx, chars));
+
+ // If an exception occurred, we need to check if the
+ // exception was an InternalError. Unfortunately,
+ // SpiderMonkey's decoder can throw InternalError for some
+ // invalid UTF-8 sources, we have to convert this into a
+ // TypeError to match the Encoding specification.
+ if (str) {
+ decoded.set(str);
+ } else {
+ JS::RootedValue exc(cx);
+ if (!JS_GetPendingException(cx, &exc) || !exc.isObject())
+ return nullptr;
+
+ JS::RootedObject exc_obj(cx, &exc.toObject());
+ const JSClass* internal_error =
+ js::ProtoKeyToClass(JSProto_InternalError);
+ if (JS_InstanceOf(cx, exc_obj, internal_error, nullptr)) {
+ // Clear the existing exception.
+ JS_ClearPendingException(cx);
+ gjs_throw_custom(
+ cx, JSProto_TypeError, nullptr,
+ "The provided encoded data was not valid UTF-8");
+ }
- JS::RootedObject exc_obj(cx, &exc.toObject());
- const JSClass* internal_error =
- js::ProtoKeyToClass(JSProto_InternalError);
- if (JS_InstanceOf(cx, exc_obj, internal_error, nullptr)) {
- // Clear the existing exception.
- JS_ClearPendingException(cx);
- gjs_throw_custom(cx, JSProto_TypeError, nullptr,
- "The provided encoded data was not valid UTF-8");
+ return nullptr;
}
-
- return nullptr;
}
uint8_t* current_data;
@@ -204,7 +343,7 @@ JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array,
// This was the UTF-8 optimized path, so we explicitly pass the encoding
return gjs_decode_from_uint8array_slow(cx, current_data, current_len,
- "UTF-8");
+ "utf-8", fatal);
}
GJS_JSAPI_RETURN_CONVENTION
@@ -213,13 +352,16 @@ static bool gjs_decode(JSContext* cx, unsigned argc, JS::Value* vp) {
JS::RootedObject byte_array(cx);
JS::UniqueChars encoding;
- if (!gjs_parse_call_args(cx, "decode", args, "os", "byteArray", &byte_array,
- "encoding", &encoding))
+ bool fatal = false;
+ if (!gjs_parse_call_args(cx, "decode", args, "os|b", "byteArray",
+ &byte_array, "encoding", &encoding, "fatal",
+ &fatal))
return false;
JS::RootedString decoded(
cx, gjs_decode_from_uint8array(cx, byte_array, encoding.get(),
- GjsStringTermination::EXPLICIT_LENGTH));
+ GjsStringTermination::EXPLICIT_LENGTH,
+ fatal));
if (!decoded)
return false;
diff --git a/gjs/text-encoding.h b/gjs/text-encoding.h
index eee174bb..e4daa85e 100644
--- a/gjs/text-encoding.h
+++ b/gjs/text-encoding.h
@@ -22,7 +22,8 @@ enum class GjsStringTermination {
GJS_JSAPI_RETURN_CONVENTION
JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject uint8array,
const char* encoding,
- GjsStringTermination string_termination);
+ GjsStringTermination string_termination,
+ bool fatal);
GJS_JSAPI_RETURN_CONVENTION
JSObject* gjs_encode_to_uint8array(JSContext* cx, JS::HandleString str,
diff --git a/modules/esm/_encoding/encoding.js b/modules/esm/_encoding/encoding.js
index 3e2f449b..e84b752b 100644
--- a/modules/esm/_encoding/encoding.js
+++ b/modules/esm/_encoding/encoding.js
@@ -132,7 +132,7 @@ class TextDecoder {
input = new Uint8Array(buffer, byteOffset + 3, byteLength - 3);
}
- return Encoding.decode(input, this._internalEncoding);
+ return Encoding.decode(input, this._internalEncoding, this.fatal);
}
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]