[gjs/ewlsh/text-encoding: 3/5] modules: Implement non-fatal encoding and decoding
- From: Evan Welsh <ewlsh src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gjs/ewlsh/text-encoding: 3/5] modules: Implement non-fatal encoding and decoding
- Date: Mon, 5 Jul 2021 05:24:27 +0000 (UTC)
commit ab3f360172a23a6ad657b5157ecf464c2b6dbf58
Author: Evan Welsh <contact evanwelsh com>
Date: Sun Jul 4 22:16:36 2021 -0700
modules: Implement non-fatal encoding and decoding
gjs/jsapi-util-string.cpp | 44 +++++++++++
gjs/jsapi-util.h | 5 ++
gjs/text-encoding.cpp | 189 ++++++++++++++++++++++++++++++++++++++--------
gjs/text-encoding.h | 3 +-
modules/core/_text.js | 2 +-
5 files changed, 211 insertions(+), 32 deletions(-)
---
diff --git a/gjs/jsapi-util-string.cpp b/gjs/jsapi-util-string.cpp
index 49226d53..511df527 100644
--- a/gjs/jsapi-util-string.cpp
+++ b/gjs/jsapi-util-string.cpp
@@ -127,6 +127,50 @@ bool gjs_string_to_utf8_n(JSContext* cx, JS::HandleString str, char** output,
return true;
}
+/**
+ * gjs_lossy_string_from_utf8:
+ *
+ * @brief Converts an array of UTF-8 characters to a JS string.
+ * Instead of throwing, any invalid characters will be converted
+ * to the UTF-8 invalid character fallback.
+ *
+ * @param cx the current #JSContext
+ * @param utf8_string an array of UTF-8 characters
+ * @param value_p a value to store the resulting string in
+ */
+JSString* gjs_lossy_string_from_utf8(JSContext* cx, const char* utf8_string) {
+ JS::ConstUTF8CharsZ chars(utf8_string, strlen(utf8_string));
+ size_t outlen;
+ JS::UniqueTwoByteChars twobyte_chars(
+ JS::LossyUTF8CharsToNewTwoByteCharsZ(cx, chars, &outlen,
+ js::MallocArena)
+ .get());
+ if (!twobyte_chars)
+ return nullptr;
+
+ return JS_NewUCStringCopyN(cx, twobyte_chars.get(), outlen);
+}
+
+/**
+ * gjs_lossy_string_from_utf8_n:
+ *
+ * @brief Provides the same conversion behavior as gjs_lossy_string_from_utf8
+ * with a fixed length. See gjs_lossy_string_from_utf8()
+ */
+JSString* gjs_lossy_string_from_utf8_n(JSContext* cx, const char* utf8_string,
+ size_t len) {
+ JS::UTF8Chars chars(utf8_string, len);
+ size_t outlen;
+ JS::UniqueTwoByteChars twobyte_chars(
+ JS::LossyUTF8CharsToNewTwoByteCharsZ(cx, chars, &outlen,
+ js::MallocArena)
+ .get());
+ if (!twobyte_chars)
+ return nullptr;
+
+ return JS_NewUCStringCopyN(cx, twobyte_chars.get(), outlen);
+}
+
bool
gjs_string_from_utf8(JSContext *context,
const char *utf8_string,
diff --git a/gjs/jsapi-util.h b/gjs/jsapi-util.h
index c43fd860..a98a7974 100644
--- a/gjs/jsapi-util.h
+++ b/gjs/jsapi-util.h
@@ -454,6 +454,11 @@ GJS_JSAPI_RETURN_CONVENTION
bool gjs_string_to_utf8_n(JSContext* cx, JS::HandleString str, char** output,
size_t* output_len);
GJS_JSAPI_RETURN_CONVENTION
+JSString* gjs_lossy_string_from_utf8(JSContext* cx, const char* utf8_string);
+GJS_JSAPI_RETURN_CONVENTION
+JSString* gjs_lossy_string_from_utf8_n(JSContext* cx, const char* utf8_string,
+ size_t len);
+GJS_JSAPI_RETURN_CONVENTION
bool gjs_string_from_utf8(JSContext *context,
const char *utf8_string,
JS::MutableHandleValue value_p);
diff --git a/gjs/text-encoding.cpp b/gjs/text-encoding.cpp
index 0f5ddb68..8e25a22f 100644
--- a/gjs/text-encoding.cpp
+++ b/gjs/text-encoding.cpp
@@ -61,6 +61,124 @@ static const char* UTF16_CODESET = "UTF-16LE";
static const char* UTF16_CODESET = "UTF-16BE";
#endif
+GJS_JSAPI_RETURN_CONVENTION
+static JSString* gjs_lossy_decode_from_uint8array_slow(
+ JSContext* cx, uint8_t* bytes, size_t bytes_len, const char* from_codeset) {
+ GError* error = nullptr;
+ GjsAutoUnref<GCharsetConverter> converter(
+ g_charset_converter_new(UTF16_CODESET, from_codeset, &error));
+
+ // This should only throw if an encoding is not available.
+ if (error)
+ return gjs_throw_type_error_from_gerror(cx, error);
+
+ // TODO(ewlsh): We can likely be more intelligent about our initial
+ // allocation and allocate based on bytes_len
+ int buffer_size = 1024;
+
+ // Cast data to correct input types
+ const char* input = reinterpret_cast<const char*>(bytes);
+ size_t input_len = bytes_len;
+
+ // The base string that we'll append to.
+ std::u16string output_str = u"";
+
+ do {
+ // Create a buffer to convert into.
+ std::vector<char> buffer(buffer_size);
+ size_t bytes_written = 0, bytes_read = 0;
+
+ g_converter_convert(G_CONVERTER(converter.get()), input, input_len,
+ buffer.data(), buffer.size(),
+ G_CONVERTER_INPUT_AT_END, &bytes_read,
+ &bytes_written, &error);
+
+ // If bytes were read, adjust input.
+ if (bytes_read > 0) {
+ input += bytes_read;
+ input_len -= bytes_read;
+ }
+
+ // If bytes were written append the buffer contents to our string
+ // accumulator
+ if (bytes_written > 0) {
+ char16_t* utf16_buffer = reinterpret_cast<char16_t*>(buffer.data());
+ // UTF-16 uses exactly 2 bytes for every character.
+ output_str.append(utf16_buffer, bytes_written / 2);
+ } else if (error) {
+ // A PARTIAL_INPUT error can only occur if the user does not provide
+ // the full sequence for a multi-byte character, we skip over the
+ // next character and insert a unicode fallback.
+
+ // An INVALID_DATA error occurs when there is no way to decode a
+ // given byte into UTF-16 or the given byte does not exist in the
+ // source encoding.
+ if (g_error_matches(error, G_IO_ERROR, G_IO_ERROR_INVALID_DATA) ||
+ g_error_matches(error, G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT)) {
+ // If we're already at the end of the string, don't insert a
+ // fallback.
+ if (input_len > 0) {
+ // Skip the next byte and reduce length by one.
+ input += 1;
+ input_len -= 1;
+
+ // Append the unicode fallback character to the output
+ output_str.append(u"\ufffd", 1);
+ }
+
+ // Clear the error.
+ g_clear_error(&error);
+ } else if (g_error_matches(error, G_IO_ERROR,
+ G_IO_ERROR_NO_SPACE)) {
+ // If the buffer was full increase the buffer
+ // size and re-try the conversion.
+ buffer_size += 512;
+
+ // Clear the error.
+ g_clear_error(&error);
+ }
+ }
+
+ // Stop decoding if an unknown error occurs.
+ } while (input_len > 0 && !error);
+
+ // An unexpected error occurred.
+ if (error)
+ return gjs_throw_type_error_from_gerror(cx, error);
+
+ // Copy the accumulator's data into a JSString of Unicode (UTF-16) chars.
+ return JS_NewUCStringCopyN(cx, output_str.c_str(), output_str.size());
+}
+
+GJS_JSAPI_RETURN_CONVENTION
+static JSString* gjs_decode_from_uint8array_slow(JSContext* cx, uint8_t* input,
+ uint32_t input_len,
+ const char* encoding,
+ bool fatal) {
+ // If the decoding is not fatal we use the lossy decoder.
+ if (!fatal)
+ return gjs_lossy_decode_from_uint8array_slow(cx, input, input_len,
+ encoding);
+
+ size_t bytes_written, bytes_read;
+ GError* error = nullptr;
+
+ GjsAutoChar bytes =
+ g_convert(reinterpret_cast<char*>(input), input_len, UTF16_CODESET,
+ encoding, &bytes_read, &bytes_written, &error);
+
+ if (error)
+ return gjs_throw_type_error_from_gerror(cx, error);
+
+ // bytes_written should be bytes in a UTF-16 string so should be a
+ // multiple of 2
+ g_assert((bytes_written % 2) == 0);
+
+ // Cast g_convert's output to char16_t and copy the data.
+ const char16_t* unicode_bytes = reinterpret_cast<char16_t*>(bytes.get());
+ return JS_NewUCStringCopyN(cx, unicode_bytes, bytes_written / 2);
+}
+
[[nodiscard]] static bool is_utf8_label(const char* encoding) {
// We could be smarter about utf8 synonyms here.
// For now, we handle any casing and trailing/leading
@@ -90,7 +208,8 @@ static const char* UTF16_CODESET = "UTF-16BE";
// decode() function implementation
JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array,
const char* encoding,
- GjsStringTermination string_termination) {
+ GjsStringTermination string_termination,
+ bool fatal) {
if (!JS_IsUint8Array(byte_array)) {
gjs_throw(cx, "Argument to decode() must be a Uint8Array");
return nullptr;
@@ -117,37 +236,44 @@ JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array,
// and encoders.
bool encoding_is_utf8 = is_utf8_label(encoding);
if (!encoding_is_utf8)
- return gjs_decode_from_uint8array_slow(cx, data, len, encoding);
+ return gjs_decode_from_uint8array_slow(cx, data, len, encoding, fatal);
JS::RootedString decoded(cx);
- JS::UTF8Chars chars(reinterpret_cast<char*>(data), len);
- JS::RootedString str(cx, JS_NewStringCopyUTF8N(cx, chars));
-
- // If an exception occurred, we need to check if the
- // exception was an InternalError. Unfortunately,
- // SpiderMonkey's decoder can throw InternalError for some
- // invalid UTF-8 sources, we have to convert this into a
- // TypeError to match the Encoding specification.
- if (str) {
- decoded.set(str);
+ if (!fatal) {
+ decoded.set(gjs_lossy_string_from_utf8_n(
+ cx, reinterpret_cast<char*>(data), len));
} else {
- if (!JS_IsExceptionPending(cx))
- return nullptr;
- JS::RootedValue exc(cx);
- if (!JS_GetPendingException(cx, &exc) || !exc.isObject())
- return nullptr;
+ JS::UTF8Chars chars(reinterpret_cast<char*>(data), len);
+ JS::RootedString str(cx, JS_NewStringCopyUTF8N(cx, chars));
+
+ // If an exception occurred, we need to check if the
+ // exception was an InternalError. Unfortunately,
+ // SpiderMonkey's decoder can throw InternalError for some
+ // invalid UTF-8 sources, we have to convert this into a
+ // TypeError to match the Encoding specification.
+ if (str) {
+ decoded.set(str);
+ } else {
+ if (!JS_IsExceptionPending(cx))
+ return nullptr;
+ JS::RootedValue exc(cx);
- JS::RootedObject exc_obj(cx, &exc.toObject());
- const JSClass* internal_error =
- js::ProtoKeyToClass(JSProto_InternalError);
- if (JS_InstanceOf(cx, exc_obj, internal_error, nullptr)) {
- // Clear the existing exception.
- JS_ClearPendingException(cx);
- gjs_throw_custom(cx, JSProto_TypeError, nullptr,
- "The provided encoded data was not valid UTF-8");
- }
+ if (!JS_GetPendingException(cx, &exc) || !exc.isObject())
+ return nullptr;
- return nullptr;
+ JS::RootedObject exc_obj(cx, &exc.toObject());
+ const JSClass* internal_error =
+ js::ProtoKeyToClass(JSProto_InternalError);
+ if (JS_InstanceOf(cx, exc_obj, internal_error, nullptr)) {
+ // Clear the existing exception.
+ JS_ClearPendingException(cx);
+ gjs_throw_custom(
+ cx, JSProto_TypeError, nullptr,
+ "The provided encoded data was not valid UTF-8");
+ }
+
+ return nullptr;
+ }
}
uint8_t* current_data;
@@ -188,13 +314,16 @@ static bool gjs_decode(JSContext* cx, unsigned argc, JS::Value* vp) {
JS::RootedObject byte_array(cx);
JS::UniqueChars encoding;
- if (!gjs_parse_call_args(cx, "decode", args, "os", "byteArray", &byte_array,
- "encoding", &encoding))
+ bool fatal = false;
+ if (!gjs_parse_call_args(cx, "decode", args, "os|b", "byteArray",
+ &byte_array, "encoding", &encoding, "fatal",
+ &fatal))
return false;
JS::RootedString decoded(
cx, gjs_decode_from_uint8array(cx, byte_array, encoding.get(),
- GjsStringTermination::EXPLICIT_LENGTH));
+ GjsStringTermination::EXPLICIT_LENGTH,
+ fatal));
if (!decoded)
return false;
diff --git a/gjs/text-encoding.h b/gjs/text-encoding.h
index eee174bb..e4daa85e 100644
--- a/gjs/text-encoding.h
+++ b/gjs/text-encoding.h
@@ -22,7 +22,8 @@ enum class GjsStringTermination {
GJS_JSAPI_RETURN_CONVENTION
JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject uint8array,
const char* encoding,
- GjsStringTermination string_termination);
+ GjsStringTermination string_termination,
+ bool fatal);
GJS_JSAPI_RETURN_CONVENTION
JSObject* gjs_encode_to_uint8array(JSContext* cx, JS::HandleString str,
diff --git a/modules/core/_text.js b/modules/core/_text.js
index a54d4342..e3cefb90 100644
--- a/modules/core/_text.js
+++ b/modules/core/_text.js
@@ -125,7 +125,7 @@ var TextDecoder = class TextDecoder {
input = new Uint8Array(buffer, byteOffset + 3, byteLength - 3);
}
- return Encoding.decode(input, this._internalEncoding);
+ return Encoding.decode(input, this._internalEncoding, this.fatal);
}
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]