[gjs/ewlsh/text-encoding: 1/3] modules: Implement fatal TextEncoder and TextDecoder APIs
- From: Evan Welsh <ewlsh src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gjs/ewlsh/text-encoding: 1/3] modules: Implement fatal TextEncoder and TextDecoder APIs
- Date: Thu, 12 Aug 2021 01:27:44 +0000 (UTC)
commit 5d7bc6e289daee1fe2bf0aadcfb9f80cabd8d652
Author: Evan Welsh <contact evanwelsh com>
Date: Mon Jul 5 21:25:54 2021 -0700
modules: Implement fatal TextEncoder and TextDecoder APIs
.eslintignore | 3 +
.eslintrc.yml | 5 +
gjs/text-encoding.cpp | 56 ++++++-
js.gresource.xml | 4 +
modules/esm/_bootstrap/default.js | 3 +
modules/esm/_encoding/encoding.js | 172 ++++++++++++++++++++
modules/esm/_encoding/encodingMap.js | 305 +++++++++++++++++++++++++++++++++++
modules/esm/_encoding/util.js | 37 +++++
8 files changed, 584 insertions(+), 1 deletion(-)
---
diff --git a/.eslintignore b/.eslintignore
index 9ee950d3..60e57336 100644
--- a/.eslintignore
+++ b/.eslintignore
@@ -3,4 +3,7 @@
installed-tests/js/jasmine.js
installed-tests/js/modules/badOverrides/WarnLib.js
+# Until ESLint merges class fields.
+# https://github.com/eslint/eslint/issues/14343
+modules/esm/_encoding/encoding.js
modules/script/jsUnit.js
diff --git a/.eslintrc.yml b/.eslintrc.yml
index 7ddf0e38..dadf40bd 100644
--- a/.eslintrc.yml
+++ b/.eslintrc.yml
@@ -242,6 +242,9 @@ rules:
- inside
yield-star-spacing: error
yoda: error
+settings:
+ jsdoc:
+ mode: typescript
globals:
ARGV: readonly
Debugger: readonly
@@ -254,5 +257,7 @@ globals:
print: readonly
printerr: readonly
window: readonly
+ TextEncoder: readonly
+ TextDecoder: readonly
parserOptions:
ecmaVersion: 2020
diff --git a/gjs/text-encoding.cpp b/gjs/text-encoding.cpp
index 03600778..3aba1827 100644
--- a/gjs/text-encoding.cpp
+++ b/gjs/text-encoding.cpp
@@ -206,6 +206,26 @@ JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array,
"UTF-8");
}
+GJS_JSAPI_RETURN_CONVENTION
+static bool gjs_decode(JSContext* cx, unsigned argc, JS::Value* vp) {
+ JS::CallArgs args = JS::CallArgsFromVp(argc, vp);
+
+ JS::RootedObject byte_array(cx);
+ JS::UniqueChars encoding;
+ if (!gjs_parse_call_args(cx, "decode", args, "os", "byteArray", &byte_array,
+ "encoding", &encoding))
+ return false;
+
+ JS::RootedString decoded(
+ cx, gjs_decode_from_uint8array(cx, byte_array, encoding.get(),
+ GjsStringTermination::EXPLICIT_LENGTH));
+ if (!decoded)
+ return false;
+
+ args.rval().setString(decoded);
+ return true;
+}
+
// encode() function implementation
JSObject* gjs_encode_to_uint8array(JSContext* cx, JS::HandleString str,
const char* encoding,
@@ -348,7 +368,41 @@ static bool gjs_encode_into_uint8array(JSContext* cx, JS::HandleString str,
return true;
}
-static JSFunctionSpec gjs_text_encoding_module_funcs[] = {JS_FS_END};
+GJS_JSAPI_RETURN_CONVENTION
+static bool gjs_encode(JSContext* cx, unsigned argc, JS::Value* vp) {
+ JS::CallArgs args = JS::CallArgsFromVp(argc, vp);
+ JS::RootedString str(cx);
+ JS::UniqueChars encoding;
+ if (!gjs_parse_call_args(cx, "encode", args, "Ss", "string", &str,
+ "encoding", &encoding))
+ return false;
+
+ JS::RootedObject uint8array(
+ cx, gjs_encode_to_uint8array(cx, str, encoding.get(),
+ GjsStringTermination::EXPLICIT_LENGTH));
+ if (!uint8array)
+ return false;
+
+ args.rval().setObject(*uint8array);
+ return true;
+}
+
+GJS_JSAPI_RETURN_CONVENTION
+static bool gjs_encode_into(JSContext* cx, unsigned argc, JS::Value* vp) {
+ JS::CallArgs args = JS::CallArgsFromVp(argc, vp);
+ JS::RootedString str(cx);
+ JS::RootedObject uint8array(cx);
+ if (!gjs_parse_call_args(cx, "encodeInto", args, "So", "string", &str,
+ "byteArray", &uint8array))
+ return false;
+
+ return gjs_encode_into_uint8array(cx, str, uint8array, args.rval());
+}
+
+static JSFunctionSpec gjs_text_encoding_module_funcs[] = {
+ JS_FN("decode", gjs_decode, 3, 0),
+ JS_FN("encodeInto", gjs_encode_into, 2, 0),
+ JS_FN("encode", gjs_encode, 2, 0), JS_FS_END};
bool gjs_define_text_encoding_stuff(JSContext* cx,
JS::MutableHandleObject module) {
diff --git a/js.gresource.xml b/js.gresource.xml
index 47be6425..947049c2 100644
--- a/js.gresource.xml
+++ b/js.gresource.xml
@@ -9,6 +9,10 @@
<!-- ESM-based modules -->
<file>modules/esm/_bootstrap/default.js</file>
+
+ <file>modules/esm/_encoding/encoding.js</file>
+ <file>modules/esm/_encoding/encodingMap.js</file>
+ <file>modules/esm/_encoding/util.js</file>
<file>modules/esm/cairo.js</file>
<file>modules/esm/gettext.js</file>
diff --git a/modules/esm/_bootstrap/default.js b/modules/esm/_bootstrap/default.js
index fefeb51b..eb315af7 100644
--- a/modules/esm/_bootstrap/default.js
+++ b/modules/esm/_bootstrap/default.js
@@ -2,3 +2,6 @@
// SPDX-FileCopyrightText: 2021 Evan Welsh <contact evanwelsh com>
// Bootstrap file which supports ESM imports.
+
+// Bootstrap the Encoding API
+import '_encoding/encoding';
diff --git a/modules/esm/_encoding/encoding.js b/modules/esm/_encoding/encoding.js
new file mode 100644
index 00000000..3e2f449b
--- /dev/null
+++ b/modules/esm/_encoding/encoding.js
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: MIT OR LGPL-2.0-or-later
+// SPDX-FileCopyrightText: 2021 Evan Welsh <contact evanwelsh com>
+
+const Encoding = import.meta.importSync('_encodingNative');
+
+import {getEncodingFromLabel} from './encodingMap.js';
+
+class TextDecoder {
+ /**
+ * @type {string}
+ */
+ encoding;
+
+ /**
+ * @type {boolean}
+ */
+ ignoreBOM;
+
+ /**
+ * @type {boolean}
+ */
+ fatal;
+
+ get [Symbol.toStringTag]() {
+ return 'TextDecoder';
+ }
+
+ /**
+ * @param {string} encoding The encoding to decode into
+ * @param {object} [options] Decoding options
+ * @param {boolean=} options.fatal Whether to throw or substitute when invalid characters are encountered
+ * @param {boolean=} options.ignoreBOM Whether to ignore the byte order for UTF-8 arrays
+ */
+ constructor(encoding = 'utf-8', options = {}) {
+ const {fatal = false, ignoreBOM = false} = options;
+
+ const encodingDefinition = getEncodingFromLabel(`${encoding}`);
+
+ if (!encodingDefinition)
+ throw new RangeError(`Invalid encoding label: '${encoding}'`);
+
+
+ if (encodingDefinition.label === 'replacement') {
+ throw new RangeError(
+ `Unsupported replacement encoding: '${encoding}'`
+ );
+ }
+
+ Object.defineProperty(this, '_internalEncoding', {
+ value: encodingDefinition.internalLabel,
+ enumerable: false,
+ writable: false,
+ configurable: false,
+ });
+
+ Object.defineProperty(this, 'encoding', {
+ value: encodingDefinition.label,
+ enumerable: true,
+ writable: false,
+ configurable: false,
+ });
+
+ Object.defineProperty(this, 'ignoreBOM', {
+ value: Boolean(ignoreBOM),
+ enumerable: true,
+ writable: false,
+ configurable: false,
+ });
+
+ Object.defineProperty(this, 'fatal', {
+ value: Boolean(fatal),
+ enumerable: true,
+ writable: false,
+ configurable: false,
+ });
+ }
+
+ /**
+ * @param {unknown} bytes a typed array of bytes to decode
+ * @param {object} [options] Decoding options
+ * @param {boolean=} options.stream Unsupported option. Whether to stream the decoded bytes.
+ * @returns
+ */
+ decode(bytes, options = {}) {
+ const {stream = false} = options;
+
+ if (stream) {
+ throw new Error(
+ 'TextDecoder does not implement the \'stream\' option.'
+ );
+ }
+
+ /** @type {Uint8Array} */
+ let input;
+
+ if (bytes instanceof ArrayBuffer) {
+ input = new Uint8Array(bytes);
+ } else if (bytes instanceof Uint8Array) {
+ input = bytes;
+ } else if (bytes instanceof Object.getPrototypeOf(Uint8Array)) {
+ let {buffer, byteLength, byteOffset} =
+ /** @type {Uint32Array} */ bytes;
+ input = new Uint8Array(buffer, byteOffset, byteLength);
+ } else if (
+ typeof bytes === 'object' &&
+ bytes !== null &&
+ 'buffer' in bytes &&
+ bytes.buffer instanceof ArrayBuffer
+ ) {
+ let {buffer, byteLength, byteOffset} = bytes;
+ input = new Uint8Array(buffer, byteOffset, byteLength);
+ } else if (bytes === undefined) {
+ input = new Uint8Array(0);
+ } else {
+ throw new Error(
+ 'Provided input cannot be converted to ArrayBufferView or ArrayBuffer'
+ );
+ }
+
+ if (
+ this.ignoreBOM &&
+ input.length > 2 &&
+ input[0] === 0xef &&
+ input[1] === 0xbb &&
+ input[2] === 0xbf
+ ) {
+ if (this.encoding !== 'utf-8')
+ throw new Error('Cannot ignore BOM for non-UTF8 encoding.');
+
+
+ let {buffer, byteLength, byteOffset} = input;
+ input = new Uint8Array(buffer, byteOffset + 3, byteLength - 3);
+ }
+
+ return Encoding.decode(input, this._internalEncoding);
+ }
+}
+
+class TextEncoder {
+ get [Symbol.toStringTag]() {
+ return 'TextEncoder';
+ }
+
+ get encoding() {
+ return 'utf-8';
+ }
+
+ encode(input = '') {
+ // The TextEncoder specification only allows for UTF-8 encoding.
+ return Encoding.encode(`${input}`, 'utf-8');
+ }
+
+ encodeInto(input = '', output = new Uint8Array()) {
+ // The TextEncoder specification only allows for UTF-8 encoding.
+ return Encoding.encodeInto(`${input}`, output);
+ }
+}
+
+Object.defineProperties(globalThis, {
+ TextEncoder: {
+ configurable: false,
+ enumerable: true,
+ writable: false,
+ value: TextEncoder,
+ },
+ TextDecoder: {
+ configurable: false,
+ enumerable: true,
+ writable: false,
+ value: TextDecoder,
+ },
+});
diff --git a/modules/esm/_encoding/encodingMap.js b/modules/esm/_encoding/encodingMap.js
new file mode 100644
index 00000000..b0f17702
--- /dev/null
+++ b/modules/esm/_encoding/encodingMap.js
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: MIT OR LGPL-2.0-or-later
+// SPDX-FileCopyrightText: 2021 Evan Welsh <contact evanwelsh com>
+
+import {trimAsciiWhitespace} from './util.js';
+
+// Data derived from https://encoding.spec.whatwg.org/encodings.json
+const encodingMap = {
+ 'utf-8': [
+ 'unicode-1-1-utf-8',
+ 'unicode11utf8',
+ 'unicode20utf8',
+ 'utf-8',
+ 'utf8',
+ 'x-unicode20utf8',
+ ],
+ ibm866: ['866', 'cp866', 'csibm866', 'ibm866'],
+ 'iso-8859-2': [
+ 'csisolatin2',
+ 'iso-8859-2',
+ 'iso-ir-101',
+ 'iso8859-2',
+ 'iso88592',
+ 'iso_8859-2',
+ 'iso_8859-2:1987',
+ 'l2',
+ 'latin2',
+ ],
+ 'iso-8859-3': [
+ 'csisolatin3',
+ 'iso-8859-3',
+ 'iso-ir-109',
+ 'iso8859-3',
+ 'iso88593',
+ 'iso_8859-3',
+ 'iso_8859-3:1988',
+ 'l3',
+ 'latin3',
+ ],
+ 'iso-8859-4': [
+ 'csisolatin4',
+ 'iso-8859-4',
+ 'iso-ir-110',
+ 'iso8859-4',
+ 'iso88594',
+ 'iso_8859-4',
+ 'iso_8859-4:1988',
+ 'l4',
+ 'latin4',
+ ],
+ 'iso-8859-5': [
+ 'csisolatincyrillic',
+ 'cyrillic',
+ 'iso-8859-5',
+ 'iso-ir-144',
+ 'iso8859-5',
+ 'iso88595',
+ 'iso_8859-5',
+ 'iso_8859-5:1988',
+ ],
+ 'iso-8859-6': [
+ 'arabic',
+ 'asmo-708',
+ 'csiso88596e',
+ 'csiso88596i',
+ 'csisolatinarabic',
+ 'ecma-114',
+ 'iso-8859-6',
+ 'iso-8859-6-e',
+ 'iso-8859-6-i',
+ 'iso-ir-127',
+ 'iso8859-6',
+ 'iso88596',
+ 'iso_8859-6',
+ 'iso_8859-6:1987',
+ ],
+ 'iso-8859-7': [
+ 'csisolatingreek',
+ 'ecma-118',
+ 'elot_928',
+ 'greek',
+ 'greek8',
+ 'iso-8859-7',
+ 'iso-ir-126',
+ 'iso8859-7',
+ 'iso88597',
+ 'iso_8859-7',
+ 'iso_8859-7:1987',
+ 'sun_eu_greek',
+ ],
+ 'iso-8859-8': [
+ 'csiso88598e',
+ 'csisolatinhebrew',
+ 'hebrew',
+ 'iso-8859-8',
+ 'iso-8859-8-e',
+ 'iso-ir-138',
+ 'iso8859-8',
+ 'iso88598',
+ 'iso_8859-8',
+ 'iso_8859-8:1988',
+ 'visual',
+ ],
+ 'iso-8859-8-i': ['csiso88598i', 'iso-8859-8-i', 'logical'],
+ 'iso-8859-10': [
+ 'csisolatin6',
+ 'iso-8859-10',
+ 'iso-ir-157',
+ 'iso8859-10',
+ 'iso885910',
+ 'l6',
+ 'latin6',
+ ],
+ 'iso-8859-13': ['iso-8859-13', 'iso8859-13', 'iso885913'],
+ 'iso-8859-14': ['iso-8859-14', 'iso8859-14', 'iso885914'],
+ 'iso-8859-15': [
+ 'csisolatin9',
+ 'iso-8859-15',
+ 'iso8859-15',
+ 'iso885915',
+ 'iso_8859-15',
+ 'l9',
+ ],
+ 'iso-8859-16': ['iso-8859-16'],
+ 'koi8-r': ['cskoi8r', 'koi', 'koi8', 'koi8-r', 'koi8_r'],
+ 'koi8-u': ['koi8-ru', 'koi8-u'],
+ macintosh: ['csmacintosh', 'mac', 'macintosh', 'x-mac-roman'],
+ 'windows-874': [
+ 'dos-874',
+ 'iso-8859-11',
+ 'iso8859-11',
+ 'iso885911',
+ 'tis-620',
+ 'windows-874',
+ ],
+ 'windows-1250': ['cp1250', 'windows-1250', 'x-cp1250'],
+ 'windows-1251': ['cp1251', 'windows-1251', 'x-cp1251'],
+ 'windows-1252': [
+ 'ansi_x3.4-1968',
+ 'ascii',
+ 'cp1252',
+ 'cp819',
+ 'csisolatin1',
+ 'ibm819',
+ 'iso-8859-1',
+ 'iso-ir-100',
+ 'iso8859-1',
+ 'iso88591',
+ 'iso_8859-1',
+ 'iso_8859-1:1987',
+ 'l1',
+ 'latin1',
+ 'us-ascii',
+ 'windows-1252',
+ 'x-cp1252',
+ ],
+ 'windows-1253': ['cp1253', 'windows-1253', 'x-cp1253'],
+ 'windows-1254': [
+ 'cp1254',
+ 'csisolatin5',
+ 'iso-8859-9',
+ 'iso-ir-148',
+ 'iso8859-9',
+ 'iso88599',
+ 'iso_8859-9',
+ 'iso_8859-9:1989',
+ 'l5',
+ 'latin5',
+ 'windows-1254',
+ 'x-cp1254',
+ ],
+ 'windows-1255': ['cp1255', 'windows-1255', 'x-cp1255'],
+ 'windows-1256': ['cp1256', 'windows-1256', 'x-cp1256'],
+ 'windows-1257': ['cp1257', 'windows-1257', 'x-cp1257'],
+ 'windows-1258': ['cp1258', 'windows-1258', 'x-cp1258'],
+ 'x-mac-cyrillic': ['x-mac-cyrillic', 'x-mac-ukrainian'],
+ gbk: [
+ 'chinese',
+ 'csgb2312',
+ 'csiso58gb231280',
+ 'gb2312',
+ 'gb_2312',
+ 'gb_2312-80',
+ 'gbk',
+ 'iso-ir-58',
+ 'x-gbk',
+ ],
+ gb18030: ['gb18030'],
+ big5: [
+ 'big5',
+ // Unlike the standard WHATWG encoder
+ // the Hong Kong Supplementary Character Set
+ // is not bundled in big5 by iconv
+ // "big5-hkscs",
+ 'cn-big5',
+ 'csbig5',
+ 'x-x-big5',
+ ],
+ 'euc-jp': ['cseucpkdfmtjapanese', 'euc-jp', 'x-euc-jp'],
+ 'iso-2022-jp': ['csiso2022jp', 'iso-2022-jp'],
+ shift_jis: [
+ 'csshiftjis',
+ 'ms932',
+ 'ms_kanji',
+ 'shift-jis',
+ 'shift_jis',
+ 'sjis',
+ 'windows-31j',
+ 'x-sjis',
+ ],
+ 'euc-kr': [
+ 'cseuckr',
+ 'csksc56011987',
+ 'euc-kr',
+ 'iso-ir-149',
+ 'korean',
+ 'ks_c_5601-1987',
+ 'ks_c_5601-1989',
+ 'ksc5601',
+ 'ksc_5601',
+ 'windows-949',
+ ],
+ 'utf-16be': ['unicodefffe', 'utf-16be'],
+ 'utf-16le': [
+ 'csunicode',
+ 'iso-10646-ucs-2',
+ 'ucs-2',
+ 'unicode',
+ 'unicodefeff',
+ 'utf-16',
+ 'utf-16le',
+ ],
+};
+
+/**
+ * Construct a map from each potential label to the canonical label
+ * for an encoding.
+ */
+const encodings = new Map(
+ Object.entries(encodingMap).flatMap(([encoding, labels]) => {
+ return labels.map(label => [label, encoding]);
+ })
+);
+
+// Maps WHATWG specified labels to the appropriate iconv
+// encoding label if iconv does not support the WHATWG label.
+//
+// Mapping here preserves the WHATWG as the label on the
+// TextDecoder so this change is transparent to API users.
+const internalEncodings = new Map([
+ // iso-8859-8-i is functionally equivalent to iso-8859-8
+ // as we are not encoding or decoding control characters.
+ ['iso-8859-8-i', 'iso-8859-8'],
+ // iconv follows a different naming convention for this
+ // encoding
+ ['x-mac-cyrillic', 'MacCyrillic'],
+ // Support HKSCS as a standalone encoding, iconv doesn't
+ // bundle it with Big5 like WHATWG does...
+ ['big5-hkscs', 'big5-hkscs'],
+]);
+
+/**
+ * @typedef Encoding
+ * @property {string} internalLabel
+ * @property {string} label
+ */
+
+/**
+ * @param {string} label the encoding label
+ * @returns {Encoding | null}
+ */
+export function getEncodingFromLabel(label) {
+ const formattedLabel = trimAsciiWhitespace(label.toLowerCase());
+
+ let canonicalLabel = encodings.get(formattedLabel);
+
+ // Lookup an internal mapping using the canonical name, if found, or
+ // the formatted label otherwise.
+ //
+ // x-mac-ukrainian > x-mac-cyrillic > MacCyrillic
+ // (canonical label) (internal label)
+ //
+ // big5-hkscs > undefined > big5-hkscs
+ // (canonical label) (internal label)
+ //
+ let internalLabel = internalEncodings.get(
+ canonicalLabel ?? formattedLabel
+ );
+
+ // If both the canonical label and the internal encoding
+ // are not found, this encoding is unsupported.
+ if (!canonicalLabel && !internalLabel)
+ return null;
+
+ if (internalLabel) {
+ return {
+ label: canonicalLabel ?? formattedLabel,
+ internalLabel,
+ };
+ }
+
+ return {
+ label: canonicalLabel,
+ internalLabel: canonicalLabel,
+ };
+}
diff --git a/modules/esm/_encoding/util.js b/modules/esm/_encoding/util.js
new file mode 100644
index 00000000..9ee450fe
--- /dev/null
+++ b/modules/esm/_encoding/util.js
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: Node.js contributors. All rights reserved.
+
+// Modified from
https://github.com/nodejs/node/blob/78680c1cbc8b0c435963bc512e826b2a6227c315/lib/internal/encoding.js
+
+/**
+ * Trims ASCII whitespace from a string.
+ * `String.prototype.trim` removes non-ASCII whitespace.
+ *
+ * @param {string} label the label to trim
+ * @returns {string}
+ */
+export const trimAsciiWhitespace = label => {
+ let s = 0;
+ let e = label.length;
+ while (
+ s < e &&
+ (label[s] === '\u0009' ||
+ label[s] === '\u000a' ||
+ label[s] === '\u000c' ||
+ label[s] === '\u000d' ||
+ label[s] === '\u0020')
+ )
+ s++;
+
+ while (
+ e > s &&
+ (label[e - 1] === '\u0009' ||
+ label[e - 1] === '\u000a' ||
+ label[e - 1] === '\u000c' ||
+ label[e - 1] === '\u000d' ||
+ label[e - 1] === '\u0020')
+ )
+ e--;
+
+ return label.slice(s, e);
+};
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]