[gjs/ewlsh/text-encoding: 1/3] modules: Implement fatal TextEncoder and TextDecoder APIs

From: Evan Welsh <ewlsh src gnome org>
To: commits-list gnome org
Cc:
Subject: [gjs/ewlsh/text-encoding: 1/3] modules: Implement fatal TextEncoder and TextDecoder APIs
Date: Thu, 12 Aug 2021 01:27:44 +0000 (UTC)

commit 5d7bc6e289daee1fe2bf0aadcfb9f80cabd8d652
Author: Evan Welsh <contact evanwelsh com>
Date:   Mon Jul 5 21:25:54 2021 -0700

    modules: Implement fatal TextEncoder and TextDecoder APIs

 .eslintignore                        |   3 +
 .eslintrc.yml                        |   5 +
 gjs/text-encoding.cpp                |  56 ++++++-
 js.gresource.xml                     |   4 +
 modules/esm/_bootstrap/default.js    |   3 +
 modules/esm/_encoding/encoding.js    | 172 ++++++++++++++++++++
 modules/esm/_encoding/encodingMap.js | 305 +++++++++++++++++++++++++++++++++++
 modules/esm/_encoding/util.js        |  37 +++++
 8 files changed, 584 insertions(+), 1 deletion(-)
---
diff --git a/.eslintignore b/.eslintignore
index 9ee950d3..60e57336 100644
--- a/.eslintignore
+++ b/.eslintignore
@@ -3,4 +3,7 @@
 
 installed-tests/js/jasmine.js
 installed-tests/js/modules/badOverrides/WarnLib.js
+# Until ESLint merges class fields.
+# https://github.com/eslint/eslint/issues/14343
+modules/esm/_encoding/encoding.js
 modules/script/jsUnit.js
diff --git a/.eslintrc.yml b/.eslintrc.yml
index 7ddf0e38..dadf40bd 100644
--- a/.eslintrc.yml
+++ b/.eslintrc.yml
@@ -242,6 +242,9 @@ rules:
     - inside
   yield-star-spacing: error
   yoda: error
+settings:
+  jsdoc:
+    mode: typescript
 globals:
   ARGV: readonly
   Debugger: readonly
@@ -254,5 +257,7 @@ globals:
   print: readonly
   printerr: readonly
   window: readonly
+  TextEncoder: readonly
+  TextDecoder: readonly
 parserOptions:
   ecmaVersion: 2020
diff --git a/gjs/text-encoding.cpp b/gjs/text-encoding.cpp
index 03600778..3aba1827 100644
--- a/gjs/text-encoding.cpp
+++ b/gjs/text-encoding.cpp
@@ -206,6 +206,26 @@ JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array,
                                            "UTF-8");
 }
 
+GJS_JSAPI_RETURN_CONVENTION
+static bool gjs_decode(JSContext* cx, unsigned argc, JS::Value* vp) {
+    JS::CallArgs args = JS::CallArgsFromVp(argc, vp);
+
+    JS::RootedObject byte_array(cx);
+    JS::UniqueChars encoding;
+    if (!gjs_parse_call_args(cx, "decode", args, "os", "byteArray", &byte_array,
+                             "encoding", &encoding))
+        return false;
+
+    JS::RootedString decoded(
+        cx, gjs_decode_from_uint8array(cx, byte_array, encoding.get(),
+                                       GjsStringTermination::EXPLICIT_LENGTH));
+    if (!decoded)
+        return false;
+
+    args.rval().setString(decoded);
+    return true;
+}
+
 // encode() function implementation
 JSObject* gjs_encode_to_uint8array(JSContext* cx, JS::HandleString str,
                                    const char* encoding,
@@ -348,7 +368,41 @@ static bool gjs_encode_into_uint8array(JSContext* cx, JS::HandleString str,
     return true;
 }
 
-static JSFunctionSpec gjs_text_encoding_module_funcs[] = {JS_FS_END};
+GJS_JSAPI_RETURN_CONVENTION
+static bool gjs_encode(JSContext* cx, unsigned argc, JS::Value* vp) {
+    JS::CallArgs args = JS::CallArgsFromVp(argc, vp);
+    JS::RootedString str(cx);
+    JS::UniqueChars encoding;
+    if (!gjs_parse_call_args(cx, "encode", args, "Ss", "string", &str,
+                             "encoding", &encoding))
+        return false;
+
+    JS::RootedObject uint8array(
+        cx, gjs_encode_to_uint8array(cx, str, encoding.get(),
+                                     GjsStringTermination::EXPLICIT_LENGTH));
+    if (!uint8array)
+        return false;
+
+    args.rval().setObject(*uint8array);
+    return true;
+}
+
+GJS_JSAPI_RETURN_CONVENTION
+static bool gjs_encode_into(JSContext* cx, unsigned argc, JS::Value* vp) {
+    JS::CallArgs args = JS::CallArgsFromVp(argc, vp);
+    JS::RootedString str(cx);
+    JS::RootedObject uint8array(cx);
+    if (!gjs_parse_call_args(cx, "encodeInto", args, "So", "string", &str,
+                             "byteArray", &uint8array))
+        return false;
+
+    return gjs_encode_into_uint8array(cx, str, uint8array, args.rval());
+}
+
+static JSFunctionSpec gjs_text_encoding_module_funcs[] = {
+    JS_FN("decode", gjs_decode, 3, 0),
+    JS_FN("encodeInto", gjs_encode_into, 2, 0),
+    JS_FN("encode", gjs_encode, 2, 0), JS_FS_END};
 
 bool gjs_define_text_encoding_stuff(JSContext* cx,
                                     JS::MutableHandleObject module) {
diff --git a/js.gresource.xml b/js.gresource.xml
index 47be6425..947049c2 100644
--- a/js.gresource.xml
+++ b/js.gresource.xml
@@ -9,6 +9,10 @@
 
     <!-- ESM-based modules -->
     <file>modules/esm/_bootstrap/default.js</file>
+
+    <file>modules/esm/_encoding/encoding.js</file>
+    <file>modules/esm/_encoding/encodingMap.js</file>
+    <file>modules/esm/_encoding/util.js</file>
   
     <file>modules/esm/cairo.js</file>
     <file>modules/esm/gettext.js</file>
diff --git a/modules/esm/_bootstrap/default.js b/modules/esm/_bootstrap/default.js
index fefeb51b..eb315af7 100644
--- a/modules/esm/_bootstrap/default.js
+++ b/modules/esm/_bootstrap/default.js
@@ -2,3 +2,6 @@
 // SPDX-FileCopyrightText: 2021 Evan Welsh <contact evanwelsh com>
 
 // Bootstrap file which supports ESM imports.
+
+// Bootstrap the Encoding API
+import '_encoding/encoding';
diff --git a/modules/esm/_encoding/encoding.js b/modules/esm/_encoding/encoding.js
new file mode 100644
index 00000000..3e2f449b
--- /dev/null
+++ b/modules/esm/_encoding/encoding.js
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: MIT OR LGPL-2.0-or-later
+// SPDX-FileCopyrightText: 2021 Evan Welsh <contact evanwelsh com>
+
+const Encoding = import.meta.importSync('_encodingNative');
+
+import {getEncodingFromLabel} from './encodingMap.js';
+
+class TextDecoder {
+    /**
+     * @type {string}
+     */
+    encoding;
+
+    /**
+     * @type {boolean}
+     */
+    ignoreBOM;
+
+    /**
+     * @type {boolean}
+     */
+    fatal;
+
+    get [Symbol.toStringTag]() {
+        return 'TextDecoder';
+    }
+
+    /**
+     * @param {string} encoding The encoding to decode into
+     * @param {object} [options] Decoding options
+     * @param {boolean=} options.fatal Whether to throw or substitute when invalid characters are encountered
+     * @param {boolean=} options.ignoreBOM Whether to ignore the byte order for UTF-8 arrays
+     */
+    constructor(encoding = 'utf-8', options = {}) {
+        const {fatal = false, ignoreBOM = false} = options;
+
+        const encodingDefinition = getEncodingFromLabel(`${encoding}`);
+
+        if (!encodingDefinition)
+            throw new RangeError(`Invalid encoding label: '${encoding}'`);
+
+
+        if (encodingDefinition.label === 'replacement') {
+            throw new RangeError(
+                `Unsupported replacement encoding: '${encoding}'`
+            );
+        }
+
+        Object.defineProperty(this, '_internalEncoding', {
+            value: encodingDefinition.internalLabel,
+            enumerable: false,
+            writable: false,
+            configurable: false,
+        });
+
+        Object.defineProperty(this, 'encoding', {
+            value: encodingDefinition.label,
+            enumerable: true,
+            writable: false,
+            configurable: false,
+        });
+
+        Object.defineProperty(this, 'ignoreBOM', {
+            value: Boolean(ignoreBOM),
+            enumerable: true,
+            writable: false,
+            configurable: false,
+        });
+
+        Object.defineProperty(this, 'fatal', {
+            value: Boolean(fatal),
+            enumerable: true,
+            writable: false,
+            configurable: false,
+        });
+    }
+
+    /**
+     * @param {unknown} bytes a typed array of bytes to decode
+     * @param {object} [options] Decoding options
+     * @param {boolean=} options.stream Unsupported option. Whether to stream the decoded bytes.
+     * @returns
+     */
+    decode(bytes, options = {}) {
+        const {stream = false} = options;
+
+        if (stream) {
+            throw new Error(
+                'TextDecoder does not implement the \'stream\' option.'
+            );
+        }
+
+        /** @type {Uint8Array} */
+        let input;
+
+        if (bytes instanceof ArrayBuffer) {
+            input = new Uint8Array(bytes);
+        } else if (bytes instanceof Uint8Array) {
+            input = bytes;
+        } else if (bytes instanceof Object.getPrototypeOf(Uint8Array)) {
+            let {buffer, byteLength, byteOffset} =
+                /** @type {Uint32Array} */ bytes;
+            input = new Uint8Array(buffer, byteOffset, byteLength);
+        } else if (
+            typeof bytes === 'object' &&
+            bytes !== null &&
+            'buffer' in bytes &&
+            bytes.buffer instanceof ArrayBuffer
+        ) {
+            let {buffer, byteLength, byteOffset} = bytes;
+            input = new Uint8Array(buffer, byteOffset, byteLength);
+        } else if (bytes === undefined) {
+            input = new Uint8Array(0);
+        } else {
+            throw new Error(
+                'Provided input cannot be converted to ArrayBufferView or ArrayBuffer'
+            );
+        }
+
+        if (
+            this.ignoreBOM &&
+            input.length > 2 &&
+            input[0] === 0xef &&
+            input[1] === 0xbb &&
+            input[2] === 0xbf
+        ) {
+            if (this.encoding !== 'utf-8')
+                throw new Error('Cannot ignore BOM for non-UTF8 encoding.');
+
+
+            let {buffer, byteLength, byteOffset} = input;
+            input = new Uint8Array(buffer, byteOffset + 3, byteLength - 3);
+        }
+
+        return Encoding.decode(input, this._internalEncoding);
+    }
+}
+
+class TextEncoder {
+    get [Symbol.toStringTag]() {
+        return 'TextEncoder';
+    }
+
+    get encoding() {
+        return 'utf-8';
+    }
+
+    encode(input = '') {
+        // The TextEncoder specification only allows for UTF-8 encoding.
+        return Encoding.encode(`${input}`, 'utf-8');
+    }
+
+    encodeInto(input = '', output = new Uint8Array()) {
+        // The TextEncoder specification only allows for UTF-8 encoding.
+        return Encoding.encodeInto(`${input}`, output);
+    }
+}
+
+Object.defineProperties(globalThis, {
+    TextEncoder: {
+        configurable: false,
+        enumerable: true,
+        writable: false,
+        value: TextEncoder,
+    },
+    TextDecoder: {
+        configurable: false,
+        enumerable: true,
+        writable: false,
+        value: TextDecoder,
+    },
+});
diff --git a/modules/esm/_encoding/encodingMap.js b/modules/esm/_encoding/encodingMap.js
new file mode 100644
index 00000000..b0f17702
--- /dev/null
+++ b/modules/esm/_encoding/encodingMap.js
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: MIT OR LGPL-2.0-or-later
+// SPDX-FileCopyrightText: 2021 Evan Welsh <contact evanwelsh com>
+
+import {trimAsciiWhitespace} from './util.js';
+
+// Data derived from https://encoding.spec.whatwg.org/encodings.json
+const encodingMap = {
+    'utf-8': [
+        'unicode-1-1-utf-8',
+        'unicode11utf8',
+        'unicode20utf8',
+        'utf-8',
+        'utf8',
+        'x-unicode20utf8',
+    ],
+    ibm866: ['866', 'cp866', 'csibm866', 'ibm866'],
+    'iso-8859-2': [
+        'csisolatin2',
+        'iso-8859-2',
+        'iso-ir-101',
+        'iso8859-2',
+        'iso88592',
+        'iso_8859-2',
+        'iso_8859-2:1987',
+        'l2',
+        'latin2',
+    ],
+    'iso-8859-3': [
+        'csisolatin3',
+        'iso-8859-3',
+        'iso-ir-109',
+        'iso8859-3',
+        'iso88593',
+        'iso_8859-3',
+        'iso_8859-3:1988',
+        'l3',
+        'latin3',
+    ],
+    'iso-8859-4': [
+        'csisolatin4',
+        'iso-8859-4',
+        'iso-ir-110',
+        'iso8859-4',
+        'iso88594',
+        'iso_8859-4',
+        'iso_8859-4:1988',
+        'l4',
+        'latin4',
+    ],
+    'iso-8859-5': [
+        'csisolatincyrillic',
+        'cyrillic',
+        'iso-8859-5',
+        'iso-ir-144',
+        'iso8859-5',
+        'iso88595',
+        'iso_8859-5',
+        'iso_8859-5:1988',
+    ],
+    'iso-8859-6': [
+        'arabic',
+        'asmo-708',
+        'csiso88596e',
+        'csiso88596i',
+        'csisolatinarabic',
+        'ecma-114',
+        'iso-8859-6',
+        'iso-8859-6-e',
+        'iso-8859-6-i',
+        'iso-ir-127',
+        'iso8859-6',
+        'iso88596',
+        'iso_8859-6',
+        'iso_8859-6:1987',
+    ],
+    'iso-8859-7': [
+        'csisolatingreek',
+        'ecma-118',
+        'elot_928',
+        'greek',
+        'greek8',
+        'iso-8859-7',
+        'iso-ir-126',
+        'iso8859-7',
+        'iso88597',
+        'iso_8859-7',
+        'iso_8859-7:1987',
+        'sun_eu_greek',
+    ],
+    'iso-8859-8': [
+        'csiso88598e',
+        'csisolatinhebrew',
+        'hebrew',
+        'iso-8859-8',
+        'iso-8859-8-e',
+        'iso-ir-138',
+        'iso8859-8',
+        'iso88598',
+        'iso_8859-8',
+        'iso_8859-8:1988',
+        'visual',
+    ],
+    'iso-8859-8-i': ['csiso88598i', 'iso-8859-8-i', 'logical'],
+    'iso-8859-10': [
+        'csisolatin6',
+        'iso-8859-10',
+        'iso-ir-157',
+        'iso8859-10',
+        'iso885910',
+        'l6',
+        'latin6',
+    ],
+    'iso-8859-13': ['iso-8859-13', 'iso8859-13', 'iso885913'],
+    'iso-8859-14': ['iso-8859-14', 'iso8859-14', 'iso885914'],
+    'iso-8859-15': [
+        'csisolatin9',
+        'iso-8859-15',
+        'iso8859-15',
+        'iso885915',
+        'iso_8859-15',
+        'l9',
+    ],
+    'iso-8859-16': ['iso-8859-16'],
+    'koi8-r': ['cskoi8r', 'koi', 'koi8', 'koi8-r', 'koi8_r'],
+    'koi8-u': ['koi8-ru', 'koi8-u'],
+    macintosh: ['csmacintosh', 'mac', 'macintosh', 'x-mac-roman'],
+    'windows-874': [
+        'dos-874',
+        'iso-8859-11',
+        'iso8859-11',
+        'iso885911',
+        'tis-620',
+        'windows-874',
+    ],
+    'windows-1250': ['cp1250', 'windows-1250', 'x-cp1250'],
+    'windows-1251': ['cp1251', 'windows-1251', 'x-cp1251'],
+    'windows-1252': [
+        'ansi_x3.4-1968',
+        'ascii',
+        'cp1252',
+        'cp819',
+        'csisolatin1',
+        'ibm819',
+        'iso-8859-1',
+        'iso-ir-100',
+        'iso8859-1',
+        'iso88591',
+        'iso_8859-1',
+        'iso_8859-1:1987',
+        'l1',
+        'latin1',
+        'us-ascii',
+        'windows-1252',
+        'x-cp1252',
+    ],
+    'windows-1253': ['cp1253', 'windows-1253', 'x-cp1253'],
+    'windows-1254': [
+        'cp1254',
+        'csisolatin5',
+        'iso-8859-9',
+        'iso-ir-148',
+        'iso8859-9',
+        'iso88599',
+        'iso_8859-9',
+        'iso_8859-9:1989',
+        'l5',
+        'latin5',
+        'windows-1254',
+        'x-cp1254',
+    ],
+    'windows-1255': ['cp1255', 'windows-1255', 'x-cp1255'],
+    'windows-1256': ['cp1256', 'windows-1256', 'x-cp1256'],
+    'windows-1257': ['cp1257', 'windows-1257', 'x-cp1257'],
+    'windows-1258': ['cp1258', 'windows-1258', 'x-cp1258'],
+    'x-mac-cyrillic': ['x-mac-cyrillic', 'x-mac-ukrainian'],
+    gbk: [
+        'chinese',
+        'csgb2312',
+        'csiso58gb231280',
+        'gb2312',
+        'gb_2312',
+        'gb_2312-80',
+        'gbk',
+        'iso-ir-58',
+        'x-gbk',
+    ],
+    gb18030: ['gb18030'],
+    big5: [
+        'big5',
+        // Unlike the standard WHATWG encoder
+        // the Hong Kong Supplementary Character Set
+        // is not bundled in big5 by iconv
+        // "big5-hkscs",
+        'cn-big5',
+        'csbig5',
+        'x-x-big5',
+    ],
+    'euc-jp': ['cseucpkdfmtjapanese', 'euc-jp', 'x-euc-jp'],
+    'iso-2022-jp': ['csiso2022jp', 'iso-2022-jp'],
+    shift_jis: [
+        'csshiftjis',
+        'ms932',
+        'ms_kanji',
+        'shift-jis',
+        'shift_jis',
+        'sjis',
+        'windows-31j',
+        'x-sjis',
+    ],
+    'euc-kr': [
+        'cseuckr',
+        'csksc56011987',
+        'euc-kr',
+        'iso-ir-149',
+        'korean',
+        'ks_c_5601-1987',
+        'ks_c_5601-1989',
+        'ksc5601',
+        'ksc_5601',
+        'windows-949',
+    ],
+    'utf-16be': ['unicodefffe', 'utf-16be'],
+    'utf-16le': [
+        'csunicode',
+        'iso-10646-ucs-2',
+        'ucs-2',
+        'unicode',
+        'unicodefeff',
+        'utf-16',
+        'utf-16le',
+    ],
+};
+
+/**
+ * Construct a map from each potential label to the canonical label
+ * for an encoding.
+ */
+const encodings = new Map(
+    Object.entries(encodingMap).flatMap(([encoding, labels]) => {
+        return labels.map(label => [label, encoding]);
+    })
+);
+
+// Maps WHATWG specified labels to the appropriate iconv
+// encoding label if iconv does not support the WHATWG label.
+//
+// Mapping here preserves the WHATWG as the label on the
+// TextDecoder so this change is transparent to API users.
+const internalEncodings = new Map([
+    // iso-8859-8-i is functionally equivalent to iso-8859-8
+    // as we are not encoding or decoding control characters.
+    ['iso-8859-8-i', 'iso-8859-8'],
+    // iconv follows a different naming convention for this
+    // encoding
+    ['x-mac-cyrillic', 'MacCyrillic'],
+    // Support HKSCS as a standalone encoding, iconv doesn't
+    // bundle it with Big5 like WHATWG does...
+    ['big5-hkscs', 'big5-hkscs'],
+]);
+
+/**
+ * @typedef Encoding
+ * @property {string} internalLabel
+ * @property {string} label
+ */
+
+/**
+ * @param {string} label the encoding label
+ * @returns {Encoding | null}
+ */
+export function getEncodingFromLabel(label) {
+    const formattedLabel = trimAsciiWhitespace(label.toLowerCase());
+
+    let canonicalLabel = encodings.get(formattedLabel);
+
+    // Lookup an internal mapping using the canonical name, if found, or
+    // the formatted label otherwise.
+    //
+    // x-mac-ukrainian   >   x-mac-cyrillic   >   MacCyrillic
+    //                      (canonical label)    (internal label)
+    //
+    // big5-hkscs        >   undefined        >   big5-hkscs
+    //                      (canonical label)    (internal label)
+    //
+    let internalLabel = internalEncodings.get(
+        canonicalLabel ?? formattedLabel
+    );
+
+    // If both the canonical label and the internal encoding
+    // are not found, this encoding is unsupported.
+    if (!canonicalLabel && !internalLabel)
+        return null;
+
+    if (internalLabel) {
+        return {
+            label: canonicalLabel ?? formattedLabel,
+            internalLabel,
+        };
+    }
+
+    return {
+        label: canonicalLabel,
+        internalLabel: canonicalLabel,
+    };
+}
diff --git a/modules/esm/_encoding/util.js b/modules/esm/_encoding/util.js
new file mode 100644
index 00000000..9ee450fe
--- /dev/null
+++ b/modules/esm/_encoding/util.js
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: Node.js contributors. All rights reserved.
+
+// Modified from 
https://github.com/nodejs/node/blob/78680c1cbc8b0c435963bc512e826b2a6227c315/lib/internal/encoding.js
+
+/**
+ * Trims ASCII whitespace from a string.
+ * `String.prototype.trim` removes non-ASCII whitespace.
+ *
+ * @param {string} label the label to trim
+ * @returns {string}
+ */
+export const trimAsciiWhitespace = label => {
+    let s = 0;
+    let e = label.length;
+    while (
+        s < e &&
+        (label[s] === '\u0009' ||
+            label[s] === '\u000a' ||
+            label[s] === '\u000c' ||
+            label[s] === '\u000d' ||
+            label[s] === '\u0020')
+    )
+        s++;
+
+    while (
+        e > s &&
+        (label[e - 1] === '\u0009' ||
+            label[e - 1] === '\u000a' ||
+            label[e - 1] === '\u000c' ||
+            label[e - 1] === '\u000d' ||
+            label[e - 1] === '\u0020')
+    )
+        e--;
+
+    return label.slice(s, e);
+};
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]