[gjs/ewlsh/text-encoding] Updates

From: Evan Welsh <ewlsh src gnome org>
To: commits-list gnome org
Cc:
Subject: [gjs/ewlsh/text-encoding] Updates
Date: Thu, 12 Aug 2021 01:23:18 +0000 (UTC)

commit cc6cfd141bf7b6d4d9cdb1d215bddbf58cfbed0d
Author: Evan Welsh <contact evanwelsh com>
Date:   Wed Aug 11 18:07:52 2021 -0700

    Updates

 .eslintignore                                      |   2 +-
 gjs/text-encoding.cpp                              |  41 ++-
 installed-tests/js/.eslintrc.yml                   |   2 +
 installed-tests/js/matchers.js                     |  29 ++
 installed-tests/js/meson.build                     |   6 +-
 installed-tests/js/testEncoding.js                 | 114 +++++---
 js.gresource.xml                                   |   6 +-
 jsconfig.json                                      |   5 +
 modules/core/_encodings.js                         | 312 ---------------------
 modules/esm/_bootstrap/default.js                  |   3 +
 .../{core/_text.js => esm/_encoding/encoding.js}   | 104 ++++---
 modules/esm/_encoding/encodingMap.js               | 305 ++++++++++++++++++++
 modules/esm/_encoding/util.js                      |  37 +++
 modules/script/_bootstrap/default.js               |  13 -
 14 files changed, 566 insertions(+), 413 deletions(-)
---
diff --git a/.eslintignore b/.eslintignore
index 9e12e9f0..60e57336 100644
--- a/.eslintignore
+++ b/.eslintignore
@@ -5,5 +5,5 @@ installed-tests/js/jasmine.js
 installed-tests/js/modules/badOverrides/WarnLib.js
 # Until ESLint merges class fields.
 # https://github.com/eslint/eslint/issues/14343
-modules/core/_text.js
+modules/esm/_encoding/encoding.js
 modules/script/jsUnit.js
diff --git a/gjs/text-encoding.cpp b/gjs/text-encoding.cpp
index 228c8c40..2dd384e8 100644
--- a/gjs/text-encoding.cpp
+++ b/gjs/text-encoding.cpp
@@ -73,13 +73,34 @@ static JSString* gjs_lossy_decode_from_uint8array_slow(
     if (error)
         return gjs_throw_type_error_from_gerror(cx, error);
 
+    // This function converts *to* UTF-16, using a std::u16string
+    // as its buffer.
+    //
+    // UTF-16 represents each character with 2 bytes or
+    // 4 bytes, the best case scenario when converting to
+    // UTF-16 is that every input byte encodes to two bytes,
+    // this is typical for ASCII and non-supplementary characters.
+    // Because we are converting from an unknown encoding
+    // technically a single byte could be supplementary in
+    // Unicode (4 bytes) or even represen multiple Unicode characters.
+    //
+    // std::u16string does not care about these implementation
+    // details, its only concern is that is consists of byte pairs.
+    // Given this, a single UTF-16 character could be represented
+    // by one or two std::u16string characters.
+
     // Allocate bytes_len * 2 + 12 as our initial buffer.
     // bytes_len * 2 is the "best case" for LATIN1 strings
-    // and strings are in the basic multilingual plane.
+    // and strings which are in the basic multilingual plane.
     // Add 12 as a slight cushion and set the minimum allocation
     // at 256 to prefer running a single iteration for
     // small strings with supplemental plane characters.
-    int buffer_size = std::min(bytes_len * 2 + 12, 256lu);
+    //
+    // When converting Chinese characters, for example,
+    // some dialectal characters are in the supplemental plane
+    // Adding a padding of 12 prevents a few dialectal characters
+    // from requiring a reallocation.
+    size_t buffer_size = std::max(bytes_len * 2 + 12, 256lu);
 
     // Cast data to correct input types
     const char* input = reinterpret_cast<const char*>(bytes);
@@ -108,7 +129,7 @@ static JSString* gjs_lossy_decode_from_uint8array_slow(
         // accumulator
         if (bytes_written > 0) {
             char16_t* utf16_buffer = reinterpret_cast<char16_t*>(buffer.data());
-            // UTF-16 uses exactly 2 bytes for every character.
+            // std::u16string uses exactly 2 bytes for every character.
             output_str.append(utf16_buffer, bytes_written / 2);
         } else if (error) {
             // A PARTIAL_INPUT error can only occur if the user does not provide
@@ -138,10 +159,16 @@ static JSString* gjs_lossy_decode_from_uint8array_slow(
                 // If the buffer was full increase the buffer
                 // size and re-try the conversion.
                 //
-                // In most cases this will allocate bytes_len * 4
-                // (the worst case scenario) in the next loop
-                // where invalid bytes are not found.
-                buffer_size *= 2;
+                // This logic allocates bytes_len * 3 first,
+                // then bytes_len * 4 (the worst case scenario
+                // is nearly impossible) and then continues appending
+                // arbitrary padding because we'll trust Gio and give
+                // it additional space.
+                if (buffer_size > bytes_len * 4) {
+                    buffer_size += 256;
+                } else {
+                    buffer_size += bytes_len;
+                }
 
                 // Clear the error.
                 g_clear_error(&error);
diff --git a/installed-tests/js/.eslintrc.yml b/installed-tests/js/.eslintrc.yml
index c1a4c9bd..bcdadbea 100644
--- a/installed-tests/js/.eslintrc.yml
+++ b/installed-tests/js/.eslintrc.yml
@@ -31,8 +31,10 @@ globals:
   setTimeout: writable
 overrides:
   - files:
+      - matchers.js
       - testCairoModule.js
       - testESModules.js
+      - testEncoding.js
       - modules/importmeta.js
       - modules/exports.js
       - modules/say.js
diff --git a/installed-tests/js/matchers.js b/installed-tests/js/matchers.js
new file mode 100644
index 00000000..e4d29835
--- /dev/null
+++ b/installed-tests/js/matchers.js
@@ -0,0 +1,29 @@
+/**
+ * A jasmine asymmetric matcher which expects an array-like object
+ * to contain the given element array in the same order with the
+ * same length. Useful for testing typed arrays.
+ *
+ * @template T
+ * @param {T[]} elements an array of elements to compare with
+ * @returns
+ */
+export function arrayLikeWithExactContents(elements) {
+    return {
+        /**
+         * @param {ArrayLike<T>} compareTo an array-like object to compare to
+         * @returns {boolean}
+         */
+        asymmetricMatch(compareTo) {
+            return (
+                compareTo.length === elements.length &&
+                elements.every((e, i) => e === compareTo[i])
+            );
+        },
+        /**
+         * @returns {string}
+         */
+        jasmineToString() {
+            return `${JSON.stringify(elements)}`;
+        },
+    };
+}
diff --git a/installed-tests/js/meson.build b/installed-tests/js/meson.build
index cca525c1..f5fa8e08 100644
--- a/installed-tests/js/meson.build
+++ b/installed-tests/js/meson.build
@@ -94,7 +94,6 @@ subdir('libgjstesttools')
 jasmine_tests = [
     'self',
     'ByteArray',
-    'Encoding',
     'Exceptions',
     'Format',
     'Fundamental',
@@ -217,7 +216,10 @@ endif
 # tests using ES modules are also separate because they need an extra
 # minijasmine flag
 
-modules_tests = ['ESModules']
+modules_tests = [
+    'ESModules',
+    'Encoding',
+]
 if build_cairo
     modules_tests += 'CairoModule'
 endif
diff --git a/installed-tests/js/testEncoding.js b/installed-tests/js/testEncoding.js
index dcd05ee6..bd02ee75 100644
--- a/installed-tests/js/testEncoding.js
+++ b/installed-tests/js/testEncoding.js
@@ -4,7 +4,9 @@
 // Some test inputs are derived from 
https://github.com/denoland/deno/blob/923214c53725651792f6d55c5401bf6b475622ea/op_crates/web/08_text_encoding.js
 // Data originally from https://encoding.spec.whatwg.org/encodings.json
 
-const {Gio} = imports.gi;
+import Gio from 'gi://Gio';
+
+import {arrayLikeWithExactContents} from './matchers.js';
 
 /**
  * Loads a JSON file from a URI and parses it.
@@ -23,36 +25,6 @@ function loadJSONFromResource(src) {
     return json;
 }
 
-/**
- * A jasmine asymmetric matcher which expects an array-like object
- * to contain the given element array in the same order with the
- * same length. Useful for testing typed arrays.
- *
- * @template T
- * @param {T[]} elements an array of elements to compare with
- * @returns
- */
-function withElements(elements) {
-    return {
-        /**
-         * @param {ArrayLike<T>} compareTo an array-like object to compare to
-         * @returns {boolean}
-         */
-        asymmetricMatch(compareTo) {
-            return (
-                compareTo.length === elements.length &&
-                elements.every((e, i) => e === compareTo[i])
-            );
-        },
-        /**
-         * @returns {string}
-         */
-        jasmineToString() {
-            return `${JSON.stringify(elements)}`;
-        },
-    };
-}
-
 /**
  * Encoded form of '𝓽𝓮𝔁𝓽'
  *
@@ -83,7 +55,7 @@ describe('Text Encoding', function () {
                 const encoded = encoder.encode(input);
 
                 expect(encoded).toEqual(
-                    withElements([...encodedMultibyteCharArray()])
+                    arrayLikeWithExactContents([...encodedMultibyteCharArray()])
                 );
             });
         });
@@ -98,7 +70,7 @@ describe('Text Encoding', function () {
                 expect(result.written).toBe(4);
 
                 expect(bytes).toEqual(
-                    withElements([0x74, 0x65, 0x78, 0x74, 0x00])
+                    arrayLikeWithExactContents([0x74, 0x65, 0x78, 0x74, 0x00])
                 );
             });
 
@@ -111,7 +83,10 @@ describe('Text Encoding', function () {
                 expect(result.written).toBe(16);
 
                 expect(bytes).toEqual(
-                    withElements([...encodedMultibyteCharArray(), 0x00])
+                    arrayLikeWithExactContents([
+                        ...encodedMultibyteCharArray(),
+                        0x00,
+                    ])
                 );
             });
 
@@ -124,7 +99,7 @@ describe('Text Encoding', function () {
                 expect(result.written).toBe(4);
 
                 expect(bytes).toEqual(
-                    withElements([
+                    arrayLikeWithExactContents([
                         ...encodedMultibyteCharArray().slice(0, 4),
                         0x00,
                     ])
@@ -302,6 +277,75 @@ describe('Text Encoding', function () {
 
                 expect(decoded).toEqual(longResult);
             });
+
+            it('can decode Big-5 HKSCS with supplemental characters', function () {
+                // The characters below roughly mean 'hard' or 'solid' and
+                // 'rooster' respectively. They were chosen for their Unicode
+                // and HKSCS positioning, not meaning.
+
+                // Big5-HKSCS bytes for the supplemental character 𠕇       
+                const supplementalBytes = [250, 64];
+                // Big5-HKSCS bytes for the non-supplemental characters 公雞
+                const nonSupplementalBytes = [164, 189, 194, 251];
+
+                const decoder = new TextDecoder('big5-hkscs');
+
+                // We currently allocate 12 additional bytes of padding
+                // and a minimum of 256...
+
+                // This should produce 400 non-supplemental bytes (50 * 2 * 4)
+                // and 16 supplemental bytes (4 * 4)
+                const repeatedNonSupplementalBytes = new Array(50).fill(nonSupplementalBytes).flat();
+                const bytes = [
+                    ...repeatedNonSupplementalBytes,
+                    ...supplementalBytes,
+                    ...repeatedNonSupplementalBytes,
+                    ...supplementalBytes,
+                    ...repeatedNonSupplementalBytes,
+                    ...supplementalBytes,
+                    ...repeatedNonSupplementalBytes,
+                    ...supplementalBytes,
+                ];
+
+                const expectedNonSupplemental  = new Array(50).fill('公雞');
+                const expected = [
+                    ...expectedNonSupplemental,
+                    '𠕇',
+                    ...expectedNonSupplemental,
+                    '𠕇',
+                    ...expectedNonSupplemental,
+                    '𠕇',
+                    ...expectedNonSupplemental,
+                    '𠕇',
+                ].join('');
+
+                // Calculate the number of bytes the UTF-16 characters should
+                // occupy.
+                const expectedU16Bytes = [...expected].reduce((prev, next) => {
+                    const utf16code = next.codePointAt(0);
+
+                    // Test whether this unit is supplemental
+                    const additionalBytes = utf16code > 0xFFFF ? 2 : 0;
+
+                    return prev + 2 + additionalBytes;
+                }, 0);
+
+
+                // We set a minimum buffer allocation of 256 bytes,
+                // this ensures that this test exceeds that.
+                expect(expectedU16Bytes / 2).toBeGreaterThan(256);
+
+                // The length of the input bytes should always be less
+                // than the expected output because UTF-16 uses 4 bytes
+                // to represent some characters HKSCS needs only 2 for.
+                expect(bytes.length).toBeLessThan(expectedU16Bytes);
+                // 4 supplemental characters, each with two additional bytes.
+                expect(bytes.length + 4 * 2).toBe(expectedU16Bytes);
+
+                const decoded = decoder.decode(new Uint8Array(bytes));
+
+                expect(decoded).toBe(expected);
+            });
         });
 
         describe('Single Byte Encoding Converter', function () {
diff --git a/js.gresource.xml b/js.gresource.xml
index a809930b..947049c2 100644
--- a/js.gresource.xml
+++ b/js.gresource.xml
@@ -9,6 +9,10 @@
 
     <!-- ESM-based modules -->
     <file>modules/esm/_bootstrap/default.js</file>
+
+    <file>modules/esm/_encoding/encoding.js</file>
+    <file>modules/esm/_encoding/encodingMap.js</file>
+    <file>modules/esm/_encoding/util.js</file>
   
     <file>modules/esm/cairo.js</file>
     <file>modules/esm/gettext.js</file>
@@ -44,10 +48,8 @@
 
     <file>modules/core/_cairo.js</file>
     <file>modules/core/_common.js</file>
-    <file>modules/core/_encodings.js</file>
     <file>modules/core/_format.js</file>
     <file>modules/core/_gettext.js</file>
     <file>modules/core/_signals.js</file>
-    <file>modules/core/_text.js</file>
   </gresource>
 </gresources>
diff --git a/jsconfig.json b/jsconfig.json
new file mode 100644
index 00000000..43feb2ed
--- /dev/null
+++ b/jsconfig.json
@@ -0,0 +1,5 @@
+{
+    "compilerOptions": {
+        "lib": ["es2020"],
+    }
+}
\ No newline at end of file
diff --git a/modules/esm/_bootstrap/default.js b/modules/esm/_bootstrap/default.js
index fefeb51b..eb315af7 100644
--- a/modules/esm/_bootstrap/default.js
+++ b/modules/esm/_bootstrap/default.js
@@ -2,3 +2,6 @@
 // SPDX-FileCopyrightText: 2021 Evan Welsh <contact evanwelsh com>
 
 // Bootstrap file which supports ESM imports.
+
+// Bootstrap the Encoding API
+import '_encoding/encoding';
diff --git a/modules/core/_text.js b/modules/esm/_encoding/encoding.js
similarity index 53%
rename from modules/core/_text.js
rename to modules/esm/_encoding/encoding.js
index 39413b73..e84b752b 100644
--- a/modules/core/_text.js
+++ b/modules/esm/_encoding/encoding.js
@@ -1,22 +1,22 @@
 // SPDX-License-Identifier: MIT OR LGPL-2.0-or-later
-// SPDX-FileCopyrightText: Evan Welsh
+// SPDX-FileCopyrightText: 2021 Evan Welsh <contact evanwelsh com>
 
-const Encoding = imports._encodingNative;
+const Encoding = import.meta.importSync('_encodingNative');
 
-const { getEncodingFromLabel } = imports._encodings;
+import {getEncodingFromLabel} from './encodingMap.js';
 
-var TextDecoder = class TextDecoder {
-    /**  
+class TextDecoder {
+    /**
      * @type {string}
      */
     encoding;
 
-    /**  
+    /**
      * @type {boolean}
      */
     ignoreBOM;
 
-    /**  
+    /**
      * @type {boolean}
      */
     fatal;
@@ -26,22 +26,24 @@ var TextDecoder = class TextDecoder {
     }
 
     /**
-     * @param {string} encoding 
-     * @param {object} [options]
-     * @param {boolean=} options.fatal
-     * @param {boolean=} options.ignoreBOM 
+     * @param {string} encoding The encoding to decode into
+     * @param {object} [options] Decoding options
+     * @param {boolean=} options.fatal Whether to throw or substitute when invalid characters are encountered
+     * @param {boolean=} options.ignoreBOM Whether to ignore the byte order for UTF-8 arrays
      */
     constructor(encoding = 'utf-8', options = {}) {
-        const { fatal = false, ignoreBOM = false } = options;
+        const {fatal = false, ignoreBOM = false} = options;
 
         const encodingDefinition = getEncodingFromLabel(`${encoding}`);
 
-        if (!encodingDefinition) {
+        if (!encodingDefinition)
             throw new RangeError(`Invalid encoding label: '${encoding}'`);
-        }
+
 
         if (encodingDefinition.label === 'replacement') {
-            throw new RangeError(`Unsupported replacement encoding: '${encoding}'`);
+            throw new RangeError(
+                `Unsupported replacement encoding: '${encoding}'`
+            );
         }
 
         Object.defineProperty(this, '_internalEncoding', {
@@ -73,19 +75,19 @@ var TextDecoder = class TextDecoder {
         });
     }
 
-
-
     /**
-     * @param {unknown} bytes 
-     * @param {object} [options]
-     * @param {boolean=} options.stream
-     * @returns 
+     * @param {unknown} bytes a typed array of bytes to decode
+     * @param {object} [options] Decoding options
+     * @param {boolean=} options.stream Unsupported option. Whether to stream the decoded bytes.
+     * @returns
      */
     decode(bytes, options = {}) {
-        const { stream = false } = options;
+        const {stream = false} = options;
 
         if (stream) {
-            throw new Error(`TextDecoder does not implement the 'stream' option.`);
+            throw new Error(
+                'TextDecoder does not implement the \'stream\' option.'
+            );
         }
 
         /** @type {Uint8Array} */
@@ -96,32 +98,37 @@ var TextDecoder = class TextDecoder {
         } else if (bytes instanceof Uint8Array) {
             input = bytes;
         } else if (bytes instanceof Object.getPrototypeOf(Uint8Array)) {
-            let { buffer, byteLength, byteOffset } = /** @type {Uint32Array} */ (bytes);
+            let {buffer, byteLength, byteOffset} =
+                /** @type {Uint32Array} */ bytes;
             input = new Uint8Array(buffer, byteOffset, byteLength);
         } else if (
-            typeof bytes === "object" &&
+            typeof bytes === 'object' &&
             bytes !== null &&
-            "buffer" in bytes &&
+            'buffer' in bytes &&
             bytes.buffer instanceof ArrayBuffer
         ) {
-            let { buffer, byteLength, byteOffset } = bytes;
-            input = new Uint8Array(
-                buffer,
-                byteOffset,
-                byteLength
-            );
+            let {buffer, byteLength, byteOffset} = bytes;
+            input = new Uint8Array(buffer, byteOffset, byteLength);
         } else if (bytes === undefined) {
             input = new Uint8Array(0);
         } else {
-            throw new Error(`Provided input cannot be converted to ArrayBufferView or ArrayBuffer`);
+            throw new Error(
+                'Provided input cannot be converted to ArrayBufferView or ArrayBuffer'
+            );
         }
 
-        if (this.ignoreBOM && input.length > 2 && input[0] === 0xEF && input[1] === 0xBB && input[2] === 
0xBF) {
-            if (this.encoding !== 'utf-8') {
-                throw new Error(`Cannot ignore BOM for non-UTF8 encoding.`);
-            }
+        if (
+            this.ignoreBOM &&
+            input.length > 2 &&
+            input[0] === 0xef &&
+            input[1] === 0xbb &&
+            input[2] === 0xbf
+        ) {
+            if (this.encoding !== 'utf-8')
+                throw new Error('Cannot ignore BOM for non-UTF8 encoding.');
+
 
-            let { buffer, byteLength, byteOffset } = input;
+            let {buffer, byteLength, byteOffset} = input;
             input = new Uint8Array(buffer, byteOffset + 3, byteLength - 3);
         }
 
@@ -129,7 +136,7 @@ var TextDecoder = class TextDecoder {
     }
 }
 
-var TextEncoder = class TextEncoder {
+class TextEncoder {
     get [Symbol.toStringTag]() {
         return 'TextEncoder';
     }
@@ -140,11 +147,26 @@ var TextEncoder = class TextEncoder {
 
     encode(input = '') {
         // The TextEncoder specification only allows for UTF-8 encoding.
-        return Encoding.encode(`${input}`, 'UTF-8');
+        return Encoding.encode(`${input}`, 'utf-8');
     }
 
     encodeInto(input = '', output = new Uint8Array()) {
         // The TextEncoder specification only allows for UTF-8 encoding.
         return Encoding.encodeInto(`${input}`, output);
     }
-}
\ No newline at end of file
+}
+
+Object.defineProperties(globalThis, {
+    TextEncoder: {
+        configurable: false,
+        enumerable: true,
+        writable: false,
+        value: TextEncoder,
+    },
+    TextDecoder: {
+        configurable: false,
+        enumerable: true,
+        writable: false,
+        value: TextDecoder,
+    },
+});
diff --git a/modules/esm/_encoding/encodingMap.js b/modules/esm/_encoding/encodingMap.js
new file mode 100644
index 00000000..b0f17702
--- /dev/null
+++ b/modules/esm/_encoding/encodingMap.js
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: MIT OR LGPL-2.0-or-later
+// SPDX-FileCopyrightText: 2021 Evan Welsh <contact evanwelsh com>
+
+import {trimAsciiWhitespace} from './util.js';
+
+// Data derived from https://encoding.spec.whatwg.org/encodings.json
+const encodingMap = {
+    'utf-8': [
+        'unicode-1-1-utf-8',
+        'unicode11utf8',
+        'unicode20utf8',
+        'utf-8',
+        'utf8',
+        'x-unicode20utf8',
+    ],
+    ibm866: ['866', 'cp866', 'csibm866', 'ibm866'],
+    'iso-8859-2': [
+        'csisolatin2',
+        'iso-8859-2',
+        'iso-ir-101',
+        'iso8859-2',
+        'iso88592',
+        'iso_8859-2',
+        'iso_8859-2:1987',
+        'l2',
+        'latin2',
+    ],
+    'iso-8859-3': [
+        'csisolatin3',
+        'iso-8859-3',
+        'iso-ir-109',
+        'iso8859-3',
+        'iso88593',
+        'iso_8859-3',
+        'iso_8859-3:1988',
+        'l3',
+        'latin3',
+    ],
+    'iso-8859-4': [
+        'csisolatin4',
+        'iso-8859-4',
+        'iso-ir-110',
+        'iso8859-4',
+        'iso88594',
+        'iso_8859-4',
+        'iso_8859-4:1988',
+        'l4',
+        'latin4',
+    ],
+    'iso-8859-5': [
+        'csisolatincyrillic',
+        'cyrillic',
+        'iso-8859-5',
+        'iso-ir-144',
+        'iso8859-5',
+        'iso88595',
+        'iso_8859-5',
+        'iso_8859-5:1988',
+    ],
+    'iso-8859-6': [
+        'arabic',
+        'asmo-708',
+        'csiso88596e',
+        'csiso88596i',
+        'csisolatinarabic',
+        'ecma-114',
+        'iso-8859-6',
+        'iso-8859-6-e',
+        'iso-8859-6-i',
+        'iso-ir-127',
+        'iso8859-6',
+        'iso88596',
+        'iso_8859-6',
+        'iso_8859-6:1987',
+    ],
+    'iso-8859-7': [
+        'csisolatingreek',
+        'ecma-118',
+        'elot_928',
+        'greek',
+        'greek8',
+        'iso-8859-7',
+        'iso-ir-126',
+        'iso8859-7',
+        'iso88597',
+        'iso_8859-7',
+        'iso_8859-7:1987',
+        'sun_eu_greek',
+    ],
+    'iso-8859-8': [
+        'csiso88598e',
+        'csisolatinhebrew',
+        'hebrew',
+        'iso-8859-8',
+        'iso-8859-8-e',
+        'iso-ir-138',
+        'iso8859-8',
+        'iso88598',
+        'iso_8859-8',
+        'iso_8859-8:1988',
+        'visual',
+    ],
+    'iso-8859-8-i': ['csiso88598i', 'iso-8859-8-i', 'logical'],
+    'iso-8859-10': [
+        'csisolatin6',
+        'iso-8859-10',
+        'iso-ir-157',
+        'iso8859-10',
+        'iso885910',
+        'l6',
+        'latin6',
+    ],
+    'iso-8859-13': ['iso-8859-13', 'iso8859-13', 'iso885913'],
+    'iso-8859-14': ['iso-8859-14', 'iso8859-14', 'iso885914'],
+    'iso-8859-15': [
+        'csisolatin9',
+        'iso-8859-15',
+        'iso8859-15',
+        'iso885915',
+        'iso_8859-15',
+        'l9',
+    ],
+    'iso-8859-16': ['iso-8859-16'],
+    'koi8-r': ['cskoi8r', 'koi', 'koi8', 'koi8-r', 'koi8_r'],
+    'koi8-u': ['koi8-ru', 'koi8-u'],
+    macintosh: ['csmacintosh', 'mac', 'macintosh', 'x-mac-roman'],
+    'windows-874': [
+        'dos-874',
+        'iso-8859-11',
+        'iso8859-11',
+        'iso885911',
+        'tis-620',
+        'windows-874',
+    ],
+    'windows-1250': ['cp1250', 'windows-1250', 'x-cp1250'],
+    'windows-1251': ['cp1251', 'windows-1251', 'x-cp1251'],
+    'windows-1252': [
+        'ansi_x3.4-1968',
+        'ascii',
+        'cp1252',
+        'cp819',
+        'csisolatin1',
+        'ibm819',
+        'iso-8859-1',
+        'iso-ir-100',
+        'iso8859-1',
+        'iso88591',
+        'iso_8859-1',
+        'iso_8859-1:1987',
+        'l1',
+        'latin1',
+        'us-ascii',
+        'windows-1252',
+        'x-cp1252',
+    ],
+    'windows-1253': ['cp1253', 'windows-1253', 'x-cp1253'],
+    'windows-1254': [
+        'cp1254',
+        'csisolatin5',
+        'iso-8859-9',
+        'iso-ir-148',
+        'iso8859-9',
+        'iso88599',
+        'iso_8859-9',
+        'iso_8859-9:1989',
+        'l5',
+        'latin5',
+        'windows-1254',
+        'x-cp1254',
+    ],
+    'windows-1255': ['cp1255', 'windows-1255', 'x-cp1255'],
+    'windows-1256': ['cp1256', 'windows-1256', 'x-cp1256'],
+    'windows-1257': ['cp1257', 'windows-1257', 'x-cp1257'],
+    'windows-1258': ['cp1258', 'windows-1258', 'x-cp1258'],
+    'x-mac-cyrillic': ['x-mac-cyrillic', 'x-mac-ukrainian'],
+    gbk: [
+        'chinese',
+        'csgb2312',
+        'csiso58gb231280',
+        'gb2312',
+        'gb_2312',
+        'gb_2312-80',
+        'gbk',
+        'iso-ir-58',
+        'x-gbk',
+    ],
+    gb18030: ['gb18030'],
+    big5: [
+        'big5',
+        // Unlike the standard WHATWG encoder
+        // the Hong Kong Supplementary Character Set
+        // is not bundled in big5 by iconv
+        // "big5-hkscs",
+        'cn-big5',
+        'csbig5',
+        'x-x-big5',
+    ],
+    'euc-jp': ['cseucpkdfmtjapanese', 'euc-jp', 'x-euc-jp'],
+    'iso-2022-jp': ['csiso2022jp', 'iso-2022-jp'],
+    shift_jis: [
+        'csshiftjis',
+        'ms932',
+        'ms_kanji',
+        'shift-jis',
+        'shift_jis',
+        'sjis',
+        'windows-31j',
+        'x-sjis',
+    ],
+    'euc-kr': [
+        'cseuckr',
+        'csksc56011987',
+        'euc-kr',
+        'iso-ir-149',
+        'korean',
+        'ks_c_5601-1987',
+        'ks_c_5601-1989',
+        'ksc5601',
+        'ksc_5601',
+        'windows-949',
+    ],
+    'utf-16be': ['unicodefffe', 'utf-16be'],
+    'utf-16le': [
+        'csunicode',
+        'iso-10646-ucs-2',
+        'ucs-2',
+        'unicode',
+        'unicodefeff',
+        'utf-16',
+        'utf-16le',
+    ],
+};
+
+/**
+ * Construct a map from each potential label to the canonical label
+ * for an encoding.
+ */
+const encodings = new Map(
+    Object.entries(encodingMap).flatMap(([encoding, labels]) => {
+        return labels.map(label => [label, encoding]);
+    })
+);
+
+// Maps WHATWG specified labels to the appropriate iconv
+// encoding label if iconv does not support the WHATWG label.
+//
+// Mapping here preserves the WHATWG as the label on the
+// TextDecoder so this change is transparent to API users.
+const internalEncodings = new Map([
+    // iso-8859-8-i is functionally equivalent to iso-8859-8
+    // as we are not encoding or decoding control characters.
+    ['iso-8859-8-i', 'iso-8859-8'],
+    // iconv follows a different naming convention for this
+    // encoding
+    ['x-mac-cyrillic', 'MacCyrillic'],
+    // Support HKSCS as a standalone encoding, iconv doesn't
+    // bundle it with Big5 like WHATWG does...
+    ['big5-hkscs', 'big5-hkscs'],
+]);
+
+/**
+ * @typedef Encoding
+ * @property {string} internalLabel
+ * @property {string} label
+ */
+
+/**
+ * @param {string} label the encoding label
+ * @returns {Encoding | null}
+ */
+export function getEncodingFromLabel(label) {
+    const formattedLabel = trimAsciiWhitespace(label.toLowerCase());
+
+    let canonicalLabel = encodings.get(formattedLabel);
+
+    // Lookup an internal mapping using the canonical name, if found, or
+    // the formatted label otherwise.
+    //
+    // x-mac-ukrainian   >   x-mac-cyrillic   >   MacCyrillic
+    //                      (canonical label)    (internal label)
+    //
+    // big5-hkscs        >   undefined        >   big5-hkscs
+    //                      (canonical label)    (internal label)
+    //
+    let internalLabel = internalEncodings.get(
+        canonicalLabel ?? formattedLabel
+    );
+
+    // If both the canonical label and the internal encoding
+    // are not found, this encoding is unsupported.
+    if (!canonicalLabel && !internalLabel)
+        return null;
+
+    if (internalLabel) {
+        return {
+            label: canonicalLabel ?? formattedLabel,
+            internalLabel,
+        };
+    }
+
+    return {
+        label: canonicalLabel,
+        internalLabel: canonicalLabel,
+    };
+}
diff --git a/modules/esm/_encoding/util.js b/modules/esm/_encoding/util.js
new file mode 100644
index 00000000..9ee450fe
--- /dev/null
+++ b/modules/esm/_encoding/util.js
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: Node.js contributors. All rights reserved.
+
+// Modified from 
https://github.com/nodejs/node/blob/78680c1cbc8b0c435963bc512e826b2a6227c315/lib/internal/encoding.js
+
+/**
+ * Trims ASCII whitespace from a string.
+ * `String.prototype.trim` removes non-ASCII whitespace.
+ *
+ * @param {string} label the label to trim
+ * @returns {string}
+ */
+export const trimAsciiWhitespace = label => {
+    let s = 0;
+    let e = label.length;
+    while (
+        s < e &&
+        (label[s] === '\u0009' ||
+            label[s] === '\u000a' ||
+            label[s] === '\u000c' ||
+            label[s] === '\u000d' ||
+            label[s] === '\u0020')
+    )
+        s++;
+
+    while (
+        e > s &&
+        (label[e - 1] === '\u0009' ||
+            label[e - 1] === '\u000a' ||
+            label[e - 1] === '\u000c' ||
+            label[e - 1] === '\u000d' ||
+            label[e - 1] === '\u0020')
+    )
+        e--;
+
+    return label.slice(s, e);
+};
diff --git a/modules/script/_bootstrap/default.js b/modules/script/_bootstrap/default.js
index fe354a02..952d7fe3 100644
--- a/modules/script/_bootstrap/default.js
+++ b/modules/script/_bootstrap/default.js
@@ -6,7 +6,6 @@
     'use strict';
 
     const {print, printerr, log, logError} = imports._print;
-    const {TextEncoder, TextDecoder} = imports._text;
 
     Object.defineProperties(exports, {
         ARGV: {
@@ -17,18 +16,6 @@
                 return imports.system.programArgs;
             },
         },
-        TextEncoder: {
-            configurable: false,
-            enumerable: true,
-            writable: false,
-            value: TextEncoder,
-        },
-        TextDecoder: {
-            configurable: false,
-            enumerable: true,
-            writable: false,
-            value: TextDecoder,
-        },
         print: {
             configurable: false,
             enumerable: true,
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]