[gjs/ewlsh/text-encoding] Updates
- From: Evan Welsh <ewlsh src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gjs/ewlsh/text-encoding] Updates
- Date: Thu, 12 Aug 2021 01:23:18 +0000 (UTC)
commit cc6cfd141bf7b6d4d9cdb1d215bddbf58cfbed0d
Author: Evan Welsh <contact evanwelsh com>
Date: Wed Aug 11 18:07:52 2021 -0700
Updates
.eslintignore | 2 +-
gjs/text-encoding.cpp | 41 ++-
installed-tests/js/.eslintrc.yml | 2 +
installed-tests/js/matchers.js | 29 ++
installed-tests/js/meson.build | 6 +-
installed-tests/js/testEncoding.js | 114 +++++---
js.gresource.xml | 6 +-
jsconfig.json | 5 +
modules/core/_encodings.js | 312 ---------------------
modules/esm/_bootstrap/default.js | 3 +
.../{core/_text.js => esm/_encoding/encoding.js} | 104 ++++---
modules/esm/_encoding/encodingMap.js | 305 ++++++++++++++++++++
modules/esm/_encoding/util.js | 37 +++
modules/script/_bootstrap/default.js | 13 -
14 files changed, 566 insertions(+), 413 deletions(-)
---
diff --git a/.eslintignore b/.eslintignore
index 9e12e9f0..60e57336 100644
--- a/.eslintignore
+++ b/.eslintignore
@@ -5,5 +5,5 @@ installed-tests/js/jasmine.js
installed-tests/js/modules/badOverrides/WarnLib.js
# Until ESLint merges class fields.
# https://github.com/eslint/eslint/issues/14343
-modules/core/_text.js
+modules/esm/_encoding/encoding.js
modules/script/jsUnit.js
diff --git a/gjs/text-encoding.cpp b/gjs/text-encoding.cpp
index 228c8c40..2dd384e8 100644
--- a/gjs/text-encoding.cpp
+++ b/gjs/text-encoding.cpp
@@ -73,13 +73,34 @@ static JSString* gjs_lossy_decode_from_uint8array_slow(
if (error)
return gjs_throw_type_error_from_gerror(cx, error);
+ // This function converts *to* UTF-16, using a std::u16string
+ // as its buffer.
+ //
+ // UTF-16 represents each character with 2 bytes or
+ // 4 bytes, the best case scenario when converting to
+ // UTF-16 is that every input byte encodes to two bytes,
+ // this is typical for ASCII and non-supplementary characters.
+ // Because we are converting from an unknown encoding
+ // technically a single byte could be supplementary in
+ // Unicode (4 bytes) or even represen multiple Unicode characters.
+ //
+ // std::u16string does not care about these implementation
+ // details, its only concern is that is consists of byte pairs.
+ // Given this, a single UTF-16 character could be represented
+ // by one or two std::u16string characters.
+
// Allocate bytes_len * 2 + 12 as our initial buffer.
// bytes_len * 2 is the "best case" for LATIN1 strings
- // and strings are in the basic multilingual plane.
+ // and strings which are in the basic multilingual plane.
// Add 12 as a slight cushion and set the minimum allocation
// at 256 to prefer running a single iteration for
// small strings with supplemental plane characters.
- int buffer_size = std::min(bytes_len * 2 + 12, 256lu);
+ //
+ // When converting Chinese characters, for example,
+ // some dialectal characters are in the supplemental plane
+ // Adding a padding of 12 prevents a few dialectal characters
+ // from requiring a reallocation.
+ size_t buffer_size = std::max(bytes_len * 2 + 12, 256lu);
// Cast data to correct input types
const char* input = reinterpret_cast<const char*>(bytes);
@@ -108,7 +129,7 @@ static JSString* gjs_lossy_decode_from_uint8array_slow(
// accumulator
if (bytes_written > 0) {
char16_t* utf16_buffer = reinterpret_cast<char16_t*>(buffer.data());
- // UTF-16 uses exactly 2 bytes for every character.
+ // std::u16string uses exactly 2 bytes for every character.
output_str.append(utf16_buffer, bytes_written / 2);
} else if (error) {
// A PARTIAL_INPUT error can only occur if the user does not provide
@@ -138,10 +159,16 @@ static JSString* gjs_lossy_decode_from_uint8array_slow(
// If the buffer was full increase the buffer
// size and re-try the conversion.
//
- // In most cases this will allocate bytes_len * 4
- // (the worst case scenario) in the next loop
- // where invalid bytes are not found.
- buffer_size *= 2;
+ // This logic allocates bytes_len * 3 first,
+ // then bytes_len * 4 (the worst case scenario
+ // is nearly impossible) and then continues appending
+ // arbitrary padding because we'll trust Gio and give
+ // it additional space.
+ if (buffer_size > bytes_len * 4) {
+ buffer_size += 256;
+ } else {
+ buffer_size += bytes_len;
+ }
// Clear the error.
g_clear_error(&error);
diff --git a/installed-tests/js/.eslintrc.yml b/installed-tests/js/.eslintrc.yml
index c1a4c9bd..bcdadbea 100644
--- a/installed-tests/js/.eslintrc.yml
+++ b/installed-tests/js/.eslintrc.yml
@@ -31,8 +31,10 @@ globals:
setTimeout: writable
overrides:
- files:
+ - matchers.js
- testCairoModule.js
- testESModules.js
+ - testEncoding.js
- modules/importmeta.js
- modules/exports.js
- modules/say.js
diff --git a/installed-tests/js/matchers.js b/installed-tests/js/matchers.js
new file mode 100644
index 00000000..e4d29835
--- /dev/null
+++ b/installed-tests/js/matchers.js
@@ -0,0 +1,29 @@
+/**
+ * A jasmine asymmetric matcher which expects an array-like object
+ * to contain the given element array in the same order with the
+ * same length. Useful for testing typed arrays.
+ *
+ * @template T
+ * @param {T[]} elements an array of elements to compare with
+ * @returns
+ */
+export function arrayLikeWithExactContents(elements) {
+ return {
+ /**
+ * @param {ArrayLike<T>} compareTo an array-like object to compare to
+ * @returns {boolean}
+ */
+ asymmetricMatch(compareTo) {
+ return (
+ compareTo.length === elements.length &&
+ elements.every((e, i) => e === compareTo[i])
+ );
+ },
+ /**
+ * @returns {string}
+ */
+ jasmineToString() {
+ return `${JSON.stringify(elements)}`;
+ },
+ };
+}
diff --git a/installed-tests/js/meson.build b/installed-tests/js/meson.build
index cca525c1..f5fa8e08 100644
--- a/installed-tests/js/meson.build
+++ b/installed-tests/js/meson.build
@@ -94,7 +94,6 @@ subdir('libgjstesttools')
jasmine_tests = [
'self',
'ByteArray',
- 'Encoding',
'Exceptions',
'Format',
'Fundamental',
@@ -217,7 +216,10 @@ endif
# tests using ES modules are also separate because they need an extra
# minijasmine flag
-modules_tests = ['ESModules']
+modules_tests = [
+ 'ESModules',
+ 'Encoding',
+]
if build_cairo
modules_tests += 'CairoModule'
endif
diff --git a/installed-tests/js/testEncoding.js b/installed-tests/js/testEncoding.js
index dcd05ee6..bd02ee75 100644
--- a/installed-tests/js/testEncoding.js
+++ b/installed-tests/js/testEncoding.js
@@ -4,7 +4,9 @@
// Some test inputs are derived from
https://github.com/denoland/deno/blob/923214c53725651792f6d55c5401bf6b475622ea/op_crates/web/08_text_encoding.js
// Data originally from https://encoding.spec.whatwg.org/encodings.json
-const {Gio} = imports.gi;
+import Gio from 'gi://Gio';
+
+import {arrayLikeWithExactContents} from './matchers.js';
/**
* Loads a JSON file from a URI and parses it.
@@ -23,36 +25,6 @@ function loadJSONFromResource(src) {
return json;
}
-/**
- * A jasmine asymmetric matcher which expects an array-like object
- * to contain the given element array in the same order with the
- * same length. Useful for testing typed arrays.
- *
- * @template T
- * @param {T[]} elements an array of elements to compare with
- * @returns
- */
-function withElements(elements) {
- return {
- /**
- * @param {ArrayLike<T>} compareTo an array-like object to compare to
- * @returns {boolean}
- */
- asymmetricMatch(compareTo) {
- return (
- compareTo.length === elements.length &&
- elements.every((e, i) => e === compareTo[i])
- );
- },
- /**
- * @returns {string}
- */
- jasmineToString() {
- return `${JSON.stringify(elements)}`;
- },
- };
-}
-
/**
* Encoded form of '𝓽𝓮𝔁𝓽'
*
@@ -83,7 +55,7 @@ describe('Text Encoding', function () {
const encoded = encoder.encode(input);
expect(encoded).toEqual(
- withElements([...encodedMultibyteCharArray()])
+ arrayLikeWithExactContents([...encodedMultibyteCharArray()])
);
});
});
@@ -98,7 +70,7 @@ describe('Text Encoding', function () {
expect(result.written).toBe(4);
expect(bytes).toEqual(
- withElements([0x74, 0x65, 0x78, 0x74, 0x00])
+ arrayLikeWithExactContents([0x74, 0x65, 0x78, 0x74, 0x00])
);
});
@@ -111,7 +83,10 @@ describe('Text Encoding', function () {
expect(result.written).toBe(16);
expect(bytes).toEqual(
- withElements([...encodedMultibyteCharArray(), 0x00])
+ arrayLikeWithExactContents([
+ ...encodedMultibyteCharArray(),
+ 0x00,
+ ])
);
});
@@ -124,7 +99,7 @@ describe('Text Encoding', function () {
expect(result.written).toBe(4);
expect(bytes).toEqual(
- withElements([
+ arrayLikeWithExactContents([
...encodedMultibyteCharArray().slice(0, 4),
0x00,
])
@@ -302,6 +277,75 @@ describe('Text Encoding', function () {
expect(decoded).toEqual(longResult);
});
+
+ it('can decode Big-5 HKSCS with supplemental characters', function () {
+ // The characters below roughly mean 'hard' or 'solid' and
+ // 'rooster' respectively. They were chosen for their Unicode
+ // and HKSCS positioning, not meaning.
+
+ // Big5-HKSCS bytes for the supplemental character 𠕇
+ const supplementalBytes = [250, 64];
+ // Big5-HKSCS bytes for the non-supplemental characters 公雞
+ const nonSupplementalBytes = [164, 189, 194, 251];
+
+ const decoder = new TextDecoder('big5-hkscs');
+
+ // We currently allocate 12 additional bytes of padding
+ // and a minimum of 256...
+
+ // This should produce 400 non-supplemental bytes (50 * 2 * 4)
+ // and 16 supplemental bytes (4 * 4)
+ const repeatedNonSupplementalBytes = new Array(50).fill(nonSupplementalBytes).flat();
+ const bytes = [
+ ...repeatedNonSupplementalBytes,
+ ...supplementalBytes,
+ ...repeatedNonSupplementalBytes,
+ ...supplementalBytes,
+ ...repeatedNonSupplementalBytes,
+ ...supplementalBytes,
+ ...repeatedNonSupplementalBytes,
+ ...supplementalBytes,
+ ];
+
+ const expectedNonSupplemental = new Array(50).fill('公雞');
+ const expected = [
+ ...expectedNonSupplemental,
+ '𠕇',
+ ...expectedNonSupplemental,
+ '𠕇',
+ ...expectedNonSupplemental,
+ '𠕇',
+ ...expectedNonSupplemental,
+ '𠕇',
+ ].join('');
+
+ // Calculate the number of bytes the UTF-16 characters should
+ // occupy.
+ const expectedU16Bytes = [...expected].reduce((prev, next) => {
+ const utf16code = next.codePointAt(0);
+
+ // Test whether this unit is supplemental
+ const additionalBytes = utf16code > 0xFFFF ? 2 : 0;
+
+ return prev + 2 + additionalBytes;
+ }, 0);
+
+
+ // We set a minimum buffer allocation of 256 bytes,
+ // this ensures that this test exceeds that.
+ expect(expectedU16Bytes / 2).toBeGreaterThan(256);
+
+ // The length of the input bytes should always be less
+ // than the expected output because UTF-16 uses 4 bytes
+ // to represent some characters HKSCS needs only 2 for.
+ expect(bytes.length).toBeLessThan(expectedU16Bytes);
+ // 4 supplemental characters, each with two additional bytes.
+ expect(bytes.length + 4 * 2).toBe(expectedU16Bytes);
+
+ const decoded = decoder.decode(new Uint8Array(bytes));
+
+ expect(decoded).toBe(expected);
+ });
});
describe('Single Byte Encoding Converter', function () {
diff --git a/js.gresource.xml b/js.gresource.xml
index a809930b..947049c2 100644
--- a/js.gresource.xml
+++ b/js.gresource.xml
@@ -9,6 +9,10 @@
<!-- ESM-based modules -->
<file>modules/esm/_bootstrap/default.js</file>
+
+ <file>modules/esm/_encoding/encoding.js</file>
+ <file>modules/esm/_encoding/encodingMap.js</file>
+ <file>modules/esm/_encoding/util.js</file>
<file>modules/esm/cairo.js</file>
<file>modules/esm/gettext.js</file>
@@ -44,10 +48,8 @@
<file>modules/core/_cairo.js</file>
<file>modules/core/_common.js</file>
- <file>modules/core/_encodings.js</file>
<file>modules/core/_format.js</file>
<file>modules/core/_gettext.js</file>
<file>modules/core/_signals.js</file>
- <file>modules/core/_text.js</file>
</gresource>
</gresources>
diff --git a/jsconfig.json b/jsconfig.json
new file mode 100644
index 00000000..43feb2ed
--- /dev/null
+++ b/jsconfig.json
@@ -0,0 +1,5 @@
+{
+ "compilerOptions": {
+ "lib": ["es2020"],
+ }
+}
\ No newline at end of file
diff --git a/modules/esm/_bootstrap/default.js b/modules/esm/_bootstrap/default.js
index fefeb51b..eb315af7 100644
--- a/modules/esm/_bootstrap/default.js
+++ b/modules/esm/_bootstrap/default.js
@@ -2,3 +2,6 @@
// SPDX-FileCopyrightText: 2021 Evan Welsh <contact evanwelsh com>
// Bootstrap file which supports ESM imports.
+
+// Bootstrap the Encoding API
+import '_encoding/encoding';
diff --git a/modules/core/_text.js b/modules/esm/_encoding/encoding.js
similarity index 53%
rename from modules/core/_text.js
rename to modules/esm/_encoding/encoding.js
index 39413b73..e84b752b 100644
--- a/modules/core/_text.js
+++ b/modules/esm/_encoding/encoding.js
@@ -1,22 +1,22 @@
// SPDX-License-Identifier: MIT OR LGPL-2.0-or-later
-// SPDX-FileCopyrightText: Evan Welsh
+// SPDX-FileCopyrightText: 2021 Evan Welsh <contact evanwelsh com>
-const Encoding = imports._encodingNative;
+const Encoding = import.meta.importSync('_encodingNative');
-const { getEncodingFromLabel } = imports._encodings;
+import {getEncodingFromLabel} from './encodingMap.js';
-var TextDecoder = class TextDecoder {
- /**
+class TextDecoder {
+ /**
* @type {string}
*/
encoding;
- /**
+ /**
* @type {boolean}
*/
ignoreBOM;
- /**
+ /**
* @type {boolean}
*/
fatal;
@@ -26,22 +26,24 @@ var TextDecoder = class TextDecoder {
}
/**
- * @param {string} encoding
- * @param {object} [options]
- * @param {boolean=} options.fatal
- * @param {boolean=} options.ignoreBOM
+ * @param {string} encoding The encoding to decode into
+ * @param {object} [options] Decoding options
+ * @param {boolean=} options.fatal Whether to throw or substitute when invalid characters are encountered
+ * @param {boolean=} options.ignoreBOM Whether to ignore the byte order for UTF-8 arrays
*/
constructor(encoding = 'utf-8', options = {}) {
- const { fatal = false, ignoreBOM = false } = options;
+ const {fatal = false, ignoreBOM = false} = options;
const encodingDefinition = getEncodingFromLabel(`${encoding}`);
- if (!encodingDefinition) {
+ if (!encodingDefinition)
throw new RangeError(`Invalid encoding label: '${encoding}'`);
- }
+
if (encodingDefinition.label === 'replacement') {
- throw new RangeError(`Unsupported replacement encoding: '${encoding}'`);
+ throw new RangeError(
+ `Unsupported replacement encoding: '${encoding}'`
+ );
}
Object.defineProperty(this, '_internalEncoding', {
@@ -73,19 +75,19 @@ var TextDecoder = class TextDecoder {
});
}
-
-
/**
- * @param {unknown} bytes
- * @param {object} [options]
- * @param {boolean=} options.stream
- * @returns
+ * @param {unknown} bytes a typed array of bytes to decode
+ * @param {object} [options] Decoding options
+ * @param {boolean=} options.stream Unsupported option. Whether to stream the decoded bytes.
+ * @returns
*/
decode(bytes, options = {}) {
- const { stream = false } = options;
+ const {stream = false} = options;
if (stream) {
- throw new Error(`TextDecoder does not implement the 'stream' option.`);
+ throw new Error(
+ 'TextDecoder does not implement the \'stream\' option.'
+ );
}
/** @type {Uint8Array} */
@@ -96,32 +98,37 @@ var TextDecoder = class TextDecoder {
} else if (bytes instanceof Uint8Array) {
input = bytes;
} else if (bytes instanceof Object.getPrototypeOf(Uint8Array)) {
- let { buffer, byteLength, byteOffset } = /** @type {Uint32Array} */ (bytes);
+ let {buffer, byteLength, byteOffset} =
+ /** @type {Uint32Array} */ bytes;
input = new Uint8Array(buffer, byteOffset, byteLength);
} else if (
- typeof bytes === "object" &&
+ typeof bytes === 'object' &&
bytes !== null &&
- "buffer" in bytes &&
+ 'buffer' in bytes &&
bytes.buffer instanceof ArrayBuffer
) {
- let { buffer, byteLength, byteOffset } = bytes;
- input = new Uint8Array(
- buffer,
- byteOffset,
- byteLength
- );
+ let {buffer, byteLength, byteOffset} = bytes;
+ input = new Uint8Array(buffer, byteOffset, byteLength);
} else if (bytes === undefined) {
input = new Uint8Array(0);
} else {
- throw new Error(`Provided input cannot be converted to ArrayBufferView or ArrayBuffer`);
+ throw new Error(
+ 'Provided input cannot be converted to ArrayBufferView or ArrayBuffer'
+ );
}
- if (this.ignoreBOM && input.length > 2 && input[0] === 0xEF && input[1] === 0xBB && input[2] ===
0xBF) {
- if (this.encoding !== 'utf-8') {
- throw new Error(`Cannot ignore BOM for non-UTF8 encoding.`);
- }
+ if (
+ this.ignoreBOM &&
+ input.length > 2 &&
+ input[0] === 0xef &&
+ input[1] === 0xbb &&
+ input[2] === 0xbf
+ ) {
+ if (this.encoding !== 'utf-8')
+ throw new Error('Cannot ignore BOM for non-UTF8 encoding.');
+
- let { buffer, byteLength, byteOffset } = input;
+ let {buffer, byteLength, byteOffset} = input;
input = new Uint8Array(buffer, byteOffset + 3, byteLength - 3);
}
@@ -129,7 +136,7 @@ var TextDecoder = class TextDecoder {
}
}
-var TextEncoder = class TextEncoder {
+class TextEncoder {
get [Symbol.toStringTag]() {
return 'TextEncoder';
}
@@ -140,11 +147,26 @@ var TextEncoder = class TextEncoder {
encode(input = '') {
// The TextEncoder specification only allows for UTF-8 encoding.
- return Encoding.encode(`${input}`, 'UTF-8');
+ return Encoding.encode(`${input}`, 'utf-8');
}
encodeInto(input = '', output = new Uint8Array()) {
// The TextEncoder specification only allows for UTF-8 encoding.
return Encoding.encodeInto(`${input}`, output);
}
-}
\ No newline at end of file
+}
+
+Object.defineProperties(globalThis, {
+ TextEncoder: {
+ configurable: false,
+ enumerable: true,
+ writable: false,
+ value: TextEncoder,
+ },
+ TextDecoder: {
+ configurable: false,
+ enumerable: true,
+ writable: false,
+ value: TextDecoder,
+ },
+});
diff --git a/modules/esm/_encoding/encodingMap.js b/modules/esm/_encoding/encodingMap.js
new file mode 100644
index 00000000..b0f17702
--- /dev/null
+++ b/modules/esm/_encoding/encodingMap.js
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: MIT OR LGPL-2.0-or-later
+// SPDX-FileCopyrightText: 2021 Evan Welsh <contact evanwelsh com>
+
+import {trimAsciiWhitespace} from './util.js';
+
+// Data derived from https://encoding.spec.whatwg.org/encodings.json
+const encodingMap = {
+ 'utf-8': [
+ 'unicode-1-1-utf-8',
+ 'unicode11utf8',
+ 'unicode20utf8',
+ 'utf-8',
+ 'utf8',
+ 'x-unicode20utf8',
+ ],
+ ibm866: ['866', 'cp866', 'csibm866', 'ibm866'],
+ 'iso-8859-2': [
+ 'csisolatin2',
+ 'iso-8859-2',
+ 'iso-ir-101',
+ 'iso8859-2',
+ 'iso88592',
+ 'iso_8859-2',
+ 'iso_8859-2:1987',
+ 'l2',
+ 'latin2',
+ ],
+ 'iso-8859-3': [
+ 'csisolatin3',
+ 'iso-8859-3',
+ 'iso-ir-109',
+ 'iso8859-3',
+ 'iso88593',
+ 'iso_8859-3',
+ 'iso_8859-3:1988',
+ 'l3',
+ 'latin3',
+ ],
+ 'iso-8859-4': [
+ 'csisolatin4',
+ 'iso-8859-4',
+ 'iso-ir-110',
+ 'iso8859-4',
+ 'iso88594',
+ 'iso_8859-4',
+ 'iso_8859-4:1988',
+ 'l4',
+ 'latin4',
+ ],
+ 'iso-8859-5': [
+ 'csisolatincyrillic',
+ 'cyrillic',
+ 'iso-8859-5',
+ 'iso-ir-144',
+ 'iso8859-5',
+ 'iso88595',
+ 'iso_8859-5',
+ 'iso_8859-5:1988',
+ ],
+ 'iso-8859-6': [
+ 'arabic',
+ 'asmo-708',
+ 'csiso88596e',
+ 'csiso88596i',
+ 'csisolatinarabic',
+ 'ecma-114',
+ 'iso-8859-6',
+ 'iso-8859-6-e',
+ 'iso-8859-6-i',
+ 'iso-ir-127',
+ 'iso8859-6',
+ 'iso88596',
+ 'iso_8859-6',
+ 'iso_8859-6:1987',
+ ],
+ 'iso-8859-7': [
+ 'csisolatingreek',
+ 'ecma-118',
+ 'elot_928',
+ 'greek',
+ 'greek8',
+ 'iso-8859-7',
+ 'iso-ir-126',
+ 'iso8859-7',
+ 'iso88597',
+ 'iso_8859-7',
+ 'iso_8859-7:1987',
+ 'sun_eu_greek',
+ ],
+ 'iso-8859-8': [
+ 'csiso88598e',
+ 'csisolatinhebrew',
+ 'hebrew',
+ 'iso-8859-8',
+ 'iso-8859-8-e',
+ 'iso-ir-138',
+ 'iso8859-8',
+ 'iso88598',
+ 'iso_8859-8',
+ 'iso_8859-8:1988',
+ 'visual',
+ ],
+ 'iso-8859-8-i': ['csiso88598i', 'iso-8859-8-i', 'logical'],
+ 'iso-8859-10': [
+ 'csisolatin6',
+ 'iso-8859-10',
+ 'iso-ir-157',
+ 'iso8859-10',
+ 'iso885910',
+ 'l6',
+ 'latin6',
+ ],
+ 'iso-8859-13': ['iso-8859-13', 'iso8859-13', 'iso885913'],
+ 'iso-8859-14': ['iso-8859-14', 'iso8859-14', 'iso885914'],
+ 'iso-8859-15': [
+ 'csisolatin9',
+ 'iso-8859-15',
+ 'iso8859-15',
+ 'iso885915',
+ 'iso_8859-15',
+ 'l9',
+ ],
+ 'iso-8859-16': ['iso-8859-16'],
+ 'koi8-r': ['cskoi8r', 'koi', 'koi8', 'koi8-r', 'koi8_r'],
+ 'koi8-u': ['koi8-ru', 'koi8-u'],
+ macintosh: ['csmacintosh', 'mac', 'macintosh', 'x-mac-roman'],
+ 'windows-874': [
+ 'dos-874',
+ 'iso-8859-11',
+ 'iso8859-11',
+ 'iso885911',
+ 'tis-620',
+ 'windows-874',
+ ],
+ 'windows-1250': ['cp1250', 'windows-1250', 'x-cp1250'],
+ 'windows-1251': ['cp1251', 'windows-1251', 'x-cp1251'],
+ 'windows-1252': [
+ 'ansi_x3.4-1968',
+ 'ascii',
+ 'cp1252',
+ 'cp819',
+ 'csisolatin1',
+ 'ibm819',
+ 'iso-8859-1',
+ 'iso-ir-100',
+ 'iso8859-1',
+ 'iso88591',
+ 'iso_8859-1',
+ 'iso_8859-1:1987',
+ 'l1',
+ 'latin1',
+ 'us-ascii',
+ 'windows-1252',
+ 'x-cp1252',
+ ],
+ 'windows-1253': ['cp1253', 'windows-1253', 'x-cp1253'],
+ 'windows-1254': [
+ 'cp1254',
+ 'csisolatin5',
+ 'iso-8859-9',
+ 'iso-ir-148',
+ 'iso8859-9',
+ 'iso88599',
+ 'iso_8859-9',
+ 'iso_8859-9:1989',
+ 'l5',
+ 'latin5',
+ 'windows-1254',
+ 'x-cp1254',
+ ],
+ 'windows-1255': ['cp1255', 'windows-1255', 'x-cp1255'],
+ 'windows-1256': ['cp1256', 'windows-1256', 'x-cp1256'],
+ 'windows-1257': ['cp1257', 'windows-1257', 'x-cp1257'],
+ 'windows-1258': ['cp1258', 'windows-1258', 'x-cp1258'],
+ 'x-mac-cyrillic': ['x-mac-cyrillic', 'x-mac-ukrainian'],
+ gbk: [
+ 'chinese',
+ 'csgb2312',
+ 'csiso58gb231280',
+ 'gb2312',
+ 'gb_2312',
+ 'gb_2312-80',
+ 'gbk',
+ 'iso-ir-58',
+ 'x-gbk',
+ ],
+ gb18030: ['gb18030'],
+ big5: [
+ 'big5',
+ // Unlike the standard WHATWG encoder
+ // the Hong Kong Supplementary Character Set
+ // is not bundled in big5 by iconv
+ // "big5-hkscs",
+ 'cn-big5',
+ 'csbig5',
+ 'x-x-big5',
+ ],
+ 'euc-jp': ['cseucpkdfmtjapanese', 'euc-jp', 'x-euc-jp'],
+ 'iso-2022-jp': ['csiso2022jp', 'iso-2022-jp'],
+ shift_jis: [
+ 'csshiftjis',
+ 'ms932',
+ 'ms_kanji',
+ 'shift-jis',
+ 'shift_jis',
+ 'sjis',
+ 'windows-31j',
+ 'x-sjis',
+ ],
+ 'euc-kr': [
+ 'cseuckr',
+ 'csksc56011987',
+ 'euc-kr',
+ 'iso-ir-149',
+ 'korean',
+ 'ks_c_5601-1987',
+ 'ks_c_5601-1989',
+ 'ksc5601',
+ 'ksc_5601',
+ 'windows-949',
+ ],
+ 'utf-16be': ['unicodefffe', 'utf-16be'],
+ 'utf-16le': [
+ 'csunicode',
+ 'iso-10646-ucs-2',
+ 'ucs-2',
+ 'unicode',
+ 'unicodefeff',
+ 'utf-16',
+ 'utf-16le',
+ ],
+};
+
+/**
+ * Construct a map from each potential label to the canonical label
+ * for an encoding.
+ */
+const encodings = new Map(
+ Object.entries(encodingMap).flatMap(([encoding, labels]) => {
+ return labels.map(label => [label, encoding]);
+ })
+);
+
+// Maps WHATWG specified labels to the appropriate iconv
+// encoding label if iconv does not support the WHATWG label.
+//
+// Mapping here preserves the WHATWG as the label on the
+// TextDecoder so this change is transparent to API users.
+const internalEncodings = new Map([
+ // iso-8859-8-i is functionally equivalent to iso-8859-8
+ // as we are not encoding or decoding control characters.
+ ['iso-8859-8-i', 'iso-8859-8'],
+ // iconv follows a different naming convention for this
+ // encoding
+ ['x-mac-cyrillic', 'MacCyrillic'],
+ // Support HKSCS as a standalone encoding, iconv doesn't
+ // bundle it with Big5 like WHATWG does...
+ ['big5-hkscs', 'big5-hkscs'],
+]);
+
+/**
+ * @typedef Encoding
+ * @property {string} internalLabel
+ * @property {string} label
+ */
+
+/**
+ * @param {string} label the encoding label
+ * @returns {Encoding | null}
+ */
+export function getEncodingFromLabel(label) {
+ const formattedLabel = trimAsciiWhitespace(label.toLowerCase());
+
+ let canonicalLabel = encodings.get(formattedLabel);
+
+ // Lookup an internal mapping using the canonical name, if found, or
+ // the formatted label otherwise.
+ //
+ // x-mac-ukrainian > x-mac-cyrillic > MacCyrillic
+ // (canonical label) (internal label)
+ //
+ // big5-hkscs > undefined > big5-hkscs
+ // (canonical label) (internal label)
+ //
+ let internalLabel = internalEncodings.get(
+ canonicalLabel ?? formattedLabel
+ );
+
+ // If both the canonical label and the internal encoding
+ // are not found, this encoding is unsupported.
+ if (!canonicalLabel && !internalLabel)
+ return null;
+
+ if (internalLabel) {
+ return {
+ label: canonicalLabel ?? formattedLabel,
+ internalLabel,
+ };
+ }
+
+ return {
+ label: canonicalLabel,
+ internalLabel: canonicalLabel,
+ };
+}
diff --git a/modules/esm/_encoding/util.js b/modules/esm/_encoding/util.js
new file mode 100644
index 00000000..9ee450fe
--- /dev/null
+++ b/modules/esm/_encoding/util.js
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: Node.js contributors. All rights reserved.
+
+// Modified from
https://github.com/nodejs/node/blob/78680c1cbc8b0c435963bc512e826b2a6227c315/lib/internal/encoding.js
+
+/**
+ * Trims ASCII whitespace from a string.
+ * `String.prototype.trim` removes non-ASCII whitespace.
+ *
+ * @param {string} label the label to trim
+ * @returns {string}
+ */
+export const trimAsciiWhitespace = label => {
+ let s = 0;
+ let e = label.length;
+ while (
+ s < e &&
+ (label[s] === '\u0009' ||
+ label[s] === '\u000a' ||
+ label[s] === '\u000c' ||
+ label[s] === '\u000d' ||
+ label[s] === '\u0020')
+ )
+ s++;
+
+ while (
+ e > s &&
+ (label[e - 1] === '\u0009' ||
+ label[e - 1] === '\u000a' ||
+ label[e - 1] === '\u000c' ||
+ label[e - 1] === '\u000d' ||
+ label[e - 1] === '\u0020')
+ )
+ e--;
+
+ return label.slice(s, e);
+};
diff --git a/modules/script/_bootstrap/default.js b/modules/script/_bootstrap/default.js
index fe354a02..952d7fe3 100644
--- a/modules/script/_bootstrap/default.js
+++ b/modules/script/_bootstrap/default.js
@@ -6,7 +6,6 @@
'use strict';
const {print, printerr, log, logError} = imports._print;
- const {TextEncoder, TextDecoder} = imports._text;
Object.defineProperties(exports, {
ARGV: {
@@ -17,18 +16,6 @@
return imports.system.programArgs;
},
},
- TextEncoder: {
- configurable: false,
- enumerable: true,
- writable: false,
- value: TextEncoder,
- },
- TextDecoder: {
- configurable: false,
- enumerable: true,
- writable: false,
- value: TextDecoder,
- },
print: {
configurable: false,
enumerable: true,
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]