From 58f9802a80834676f9d7fcde0cd89c010ee47f7d Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Mon, 9 Feb 2026 16:36:27 +0400 Subject: [PATCH] perf: make encoding-browser 2x smaller --- encoding-browser.browser.js | 59 ++++-- fallback/encoding.api.js | 43 ----- fallback/encoding.js | 43 ++++- tests/encoding/browser.test.js | 170 ++++++++++++++++++ tests/encoding/generic.test.js | 1 - .../whatwg-encoding/whatwg-encoding-mock.js | 2 +- whatwg.js | 8 +- 7 files changed, 260 insertions(+), 66 deletions(-) create mode 100644 tests/encoding/browser.test.js diff --git a/encoding-browser.browser.js b/encoding-browser.browser.js index 05454f68..d0ba969b 100644 --- a/encoding-browser.browser.js +++ b/encoding-browser.browser.js @@ -1,10 +1,4 @@ -import { - fromSource, - getBOMEncoding, - normalizeEncoding, - E_ENCODING, -} from './fallback/encoding.api.js' -import labels from './fallback/encoding.labels.js' +import { getBOMEncoding } from './fallback/encoding.api.js' // Lite-weight version which re-exports existing implementations on browsers, // while still being aliased to the full impl in RN and Node.js @@ -13,17 +7,48 @@ import labels from './fallback/encoding.labels.js' const { TextDecoder, TextEncoder, TextDecoderStream, TextEncoderStream } = globalThis -export { normalizeEncoding, getBOMEncoding, labelToName } from './fallback/encoding.api.js' +export { getBOMEncoding } from './fallback/encoding.api.js' export { TextDecoder, TextEncoder, TextDecoderStream, TextEncoderStream } -// https://encoding.spec.whatwg.org/#decode +export function normalizeEncoding(label) { + if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8' + if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252' + try { + return new TextDecoder(label).encoding + } catch {} + + if (/[^\w\t\n\f\r .:-]/i.test(label)) return null + const l = `${label}`.trim().toLowerCase() + if ( + l === 'replacement' || + l === 'csiso2022kr' || + l === 'hz-gb-2312' || + l === 'iso-2022-cn' || + l === 'iso-2022-cn-ext' || + l === 'iso-2022-kr' + ) { + return 'replacement' + } + + return null +} + export function legacyHookDecode(input, fallbackEncoding = 'utf-8') { - let u8 = fromSource(input) - const bomEncoding = getBOMEncoding(u8) - if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2) - const enc = bomEncoding ?? normalizeEncoding(fallbackEncoding) // "the byte order mark is more authoritative than anything else" - if (enc === 'utf-8') return new TextDecoder('utf-8', { ignoreBOM: true }).decode(u8) // fast path - if (enc === 'replacement') return u8.byteLength > 0 ? '\uFFFD' : '' - if (!Object.hasOwn(labels, enc)) throw new RangeError(E_ENCODING) - return new TextDecoder(enc, { ignoreBOM: true }).decode(u8) + const enc = getBOMEncoding(input) ?? normalizeEncoding(fallbackEncoding) + if (enc === 'replacement') return input.byteLength > 0 ? '\uFFFD' : '' + return new TextDecoder(enc).decode(input) +} + +export function labelToName(label) { + const enc = normalizeEncoding(label) + if (enc === 'utf-8') return 'UTF-8' + if (!enc) return enc + const p = enc.slice(0, 3) + if (p === 'utf' || p === 'iso' || p === 'koi' || p === 'euc' || p === 'ibm' || p === 'gbk') { + return enc.toUpperCase() + } + + if (enc === 'big5') return 'Big5' + if (enc === 'shift_jis') return 'Shift_JIS' + return enc } diff --git a/fallback/encoding.api.js b/fallback/encoding.api.js index 6ca9172a..8dc5243a 100644 --- a/fallback/encoding.api.js +++ b/fallback/encoding.api.js @@ -1,32 +1,3 @@ -import labels from './encoding.labels.js' - -let labelsMap - -export const E_ENCODING = 'Unknown encoding' - -// Warning: unlike whatwg-encoding, returns lowercased labels -// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them -// https://encoding.spec.whatwg.org/#names-and-labels -export function normalizeEncoding(label) { - // fast path - if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8' - if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252' - // full map - if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace) - const low = `${label}`.trim().toLowerCase() - if (Object.hasOwn(labels, low)) return low - if (!labelsMap) { - labelsMap = new Map() - for (const [label, aliases] of Object.entries(labels)) { - for (const alias of aliases) labelsMap.set(alias, label) - } - } - - const mapped = labelsMap.get(low) - if (mapped) return mapped - return null -} - // TODO: make this more strict against Symbol.toStringTag // Is not very significant though, anything faking Symbol.toStringTag could as well override // prototypes, which is not something we protect against @@ -65,17 +36,3 @@ export function getBOMEncoding(input) { if (u8[0] === 0xfe && u8[1] === 0xff) return 'utf-16be' return null } - -const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk']) - -// Unlike normalizeEncoding, case-sensitive -// https://encoding.spec.whatwg.org/#names-and-labels -export function labelToName(label) { - const enc = normalizeEncoding(label) - if (enc === 'utf-8') return 'UTF-8' // fast path - if (!enc) return enc - if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase() - if (enc === 'big5') return 'Big5' - if (enc === 'shift_jis') return 'Shift_JIS' - return enc -} diff --git a/fallback/encoding.js b/fallback/encoding.js index 37eb7019..2c8d3c0e 100644 --- a/fallback/encoding.js +++ b/fallback/encoding.js @@ -5,17 +5,56 @@ import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js' import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js' import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js' import labels from './encoding.labels.js' -import { fromSource, getBOMEncoding, normalizeEncoding, E_ENCODING } from './encoding.api.js' +import { fromSource, getBOMEncoding } from './encoding.api.js' import { unfinishedBytes, mergePrefix } from './encoding.util.js' -export { labelToName, getBOMEncoding, normalizeEncoding } from './encoding.api.js' +export { getBOMEncoding } from './encoding.api.js' +export const E_ENCODING = 'Unknown encoding' const E_MULTI = "import '@exodus/bytes/encoding.js' for legacy multi-byte encodings support" const E_OPTIONS = 'The "options" argument must be of type object' const replacementChar = '\uFFFD' const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore let createMultibyteDecoder, multibyteEncoder +let labelsMap +// Warning: unlike whatwg-encoding, returns lowercased labels +// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them +// https://encoding.spec.whatwg.org/#names-and-labels +export function normalizeEncoding(label) { + // fast path + if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8' + if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252' + // full map + if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace) + const low = `${label}`.trim().toLowerCase() + if (Object.hasOwn(labels, low)) return low + if (!labelsMap) { + labelsMap = new Map() + for (const [label, aliases] of Object.entries(labels)) { + for (const alias of aliases) labelsMap.set(alias, label) + } + } + + const mapped = labelsMap.get(low) + if (mapped) return mapped + return null +} + +const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk']) + +// Unlike normalizeEncoding, case-sensitive +// https://encoding.spec.whatwg.org/#names-and-labels +export function labelToName(label) { + const enc = normalizeEncoding(label) + if (enc === 'utf-8') return 'UTF-8' // fast path + if (!enc) return enc + if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase() + if (enc === 'big5') return 'Big5' + if (enc === 'shift_jis') return 'Shift_JIS' + return enc +} + export const isMultibyte = (enc) => multibyteSet.has(enc) export function setMultibyte(createDecoder, createEncoder) { createMultibyteDecoder = createDecoder diff --git a/tests/encoding/browser.test.js b/tests/encoding/browser.test.js new file mode 100644 index 00000000..2ec6e35e --- /dev/null +++ b/tests/encoding/browser.test.js @@ -0,0 +1,170 @@ +import { + TextDecoder, + TextEncoder, + getBOMEncoding, + legacyHookDecode, +} from '@exodus/bytes/encoding-browser.js' +import { fromHex } from '@exodus/bytes/hex.js' +import { test, describe } from 'node:test' +import { labels } from './fixtures/encodings.cjs' +import unfinishedBytesFixtures from './fixtures/unfinishedBytes.js' + +test('Unfinished bytes', (t) => { + for (const [encoding, trail, u8] of unfinishedBytesFixtures) { + const decoder = new TextDecoder(encoding) + const a0 = decoder.decode(u8, { stream: true }) + const b0 = decoder.decode() + const ab = new TextDecoder(encoding).decode(u8) + const a1 = new TextDecoder(encoding).decode(u8.subarray(0, u8.length - trail)) + const b1 = new TextDecoder(encoding).decode(u8.subarray(u8.length - trail)) + t.assert.strictEqual(a0, a1) + t.assert.strictEqual(b0, b1) + t.assert.strictEqual(a0 + b0, ab) + t.assert.strictEqual(decoder.decode(u8), ab) // reuse + + if (trail === 0) { + t.assert.strictEqual(a0, ab) + t.assert.strictEqual(b0, '') + } + + if (trail === u8.length) { + t.assert.strictEqual(a0, '') + t.assert.strictEqual(b0, ab) + } + } +}) + +test('String coercion', (t) => { + const encoder = new TextEncoder() + const map = [ + [{}, '[object Object]'], + [null, 'null'], + [undefined, 'undefined'], + ] + + for (const [arg, string] of map) { + const length = string.length + const a = encoder.encode(string) + t.assert.strictEqual(a.length, length) + + const b = encoder.encode(arg) + if (arg === undefined) { + // undefined is special + t.assert.strictEqual(b.length, 0) + t.assert.deepStrictEqual(b, Uint8Array.of()) + } else { + t.assert.strictEqual(b.length, length) + t.assert.deepStrictEqual(b, a) + } + + const c = new Uint8Array(20) + t.assert.deepStrictEqual(encoder.encodeInto(arg, c), { read: length, written: length }) + t.assert.deepStrictEqual(c.subarray(0, length), a) + } +}) + +// https://encoding.spec.whatwg.org/#x-user-defined-decoder +test('x-user-defined encoding', (t) => { + const decoder = new TextDecoder('x-user-defined') + for (let byte = 0; byte < 256; byte++) { + const codePoint = byte >= 128 ? 0xf7_80 + byte - 0x80 : byte + t.assert.strictEqual(decoder.decode(Uint8Array.of(byte)), String.fromCodePoint(codePoint)) + } +}) + +// iso-8859-1, iso-8859-9, iso-8859-11 differ in WHATWG Encoding spec from https://unicode.org/Public/MAPPINGS/ISO8859 +// and map to windows-1252, windows-1254, windows-874 instead +test('not all ISO-8859 encodings are present in TextDecoder', (t) => { + t.assert.strictEqual(new TextDecoder('iso-8859-1').encoding, 'windows-1252') + t.assert.strictEqual(new TextDecoder('iso-8859-2').encoding, 'iso-8859-2') // present + t.assert.strictEqual(new TextDecoder('iso-8859-9').encoding, 'windows-1254') + t.assert.strictEqual(new TextDecoder('iso-8859-11').encoding, 'windows-874') + t.assert.throws(() => new TextDecoder('iso-8859-12')) + t.assert.strictEqual(new TextDecoder('iso-8859-13').encoding, 'iso-8859-13') // present +}) + +describe('encodings are ASCII supersets, except utf-16 and iso-2022-jp', () => { + for (const label of labels) { + if (label === 'replacement' || label === 'utf-16le' || label === 'utf-16be') continue + test(label, (t) => { + const loose = new TextDecoder(label) + const fatal = new TextDecoder(label, { fatal: true }) + for (let i = 0; i < 128; i++) { + if (label === 'iso-2022-jp' && [0x0e, 0x0f, 0x1b].includes(i)) continue + t.assert.strictEqual(loose.decode(Uint8Array.of(i)), String.fromCodePoint(i)) + t.assert.strictEqual(fatal.decode(Uint8Array.of(i)), String.fromCodePoint(i)) + } + }) + } +}) + +describe('legacyHookDecode', () => { + const fixtures = { + replacement: [ + ['', ''], + ['00', '\uFFFD'], + ['ff', '\uFFFD'], + ['20', '\uFFFD'], + ['2020', '\uFFFD'], + // BOM takes preference + ['efbbbf', ''], + ['efbbbf2a', '*'], + ['efbbbf202a', ' *'], + ['fffe', ''], + ['fffe2a20', '\u202A'], + ['fffe2a', '\uFFFD'], + ['fffe00d72a', '\uD700\uFFFD'], + ['fffe00d82a', '\uFFFD'], + ['fffe00dc2a', '\uFFFD\uFFFD'], + ['feff', ''], + ['feff202a', '\u202A'], + ['feff20', '\uFFFD'], + ['feffd70020', '\uD700\uFFFD'], + ['feffd80020', '\uFFFD'], + ['feffdc0020', '\uFFFD\uFFFD'], + ], + // non-normalized names + Utf8: [['c280', '\x80']], + unicodefeff: [['c280', '\u80C2']], + UnicodeFFFE: [['c280', '\uC280']], + } + + test('null encoding', (t) => { + t.assert.throws(() => legacyHookDecode(Uint8Array.of(), null), RangeError) + }) + + for (const [encoding, data] of Object.entries(fixtures)) { + test(encoding, (t) => { + for (const [hex, string] of data) { + t.assert.strictEqual(legacyHookDecode(fromHex(hex), encoding), string, `${hex}`) + } + }) + } +}) + +test('getBOMEncoding', (t) => { + const fixtures = [ + [null, ''], + [null, 'ff'], + [null, 'fe'], + [null, 'ef'], + [null, 'efbb'], + [null, 'efbb00'], + [null, 'efbfbb'], + [null, 'ffbbbf'], + ['utf-8', 'efbbbf'], + ['utf-8', 'efbbbf00'], + ['utf-16le', 'fffe'], + ['utf-16le', 'fffefffe'], + ['utf-16le', 'fffefffefffe'], + ['utf-16le', 'fffebb'], + ['utf-16le', 'fffebf'], + ['utf-16be', 'feff'], + ['utf-16be', 'fefffeff'], + ['utf-16be', 'fefffefffeff'], + ] + + for (const [enc, hex] of fixtures) { + t.assert.strictEqual(getBOMEncoding(fromHex(hex)), enc, `${hex} -> ${enc}`) + } +}) diff --git a/tests/encoding/generic.test.js b/tests/encoding/generic.test.js index 9a2d1cb5..abe276be 100644 --- a/tests/encoding/generic.test.js +++ b/tests/encoding/generic.test.js @@ -53,7 +53,6 @@ test('String coercion', (t) => { t.assert.strictEqual(b.length, 0) t.assert.deepStrictEqual(b, Uint8Array.of()) } else { - const b = encoder.encode(arg) t.assert.strictEqual(b.length, length) t.assert.deepStrictEqual(b, a) } diff --git a/tests/vendor/whatwg-encoding/whatwg-encoding-mock.js b/tests/vendor/whatwg-encoding/whatwg-encoding-mock.js index 43194e8f..902cf4c6 100644 --- a/tests/vendor/whatwg-encoding/whatwg-encoding-mock.js +++ b/tests/vendor/whatwg-encoding/whatwg-encoding-mock.js @@ -1,4 +1,4 @@ -import * as api from '@exodus/bytes/encoding.js' +import * as api from '@exodus/bytes/encoding-browser.js' // prettier-ignore const supported = new Set([ diff --git a/whatwg.js b/whatwg.js index 3c9c0e11..7f4d05ce 100644 --- a/whatwg.js +++ b/whatwg.js @@ -1,7 +1,11 @@ import { utf8fromStringLoose } from '@exodus/bytes/utf8.js' import { createSinglebyteEncoder } from '@exodus/bytes/single-byte.js' -import { isMultibyte, getMultibyteEncoder } from './fallback/encoding.js' -import { normalizeEncoding, E_ENCODING } from './fallback/encoding.api.js' +import { + isMultibyte, + getMultibyteEncoder, + normalizeEncoding, + E_ENCODING, +} from './fallback/encoding.js' import { percentEncoder } from './fallback/percent.js' import { encodeMap } from './fallback/single-byte.js' import { E_STRING } from './fallback/_utils.js'