feat: support other iso-8859 encodings in single-byte

ChALkeR · ChALkeR · commit 8005f554111b · 2026-01-01T23:46:07.000+08:00
This brings in actual iso-8859-1 support, also iso-8859-9 and iso-8859-11
diff --git a/README.md b/README.md
@@ -131,16 +131,42 @@ import { createSinglebyteDecoder, createSinglebyteEncoder } from '@exodus/bytes/
 import { windows1252toString, windows1252fromString } from '@exodus/bytes/single-byte.js'
 ```
 
-Decode the legacy single-byte encodings according to the [Encoding standard](https://encoding.spec.whatwg.org/)
-([§9](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings) and
-[§14.5](https://encoding.spec.whatwg.org/#x-user-defined)).
+Decode / encode the legacy single-byte encodings according to the
+[Encoding standard](https://encoding.spec.whatwg.org/)
+([§9](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings),
+[§14.5](https://encoding.spec.whatwg.org/#x-user-defined)),
+and [unicode.org](https://unicode.org/Public/MAPPINGS/ISO8859) `iso-8859-*` mappings.
 
-Supports all single-byte encodings listed in the standard:
+Supports all single-byte encodings listed in the WHATWG Encoding standard:
 `ibm866`, `iso-8859-2`, `iso-8859-3`, `iso-8859-4`, `iso-8859-5`, `iso-8859-6`, `iso-8859-7`, `iso-8859-8`,
 `iso-8859-8-i`, `iso-8859-10`, `iso-8859-13`, `iso-8859-14`, `iso-8859-15`, `iso-8859-16`, `koi8-r`, `koi8-u`,
 `macintosh`, `windows-874`, `windows-1250`, `windows-1251`, `windows-1252`, `windows-1253`, `windows-1254`,
 `windows-1255`, `windows-1256`, `windows-1257`, `windows-1258`, `x-mac-cyrillic` and `x-user-defined`.
 
+Also supports `iso-8859-1`, `iso-8859-9`, `iso-8859-11` as defined at
+[unicode.org](https://unicode.org/Public/MAPPINGS/ISO8859)
+(and all other `iso-8859-*` encodings there as they match WHATWG).
+
+> [!NOTE]
+> While all `iso-8859-*` encodings supported by the [WHATWG Encoding standard](https://encoding.spec.whatwg.org/) match
+> [unicode.org](https://unicode.org/Public/MAPPINGS/ISO8859), the WHATWG Encoding spec doesn't support
+> `iso-8859-1`, `iso-8859-9`, `iso-8859-11`, and instead maps them as labels to `windows-1252`, `windows-1254`, `windows-874`.\
+> `createSinglebyteDecoder()` (unlike `TextDecoder` or `legacyHookDecode()`) does not do such mapping,
+> so its results will differ from `TextDecoder` for those encoding names.
+
+```js
+> new TextDecoder('iso-8859-1').encoding
+'windows-1252'
+> new TextDecoder('iso-8859-9').encoding
+'windows-1254'
+> new TextDecoder('iso-8859-11').encoding
+'windows-874'
+> new TextDecoder('iso-8859-9').decode(Uint8Array.of(0x80, 0x81, 0xd0))
+'€\x81Ğ' // this is actually decoded according to windows-1254 per TextDecoder spec
+> createSinglebyteDecoder('iso-8859-9')(Uint8Array.of(0x80, 0x81, 0xd0))
+'\x80\x81Ğ' // this is iso-8859-9 as defined at https://unicode.org/Public/MAPPINGS/ISO8859/8859-9.txt
+```
+
 ##### `createSinglebyteDecoder(encoding, loose = false)`
 
 Create a decoder for a supported one-byte `encoding`, given its lowercased name `encoding`.
@@ -156,12 +182,35 @@ Returns a function `encode(string)` that encodes a string to bytes.
 In `'fatal'` mode (default), will throw on non well-formed strings or any codepoints which could
 not be encoded in the target encoding.
 
+##### `latin1toString(arr)`
+
+Decode `iso-8859-1` bytes to a string.
+
+There is no loose variant for this encoding, all bytes can be decoded.
+
+Same as:
+```js
+const latin1toString = createSinglebyteDecoder('iso-8859-1')
+```
+
+Note: this is different from `new TextDecoder('iso-8859-1')` and `new TextDecoder('latin1')`, as
+those alias to `new TextDecoder('windows-1252')`.
+
+##### `latin1fromString(string)`
+
+Encode a string to `iso-8859-1` bytes.
+
+Will throw on non well-formed strings or any codepoints which could not be encoded in `iso-8859-1`.
+
+Same as:
+```js
+const latin1fromString = createSinglebyteEncoder('iso-8859-1', { mode: 'fatal' })
+```
+
 ##### `windows1252toString(arr)`
 
 Decode `windows-1252` bytes to a string.
 
-Also supports `ascii` and `latin-1` as those are strict subsets of `windows-1252`.
-
 There is no loose variant for this encoding, all bytes can be decoded.
 
 Same as:
@@ -173,8 +222,6 @@ const windows1252toString = createSinglebyteDecoder('windows-1252')
 
 Encode a string to `windows-1252` bytes.
 
-Also supports `ascii` and `latin-1` as those are strict subsets of `windows-1252`.
-
 Will throw on non well-formed strings or any codepoints which could not be encoded in `windows-1252`.
 
 Same as:
diff --git a/benchmarks/latin1.bench.js b/benchmarks/latin1.bench.js
@@ -1,3 +1,4 @@
+import { latin1fromString, latin1toString } from '@exodus/bytes/single-byte.js'
 import { benchmark } from '@exodus/test/benchmark' // eslint-disable-line @exodus/import/no-unresolved
 import buffer from 'buffer/index.js'
 import { describe, test } from 'node:test'
@@ -10,9 +11,9 @@ if (!globalThis.Buffer) globalThis.Buffer = buffer.Buffer
 const bufferIsPolyfilled = Buffer === buffer.Buffer
 const toBuffer = (x, B) => B.from(x.buffer, x.byteOffset, x.byteLength)
 
-const strings = bufs.map((x) => toBuffer(x, Buffer).toString('latin1'))
+const strings = bufs.map((x) => latin1toString(x))
 const asciiBufs = bufs.map((x) => x.map((c) => (c >= 0x80 ? c - 0x80 : c)))
-const asciiStrings = asciiBufs.map((x) => toBuffer(x, Buffer).toString())
+const asciiStrings = asciiBufs.map((x) => latin1toString(x))
 
 const isNative = (x) => x && (!bufferIsPolyfilled || `${x}`.includes('[native code]')) // we consider Node.js TextDecoder/TextEncoder native
 const { TextEncoder, TextDecoder, btoa } = globalThis
@@ -27,6 +28,7 @@ describe('benchmarks: latin1', async () => {
 
   // [name, impl, skip]
   const decodeLatin1 = [
+    ['@exodus/bytes', (x) => latin1toString(x)],
     ['./fallback/latin1', (x) => latin1.decodeLatin1(x)],
     ['Buffer', (x) => toBuffer(x, Buffer).toString('latin1')],
     // ['Buffer.from', (x) => Buffer.from(x).toString('latin1')],
@@ -37,6 +39,7 @@ describe('benchmarks: latin1', async () => {
 
   // [name, impl, skip]
   const encodeLatin1 = [
+    ['@exodus/bytes', (x) => latin1fromString(x)],
     ['./fallback/latin1', (x) => latin1.encodeLatin1(x)],
     ['Buffer', (x) => Buffer.from(x, 'latin1')],
     ['buffer/Buffer', (x) => buffer.Buffer.from(x, 'latin1'), bufferIsPolyfilled],
@@ -46,6 +49,7 @@ describe('benchmarks: latin1', async () => {
 
   // [name, impl, skip]
   const decodeAscii = [
+    ['@exodus/bytes latin1', (x) => latin1toString(x)],
     ['./fallback/latin1', (x) => latin1.decodeAscii(x)],
     ['Buffer (ascii)', (x) => toBuffer(x, Buffer).toString('ascii')],
     ['Buffer (latin1)', (x) => toBuffer(x, Buffer).toString('latin1')],
@@ -58,6 +62,7 @@ describe('benchmarks: latin1', async () => {
 
   // [name, impl, skip]
   const encodeAscii = [
+    ['@exodus/bytes latin1', (x) => latin1fromString(x)],
     ['./fallback/latin1', (x) => latin1.encodeAscii(x, 'ERR'), !textEncoder],
     ['Buffer (ascii)', (x) => Buffer.from(x, 'ascii')],
     ['Buffer (latin1)', (x) => Buffer.from(x, 'latin1')],
diff --git a/fallback/single-byte.js b/fallback/single-byte.js
@@ -65,6 +65,7 @@ export function encodingMapper(encoding) {
 export function encodingDecoder(encoding) {
   const cached = decoders.get(encoding)
   if (cached) return cached
+  if (encoding === 'iso-8859-1') return (arr, loose = false) => decodeLatin1(arr)
 
   let strings
   const codes = getEncoding(encoding)
diff --git a/single-byte.js b/single-byte.js
@@ -1,6 +1,6 @@
 import { assertUint8 } from './assert.js'
 import { canDecoders, nativeEncoder, isHermes, E_STRING } from './fallback/_utils.js'
-import { encodeAscii, encodeAsciiPrefix } from './fallback/latin1.js'
+import { encodeAscii, encodeAsciiPrefix, encodeLatin1 } from './fallback/latin1.js'
 import { assertEncoding, encodingDecoder, encodeMap, E_STRICT } from './fallback/single-byte.js'
 
 const { TextDecoder } = globalThis
@@ -9,8 +9,9 @@ let windows1252works
 
 // prettier-ignore
 const skipNative = new Set([
-  'iso-8859-16', // iso-8859-16 is somehow broken in WebKit, at least on CI
+  'iso-8859-1', 'iso-8859-9', 'iso-8859-11', // non-WHATWG
   'iso-8859-6', 'iso-8859-8', 'iso-8859-8-i', // slow in all 3 engines
+  'iso-8859-16', // iso-8859-16 is somehow broken in WebKit, at least on CI
 ])
 
 function shouldUseNative(enc) {
@@ -92,12 +93,17 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
   // TODO: replacement, truncate (replacement will need varying length)
   if (mode !== 'fatal') throw new Error('Unsupported mode')
   const m = encodeMap(encoding) // asserts
+  const isLatin1 = encoding === 'iso-8859-1'
 
   // No single-byte encoder produces surrogate pairs, so any surrogate is invalid
   // This needs special treatment only to decide how many replacement chars to output, one or two
   // Not much use in running isWellFormed, most likely cause of error is unmapped chars, not surrogate pairs
   return (s) => {
     if (typeof s !== 'string') throw new TypeError(E_STRING)
+    if (isLatin1) {
+      if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
+      return encodeLatin1(s)
+    }
 
     // Instead of an ASCII regex check, encode optimistically - this is faster
     // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
@@ -113,5 +119,7 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
   }
 }
 
+export const latin1toString = createSinglebyteDecoder('iso-8859-1')
+export const latin1fromString = createSinglebyteEncoder('iso-8859-1')
 export const windows1252toString = createSinglebyteDecoder('windows-1252')
 export const windows1252fromString = createSinglebyteEncoder('windows-1252')
diff --git a/single-byte.node.js b/single-byte.node.js
@@ -23,7 +23,6 @@ function latin1Prefix(arr, start) {
 
 export function createSinglebyteDecoder(encoding, loose = false) {
   if (typeof loose !== 'boolean') throw new TypeError('loose option should be boolean')
-  const latin1path = encoding === 'windows-1252'
   if (isDeno) {
     const jsDecoder = encodingDecoder(encoding) // asserts
     return (arr) => {
@@ -34,11 +33,12 @@ export function createSinglebyteDecoder(encoding, loose = false) {
     }
   }
 
+  const latin1path = encoding === 'windows-1252'
   const { incomplete, mapper } = encodingMapper(encoding) // asserts
   return (arr) => {
     assertUint8(arr)
     if (arr.byteLength === 0) return ''
-    if (isAscii(arr)) return toBuf(arr).latin1Slice(0, arr.byteLength) // .latin1Slice is faster than .asciiSlice
+    if (encoding === 'iso-8859-1' || isAscii(arr)) return toBuf(arr).latin1Slice() // .latin1Slice is faster than .asciiSlice
 
     // Node.js TextDecoder is broken, so we can't use it. It's also slow anyway
 
@@ -64,9 +64,15 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
   // TODO: replacement, truncate (replacement will need varying length)
   if (mode !== 'fatal') throw new Error('Unsupported mode')
   const m = encodeMap(encoding) // asserts
+  const isLatin1 = encoding === 'iso-8859-1'
 
   return (s) => {
     if (typeof s !== 'string') throw new TypeError(E_STRING)
+    if (isLatin1) {
+      if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
+      const b = Buffer.from(s, 'latin1')
+      return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
+    }
 
     // Instead of an ASCII regex check, encode optimistically - this is faster
     // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
@@ -104,5 +110,7 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
   }
 }
 
+export const latin1toString = createSinglebyteDecoder('iso-8859-1')
+export const latin1fromString = createSinglebyteEncoder('iso-8859-1')
 export const windows1252toString = createSinglebyteDecoder('windows-1252')
 export const windows1252fromString = createSinglebyteEncoder('windows-1252')
diff --git a/tests/single-byte.test.js b/tests/single-byte.test.js
@@ -6,6 +6,7 @@ import { encodingDecoder } from '../fallback/single-byte.js'
 import encodingsObject from '../fallback/single-byte.encodings.js'
 
 const encodings = Object.keys(encodingsObject)
+const nonWhatwg = new Set(['iso-8859-1', 'iso-8859-9', 'iso-8859-11'])
 
 // See also tests/encoding/single-byte.tables.test.js for similar TextDecoder tests
 
@@ -132,6 +133,7 @@ describe('single-byte encodings index: Unicode', () => {
 
 describe('single-byte encodings index: WHATWG', () => {
   for (const encoding of encodings) {
+    if (nonWhatwg.has(encoding)) continue
     test(encoding, (t) => {
       const decoder = createSinglebyteDecoder(encoding)
       const decoderLoose = createSinglebyteDecoder(encoding, true)