Skip to content

Commit 8005f55

Browse files
committed
feat: support other iso-8859 encodings in single-byte
This brings in actual iso-8859-1 support, also iso-8859-9 and iso-8859-11
1 parent 5c6fff4 commit 8005f55

6 files changed

Lines changed: 85 additions & 14 deletions

File tree

README.md

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -131,16 +131,42 @@ import { createSinglebyteDecoder, createSinglebyteEncoder } from '@exodus/bytes/
131131
import { windows1252toString, windows1252fromString } from '@exodus/bytes/single-byte.js'
132132
```
133133

134-
Decode the legacy single-byte encodings according to the [Encoding standard](https://encoding.spec.whatwg.org/)
135-
([§9](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings) and
136-
[§14.5](https://encoding.spec.whatwg.org/#x-user-defined)).
134+
Decode / encode the legacy single-byte encodings according to the
135+
[Encoding standard](https://encoding.spec.whatwg.org/)
136+
([§9](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings),
137+
[§14.5](https://encoding.spec.whatwg.org/#x-user-defined)),
138+
and [unicode.org](https://unicode.org/Public/MAPPINGS/ISO8859) `iso-8859-*` mappings.
137139

138-
Supports all single-byte encodings listed in the standard:
140+
Supports all single-byte encodings listed in the WHATWG Encoding standard:
139141
`ibm866`, `iso-8859-2`, `iso-8859-3`, `iso-8859-4`, `iso-8859-5`, `iso-8859-6`, `iso-8859-7`, `iso-8859-8`,
140142
`iso-8859-8-i`, `iso-8859-10`, `iso-8859-13`, `iso-8859-14`, `iso-8859-15`, `iso-8859-16`, `koi8-r`, `koi8-u`,
141143
`macintosh`, `windows-874`, `windows-1250`, `windows-1251`, `windows-1252`, `windows-1253`, `windows-1254`,
142144
`windows-1255`, `windows-1256`, `windows-1257`, `windows-1258`, `x-mac-cyrillic` and `x-user-defined`.
143145

146+
Also supports `iso-8859-1`, `iso-8859-9`, `iso-8859-11` as defined at
147+
[unicode.org](https://unicode.org/Public/MAPPINGS/ISO8859)
148+
(and all other `iso-8859-*` encodings there as they match WHATWG).
149+
150+
> [!NOTE]
151+
> While all `iso-8859-*` encodings supported by the [WHATWG Encoding standard](https://encoding.spec.whatwg.org/) match
152+
> [unicode.org](https://unicode.org/Public/MAPPINGS/ISO8859), the WHATWG Encoding spec doesn't support
153+
> `iso-8859-1`, `iso-8859-9`, `iso-8859-11`, and instead maps them as labels to `windows-1252`, `windows-1254`, `windows-874`.\
154+
> `createSinglebyteDecoder()` (unlike `TextDecoder` or `legacyHookDecode()`) does not do such mapping,
155+
> so its results will differ from `TextDecoder` for those encoding names.
156+
157+
```js
158+
> new TextDecoder('iso-8859-1').encoding
159+
'windows-1252'
160+
> new TextDecoder('iso-8859-9').encoding
161+
'windows-1254'
162+
> new TextDecoder('iso-8859-11').encoding
163+
'windows-874'
164+
> new TextDecoder('iso-8859-9').decode(Uint8Array.of(0x80, 0x81, 0xd0))
165+
'\x81Ğ' // this is actually decoded according to windows-1254 per TextDecoder spec
166+
> createSinglebyteDecoder('iso-8859-9')(Uint8Array.of(0x80, 0x81, 0xd0))
167+
'\x80\x81Ğ' // this is iso-8859-9 as defined at https://unicode.org/Public/MAPPINGS/ISO8859/8859-9.txt
168+
```
169+
144170
##### `createSinglebyteDecoder(encoding, loose = false)`
145171

146172
Create a decoder for a supported one-byte `encoding`, given its lowercased name `encoding`.
@@ -156,12 +182,35 @@ Returns a function `encode(string)` that encodes a string to bytes.
156182
In `'fatal'` mode (default), will throw on non well-formed strings or any codepoints which could
157183
not be encoded in the target encoding.
158184

185+
##### `latin1toString(arr)`
186+
187+
Decode `iso-8859-1` bytes to a string.
188+
189+
There is no loose variant for this encoding, all bytes can be decoded.
190+
191+
Same as:
192+
```js
193+
const latin1toString = createSinglebyteDecoder('iso-8859-1')
194+
```
195+
196+
Note: this is different from `new TextDecoder('iso-8859-1')` and `new TextDecoder('latin1')`, as
197+
those alias to `new TextDecoder('windows-1252')`.
198+
199+
##### `latin1fromString(string)`
200+
201+
Encode a string to `iso-8859-1` bytes.
202+
203+
Will throw on non well-formed strings or any codepoints which could not be encoded in `iso-8859-1`.
204+
205+
Same as:
206+
```js
207+
const latin1fromString = createSinglebyteEncoder('iso-8859-1', { mode: 'fatal' })
208+
```
209+
159210
##### `windows1252toString(arr)`
160211

161212
Decode `windows-1252` bytes to a string.
162213

163-
Also supports `ascii` and `latin-1` as those are strict subsets of `windows-1252`.
164-
165214
There is no loose variant for this encoding, all bytes can be decoded.
166215

167216
Same as:
@@ -173,8 +222,6 @@ const windows1252toString = createSinglebyteDecoder('windows-1252')
173222

174223
Encode a string to `windows-1252` bytes.
175224

176-
Also supports `ascii` and `latin-1` as those are strict subsets of `windows-1252`.
177-
178225
Will throw on non well-formed strings or any codepoints which could not be encoded in `windows-1252`.
179226

180227
Same as:

benchmarks/latin1.bench.js

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { latin1fromString, latin1toString } from '@exodus/bytes/single-byte.js'
12
import { benchmark } from '@exodus/test/benchmark' // eslint-disable-line @exodus/import/no-unresolved
23
import buffer from 'buffer/index.js'
34
import { describe, test } from 'node:test'
@@ -10,9 +11,9 @@ if (!globalThis.Buffer) globalThis.Buffer = buffer.Buffer
1011
const bufferIsPolyfilled = Buffer === buffer.Buffer
1112
const toBuffer = (x, B) => B.from(x.buffer, x.byteOffset, x.byteLength)
1213

13-
const strings = bufs.map((x) => toBuffer(x, Buffer).toString('latin1'))
14+
const strings = bufs.map((x) => latin1toString(x))
1415
const asciiBufs = bufs.map((x) => x.map((c) => (c >= 0x80 ? c - 0x80 : c)))
15-
const asciiStrings = asciiBufs.map((x) => toBuffer(x, Buffer).toString())
16+
const asciiStrings = asciiBufs.map((x) => latin1toString(x))
1617

1718
const isNative = (x) => x && (!bufferIsPolyfilled || `${x}`.includes('[native code]')) // we consider Node.js TextDecoder/TextEncoder native
1819
const { TextEncoder, TextDecoder, btoa } = globalThis
@@ -27,6 +28,7 @@ describe('benchmarks: latin1', async () => {
2728

2829
// [name, impl, skip]
2930
const decodeLatin1 = [
31+
['@exodus/bytes', (x) => latin1toString(x)],
3032
['./fallback/latin1', (x) => latin1.decodeLatin1(x)],
3133
['Buffer', (x) => toBuffer(x, Buffer).toString('latin1')],
3234
// ['Buffer.from', (x) => Buffer.from(x).toString('latin1')],
@@ -37,6 +39,7 @@ describe('benchmarks: latin1', async () => {
3739

3840
// [name, impl, skip]
3941
const encodeLatin1 = [
42+
['@exodus/bytes', (x) => latin1fromString(x)],
4043
['./fallback/latin1', (x) => latin1.encodeLatin1(x)],
4144
['Buffer', (x) => Buffer.from(x, 'latin1')],
4245
['buffer/Buffer', (x) => buffer.Buffer.from(x, 'latin1'), bufferIsPolyfilled],
@@ -46,6 +49,7 @@ describe('benchmarks: latin1', async () => {
4649

4750
// [name, impl, skip]
4851
const decodeAscii = [
52+
['@exodus/bytes latin1', (x) => latin1toString(x)],
4953
['./fallback/latin1', (x) => latin1.decodeAscii(x)],
5054
['Buffer (ascii)', (x) => toBuffer(x, Buffer).toString('ascii')],
5155
['Buffer (latin1)', (x) => toBuffer(x, Buffer).toString('latin1')],
@@ -58,6 +62,7 @@ describe('benchmarks: latin1', async () => {
5862

5963
// [name, impl, skip]
6064
const encodeAscii = [
65+
['@exodus/bytes latin1', (x) => latin1fromString(x)],
6166
['./fallback/latin1', (x) => latin1.encodeAscii(x, 'ERR'), !textEncoder],
6267
['Buffer (ascii)', (x) => Buffer.from(x, 'ascii')],
6368
['Buffer (latin1)', (x) => Buffer.from(x, 'latin1')],

fallback/single-byte.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ export function encodingMapper(encoding) {
6565
export function encodingDecoder(encoding) {
6666
const cached = decoders.get(encoding)
6767
if (cached) return cached
68+
if (encoding === 'iso-8859-1') return (arr, loose = false) => decodeLatin1(arr)
6869

6970
let strings
7071
const codes = getEncoding(encoding)

single-byte.js

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { assertUint8 } from './assert.js'
22
import { canDecoders, nativeEncoder, isHermes, E_STRING } from './fallback/_utils.js'
3-
import { encodeAscii, encodeAsciiPrefix } from './fallback/latin1.js'
3+
import { encodeAscii, encodeAsciiPrefix, encodeLatin1 } from './fallback/latin1.js'
44
import { assertEncoding, encodingDecoder, encodeMap, E_STRICT } from './fallback/single-byte.js'
55

66
const { TextDecoder } = globalThis
@@ -9,8 +9,9 @@ let windows1252works
99

1010
// prettier-ignore
1111
const skipNative = new Set([
12-
'iso-8859-16', // iso-8859-16 is somehow broken in WebKit, at least on CI
12+
'iso-8859-1', 'iso-8859-9', 'iso-8859-11', // non-WHATWG
1313
'iso-8859-6', 'iso-8859-8', 'iso-8859-8-i', // slow in all 3 engines
14+
'iso-8859-16', // iso-8859-16 is somehow broken in WebKit, at least on CI
1415
])
1516

1617
function shouldUseNative(enc) {
@@ -92,12 +93,17 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
9293
// TODO: replacement, truncate (replacement will need varying length)
9394
if (mode !== 'fatal') throw new Error('Unsupported mode')
9495
const m = encodeMap(encoding) // asserts
96+
const isLatin1 = encoding === 'iso-8859-1'
9597

9698
// No single-byte encoder produces surrogate pairs, so any surrogate is invalid
9799
// This needs special treatment only to decide how many replacement chars to output, one or two
98100
// Not much use in running isWellFormed, most likely cause of error is unmapped chars, not surrogate pairs
99101
return (s) => {
100102
if (typeof s !== 'string') throw new TypeError(E_STRING)
103+
if (isLatin1) {
104+
if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
105+
return encodeLatin1(s)
106+
}
101107

102108
// Instead of an ASCII regex check, encode optimistically - this is faster
103109
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
@@ -113,5 +119,7 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
113119
}
114120
}
115121

122+
export const latin1toString = createSinglebyteDecoder('iso-8859-1')
123+
export const latin1fromString = createSinglebyteEncoder('iso-8859-1')
116124
export const windows1252toString = createSinglebyteDecoder('windows-1252')
117125
export const windows1252fromString = createSinglebyteEncoder('windows-1252')

single-byte.node.js

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ function latin1Prefix(arr, start) {
2323

2424
export function createSinglebyteDecoder(encoding, loose = false) {
2525
if (typeof loose !== 'boolean') throw new TypeError('loose option should be boolean')
26-
const latin1path = encoding === 'windows-1252'
2726
if (isDeno) {
2827
const jsDecoder = encodingDecoder(encoding) // asserts
2928
return (arr) => {
@@ -34,11 +33,12 @@ export function createSinglebyteDecoder(encoding, loose = false) {
3433
}
3534
}
3635

36+
const latin1path = encoding === 'windows-1252'
3737
const { incomplete, mapper } = encodingMapper(encoding) // asserts
3838
return (arr) => {
3939
assertUint8(arr)
4040
if (arr.byteLength === 0) return ''
41-
if (isAscii(arr)) return toBuf(arr).latin1Slice(0, arr.byteLength) // .latin1Slice is faster than .asciiSlice
41+
if (encoding === 'iso-8859-1' || isAscii(arr)) return toBuf(arr).latin1Slice() // .latin1Slice is faster than .asciiSlice
4242

4343
// Node.js TextDecoder is broken, so we can't use it. It's also slow anyway
4444

@@ -64,9 +64,15 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
6464
// TODO: replacement, truncate (replacement will need varying length)
6565
if (mode !== 'fatal') throw new Error('Unsupported mode')
6666
const m = encodeMap(encoding) // asserts
67+
const isLatin1 = encoding === 'iso-8859-1'
6768

6869
return (s) => {
6970
if (typeof s !== 'string') throw new TypeError(E_STRING)
71+
if (isLatin1) {
72+
if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
73+
const b = Buffer.from(s, 'latin1')
74+
return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
75+
}
7076

7177
// Instead of an ASCII regex check, encode optimistically - this is faster
7278
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
@@ -104,5 +110,7 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
104110
}
105111
}
106112

113+
export const latin1toString = createSinglebyteDecoder('iso-8859-1')
114+
export const latin1fromString = createSinglebyteEncoder('iso-8859-1')
107115
export const windows1252toString = createSinglebyteDecoder('windows-1252')
108116
export const windows1252fromString = createSinglebyteEncoder('windows-1252')

tests/single-byte.test.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { encodingDecoder } from '../fallback/single-byte.js'
66
import encodingsObject from '../fallback/single-byte.encodings.js'
77

88
const encodings = Object.keys(encodingsObject)
9+
const nonWhatwg = new Set(['iso-8859-1', 'iso-8859-9', 'iso-8859-11'])
910

1011
// See also tests/encoding/single-byte.tables.test.js for similar TextDecoder tests
1112

@@ -132,6 +133,7 @@ describe('single-byte encodings index: Unicode', () => {
132133

133134
describe('single-byte encodings index: WHATWG', () => {
134135
for (const encoding of encodings) {
136+
if (nonWhatwg.has(encoding)) continue
135137
test(encoding, (t) => {
136138
const decoder = createSinglebyteDecoder(encoding)
137139
const decoderLoose = createSinglebyteDecoder(encoding, true)

0 commit comments

Comments
 (0)