Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,41 @@ do not provide sufficiently complete / non-buggy `TextDecoder` APIs.
> but they are fixing them and the expected update window is short.\
> If you want to circumvent browser bugs, use full `@exodus/bytes/encoding.js` import.

### `@exodus/bytes/whatwg.js`

WHATWG helpers

```js
import '@exodus/bytes/encoding.js' // For full legacy multi-byte encodings support
import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js'
```

#### `percentEncodeAfterEncoding(encoding, input, percentEncodeSet, spaceAsPlus = false)`

Implements [percent-encode after encoding](https://url.spec.whatwg.org/#string-percent-encode-after-encoding)
per WHATWG URL specification.

> [!IMPORTANT]
> You must import `@exodus/bytes/encoding.js` for this API to accept legacy multi-byte encodings.

Encodings `utf16-le`, `utf16-be`, and `replacement` are not accepted.

[C0 control percent-encode set](https://url.spec.whatwg.org/#c0-control-percent-encode-set) is
always percent-encoded.

`percentEncodeSet` is an addition to that, and must be a string of unique increasing codepoints
in range 0x20 - 0x7e, e.g. `' "#<>'`.

This method accepts [DOMStrings](https://webidl.spec.whatwg.org/#idl-DOMString) and converts them
to [USVStrings](https://webidl.spec.whatwg.org/#idl-USVString).
This is different from e.g. `encodeURI` and `encodeURIComponent` which throw on surrogates:
```js
> percentEncodeAfterEncoding('utf8', '\ud800', ' "#$%&+,/:;<=>?@[\\]^`{|}') // component
'%EF%BF%BD'
> encodeURIComponent('\ud800')
Uncaught URIError: URI malformed
```

## Changelog

See [GitHub Releases](https://github.com/ExodusOSS/bytes/releases) tab
Expand Down
31 changes: 31 additions & 0 deletions fallback/percent.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import { decodeAscii, encodeLatin1 } from './latin1.js'
import { decode2string } from './_utils.js'

const ERR = 'percentEncodeSet must be a string of unique increasing codepoints in range 0x20 - 0x7e'
const percentMap = new Map()
let hex, base

export function percentEncoder(set, spaceAsPlus = false) {
if (typeof set !== 'string' || /[^\x20-\x7E]/.test(set)) throw new TypeError(ERR)
if (typeof spaceAsPlus !== 'boolean') throw new TypeError('spaceAsPlus must be boolean')
const id = set + +spaceAsPlus
const cached = percentMap.get(id)
if (cached) return cached

const n = encodeLatin1(set).sort() // string checked above to be ascii
if (decodeAscii(n) !== set || new Set(n).size !== n.length) throw new TypeError(ERR)

if (!base) {
hex = Array.from({ length: 256 }, (_, i) => `%${i.toString(16).padStart(2, '0').toUpperCase()}`)
base = hex.map((h, i) => (i < 0x20 || i > 0x7e ? h : String.fromCharCode(i)))
}

const map = base.slice() // copy
for (const c of n) map[c] = hex[c]
if (spaceAsPlus) map[0x20] = '+' // overrides whatever percentEncodeSet thinks about it

// Input is not typechecked, for internal use only
const percentEncode = (u8, start = 0, end = u8.length) => decode2string(u8, start, end, map)
percentMap.set(id, percentEncode)
return percentEncode
}
7 changes: 7 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
"/fallback/encoding.util.js",
"/fallback/hex.js",
"/fallback/latin1.js",
"/fallback/percent.js",
"/fallback/multi-byte.encodings.cjs",
"/fallback/multi-byte.encodings.json",
"/fallback/multi-byte.js",
Expand Down Expand Up @@ -119,6 +120,8 @@
"/utf8.js",
"/utf8.d.ts",
"/utf8.node.js",
"/whatwg.js",
"/whatwg.d.ts",
"/wif.js",
"/wif.d.ts"
],
Expand Down Expand Up @@ -199,6 +202,10 @@
"node": "./utf8.node.js",
"default": "./utf8.js"
},
"./whatwg.js": {
"types": "./whatwg.d.ts",
"default": "./whatwg.js"
},
"./wif.js": {
"types": "./wif.d.ts",
"default": "./wif.js"
Expand Down
148 changes: 148 additions & 0 deletions tests/whatwg.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import '@exodus/bytes/encoding.js'
import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js'
import { describe, test } from 'node:test'
import { labels } from './encoding/fixtures/encodings.cjs'

const jsuri = ' "%<>[\\]^`{|}' // https://tc39.es/ecma262/#sec-encodeuri-uri
const jsuricomponent = ' "#$%&+,/:;<=>?@[\\]^`{|}' // https://tc39.es/ecma262/#sec-encodeuricomponent-uricomponent
const fragment = ' "<>`' // https://url.spec.whatwg.org/#fragment-percent-encode-set
const query = ' "#<>' // https://url.spec.whatwg.org/#query-percent-encode-set
const specialquery = ' "#\'<>' // https://url.spec.whatwg.org/#special-query-percent-encode-set
const path = ' "#<>?^`{}' // https://url.spec.whatwg.org/#path-percent-encode-set
const userinfo = ' "#/:;<=>?@[\\]^`{|}' // https://url.spec.whatwg.org/#userinfo-percent-encode-set
const component = ' "#$%&+,/:;<=>?@[\\]^`{|}' // https://url.spec.whatwg.org/#component-percent-encode-set
const form = ' !"#$%&\'()+,/:;<=>?@[\\]^`{|}~' // https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set

const sets = ['', userinfo, jsuri, jsuricomponent]
const invalid = ['replacement', 'utf-16le', 'utf-16be'] // https://encoding.spec.whatwg.org/#get-an-encoder

const slowEngine =
process.env.EXODUS_TEST_PLATFORM === 'quickjs' ||
process.env.EXODUS_TEST_PLATFORM === 'xs' ||
process.env.EXODUS_TEST_PLATFORM === 'engine262'

test('perncent-encode sets coherence', (t) => {
const eq = (a, b) => t.assert.deepStrictEqual([...a], [...b].sort())
// https://tc39.es/ecma262/#sec-encodeuri-uri step 2
eq(jsuricomponent, jsuri + ';/?:@&=+$,#')
// https://url.spec.whatwg.org/#fragment-percent-encode-set
eq(fragment, String.fromCharCode(0x20, 0x22, 0x3c, 0x3e, 0x60))
// https://url.spec.whatwg.org/#query-percent-encode-set
eq(query, String.fromCharCode(0x20, 0x22, 0x23, 0x3c, 0x3e))
// https://url.spec.whatwg.org/#special-query-percent-encode-set
eq(specialquery, query + String.fromCharCode(0x27))
// https://url.spec.whatwg.org/#path-percent-encode-set
eq(path, query + String.fromCharCode(0x3f, 0x5e, 0x60, 0x7b, 0x7d))
// https://url.spec.whatwg.org/#userinfo-percent-encode-set
eq(userinfo, path + String.fromCharCode(0x2f, 0x3a, 0x3b, 0x3d, 0x40, 0x5b, 0x5c, 0x5d, 0x7c))
// https://url.spec.whatwg.org/#component-percent-encode-set
eq(component, userinfo + String.fromCharCode(0x24, 0x25, 0x26, 0x2b, 0x2c))
t.assert.strictEqual(jsuricomponent, component)
// https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set
eq(form, component + String.fromCharCode(0x21, 0x27, 0x28, 0x29, 0x7e))
})

describe('percent-encode after encoding', () => {
const f = percentEncodeAfterEncoding

// https://url.spec.whatwg.org/#example-percent-encode-operations
test('examples from spec', (t) => {
// At https://github.com/whatwg/url/commit/5c50135f8304dc8cb9bb49367b364699cc5bb031
t.assert.strictEqual(f('Shift_JIS', ' ', userinfo), '%20')
t.assert.strictEqual(f('Shift_JIS', '≡', userinfo), '%81%DF')
t.assert.strictEqual(f('Shift_JIS', '‽', userinfo), '%26%238253%3B')
t.assert.strictEqual(f('ISO-2022-JP', '¥', userinfo), '%1B(J%5C%1B(B')
t.assert.strictEqual(
f('Shift_JIS', '1+1 ≡ 2%20‽', userinfo, true),
'1+1+%81%DF+2%20%26%238253%3B'
)
t.assert.strictEqual(f('UTF-8', '≡', userinfo), '%E2%89%A1')
t.assert.strictEqual(f('UTF-8', '‽', userinfo), '%E2%80%BD')
t.assert.strictEqual(f('UTF-8', 'Say what‽', userinfo), 'Say%20what%E2%80%BD')

// At https://github.com/whatwg/url/pull/896
t.assert.strictEqual(f('Shift_JIS', ' ', specialquery), '%20')
t.assert.strictEqual(f('Shift_JIS', '≡', specialquery), '%81%DF')
t.assert.strictEqual(f('Shift_JIS', '‽', specialquery), '%26%238253%3B')
t.assert.strictEqual(f('ISO-2022-JP', '¥', specialquery), '%1B(J\\%1B(B')
t.assert.strictEqual(
f('Shift_JIS', '1+1 ≡ 2%20‽', form, true),
'1%2B1+%81%DF+2%2520%26%238253%3B'
)
t.assert.strictEqual(f('UTF-8', '≡', userinfo), '%E2%89%A1')
t.assert.strictEqual(f('UTF-8', '‽', userinfo), '%E2%80%BD')
t.assert.strictEqual(f('UTF-8', 'Say what‽', userinfo), 'Say%20what%E2%80%BD')
})

// https://encoding.spec.whatwg.org/#get-an-encoder
describe('throws on unknown, utf-16 and replacement', () => {
for (const encoding of [...invalid, 'what', 'UTF-16', 'unicode']) {
test(encoding, (t) => {
for (const set of sets) {
t.assert.throws(() => f(encoding, '', set), /encoding/)
t.assert.throws(() => f(encoding, ' ', set), /encoding/)
t.assert.throws(() => f(encoding, ' ', set, true), /encoding/)
t.assert.throws(() => f(encoding, '\uFFFD', set, true), /encoding/)
}
})
}
})

describe('all valid encodings are recognized', () => {
for (const encoding of labels) {
if (invalid.includes(encoding)) continue
test(encoding, (t) => {
for (const set of sets) {
t.assert.strictEqual(f(encoding, '', set), '')
// Even non-ASCII encodings passthrough on a lone space
t.assert.strictEqual(f(encoding, ' ', set), set.includes(' ') ? '%20' : ' ')
t.assert.strictEqual(f(encoding, ' ', set, true), '+')
}
})
}
})

describe('replaces non-scalarvalue', () => {
for (const encoding of labels) {
if (invalid.includes(encoding)) continue
test(encoding, (t) => {
const a = f(encoding, '\uFFFD', userinfo)
const b = f(encoding, '\uFFFD', jsuri)
for (let cp = 0xd8_00; cp < 0xe0_00; cp++) {
const s = String.fromCodePoint(cp)
t.assert.strictEqual(f(encoding, s, userinfo), a)
t.assert.strictEqual(f(encoding, s, jsuri), b)
}
})
}
})

describe('encodeURI / encodeURIComponent', () => {
describe('ASCII supersets', (t) => {
const ascii = Array.from({ length: 128 }, (_, i) => String.fromCharCode(i)).join('')
for (const encoding of labels) {
if (invalid.includes(encoding)) continue
if (encoding === 'iso-2022-jp') continue // not an ASCII superset
test(encoding, (t) => {
t.assert.strictEqual(f(encoding, ascii, jsuricomponent), encodeURIComponent(ascii))
t.assert.strictEqual(f(encoding, ascii, jsuri), encodeURI(ascii))
for (let i = 0; i < 128; i++) {
const s = String.fromCharCode(i)
t.assert.strictEqual(f(encoding, s, jsuricomponent), encodeURIComponent(s))
t.assert.strictEqual(f(encoding, s, jsuri), encodeURI(s))
}
})
}
})

test('UTF-8: full Unicode', (t) => {
const MAX = slowEngine ? 0x1_ff_ff : 0x10_ff_ff // Max Unicode codepoint
for (let cp = 0; cp <= MAX; cp++) {
if (cp >= 0xd8_00 && cp < 0xe0_00) continue
const s = String.fromCodePoint(cp)
t.assert.strictEqual(f('utf8', s, jsuricomponent), encodeURIComponent(s))
t.assert.strictEqual(f('utf8', s, jsuri), encodeURI(s))
}
})
})
})
48 changes: 48 additions & 0 deletions tests/wpt/fixtures/url/resources/percent-encoding.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
[
"Tests for percent-encoding.",
{
"input": "\u2020",
"output": {
"big5": "%26%238224%3B",
"euc-kr": "%A2%D3",
"utf-8": "%E2%80%A0",
"windows-1252": "%86"
}
},
"This uses a trailing A to prevent the URL parser from trimming the C0 control.",
{
"input": "\u000EA",
"output": {
"big5": "%0EA",
"iso-2022-jp": "%26%2365533%3BA",
"utf-8": "%0EA"
}
},
{
"input": "\u203E\u005C",
"output": {
"iso-2022-jp": "%1B(J~%1B(B\\",
"utf-8": "%E2%80%BE\\"
}
},
{
"input": "\uE5E5",
"output": {
"gb18030": "%26%2358853%3B",
"utf-8": "%EE%97%A5"
}
},
{
"input": "\u2212",
"output": {
"shift_jis": "%81|",
"utf-8": "%E2%88%92"
}
},
{
"input": "á|",
"output": {
"utf-8": "%C3%A1|"
}
}
]
45 changes: 24 additions & 21 deletions tests/wpt/mulibyte-encoder.test.js
Original file line number Diff line number Diff line change
@@ -1,28 +1,14 @@
import { createMultibyteEncoder } from '@exodus/bytes/multi-byte.js'
import { multibyteEncoder } from '../../fallback/multi-byte.js'
import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js'
import '@exodus/bytes/encoding.js'
import { encodeLatin1 } from '../../fallback/latin1.js'
import { describe, test } from 'node:test'
import { readFileSync } from 'node:fs'
import { join } from 'node:path'

const { unescape } = globalThis

// query percent-encode set
const querySet = (x) => x < 0x21 || x > 0x7e || x === 0x22 || x === 0x23 || x === 0x3c || x === 0x3e
const esc1 = (x) => '%' + x.toString(16).padStart(2, '0').toUpperCase()
const escArr = (u) => [...u].map((x) => (querySet(x) ? esc1(x) : String.fromCharCode(x))).join('')
const specialquery = ` "#'<>` // https://url.spec.whatwg.org/#special-query-percent-encode-set

function toUrl(encoding, input) {
let encoded = ''
let last = 0
const escaping = multibyteEncoder(encoding, (cp, u, i) => {
encoded += `${escArr(u.subarray(last, i))}%26%23${cp}%3B` // &#cp;
last = i
return 0 // no bytes emitted
})

const u = escaping(input)
encoded += escArr(u.subarray(last))
return encoded
}
const { unescape } = globalThis

function testEncoder(encoding, fn) {
describe(encoding, () => {
Expand All @@ -38,7 +24,7 @@ function testEncoder(encoding, fn) {
}

// Full check
t.assert.strictEqual(toUrl(encoding, input), escaped)
t.assert.strictEqual(percentEncodeAfterEncoding(encoding, input, specialquery), escaped)
})
})
})
Expand Down Expand Up @@ -109,3 +95,20 @@ testEncoder('iso-2022-jp', (encode) => {
encode('\uFF61\uFFFD', '%1B$B!%23%1B(B%26%2365533%3B', 'Katakana U+FFFD')
encode('\u0393\uFFFD', '%1B$B&%23%1B(B%26%2365533%3B', 'jis0208 U+FFFD')
})

test('url/resources/percent-encoding.json', (t) => {
const data = JSON.parse(
readFileSync(join(import.meta.dirname, `fixtures/url/resources/percent-encoding.json`), 'utf8')
)

// Doc: https://github.com/web-platform-tests/wpt/blob/master/url/README.md
// > _percentEncodeSet_ set to special-query percent-encode set and _spaceAsPlus_ set to false.
const set = specialquery
const spaceAsPlus = false
for (const { input, output } of data) {
if (!input && !output) continue // comment
for (const [encoding, escaped] of Object.entries(output)) {
t.assert.strictEqual(percentEncodeAfterEncoding(encoding, input, set, spaceAsPlus), escaped)
}
}
})
Loading