From 9b7f2806955235c9eda18a7eaa40eee2f0b9286a Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Sat, 24 Jan 2026 00:15:52 +0400 Subject: [PATCH] feat: add percentEncodeAfterEncoding --- README.md | 35 +++++ fallback/percent.js | 31 ++++ package.json | 7 + tests/whatwg.test.js | 148 ++++++++++++++++++ .../url/resources/percent-encoding.json | 48 ++++++ tests/wpt/mulibyte-encoder.test.js | 45 +++--- whatwg.d.ts | 48 ++++++ whatwg.js | 76 +++++++++ 8 files changed, 417 insertions(+), 21 deletions(-) create mode 100644 fallback/percent.js create mode 100644 tests/whatwg.test.js create mode 100644 tests/wpt/fixtures/url/resources/percent-encoding.json create mode 100644 whatwg.d.ts create mode 100644 whatwg.js diff --git a/README.md b/README.md index 2197b49e..6ad48bc6 100644 --- a/README.md +++ b/README.md @@ -801,6 +801,41 @@ do not provide sufficiently complete / non-buggy `TextDecoder` APIs. > but they are fixing them and the expected update window is short.\ > If you want to circumvent browser bugs, use full `@exodus/bytes/encoding.js` import. +### `@exodus/bytes/whatwg.js` + +WHATWG helpers + +```js +import '@exodus/bytes/encoding.js' // For full legacy multi-byte encodings support +import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js' +``` + +#### `percentEncodeAfterEncoding(encoding, input, percentEncodeSet, spaceAsPlus = false)` + +Implements [percent-encode after encoding](https://url.spec.whatwg.org/#string-percent-encode-after-encoding) +per WHATWG URL specification. + +> [!IMPORTANT] +> You must import `@exodus/bytes/encoding.js` for this API to accept legacy multi-byte encodings. + +Encodings `utf16-le`, `utf16-be`, and `replacement` are not accepted. + +[C0 control percent-encode set](https://url.spec.whatwg.org/#c0-control-percent-encode-set) is +always percent-encoded. + +`percentEncodeSet` is an addition to that, and must be a string of unique increasing codepoints +in range 0x20 - 0x7e, e.g. `' "#<>'`. + +This method accepts [DOMStrings](https://webidl.spec.whatwg.org/#idl-DOMString) and converts them +to [USVStrings](https://webidl.spec.whatwg.org/#idl-USVString). +This is different from e.g. `encodeURI` and `encodeURIComponent` which throw on surrogates: +```js +> percentEncodeAfterEncoding('utf8', '\ud800', ' "#$%&+,/:;<=>?@[\\]^`{|}') // component +'%EF%BF%BD' +> encodeURIComponent('\ud800') +Uncaught URIError: URI malformed +``` + ## Changelog See [GitHub Releases](https://github.com/ExodusOSS/bytes/releases) tab diff --git a/fallback/percent.js b/fallback/percent.js new file mode 100644 index 00000000..f12613bf --- /dev/null +++ b/fallback/percent.js @@ -0,0 +1,31 @@ +import { decodeAscii, encodeLatin1 } from './latin1.js' +import { decode2string } from './_utils.js' + +const ERR = 'percentEncodeSet must be a string of unique increasing codepoints in range 0x20 - 0x7e' +const percentMap = new Map() +let hex, base + +export function percentEncoder(set, spaceAsPlus = false) { + if (typeof set !== 'string' || /[^\x20-\x7E]/.test(set)) throw new TypeError(ERR) + if (typeof spaceAsPlus !== 'boolean') throw new TypeError('spaceAsPlus must be boolean') + const id = set + +spaceAsPlus + const cached = percentMap.get(id) + if (cached) return cached + + const n = encodeLatin1(set).sort() // string checked above to be ascii + if (decodeAscii(n) !== set || new Set(n).size !== n.length) throw new TypeError(ERR) + + if (!base) { + hex = Array.from({ length: 256 }, (_, i) => `%${i.toString(16).padStart(2, '0').toUpperCase()}`) + base = hex.map((h, i) => (i < 0x20 || i > 0x7e ? h : String.fromCharCode(i))) + } + + const map = base.slice() // copy + for (const c of n) map[c] = hex[c] + if (spaceAsPlus) map[0x20] = '+' // overrides whatever percentEncodeSet thinks about it + + // Input is not typechecked, for internal use only + const percentEncode = (u8, start = 0, end = u8.length) => decode2string(u8, start, end, map) + percentMap.set(id, percentEncode) + return percentEncode +} diff --git a/package.json b/package.json index 1acd8539..6d65f8f2 100644 --- a/package.json +++ b/package.json @@ -70,6 +70,7 @@ "/fallback/encoding.util.js", "/fallback/hex.js", "/fallback/latin1.js", + "/fallback/percent.js", "/fallback/multi-byte.encodings.cjs", "/fallback/multi-byte.encodings.json", "/fallback/multi-byte.js", @@ -119,6 +120,8 @@ "/utf8.js", "/utf8.d.ts", "/utf8.node.js", + "/whatwg.js", + "/whatwg.d.ts", "/wif.js", "/wif.d.ts" ], @@ -199,6 +202,10 @@ "node": "./utf8.node.js", "default": "./utf8.js" }, + "./whatwg.js": { + "types": "./whatwg.d.ts", + "default": "./whatwg.js" + }, "./wif.js": { "types": "./wif.d.ts", "default": "./wif.js" diff --git a/tests/whatwg.test.js b/tests/whatwg.test.js new file mode 100644 index 00000000..f3d2ebb2 --- /dev/null +++ b/tests/whatwg.test.js @@ -0,0 +1,148 @@ +import '@exodus/bytes/encoding.js' +import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js' +import { describe, test } from 'node:test' +import { labels } from './encoding/fixtures/encodings.cjs' + +const jsuri = ' "%<>[\\]^`{|}' // https://tc39.es/ecma262/#sec-encodeuri-uri +const jsuricomponent = ' "#$%&+,/:;<=>?@[\\]^`{|}' // https://tc39.es/ecma262/#sec-encodeuricomponent-uricomponent +const fragment = ' "<>`' // https://url.spec.whatwg.org/#fragment-percent-encode-set +const query = ' "#<>' // https://url.spec.whatwg.org/#query-percent-encode-set +const specialquery = ' "#\'<>' // https://url.spec.whatwg.org/#special-query-percent-encode-set +const path = ' "#<>?^`{}' // https://url.spec.whatwg.org/#path-percent-encode-set +const userinfo = ' "#/:;<=>?@[\\]^`{|}' // https://url.spec.whatwg.org/#userinfo-percent-encode-set +const component = ' "#$%&+,/:;<=>?@[\\]^`{|}' // https://url.spec.whatwg.org/#component-percent-encode-set +const form = ' !"#$%&\'()+,/:;<=>?@[\\]^`{|}~' // https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set + +const sets = ['', userinfo, jsuri, jsuricomponent] +const invalid = ['replacement', 'utf-16le', 'utf-16be'] // https://encoding.spec.whatwg.org/#get-an-encoder + +const slowEngine = + process.env.EXODUS_TEST_PLATFORM === 'quickjs' || + process.env.EXODUS_TEST_PLATFORM === 'xs' || + process.env.EXODUS_TEST_PLATFORM === 'engine262' + +test('perncent-encode sets coherence', (t) => { + const eq = (a, b) => t.assert.deepStrictEqual([...a], [...b].sort()) + // https://tc39.es/ecma262/#sec-encodeuri-uri step 2 + eq(jsuricomponent, jsuri + ';/?:@&=+$,#') + // https://url.spec.whatwg.org/#fragment-percent-encode-set + eq(fragment, String.fromCharCode(0x20, 0x22, 0x3c, 0x3e, 0x60)) + // https://url.spec.whatwg.org/#query-percent-encode-set + eq(query, String.fromCharCode(0x20, 0x22, 0x23, 0x3c, 0x3e)) + // https://url.spec.whatwg.org/#special-query-percent-encode-set + eq(specialquery, query + String.fromCharCode(0x27)) + // https://url.spec.whatwg.org/#path-percent-encode-set + eq(path, query + String.fromCharCode(0x3f, 0x5e, 0x60, 0x7b, 0x7d)) + // https://url.spec.whatwg.org/#userinfo-percent-encode-set + eq(userinfo, path + String.fromCharCode(0x2f, 0x3a, 0x3b, 0x3d, 0x40, 0x5b, 0x5c, 0x5d, 0x7c)) + // https://url.spec.whatwg.org/#component-percent-encode-set + eq(component, userinfo + String.fromCharCode(0x24, 0x25, 0x26, 0x2b, 0x2c)) + t.assert.strictEqual(jsuricomponent, component) + // https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set + eq(form, component + String.fromCharCode(0x21, 0x27, 0x28, 0x29, 0x7e)) +}) + +describe('percent-encode after encoding', () => { + const f = percentEncodeAfterEncoding + + // https://url.spec.whatwg.org/#example-percent-encode-operations + test('examples from spec', (t) => { + // At https://github.com/whatwg/url/commit/5c50135f8304dc8cb9bb49367b364699cc5bb031 + t.assert.strictEqual(f('Shift_JIS', ' ', userinfo), '%20') + t.assert.strictEqual(f('Shift_JIS', '≡', userinfo), '%81%DF') + t.assert.strictEqual(f('Shift_JIS', '‽', userinfo), '%26%238253%3B') + t.assert.strictEqual(f('ISO-2022-JP', '¥', userinfo), '%1B(J%5C%1B(B') + t.assert.strictEqual( + f('Shift_JIS', '1+1 ≡ 2%20‽', userinfo, true), + '1+1+%81%DF+2%20%26%238253%3B' + ) + t.assert.strictEqual(f('UTF-8', '≡', userinfo), '%E2%89%A1') + t.assert.strictEqual(f('UTF-8', '‽', userinfo), '%E2%80%BD') + t.assert.strictEqual(f('UTF-8', 'Say what‽', userinfo), 'Say%20what%E2%80%BD') + + // At https://github.com/whatwg/url/pull/896 + t.assert.strictEqual(f('Shift_JIS', ' ', specialquery), '%20') + t.assert.strictEqual(f('Shift_JIS', '≡', specialquery), '%81%DF') + t.assert.strictEqual(f('Shift_JIS', '‽', specialquery), '%26%238253%3B') + t.assert.strictEqual(f('ISO-2022-JP', '¥', specialquery), '%1B(J\\%1B(B') + t.assert.strictEqual( + f('Shift_JIS', '1+1 ≡ 2%20‽', form, true), + '1%2B1+%81%DF+2%2520%26%238253%3B' + ) + t.assert.strictEqual(f('UTF-8', '≡', userinfo), '%E2%89%A1') + t.assert.strictEqual(f('UTF-8', '‽', userinfo), '%E2%80%BD') + t.assert.strictEqual(f('UTF-8', 'Say what‽', userinfo), 'Say%20what%E2%80%BD') + }) + + // https://encoding.spec.whatwg.org/#get-an-encoder + describe('throws on unknown, utf-16 and replacement', () => { + for (const encoding of [...invalid, 'what', 'UTF-16', 'unicode']) { + test(encoding, (t) => { + for (const set of sets) { + t.assert.throws(() => f(encoding, '', set), /encoding/) + t.assert.throws(() => f(encoding, ' ', set), /encoding/) + t.assert.throws(() => f(encoding, ' ', set, true), /encoding/) + t.assert.throws(() => f(encoding, '\uFFFD', set, true), /encoding/) + } + }) + } + }) + + describe('all valid encodings are recognized', () => { + for (const encoding of labels) { + if (invalid.includes(encoding)) continue + test(encoding, (t) => { + for (const set of sets) { + t.assert.strictEqual(f(encoding, '', set), '') + // Even non-ASCII encodings passthrough on a lone space + t.assert.strictEqual(f(encoding, ' ', set), set.includes(' ') ? '%20' : ' ') + t.assert.strictEqual(f(encoding, ' ', set, true), '+') + } + }) + } + }) + + describe('replaces non-scalarvalue', () => { + for (const encoding of labels) { + if (invalid.includes(encoding)) continue + test(encoding, (t) => { + const a = f(encoding, '\uFFFD', userinfo) + const b = f(encoding, '\uFFFD', jsuri) + for (let cp = 0xd8_00; cp < 0xe0_00; cp++) { + const s = String.fromCodePoint(cp) + t.assert.strictEqual(f(encoding, s, userinfo), a) + t.assert.strictEqual(f(encoding, s, jsuri), b) + } + }) + } + }) + + describe('encodeURI / encodeURIComponent', () => { + describe('ASCII supersets', (t) => { + const ascii = Array.from({ length: 128 }, (_, i) => String.fromCharCode(i)).join('') + for (const encoding of labels) { + if (invalid.includes(encoding)) continue + if (encoding === 'iso-2022-jp') continue // not an ASCII superset + test(encoding, (t) => { + t.assert.strictEqual(f(encoding, ascii, jsuricomponent), encodeURIComponent(ascii)) + t.assert.strictEqual(f(encoding, ascii, jsuri), encodeURI(ascii)) + for (let i = 0; i < 128; i++) { + const s = String.fromCharCode(i) + t.assert.strictEqual(f(encoding, s, jsuricomponent), encodeURIComponent(s)) + t.assert.strictEqual(f(encoding, s, jsuri), encodeURI(s)) + } + }) + } + }) + + test('UTF-8: full Unicode', (t) => { + const MAX = slowEngine ? 0x1_ff_ff : 0x10_ff_ff // Max Unicode codepoint + for (let cp = 0; cp <= MAX; cp++) { + if (cp >= 0xd8_00 && cp < 0xe0_00) continue + const s = String.fromCodePoint(cp) + t.assert.strictEqual(f('utf8', s, jsuricomponent), encodeURIComponent(s)) + t.assert.strictEqual(f('utf8', s, jsuri), encodeURI(s)) + } + }) + }) +}) diff --git a/tests/wpt/fixtures/url/resources/percent-encoding.json b/tests/wpt/fixtures/url/resources/percent-encoding.json new file mode 100644 index 00000000..eccd1db6 --- /dev/null +++ b/tests/wpt/fixtures/url/resources/percent-encoding.json @@ -0,0 +1,48 @@ +[ + "Tests for percent-encoding.", + { + "input": "\u2020", + "output": { + "big5": "%26%238224%3B", + "euc-kr": "%A2%D3", + "utf-8": "%E2%80%A0", + "windows-1252": "%86" + } + }, + "This uses a trailing A to prevent the URL parser from trimming the C0 control.", + { + "input": "\u000EA", + "output": { + "big5": "%0EA", + "iso-2022-jp": "%26%2365533%3BA", + "utf-8": "%0EA" + } + }, + { + "input": "\u203E\u005C", + "output": { + "iso-2022-jp": "%1B(J~%1B(B\\", + "utf-8": "%E2%80%BE\\" + } + }, + { + "input": "\uE5E5", + "output": { + "gb18030": "%26%2358853%3B", + "utf-8": "%EE%97%A5" + } + }, + { + "input": "\u2212", + "output": { + "shift_jis": "%81|", + "utf-8": "%E2%88%92" + } + }, + { + "input": "á|", + "output": { + "utf-8": "%C3%A1|" + } + } +] diff --git a/tests/wpt/mulibyte-encoder.test.js b/tests/wpt/mulibyte-encoder.test.js index 37d83b29..35ee4ee4 100644 --- a/tests/wpt/mulibyte-encoder.test.js +++ b/tests/wpt/mulibyte-encoder.test.js @@ -1,28 +1,14 @@ import { createMultibyteEncoder } from '@exodus/bytes/multi-byte.js' -import { multibyteEncoder } from '../../fallback/multi-byte.js' +import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js' +import '@exodus/bytes/encoding.js' import { encodeLatin1 } from '../../fallback/latin1.js' import { describe, test } from 'node:test' +import { readFileSync } from 'node:fs' +import { join } from 'node:path' -const { unescape } = globalThis - -// query percent-encode set -const querySet = (x) => x < 0x21 || x > 0x7e || x === 0x22 || x === 0x23 || x === 0x3c || x === 0x3e -const esc1 = (x) => '%' + x.toString(16).padStart(2, '0').toUpperCase() -const escArr = (u) => [...u].map((x) => (querySet(x) ? esc1(x) : String.fromCharCode(x))).join('') +const specialquery = ` "#'<>` // https://url.spec.whatwg.org/#special-query-percent-encode-set -function toUrl(encoding, input) { - let encoded = '' - let last = 0 - const escaping = multibyteEncoder(encoding, (cp, u, i) => { - encoded += `${escArr(u.subarray(last, i))}%26%23${cp}%3B` // &#cp; - last = i - return 0 // no bytes emitted - }) - - const u = escaping(input) - encoded += escArr(u.subarray(last)) - return encoded -} +const { unescape } = globalThis function testEncoder(encoding, fn) { describe(encoding, () => { @@ -38,7 +24,7 @@ function testEncoder(encoding, fn) { } // Full check - t.assert.strictEqual(toUrl(encoding, input), escaped) + t.assert.strictEqual(percentEncodeAfterEncoding(encoding, input, specialquery), escaped) }) }) }) @@ -109,3 +95,20 @@ testEncoder('iso-2022-jp', (encode) => { encode('\uFF61\uFFFD', '%1B$B!%23%1B(B%26%2365533%3B', 'Katakana U+FFFD') encode('\u0393\uFFFD', '%1B$B&%23%1B(B%26%2365533%3B', 'jis0208 U+FFFD') }) + +test('url/resources/percent-encoding.json', (t) => { + const data = JSON.parse( + readFileSync(join(import.meta.dirname, `fixtures/url/resources/percent-encoding.json`), 'utf8') + ) + + // Doc: https://github.com/web-platform-tests/wpt/blob/master/url/README.md + // > _percentEncodeSet_ set to special-query percent-encode set and _spaceAsPlus_ set to false. + const set = specialquery + const spaceAsPlus = false + for (const { input, output } of data) { + if (!input && !output) continue // comment + for (const [encoding, escaped] of Object.entries(output)) { + t.assert.strictEqual(percentEncodeAfterEncoding(encoding, input, set, spaceAsPlus), escaped) + } + } +}) diff --git a/whatwg.d.ts b/whatwg.d.ts new file mode 100644 index 00000000..b0bbed7b --- /dev/null +++ b/whatwg.d.ts @@ -0,0 +1,48 @@ +/** + * WHATWG helpers + * + * ```js + * import '@exodus/bytes/encoding.js' // For full legacy multi-byte encodings support + * import { percentEncodeAfterEncoding } from '@exodus/bytes/whatwg.js' + * ``` + * + * @module @exodus/bytes/whatwg.js + */ + +/** + * Implements [percent-encode after encoding](https://url.spec.whatwg.org/#string-percent-encode-after-encoding) + * per WHATWG URL specification. + * + * > [!IMPORTANT] + * > You must import `@exodus/bytes/encoding.js` for this API to accept legacy multi-byte encodings. + * + * Encodings `utf16-le`, `utf16-be`, and `replacement` are not accepted. + * + * [C0 control percent-encode set](https://url.spec.whatwg.org/#c0-control-percent-encode-set) is + * always percent-encoded. + * + * `percentEncodeSet` is an addition to that, and must be a string of unique increasing codepoints + * in range 0x20 - 0x7e, e.g. `' "#<>'`. + * + * This method accepts [DOMStrings](https://webidl.spec.whatwg.org/#idl-DOMString) and converts them + * to [USVStrings](https://webidl.spec.whatwg.org/#idl-USVString). + * This is different from e.g. `encodeURI` and `encodeURIComponent` which throw on surrogates: + * ```js + * > percentEncodeAfterEncoding('utf8', '\ud800', ' "#$%&+,/:;<=>?@[\\]^`{|}') // component + * '%EF%BF%BD' + * > encodeURIComponent('\ud800') + * Uncaught URIError: URI malformed + * ``` + * + * @param encoding - The encoding label per WHATWG Encoding spec + * @param input - Input scalar-value string to encode + * @param percentEncodeSet - A string of ASCII chars to escape in addition to C0 control percent-encode set + * @param spaceAsPlus - Whether to encode space as `'+'` instead of `'%20'` or `' '` (default: false) + * @returns The percent-encoded string + */ +export function percentEncodeAfterEncoding( + encoding: string, + input: string, + percentEncodeSet: string, + spaceAsPlus?: boolean +): string; diff --git a/whatwg.js b/whatwg.js new file mode 100644 index 00000000..3c9c0e11 --- /dev/null +++ b/whatwg.js @@ -0,0 +1,76 @@ +import { utf8fromStringLoose } from '@exodus/bytes/utf8.js' +import { createSinglebyteEncoder } from '@exodus/bytes/single-byte.js' +import { isMultibyte, getMultibyteEncoder } from './fallback/encoding.js' +import { normalizeEncoding, E_ENCODING } from './fallback/encoding.api.js' +import { percentEncoder } from './fallback/percent.js' +import { encodeMap } from './fallback/single-byte.js' +import { E_STRING } from './fallback/_utils.js' + +// https://url.spec.whatwg.org/#string-percent-encode-after-encoding +// Codepoints below 0x20, 0x7F specifically, and above 0x7F (non-ASCII) are always encoded +// > A C0 control is a code point in the range U+0000 NULL to U+001F INFORMATION SEPARATOR ONE, inclusive. +// > The C0 control percent-encode set are the C0 controls and all code points greater than U+007E (~). +export function percentEncodeAfterEncoding(encoding, input, percentEncodeSet, spaceAsPlus = false) { + const enc = normalizeEncoding(encoding) + // Ref: https://encoding.spec.whatwg.org/#get-an-encoder + if (!enc || enc === 'replacement' || enc === 'utf-16le' || enc === 'utf-16be') { + throw new RangeError(E_ENCODING) + } + + const percent = percentEncoder(percentEncodeSet, spaceAsPlus) + if (enc === 'utf-8') return percent(utf8fromStringLoose(input)) + + const multi = isMultibyte(enc) + const encoder = multi ? getMultibyteEncoder() : createSinglebyteEncoder + const fatal = encoder(enc) + try { + return percent(fatal(input)) + } catch {} + + let res = '' + let last = 0 + if (multi) { + const rep = enc === 'gb18030' ? percent(fatal('\uFFFD')) : `%26%23${0xff_fd}%3B` // only gb18030 can encode it + const escaping = encoder(enc, (cp, u, i) => { + res += percent(u, last, i) + res += cp >= 0xd8_00 && cp < 0xe0_00 ? rep : `%26%23${cp}%3B` // &#cp; + last = i + return 0 // no bytes emitted + }) + + const u = escaping(input) // has side effects on res + res += percent(u, last) + } else { + if (typeof input !== 'string') throw new TypeError(E_STRING) // all other paths have their own validation + const m = encodeMap(enc) + const len = input.length + const u = new Uint8Array(len) + for (let i = 0; i < len; i++) { + const x = input.charCodeAt(i) + const b = m[x] + if (!b && x) { + let cp = x + const i0 = i + if (x >= 0xd8_00 && x < 0xe0_00) { + cp = 0xff_fd + if (x < 0xdc_00 && i + 1 < len) { + const x1 = input.charCodeAt(i + 1) + if (x1 >= 0xdc_00 && x1 < 0xe0_00) { + cp = 0x1_00_00 + ((x1 & 0x3_ff) | ((x & 0x3_ff) << 10)) + i++ + } + } + } + + res += `${percent(u, last, i0)}%26%23${cp}%3B` // &#cp; + last = i + 1 // skip current + } else { + u[i] = b + } + } + + res += percent(u, last) + } + + return res +}