From 863a794ab3fdb2caf3193a559a66d88ac9319916 Mon Sep 17 00:00:00 2001 From: Dereck Tu Date: Fri, 16 Jan 2026 13:15:22 -0500 Subject: [PATCH] fix: handle multi-byte characters Ticket: DX-2800 This commit handles arbitrary sized characters. SWC uses byte offsets, while JS uses character offsets. This results in an index drift. #1109 had an initial fix, but upon adding more tests, it seemed that the fix was incomplete. This commit should pass the additional tests --- packages/openapi-generator/src/comments.ts | 45 +++--- .../test/openapi/comments.test.ts | 131 ++++++++++++++++++ 2 files changed, 158 insertions(+), 18 deletions(-) diff --git a/packages/openapi-generator/src/comments.ts b/packages/openapi-generator/src/comments.ts index 36879f2d..a751ccf4 100644 --- a/packages/openapi-generator/src/comments.ts +++ b/packages/openapi-generator/src/comments.ts @@ -2,11 +2,27 @@ import { parse as parseComment, Block } from 'comment-parser'; import { Schema } from './ir'; /** - * Compute the difference between byte length and character length for a string. - * This accounts for multibyte UTF-8 characters. + * Convert a UTF-8 byte offset to a JavaScript string character offset. + * SWC (written in Rust) uses byte offsets, but JavaScript strings use + * UTF-16 code unit offsets. This function handles the conversion by + * iterating through the string and accumulating byte lengths. + * + * @param str The source string + * @param byteOffset The byte offset to convert + * @returns The corresponding character offset */ -function computeByteLengthDiff(str: string): number { - return Buffer.byteLength(str, 'utf8') - str.length; +function byteOffsetToCharOffset(str: string, byteOffset: number): number { + let charCount = 0; + let byteCount = 0; + + for (const char of str) { + const charBytes = Buffer.byteLength(char, 'utf8'); + if (byteCount + charBytes > byteOffset) break; + byteCount += charBytes; + charCount++; + } + + return charCount; } export function leadingComment( @@ -18,20 +34,13 @@ export function leadingComment( // SWC uses byte offsets, but JavaScript strings use character offsets. // When there are multibyte UTF-8 characters, we need to adjust. // Calculate the byte-to-char difference for the portion of source before our slice. - const prefixLength = Math.min(start - srcSpanStart, src.length); - const prefix = src.slice(0, prefixLength); - const byteDiff = computeByteLengthDiff(prefix); - - // Adjust the slice offsets by the byte difference - const adjustedStart = start - srcSpanStart - byteDiff; - const adjustedEnd = - end - - srcSpanStart - - computeByteLengthDiff(src.slice(0, Math.min(end - srcSpanStart, src.length))); - - let commentString = src - .slice(Math.max(0, adjustedStart), Math.max(0, adjustedEnd)) - .trim(); + const startByteOffset = start - srcSpanStart; + const endByteOffset = end - srcSpanStart; + + const startCharOffset = byteOffsetToCharOffset(src, startByteOffset); + const endCharOffset = byteOffsetToCharOffset(src, endByteOffset); + + let commentString = src.slice(startCharOffset, endCharOffset).trim(); if (commentString.includes(' * ') && !/\/\*\*([\s\S]*?)\*\//.test(commentString)) { // The comment block seems to be JSDoc but was sliced incorrectly diff --git a/packages/openapi-generator/test/openapi/comments.test.ts b/packages/openapi-generator/test/openapi/comments.test.ts index 4b461a5f..7b3ef6d5 100644 --- a/packages/openapi-generator/test/openapi/comments.test.ts +++ b/packages/openapi-generator/test/openapi/comments.test.ts @@ -2112,6 +2112,137 @@ testCase('route with multibyte chars', ROUTE_WITH_MULTIBYTE_CHARS, { }, }); +const ROUTE_WITH_CJK_CHARS = ` +import * as t from 'io-ts'; +import * as h from '@api-ts/io-ts-http'; + +export const Body = t.type({ + /** + * 日本語の名前フィールド (Japanese name field) + * @example 山田太郎 + */ + japaneseName: t.string, + /** + * 中文名字字段 (Chinese name field) + * @example 张三 + */ + chineseName: t.string, + /** + * 한국어 이름 필드 (Korean name field) + * @example 김철수 + */ + koreanName: t.string, +}); + +/** + * Route testing CJK characters (日本語, 中文, 한국어) + * + * @operationId api.v1.cjkChars + * @tag Test Routes + */ +export const route = h.httpRoute({ + path: '/cjk-chars', + method: 'POST', + request: h.httpRequest({ + body: Body, + }), + response: { + 200: { + result: t.string + } + }, +}); +`; + +testCase('route with CJK characters', ROUTE_WITH_CJK_CHARS, { + openapi: '3.0.3', + info: { + title: 'Test', + version: '1.0.0', + }, + paths: { + '/cjk-chars': { + post: { + summary: 'Route testing CJK characters (日本語, 中文, 한국어)', + operationId: 'api.v1.cjkChars', + tags: ['Test Routes'], + parameters: [], + requestBody: { + content: { + 'application/json': { + schema: { + properties: { + japaneseName: { + type: 'string', + description: '日本語の名前フィールド (Japanese name field)', + example: '山田太郎', + }, + chineseName: { + type: 'string', + description: '中文名字字段 (Chinese name field)', + example: '张三', + }, + koreanName: { + type: 'string', + description: '한국어 이름 필드 (Korean name field)', + example: '김철수', + }, + }, + required: ['japaneseName', 'chineseName', 'koreanName'], + type: 'object', + }, + }, + }, + }, + responses: { + 200: { + description: 'OK', + content: { + 'application/json': { + schema: { + type: 'object', + properties: { + result: { + type: 'string', + }, + }, + required: ['result'], + }, + }, + }, + }, + }, + }, + }, + }, + components: { + schemas: { + Body: { + title: 'Body', + type: 'object', + properties: { + japaneseName: { + type: 'string', + description: '日本語の名前フィールド (Japanese name field)', + example: '山田太郎', + }, + chineseName: { + type: 'string', + description: '中文名字字段 (Chinese name field)', + example: '张三', + }, + koreanName: { + type: 'string', + description: '한국어 이름 필드 (Korean name field)', + example: '김철수', + }, + }, + required: ['japaneseName', 'chineseName', 'koreanName'], + }, + }, + }, +}); + const ROUTE_WITH_MARKDOWN_LIST = ` import * as t from 'io-ts'; import * as h from '@api-ts/io-ts-http';