From 863a794ab3fdb2caf3193a559a66d88ac9319916 Mon Sep 17 00:00:00 2001
From: Dereck Tu <derecktu@bitgo.com>
Date: Fri, 16 Jan 2026 13:15:22 -0500
Subject: [PATCH] fix: handle multi-byte characters

Ticket: DX-2800

This commit handles arbitrary sized characters. SWC uses byte offsets,
while JS uses character offsets. This results in an index drift. #1109
had an initial fix, but upon adding more tests, it seemed that the fix
was incomplete. This commit should pass the additional tests
---
 packages/openapi-generator/src/comments.ts    |  45 +++---
 .../test/openapi/comments.test.ts             | 131 ++++++++++++++++++
 2 files changed, 158 insertions(+), 18 deletions(-)

diff --git a/packages/openapi-generator/src/comments.ts b/packages/openapi-generator/src/comments.ts
index 36879f2d..a751ccf4 100644
--- a/packages/openapi-generator/src/comments.ts
+++ b/packages/openapi-generator/src/comments.ts
@@ -2,11 +2,27 @@ import { parse as parseComment, Block } from 'comment-parser';
 import { Schema } from './ir';
 
 /**
- * Compute the difference between byte length and character length for a string.
- * This accounts for multibyte UTF-8 characters.
+ * Convert a UTF-8 byte offset to a JavaScript string character offset.
+ * SWC (written in Rust) uses byte offsets, but JavaScript strings use
+ * UTF-16 code unit offsets. This function handles the conversion by
+ * iterating through the string and accumulating byte lengths.
+ *
+ * @param str The source string
+ * @param byteOffset The byte offset to convert
+ * @returns The corresponding character offset
  */
-function computeByteLengthDiff(str: string): number {
-  return Buffer.byteLength(str, 'utf8') - str.length;
+function byteOffsetToCharOffset(str: string, byteOffset: number): number {
+  let charCount = 0;
+  let byteCount = 0;
+
+  for (const char of str) {
+    const charBytes = Buffer.byteLength(char, 'utf8');
+    if (byteCount + charBytes > byteOffset) break;
+    byteCount += charBytes;
+    charCount++;
+  }
+
+  return charCount;
 }
 
 export function leadingComment(
@@ -18,20 +34,13 @@ export function leadingComment(
   // SWC uses byte offsets, but JavaScript strings use character offsets.
   // When there are multibyte UTF-8 characters, we need to adjust.
   // Calculate the byte-to-char difference for the portion of source before our slice.
-  const prefixLength = Math.min(start - srcSpanStart, src.length);
-  const prefix = src.slice(0, prefixLength);
-  const byteDiff = computeByteLengthDiff(prefix);
-
-  // Adjust the slice offsets by the byte difference
-  const adjustedStart = start - srcSpanStart - byteDiff;
-  const adjustedEnd =
-    end -
-    srcSpanStart -
-    computeByteLengthDiff(src.slice(0, Math.min(end - srcSpanStart, src.length)));
-
-  let commentString = src
-    .slice(Math.max(0, adjustedStart), Math.max(0, adjustedEnd))
-    .trim();
+  const startByteOffset = start - srcSpanStart;
+  const endByteOffset = end - srcSpanStart;
+
+  const startCharOffset = byteOffsetToCharOffset(src, startByteOffset);
+  const endCharOffset = byteOffsetToCharOffset(src, endByteOffset);
+
+  let commentString = src.slice(startCharOffset, endCharOffset).trim();
 
   if (commentString.includes(' * ') && !/\/\*\*([\s\S]*?)\*\//.test(commentString)) {
     // The comment block seems to be JSDoc but was sliced incorrectly
diff --git a/packages/openapi-generator/test/openapi/comments.test.ts b/packages/openapi-generator/test/openapi/comments.test.ts
index 4b461a5f..7b3ef6d5 100644
--- a/packages/openapi-generator/test/openapi/comments.test.ts
+++ b/packages/openapi-generator/test/openapi/comments.test.ts
@@ -2112,6 +2112,137 @@ testCase('route with multibyte chars', ROUTE_WITH_MULTIBYTE_CHARS, {
   },
 });
 
+const ROUTE_WITH_CJK_CHARS = `
+import * as t from 'io-ts';
+import * as h from '@api-ts/io-ts-http';
+
+export const Body = t.type({
+  /**
+   * 日本語の名前フィールド (Japanese name field)
+   * @example 山田太郎
+   */
+  japaneseName: t.string,
+  /**
+   * 中文名字字段 (Chinese name field)
+   * @example 张三
+   */
+  chineseName: t.string,
+  /**
+   * 한국어 이름 필드 (Korean name field)
+   * @example 김철수
+   */
+  koreanName: t.string,
+});
+
+/**
+ * Route testing CJK characters (日本語, 中文, 한국어)
+ *
+ * @operationId api.v1.cjkChars
+ * @tag Test Routes
+ */
+export const route = h.httpRoute({
+  path: '/cjk-chars',
+  method: 'POST',
+  request: h.httpRequest({
+    body: Body,
+  }),
+  response: {
+    200: {
+      result: t.string
+    }
+  },
+});
+`;
+
+testCase('route with CJK characters', ROUTE_WITH_CJK_CHARS, {
+  openapi: '3.0.3',
+  info: {
+    title: 'Test',
+    version: '1.0.0',
+  },
+  paths: {
+    '/cjk-chars': {
+      post: {
+        summary: 'Route testing CJK characters (日本語, 中文, 한국어)',
+        operationId: 'api.v1.cjkChars',
+        tags: ['Test Routes'],
+        parameters: [],
+        requestBody: {
+          content: {
+            'application/json': {
+              schema: {
+                properties: {
+                  japaneseName: {
+                    type: 'string',
+                    description: '日本語の名前フィールド (Japanese name field)',
+                    example: '山田太郎',
+                  },
+                  chineseName: {
+                    type: 'string',
+                    description: '中文名字字段 (Chinese name field)',
+                    example: '张三',
+                  },
+                  koreanName: {
+                    type: 'string',
+                    description: '한국어 이름 필드 (Korean name field)',
+                    example: '김철수',
+                  },
+                },
+                required: ['japaneseName', 'chineseName', 'koreanName'],
+                type: 'object',
+              },
+            },
+          },
+        },
+        responses: {
+          200: {
+            description: 'OK',
+            content: {
+              'application/json': {
+                schema: {
+                  type: 'object',
+                  properties: {
+                    result: {
+                      type: 'string',
+                    },
+                  },
+                  required: ['result'],
+                },
+              },
+            },
+          },
+        },
+      },
+    },
+  },
+  components: {
+    schemas: {
+      Body: {
+        title: 'Body',
+        type: 'object',
+        properties: {
+          japaneseName: {
+            type: 'string',
+            description: '日本語の名前フィールド (Japanese name field)',
+            example: '山田太郎',
+          },
+          chineseName: {
+            type: 'string',
+            description: '中文名字字段 (Chinese name field)',
+            example: '张三',
+          },
+          koreanName: {
+            type: 'string',
+            description: '한국어 이름 필드 (Korean name field)',
+            example: '김철수',
+          },
+        },
+        required: ['japaneseName', 'chineseName', 'koreanName'],
+      },
+    },
+  },
+});
+
 const ROUTE_WITH_MARKDOWN_LIST = `
 import * as t from 'io-ts';
 import * as h from '@api-ts/io-ts-http';