Skip to content

Commit 69a3988

Browse files
authored
Import feature-extraction inference type from TEI (#781)
This PR adds a script to import `feature-extraction` inference types from [text-embeddings-inference](https://github.com/huggingface/text-embeddings-inference). The jsonschema is pulled from https://huggingface.github.io/text-embeddings-inference/openapi.json and converted into the JSONSchema format from which we generate types from the JS and Python clients. This script is highly inspired on the TGI importer script. This PR also add `prompt_name` input parameter that has been newly added to TEI (see huggingface/text-embeddings-inference#312). Decisions taken: 1. Keep `string` as input. In theory TEI is capable of handling much more complex inputs (`Union[List[Union[List[int], int, str]], str]`) but let's keep it simple for now. Other inference tasks are also currently defined without arrays even when InferenceAPI/Endpoints is capable of it. 2. I only take input/output types for the `/embed` route, which is the closest one to `feature-extraction` task. **Note:** in a follow-up PR it would be really nice to put this in a CI workflow that could be triggered manually to open a PR when new arguments are added to TGI / TEI.
1 parent 2b7457c commit 69a3988

File tree

6 files changed

+188
-22
lines changed

6 files changed

+188
-22
lines changed

packages/tasks/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@
2828
"prepare": "pnpm run build",
2929
"check": "tsc",
3030
"inference-codegen": "tsx scripts/inference-codegen.ts && prettier --write src/tasks/*/inference.ts",
31-
"inference-tgi-import": "tsx scripts/inference-tgi-import.ts && prettier --write src/tasks/text-generation/spec/*.json && prettier --write src/tasks/chat-completion/spec/*.json"
31+
"inference-tgi-import": "tsx scripts/inference-tgi-import.ts && prettier --write src/tasks/text-generation/spec/*.json && prettier --write src/tasks/chat-completion/spec/*.json",
32+
"inference-tei-import": "tsx scripts/inference-tei-import.ts && prettier --write src/tasks/feature-extraction/spec/*.json"
3233
},
3334
"type": "module",
3435
"files": [
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
/*
2+
* Fetches TEI specs and generates JSON schema for input and output of
3+
* text-embeddings (called feature-extraction).
4+
* See https://huggingface.github.io/text-embeddings-inference/
5+
*/
6+
import fs from "fs/promises";
7+
import * as path from "node:path/posix";
8+
import { existsSync as pathExists } from "node:fs";
9+
import type { JsonObject, JsonValue } from "type-fest";
10+
11+
const URL = "https://huggingface.github.io/text-embeddings-inference/openapi.json";
12+
13+
const rootDirFinder = function (): string {
14+
let currentPath = path.normalize(import.meta.url);
15+
16+
while (currentPath !== "/") {
17+
if (pathExists(path.join(currentPath, "package.json"))) {
18+
return currentPath;
19+
}
20+
21+
currentPath = path.normalize(path.join(currentPath, ".."));
22+
}
23+
24+
return "/";
25+
};
26+
27+
const rootDir = rootDirFinder();
28+
const tasksDir = path.join(rootDir, "src", "tasks");
29+
30+
function toCamelCase(str: string, joiner = "") {
31+
return str
32+
.split(/[-_]/)
33+
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
34+
.join(joiner);
35+
}
36+
37+
async function _extractAndAdapt(task: string, mainComponentName: string, type: "input" | "output" | "stream_output") {
38+
console.debug(`✨ Importing`, task, type);
39+
40+
console.debug(" 📥 Fetching TEI specs");
41+
const response = await fetch(URL);
42+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
43+
const openapi = (await response.json()) as any;
44+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
45+
const components: Record<string, any> = openapi["components"]["schemas"];
46+
47+
// e.g. TextGeneration
48+
const camelName = toCamelCase(task);
49+
// e.g. TextGenerationInput
50+
const camelFullName = camelName + toCamelCase(type);
51+
const mainComponent = components[mainComponentName];
52+
const filteredComponents: Record<string, JsonObject> = {};
53+
54+
function _scan(data: JsonValue) {
55+
if (Array.isArray(data) || data instanceof Array) {
56+
for (const item of data) {
57+
_scan(item);
58+
}
59+
} else if (data && typeof data === "object") {
60+
for (const key of Object.keys(data)) {
61+
if (key === "$ref" && data[key] === "#/components/schemas/Input") {
62+
// Special case: keep input as string or string[]
63+
// but not Union[List[Union[List[int], int, str]], str]
64+
// data.delete(key);
65+
delete data[key];
66+
data["type"] = "string";
67+
data["description"] = "The text to embed.";
68+
} else if (key === "$ref" && typeof data[key] === "string") {
69+
// Verify reference exists
70+
const ref = (data[key] as string).split("/").pop() ?? "";
71+
if (!components[ref]) {
72+
throw new Error(`Reference not found in components: ${data[key]}`);
73+
}
74+
75+
// Add reference to components to export (and scan it too)
76+
const newRef = camelFullName + ref.replace(camelName, "");
77+
if (!filteredComponents[newRef]) {
78+
components[ref]["title"] = newRef; // Rename title to avoid conflicts
79+
filteredComponents[newRef] = components[ref];
80+
_scan(components[ref]);
81+
}
82+
83+
// Updating the reference to new format
84+
data[key] = `#/$defs/${newRef}`;
85+
} else {
86+
_scan(data[key]);
87+
}
88+
}
89+
}
90+
}
91+
92+
console.debug(" 📦 Packaging jsonschema");
93+
_scan(mainComponent);
94+
95+
const prettyName = toCamelCase(task, " ") + " " + toCamelCase(type, " ");
96+
const inputSchema = {
97+
$id: `/inference/schemas/${task}/${type}.json`,
98+
$schema: "http://json-schema.org/draft-06/schema#",
99+
description:
100+
prettyName +
101+
".\n\nAuto-generated from TEI specs." +
102+
"\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
103+
title: camelFullName,
104+
type: mainComponent["type"],
105+
required: mainComponent["required"],
106+
properties: mainComponent["properties"],
107+
$defs: filteredComponents,
108+
items: mainComponent["items"],
109+
};
110+
111+
const specPath = path.join(tasksDir, task, "spec", `${type}.json`);
112+
console.debug(" 📂 Exporting", specPath);
113+
await fs.writeFile(specPath, JSON.stringify(inputSchema, null, 4));
114+
}
115+
116+
await _extractAndAdapt("feature-extraction", "EmbedRequest", "input");
117+
await _extractAndAdapt("feature-extraction", "EmbedResponse", "output");
118+
console.debug("✅ All done!");

packages/tasks/scripts/inference-tgi-import.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Fetches TGI specs and generated JSON schema for input, output and stream_output of
2+
* Fetches TGI specs and generates JSON schema for input, output and stream_output of
33
* text-generation and chat-completion tasks.
44
* See https://huggingface.github.io/text-generation-inference/
55
*/

packages/tasks/src/tasks/feature-extraction/inference.ts

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,37 @@
44
* Using src/scripts/inference-codegen
55
*/
66

7-
export type FeatureExtractionOutput = unknown[];
7+
export type FeatureExtractionOutput = Array<number[]>;
88

99
/**
10-
* Inputs for Text Embedding inference
10+
* Feature Extraction Input.
11+
*
12+
* Auto-generated from TEI specs.
13+
* For more details, check out
14+
* https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.
1115
*/
1216
export interface FeatureExtractionInput {
1317
/**
14-
* The text to get the embeddings of
18+
* The text to embed.
1519
*/
1620
inputs: string;
21+
normalize?: boolean;
1722
/**
18-
* Additional inference parameters
23+
* The name of the prompt that should be used by for encoding. If not set, no prompt
24+
* will be applied.
25+
*
26+
* Must be a key in the `Sentence Transformers` configuration `prompts` dictionary.
27+
*
28+
* For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ",
29+
* ...},
30+
* then the sentence "What is the capital of France?" will be encoded as
31+
* "query: What is the capital of France?" because the prompt text will be prepended before
32+
* any text to encode.
1933
*/
20-
parameters?: { [key: string]: unknown };
34+
prompt_name?: string;
35+
truncate?: boolean;
36+
truncation_direction?: FeatureExtractionInputTruncationDirection;
2137
[property: string]: unknown;
2238
}
39+
40+
export type FeatureExtractionInputTruncationDirection = "Left" | "Right";
Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,47 @@
11
{
22
"$id": "/inference/schemas/feature-extraction/input.json",
33
"$schema": "http://json-schema.org/draft-06/schema#",
4-
"description": "Inputs for Text Embedding inference",
4+
"description": "Feature Extraction Input.\n\nAuto-generated from TEI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
55
"title": "FeatureExtractionInput",
66
"type": "object",
7+
"required": ["inputs"],
78
"properties": {
89
"inputs": {
9-
"description": "The text to get the embeddings of",
10-
"type": "string"
10+
"type": "string",
11+
"description": "The text to embed."
1112
},
12-
"parameters": {
13-
"description": "Additional inference parameters",
14-
"$ref": "#/$defs/FeatureExtractionParameters"
13+
"normalize": {
14+
"type": "boolean",
15+
"default": "true",
16+
"example": "true"
17+
},
18+
"prompt_name": {
19+
"type": "string",
20+
"description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.",
21+
"default": "null",
22+
"example": "null",
23+
"nullable": true
24+
},
25+
"truncate": {
26+
"type": "boolean",
27+
"default": "false",
28+
"example": "false",
29+
"nullable": true
30+
},
31+
"truncation_direction": {
32+
"allOf": [
33+
{
34+
"$ref": "#/$defs/FeatureExtractionInputTruncationDirection"
35+
}
36+
],
37+
"default": "right"
1538
}
1639
},
1740
"$defs": {
18-
"FeatureExtractionParameters": {
19-
"title": "FeatureExtractionParameters",
20-
"description": "Additional inference parameters for Feature Extraction",
21-
"type": "object",
22-
"properties": {}
41+
"FeatureExtractionInputTruncationDirection": {
42+
"type": "string",
43+
"enum": ["Left", "Right"],
44+
"title": "FeatureExtractionInputTruncationDirection"
2345
}
24-
},
25-
"required": ["inputs"]
46+
}
2647
}
Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
{
22
"$id": "/inference/schemas/feature-extraction/output.json",
33
"$schema": "http://json-schema.org/draft-06/schema#",
4-
"description": "The embedding for the input text, as a nested list (tensor) of floats",
4+
"description": "Feature Extraction Output.\n\nAuto-generated from TEI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
5+
"title": "FeatureExtractionOutput",
56
"type": "array",
6-
"title": "FeatureExtractionOutput"
7+
"$defs": {},
8+
"items": {
9+
"type": "array",
10+
"items": {
11+
"type": "number",
12+
"format": "float"
13+
}
14+
}
715
}

0 commit comments

Comments
 (0)