Import feature-extraction inference type from TEI (#781)

Wauplin · web-flow · commit 69a3988f4f91 · 2024-07-12T15:37:36.000+02:00
This PR adds a script to import `feature-extraction` inference types from [text-embeddings-inference](https://github.com/huggingface/text-embeddings-inference). The jsonschema is pulled from https://huggingface.github.io/text-embeddings-inference/openapi.json and converted into the JSONSchema format from which we generate types from the JS and Python clients. This script is highly inspired on the TGI importer script. This PR also add `prompt_name` input parameter that has been newly added to TEI (see huggingface/text-embeddings-inference#312). Decisions taken: 1. Keep `string` as input. In theory TEI is capable of handling much more complex inputs (`Union[List[Union[List[int], int, str]], str]`) but let's keep it simple for now. Other inference tasks are also currently defined without arrays even when InferenceAPI/Endpoints is capable of it. 2. I only take input/output types for the `/embed` route, which is the closest one to `feature-extraction` task. **Note:** in a follow-up PR it would be really nice to put this in a CI workflow that could be triggered manually to open a PR when new arguments are added to TGI / TEI.
diff --git a/packages/tasks/package.json b/packages/tasks/package.json
@@ -28,7 +28,8 @@
 		"prepare": "pnpm run build",
 		"check": "tsc",
 		"inference-codegen": "tsx scripts/inference-codegen.ts && prettier --write src/tasks/*/inference.ts",
-		"inference-tgi-import": "tsx scripts/inference-tgi-import.ts && prettier --write src/tasks/text-generation/spec/*.json && prettier --write src/tasks/chat-completion/spec/*.json"
+		"inference-tgi-import": "tsx scripts/inference-tgi-import.ts && prettier --write src/tasks/text-generation/spec/*.json && prettier --write src/tasks/chat-completion/spec/*.json",
+		"inference-tei-import": "tsx scripts/inference-tei-import.ts && prettier --write src/tasks/feature-extraction/spec/*.json"
 	},
 	"type": "module",
 	"files": [
diff --git a/packages/tasks/scripts/inference-tei-import.ts b/packages/tasks/scripts/inference-tei-import.ts
@@ -0,0 +1,118 @@
+/*
+ * Fetches TEI specs and generates JSON schema for input and output of
+ * text-embeddings (called feature-extraction).
+ * See https://huggingface.github.io/text-embeddings-inference/
+ */
+import fs from "fs/promises";
+import * as path from "node:path/posix";
+import { existsSync as pathExists } from "node:fs";
+import type { JsonObject, JsonValue } from "type-fest";
+
+const URL = "https://huggingface.github.io/text-embeddings-inference/openapi.json";
+
+const rootDirFinder = function (): string {
+	let currentPath = path.normalize(import.meta.url);
+
+	while (currentPath !== "/") {
+		if (pathExists(path.join(currentPath, "package.json"))) {
+			return currentPath;
+		}
+
+		currentPath = path.normalize(path.join(currentPath, ".."));
+	}
+
+	return "/";
+};
+
+const rootDir = rootDirFinder();
+const tasksDir = path.join(rootDir, "src", "tasks");
+
+function toCamelCase(str: string, joiner = "") {
+	return str
+		.split(/[-_]/)
+		.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
+		.join(joiner);
+}
+
+async function _extractAndAdapt(task: string, mainComponentName: string, type: "input" | "output" | "stream_output") {
+	console.debug(`✨ Importing`, task, type);
+
+	console.debug("   📥 Fetching TEI specs");
+	const response = await fetch(URL);
+	// eslint-disable-next-line @typescript-eslint/no-explicit-any
+	const openapi = (await response.json()) as any;
+	// eslint-disable-next-line @typescript-eslint/no-explicit-any
+	const components: Record<string, any> = openapi["components"]["schemas"];
+
+	// e.g. TextGeneration
+	const camelName = toCamelCase(task);
+	// e.g. TextGenerationInput
+	const camelFullName = camelName + toCamelCase(type);
+	const mainComponent = components[mainComponentName];
+	const filteredComponents: Record<string, JsonObject> = {};
+
+	function _scan(data: JsonValue) {
+		if (Array.isArray(data) || data instanceof Array) {
+			for (const item of data) {
+				_scan(item);
+			}
+		} else if (data && typeof data === "object") {
+			for (const key of Object.keys(data)) {
+				if (key === "$ref" && data[key] === "#/components/schemas/Input") {
+					// Special case: keep input as string or string[]
+					// but not Union[List[Union[List[int], int, str]], str]
+					// data.delete(key);
+					delete data[key];
+					data["type"] = "string";
+					data["description"] = "The text to embed.";
+				} else if (key === "$ref" && typeof data[key] === "string") {
+					// Verify reference exists
+					const ref = (data[key] as string).split("/").pop() ?? "";
+					if (!components[ref]) {
+						throw new Error(`Reference not found in components: ${data[key]}`);
+					}
+
+					// Add reference to components to export (and scan it too)
+					const newRef = camelFullName + ref.replace(camelName, "");
+					if (!filteredComponents[newRef]) {
+						components[ref]["title"] = newRef; // Rename title to avoid conflicts
+						filteredComponents[newRef] = components[ref];
+						_scan(components[ref]);
+					}
+
+					// Updating the reference to new format
+					data[key] = `#/$defs/${newRef}`;
+				} else {
+					_scan(data[key]);
+				}
+			}
+		}
+	}
+
+	console.debug("   📦 Packaging jsonschema");
+	_scan(mainComponent);
+
+	const prettyName = toCamelCase(task, " ") + " " + toCamelCase(type, " ");
+	const inputSchema = {
+		$id: `/inference/schemas/${task}/${type}.json`,
+		$schema: "http://json-schema.org/draft-06/schema#",
+		description:
+			prettyName +
+			".\n\nAuto-generated from TEI specs." +
+			"\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
+		title: camelFullName,
+		type: mainComponent["type"],
+		required: mainComponent["required"],
+		properties: mainComponent["properties"],
+		$defs: filteredComponents,
+		items: mainComponent["items"],
+	};
+
+	const specPath = path.join(tasksDir, task, "spec", `${type}.json`);
+	console.debug("   📂 Exporting", specPath);
+	await fs.writeFile(specPath, JSON.stringify(inputSchema, null, 4));
+}
+
+await _extractAndAdapt("feature-extraction", "EmbedRequest", "input");
+await _extractAndAdapt("feature-extraction", "EmbedResponse", "output");
+console.debug("✅ All done!");
diff --git a/packages/tasks/scripts/inference-tgi-import.ts b/packages/tasks/scripts/inference-tgi-import.ts
@@ -1,5 +1,5 @@
 /*
- * Fetches TGI specs and generated JSON schema for input, output and stream_output of
+ * Fetches TGI specs and generates JSON schema for input, output and stream_output of
  * text-generation and chat-completion tasks.
  * See https://huggingface.github.io/text-generation-inference/
  */
diff --git a/packages/tasks/src/tasks/feature-extraction/inference.ts b/packages/tasks/src/tasks/feature-extraction/inference.ts
@@ -4,19 +4,37 @@
  * Using src/scripts/inference-codegen
  */
 
-export type FeatureExtractionOutput = unknown[];
+export type FeatureExtractionOutput = Array<number[]>;
 
 /**
- * Inputs for Text Embedding inference
+ * Feature Extraction Input.
+ *
+ * Auto-generated from TEI specs.
+ * For more details, check out
+ * https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.
  */
 export interface FeatureExtractionInput {
 	/**
-	 * The text to get the embeddings of
+	 * The text to embed.
 	 */
 	inputs: string;
+	normalize?: boolean;
 	/**
-	 * Additional inference parameters
+	 * The name of the prompt that should be used by for encoding. If not set, no prompt
+	 * will be applied.
+	 *
+	 * Must be a key in the `Sentence Transformers` configuration `prompts` dictionary.
+	 *
+	 * For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ",
+	 * ...},
+	 * then the sentence "What is the capital of France?" will be encoded as
+	 * "query: What is the capital of France?" because the prompt text will be prepended before
+	 * any text to encode.
 	 */
-	parameters?: { [key: string]: unknown };
+	prompt_name?: string;
+	truncate?: boolean;
+	truncation_direction?: FeatureExtractionInputTruncationDirection;
 	[property: string]: unknown;
 }
+
+export type FeatureExtractionInputTruncationDirection = "Left" | "Right";
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/input.json b/packages/tasks/src/tasks/feature-extraction/spec/input.json
@@ -1,26 +1,47 @@
 {
 	"$id": "/inference/schemas/feature-extraction/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Inputs for Text Embedding inference",
+	"description": "Feature Extraction Input.\n\nAuto-generated from TEI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
 	"title": "FeatureExtractionInput",
 	"type": "object",
+	"required": ["inputs"],
 	"properties": {
 		"inputs": {
-			"description": "The text to get the embeddings of",
-			"type": "string"
+			"type": "string",
+			"description": "The text to embed."
 		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/FeatureExtractionParameters"
+		"normalize": {
+			"type": "boolean",
+			"default": "true",
+			"example": "true"
+		},
+		"prompt_name": {
+			"type": "string",
+			"description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.",
+			"default": "null",
+			"example": "null",
+			"nullable": true
+		},
+		"truncate": {
+			"type": "boolean",
+			"default": "false",
+			"example": "false",
+			"nullable": true
+		},
+		"truncation_direction": {
+			"allOf": [
+				{
+					"$ref": "#/$defs/FeatureExtractionInputTruncationDirection"
+				}
+			],
+			"default": "right"
 		}
 	},
 	"$defs": {
-		"FeatureExtractionParameters": {
-			"title": "FeatureExtractionParameters",
-			"description": "Additional inference parameters for Feature Extraction",
-			"type": "object",
-			"properties": {}
+		"FeatureExtractionInputTruncationDirection": {
+			"type": "string",
+			"enum": ["Left", "Right"],
+			"title": "FeatureExtractionInputTruncationDirection"
 		}
-	},
-	"required": ["inputs"]
+	}
 }
diff --git a/packages/tasks/src/tasks/feature-extraction/spec/output.json b/packages/tasks/src/tasks/feature-extraction/spec/output.json
@@ -1,7 +1,15 @@
 {
 	"$id": "/inference/schemas/feature-extraction/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "The embedding for the input text, as a nested list (tensor) of floats",
+	"description": "Feature Extraction Output.\n\nAuto-generated from TEI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
+	"title": "FeatureExtractionOutput",
 	"type": "array",
-	"title": "FeatureExtractionOutput"
+	"$defs": {},
+	"items": {
+		"type": "array",
+		"items": {
+			"type": "number",
+			"format": "float"
+		}
+	}
 }