elastic · Jan-Kazlouski-elastic · Dec 4, 2025 · Dec 11, 2025 · DonalEvans · Dec 12, 2025
diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json
diff --git a/output/schema/schema.json b/output/schema/schema.json
diff --git a/output/typescript/types.ts b/output/typescript/types.ts
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -3,7 +3,7 @@
     "transform-to-openapi": "npm run transform-to-openapi --prefix compiler --"
   },
   "dependencies": {
-    "@redocly/cli": "^1.34.5"
+    "@redocly/cli": "^1.34.6"
   },
   "version": "overlay"
 }
diff --git a/specification/_doc_ids/table.csv b/specification/_doc_ids/table.csv
@@ -398,6 +398,7 @@ inference-api-put-huggingface,https://www.elastic.co/docs/api/doc/elasticsearch/
 inference-api-put-jinaai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-jinaai,,
 inference-api-put-llama,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-llama,,
 inference-api-put-mistral,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-mistral,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-mistral.html,
+inference-api-put-nvidia,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-nvidia,,
 inference-api-put-openai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-openai,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-openai.html,
 inference-api-put-openshift-ai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-openshift-ai,,
 inference-api-put-voyageai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-voyageai,,

@@ -0,0 +1,49 @@
+{
+  "inference.put_nvidia": {
+    "documentation": {
+      "url": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-nvidia",
+      "description": "Create an Nvidia inference endpoint"
+    },
+    "stability": "stable",
+    "visibility": "public",
+    "headers": {
+      "accept": ["application/json"],
+      "content_type": ["application/json"]
+    },
+    "url": {
+      "paths": [
+        {
+          "path": "/_inference/{task_type}/{nvidia_inference_id}",
+          "methods": ["PUT"],
+          "parts": {
+            "task_type": {
+              "type": "enum",
+              "description": "The task type",
+              "options": [
+                "rerank",
+                "text_embedding",
+                "completion",
+                "chat_completion"
+              ]
+            },
+            "nvidia_inference_id": {
+              "type": "string",
+              "description": "The inference ID"
+            }
+          }
+        }
+      ]
+    },
+    "body": {
+      "description": "The inference endpoint's task and service settings",
+      "required": true
+    },
+    "params": {
+      "timeout": {
+        "type": "time",
+        "description": "Specifies the amount of time to wait for the inference endpoint to be created.",
+        "default": "30s"
+      }
+    }
+  }
+}
diff --git a/specification/inference/_types/CommonTypes.ts b/specification/inference/_types/CommonTypes.ts
@@ -1809,6 +1809,86 @@ export enum MistralServiceType {
   mistral
 }
 
+export class NvidiaServiceSettings {
+  /**
+   * A valid API key for your Nvidia endpoint.
+   * Can be found in `API Keys` section of Nvidia account settings.
+   */
+  api_key: string
+  /**
+   * The URL of the Nvidia model endpoint.
+   */
+  url?: string
+  /**
+   * The name of the model to use for the inference task.
+   * Refer to the model's documentation for the name if needed.
+   * Service has been tested and confirmed to be working with the following models:
+   *
+   * * For `text_embedding` task - `nvidia/llama-3.2-nv-embedqa-1b-v2`.
+   * * For `completion` and `chat_completion` tasks - `microsoft/phi-3-mini-128k-instruct`.
+   * * For `rerank` task - `nv-rerank-qa-mistral-4b:1`.
+   * Service doesn't support `text_embedding` task `baai/bge-m3` and `nvidia/nvclip` models due to them not recognizing the `input_type` parameter.
+   */
+  model_id: string
+  /**
+   * For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.
+   */
+  max_input_tokens?: integer
+  /**
+   * For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.
+   */
+  similarity?: NvidiaSimilarityType
+  /**
+   * This setting helps to minimize the number of rate limit errors returned from the Nvidia API.
+   * By default, the `nvidia` service sets the number of requests allowed per minute to 3000.
+   */
+  rate_limit?: RateLimitSetting
+}
+
+export enum NvidiaTaskType {
+  text_embedding,
+  completion,
+  chat_completion,
+  rerank
+}
+
+export enum NvidiaServiceType {
+  nvidia
+}
+
+export enum NvidiaSimilarityType {
+  cosine,
+  dot_product,
+  l2_norm
+}
+
+export class NvidiaTaskSettings {
+  /**
+   * For a `text_embedding` task, type of input sent to the Nvidia endpoint.
+   * Valid values are:
+   *
+   * * `ingest`: Mapped to Nvidia's `passage` value in request. Used when generating embeddings during indexing.
+   * * `search`: Mapped to Nvidia's `query` value in request. Used when generating embeddings during querying.
+   *
+   * IMPORTANT: If not specified `input_type` field in request to Nvidia endpoint is set as `query` by default.
+   */
+  input_type?: NvidiaInputType
+  /**
+   * For a `text_embedding` task, the method to handle inputs longer than the maximum token length.
+   * Valid values are:
+   *
+   * * `END`: When the input exceeds the maximum input token length, the end of the input is discarded.
+   * * `NONE`: When the input exceeds the maximum input token length, an error is returned.
+   * * `START`: When the input exceeds the maximum input token length, the start of the input is discarded.
+   */
+  truncate?: CohereTruncateType
+}
+
+export enum NvidiaInputType {
+  ingest,
+  search
+}
+
 export class OpenAIServiceSettings {
   /**
    * A valid API key of your OpenAI account.

diff --git a/specification/inference/_types/Services.ts b/specification/inference/_types/Services.ts
@@ -41,6 +41,7 @@ import {
   TaskTypeJinaAi,
   TaskTypeLlama,
   TaskTypeMistral,
+  TaskTypeNvidia,
   TaskTypeOpenAI,
   TaskTypeOpenShiftAi,
   TaskTypeVoyageAI,
@@ -304,6 +305,17 @@ export class InferenceEndpointInfoMistral extends InferenceEndpoint {
   task_type: TaskTypeMistral
 }
 
+export class InferenceEndpointInfoNvidia extends InferenceEndpoint {
+  /**
+   * The inference ID
+   */
+  inference_id: string
+  /**
+   * The task type
+   */
+  task_type: TaskTypeNvidia
+}
+
 export class InferenceEndpointInfoOpenAI extends InferenceEndpoint {
   /**
    * The inference Id

diff --git a/specification/inference/_types/TaskType.ts b/specification/inference/_types/TaskType.ts
@@ -140,6 +140,13 @@ export enum TaskTypeMistral {
   completion
 }
 
+export enum TaskTypeNvidia {
+  text_embedding,
+  chat_completion,
+  completion,
+  rerank
+}
+
 export enum TaskTypeOpenAI {
   text_embedding,
   chat_completion,

diff --git a/specification/inference/put/PutRequest.ts b/specification/inference/put/PutRequest.ts
@@ -49,6 +49,7 @@ import { TaskType } from '@inference/_types/TaskType'
  * * JinaAI (`rerank`, `text_embedding`)
  * * Llama (`chat_completion`, `completion`, `text_embedding`)
  * * Mistral (`chat_completion`, `completion`, `text_embedding`)
+ * * Nvidia (`chat_completion`, `completion`, `text_embedding`, `rerank`)
  * * OpenAI (`chat_completion`, `completion`, `text_embedding`)
  * * OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)
  * * VoyageAI (`rerank`, `text_embedding`)