diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json index 8cfff46c33..2a4c761bd3 100644 --- a/output/openapi/elasticsearch-openapi.json +++ b/output/openapi/elasticsearch-openapi.json @@ -20780,7 +20780,7 @@ "inference" ], "summary": "Create an inference endpoint", - "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `rerank`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`chat_completion`, `completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Groq (`chat_completion`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* JinaAI (`rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* VoyageAI (`rerank`, `text_embedding`)\n* Watsonx inference integration (`text_embedding`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n", + "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `rerank`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`chat_completion`, `completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Groq (`chat_completion`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* JinaAI (`rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* Nvidia (`chat_completion`, `completion`, `text_embedding`, `rerank`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* VoyageAI (`rerank`, `text_embedding`)\n* Watsonx inference integration (`text_embedding`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n", "operationId": "inference-put", "parameters": [ { @@ -20903,7 +20903,7 @@ "inference" ], "summary": "Create an inference endpoint", - "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `rerank`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`chat_completion`, `completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Groq (`chat_completion`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* JinaAI (`rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* VoyageAI (`rerank`, `text_embedding`)\n* Watsonx inference integration (`text_embedding`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n", + "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `rerank`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`chat_completion`, `completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Groq (`chat_completion`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* JinaAI (`rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* Nvidia (`chat_completion`, `completion`, `text_embedding`, `rerank`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* VoyageAI (`rerank`, `text_embedding`)\n* Watsonx inference integration (`text_embedding`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n", "operationId": "inference-put-1", "parameters": [ { @@ -23565,6 +23565,204 @@ ] } }, + "/_inference/{task_type}/{nvidia_inference_id}": { + "put": { + "tags": [ + "inference" + ], + "summary": "Create an Nvidia inference endpoint", + "description": "Create an inference endpoint to perform an inference task with the `nvidia` service.\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n", + "operationId": "inference-put-nvidia", + "parameters": [ + { + "in": "path", + "name": "task_type", + "description": "The type of the inference task that the model will perform.\nNOTE: The `chat_completion` task type only supports streaming and only through the _stream API.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/inference._types.NvidiaTaskType" + }, + "style": "simple" + }, + { + "in": "path", + "name": "nvidia_inference_id", + "description": "The unique identifier of the inference endpoint.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/_types.Id" + }, + "style": "simple" + }, + { + "in": "query", + "name": "timeout", + "description": "Specifies the amount of time to wait for the inference endpoint to be created.", + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/_types.Duration" + }, + "style": "form" + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "chunking_settings": { + "externalDocs": { + "url": "https://www.elastic.co/docs/explore-analyze/elastic-inference/inference-api#infer-chunking-config" + }, + "description": "The chunking configuration object.\nApplies only to the `text_embedding` task type.\nNot applicable to the `rerank`, `completion`, or `chat_completion` task types.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.InferenceChunkingSettings" + } + ] + }, + "service": { + "description": "The type of service supported for the specified task type. In this case, `nvidia`.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.NvidiaServiceType" + } + ] + }, + "service_settings": { + "description": "Settings used to install the inference model. These settings are specific to the `nvidia` service.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.NvidiaServiceSettings" + } + ] + }, + "task_settings": { + "description": "Settings to configure the inference task.\nApplies only to the `text_embedding` task type.\nNot applicable to the `rerank`, `completion`, or `chat_completion` task types.\nThese settings are specific to the task type you specified.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.NvidiaTaskSettings" + } + ] + } + }, + "required": [ + "service", + "service_settings" + ] + }, + "examples": { + "PutNvidiaRequestExample1": { + "summary": "A text embedding task", + "description": "Run `PUT _inference/text_embedding/nvidia-text-embedding` to create an inference endpoint that performs a `text_embedding` task.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"url\": \"nvidia-embeddings-url\",\n \"api_key\": \"nvidia-embeddings-token\",\n \"model_id\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\"\n }\n}" + }, + "PutNvidiaRequestExample2": { + "summary": "A text embedding task with custom `task_settings` and no `url` parameter", + "description": "Run `PUT _inference/text_embedding/nvidia-text-embedding` to create an inference endpoint that performs a `text_embedding` task, specifying custom `task_settings` and omitting the `url` parameter if model is accessible via default NVIDIA endpoint.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n \"api_key\": \"nvidia-text-embeddings-token\"\n },\n \"task_settings\": {\n \"input_type\": \"ingest\",\n \"truncate\": \"start\"\n }\n}" + }, + "PutNvidiaRequestExample3": { + "summary": "A completion task", + "description": "Run `PUT _inference/completion/nvidia-completion` to create an inference endpoint that performs a `completion` task.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"url\": \"nvidia-completion-url\",\n \"api_key\": \"nvidia-completion-token\",\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\"\n }\n}" + }, + "PutNvidiaRequestExample4": { + "summary": "A completion task without `url` parameter", + "description": "Run `PUT _inference/completion/nvidia-completion` to create an inference endpoint that performs a `completion` task, omitting the `url` parameter if model is accessible via default NVIDIA endpoint.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"api_key\": \"nvidia-completion-token\",\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\"\n }\n}" + }, + "PutNvidiaRequestExample5": { + "summary": "A chat completion task", + "description": "Run `PUT _inference/chat_completion/nvidia-chat-completion` to create an inference endpoint that performs a `chat_completion` task.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"url\": \"nvidia-chat-completion-url\",\n \"api_key\": \"nvidia-chat-completion-token\",\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\"\n }\n}" + }, + "PutNvidiaRequestExample6": { + "summary": "A chat completion task without `url` parameter", + "description": "Run `PUT _inference/chat_completion/nvidia-chat-completion` to create an inference endpoint that performs a `chat_completion` task, omitting the `url` parameter if model is accessible via default NVIDIA endpoint.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"api_key\": \"nvidia-chat-completion-token\",\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\"\n }\n}" + }, + "PutNvidiaRequestExample7": { + "summary": "A rerank task", + "description": "Run `PUT _inference/rerank/nvidia-rerank` to create an inference endpoint that performs a `rerank` task.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"url\": \"nvidia-rerank-url\",\n \"api_key\": \"nvidia-rerank-token\",\n \"model_id\": \"nv-rerank-qa-mistral-4b:1\"\n }\n}" + }, + "PutNvidiaRequestExample8": { + "summary": "A rerank task without `url` parameter", + "description": "Run `PUT _inference/rerank/nvidia-rerank` to create an inference endpoint that performs a `rerank` task, omitting the `url` parameter if model is accessible via default NVIDIA endpoint.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"api_key\": \"nvidia-rerank-token\",\n \"model_id\": \"nv-rerank-qa-mistral-4b:1\"\n }\n}" + } + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/inference._types.InferenceEndpointInfoNvidia" + }, + "examples": { + "PutNvidiaResponseExample1": { + "summary": "A text embedding task", + "description": "A successful response when creating an Nvidia `text_embedding` inference endpoint.", + "value": "{\n \"inference_id\": \"nvidia-text-embedding\",\n \"task_type\": \"text_embedding\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n \"url\": \"nvidia-embeddings-url\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n },\n \"dimensions\": 2048,\n \"similarity\": \"dot_product\"\n },\n \"chunking_settings\": {\n \"strategy\": \"sentence\",\n \"max_chunk_size\": 250,\n \"sentence_overlap\": 1\n }\n}" + }, + "PutNvidiaResponseExample2": { + "summary": "A text embedding task with custom `task_settings` and no `url` parameter", + "description": "A successful response when creating an Nvidia `text_embedding` inference endpoint with custom `task_settings` and no `url` parameter.", + "value": "{\n \"inference_id\": \"nvidia-text-embedding\",\n \"task_type\": \"text_embedding\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n },\n \"dimensions\": 2048,\n \"similarity\": \"dot_product\"\n },\n \"task_settings\": {\n \"input_type\": \"ingest\",\n \"truncate\": \"start\"\n },\n \"chunking_settings\": {\n \"strategy\": \"sentence\",\n \"max_chunk_size\": 250,\n \"sentence_overlap\": 1\n }\n}" + }, + "PutNvidiaResponseExample3": { + "summary": "A completion task", + "description": "A successful response when creating an Nvidia `completion` inference endpoint.", + "value": "{\n \"inference_id\": \"nvidia-completion\",\n \"task_type\": \"completion\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\",\n \"url\": \"nvidia-completion-url\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample4": { + "summary": "A completion task without `url` parameter", + "description": "A successful response when creating an Nvidia `completion` inference endpoint without `url` parameter.", + "value": "{\n \"inference_id\": \"nvidia-completion\",\n \"task_type\": \"completion\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample5": { + "summary": "A chat completion task", + "description": "A successful response when creating an Nvidia `chat_completion` inference endpoint.", + "value": "{\n \"inference_id\": \"nvidia-chat-completion\",\n \"task_type\": \"chat_completion\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\",\n \"url\": \"nvidia-chat-completion-url\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample6": { + "summary": "A chat completion task without `url` parameter", + "description": "A successful response when creating an Nvidia `chat_completion` inference endpoint without `url` parameter.", + "value": "{\n \"inference_id\": \"nvidia-chat-completion\",\n \"task_type\": \"chat_completion\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample7": { + "summary": "A rerank task", + "description": "A successful response when creating an Nvidia `rerank` inference endpoint.", + "value": "{\n \"inference_id\": \"nvidia-rerank\",\n \"task_type\": \"rerank\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nv-rerank-qa-mistral-4b:1\",\n \"url\": \"nvidia-rerank-url\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample8": { + "summary": "A rerank task without `url` parameter", + "description": "A successful response when creating an Nvidia `rerank` inference endpoint without `url` parameter.", + "value": "{\n \"inference_id\": \"nvidia-rerank\",\n \"task_type\": \"rerank\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nv-rerank-qa-mistral-4b:1\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + } + } + } + } + } + }, + "x-state": "Generally available; Added in 9.3.0", + "x-metaTags": [ + { + "content": "Elasticsearch, Machine Learning", + "name": "product_name" + } + ] + } + }, "/_inference/{task_type}/{openai_inference_id}": { "put": { "tags": [ @@ -104776,6 +104974,135 @@ "completion" ] }, + "inference._types.NvidiaTaskType": { + "type": "string", + "enum": [ + "text_embedding", + "completion", + "chat_completion", + "rerank" + ] + }, + "inference._types.NvidiaServiceType": { + "type": "string", + "enum": [ + "nvidia" + ] + }, + "inference._types.NvidiaServiceSettings": { + "type": "object", + "properties": { + "api_key": { + "description": "A valid API key for your Nvidia endpoint.\nCan be found in `API Keys` section of Nvidia account settings.", + "type": "string" + }, + "url": { + "description": "The URL of the Nvidia model endpoint.", + "type": "string" + }, + "model_id": { + "description": "The name of the model to use for the inference task.\nRefer to the model's documentation for the name if needed.\nService has been tested and confirmed to be working with the following models:\n\n* For `text_embedding` task - `nvidia/llama-3.2-nv-embedqa-1b-v2`.\n* For `completion` and `chat_completion` tasks - `microsoft/phi-3-mini-128k-instruct`.\n* For `rerank` task - `nv-rerank-qa-mistral-4b:1`.\nService doesn't support `text_embedding` task `baai/bge-m3` and `nvidia/nvclip` models due to them not recognizing the `input_type` parameter.", + "type": "string" + }, + "max_input_tokens": { + "description": "For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.", + "type": "number" + }, + "similarity": { + "description": "For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.NvidiaSimilarityType" + } + ] + }, + "rate_limit": { + "description": "This setting helps to minimize the number of rate limit errors returned from the Nvidia API.\nBy default, the `nvidia` service sets the number of requests allowed per minute to 3000.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.RateLimitSetting" + } + ] + } + }, + "required": [ + "api_key", + "model_id" + ] + }, + "inference._types.NvidiaSimilarityType": { + "type": "string", + "enum": [ + "cosine", + "dot_product", + "l2_norm" + ] + }, + "inference._types.NvidiaTaskSettings": { + "type": "object", + "properties": { + "input_type": { + "description": "For a `text_embedding` task, type of input sent to the Nvidia endpoint.\nValid values are:\n\n* `ingest`: Mapped to Nvidia's `passage` value in request. Used when generating embeddings during indexing.\n* `search`: Mapped to Nvidia's `query` value in request. Used when generating embeddings during querying.\n\nIMPORTANT: If not specified `input_type` field in request to Nvidia endpoint is set as `query` by default.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.NvidiaInputType" + } + ] + }, + "truncate": { + "description": "For a `text_embedding` task, the method to handle inputs longer than the maximum token length.\nValid values are:\n\n* `END`: When the input exceeds the maximum input token length, the end of the input is discarded.\n* `NONE`: When the input exceeds the maximum input token length, an error is returned.\n* `START`: When the input exceeds the maximum input token length, the start of the input is discarded.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.CohereTruncateType" + } + ] + } + } + }, + "inference._types.NvidiaInputType": { + "type": "string", + "enum": [ + "ingest", + "search" + ] + }, + "inference._types.InferenceEndpointInfoNvidia": { + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.InferenceEndpoint" + }, + { + "type": "object", + "properties": { + "inference_id": { + "description": "The inference ID", + "type": "string" + }, + "task_type": { + "description": "The task type", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.TaskTypeNvidia" + } + ] + } + }, + "required": [ + "inference_id", + "task_type" + ] + } + ] + }, + "inference._types.TaskTypeNvidia": { + "type": "string", + "enum": [ + "text_embedding", + "chat_completion", + "completion", + "rerank" + ] + }, "inference._types.OpenAITaskType": { "type": "string", "enum": [ diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json index 399828704a..2a925cad99 100644 --- a/output/openapi/elasticsearch-serverless-openapi.json +++ b/output/openapi/elasticsearch-serverless-openapi.json @@ -11783,7 +11783,7 @@ "inference" ], "summary": "Create an inference endpoint", - "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `rerank`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`chat_completion`, `completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Groq (`chat_completion`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* JinaAI (`rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* VoyageAI (`rerank`, `text_embedding`)\n* Watsonx inference integration (`text_embedding`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n", + "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `rerank`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`chat_completion`, `completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Groq (`chat_completion`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* JinaAI (`rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* Nvidia (`chat_completion`, `completion`, `text_embedding`, `rerank`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* VoyageAI (`rerank`, `text_embedding`)\n* Watsonx inference integration (`text_embedding`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n", "operationId": "inference-put", "parameters": [ { @@ -11906,7 +11906,7 @@ "inference" ], "summary": "Create an inference endpoint", - "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `rerank`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`chat_completion`, `completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Groq (`chat_completion`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* JinaAI (`rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* VoyageAI (`rerank`, `text_embedding`)\n* Watsonx inference integration (`text_embedding`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n", + "description": "IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `rerank`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`chat_completion`, `completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Groq (`chat_completion`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* JinaAI (`rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* Nvidia (`chat_completion`, `completion`, `text_embedding`, `rerank`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* VoyageAI (`rerank`, `text_embedding`)\n* Watsonx inference integration (`text_embedding`)\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n", "operationId": "inference-put-1", "parameters": [ { @@ -14568,6 +14568,204 @@ ] } }, + "/_inference/{task_type}/{nvidia_inference_id}": { + "put": { + "tags": [ + "inference" + ], + "summary": "Create an Nvidia inference endpoint", + "description": "Create an inference endpoint to perform an inference task with the `nvidia` service.\n\n## Required authorization\n\n* Cluster privileges: `manage_inference`\n", + "operationId": "inference-put-nvidia", + "parameters": [ + { + "in": "path", + "name": "task_type", + "description": "The type of the inference task that the model will perform.\nNOTE: The `chat_completion` task type only supports streaming and only through the _stream API.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/inference._types.NvidiaTaskType" + }, + "style": "simple" + }, + { + "in": "path", + "name": "nvidia_inference_id", + "description": "The unique identifier of the inference endpoint.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/_types.Id" + }, + "style": "simple" + }, + { + "in": "query", + "name": "timeout", + "description": "Specifies the amount of time to wait for the inference endpoint to be created.", + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/_types.Duration" + }, + "style": "form" + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "chunking_settings": { + "externalDocs": { + "url": "https://www.elastic.co/docs/explore-analyze/elastic-inference/inference-api#infer-chunking-config" + }, + "description": "The chunking configuration object.\nApplies only to the `text_embedding` task type.\nNot applicable to the `rerank`, `completion`, or `chat_completion` task types.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.InferenceChunkingSettings" + } + ] + }, + "service": { + "description": "The type of service supported for the specified task type. In this case, `nvidia`.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.NvidiaServiceType" + } + ] + }, + "service_settings": { + "description": "Settings used to install the inference model. These settings are specific to the `nvidia` service.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.NvidiaServiceSettings" + } + ] + }, + "task_settings": { + "description": "Settings to configure the inference task.\nApplies only to the `text_embedding` task type.\nNot applicable to the `rerank`, `completion`, or `chat_completion` task types.\nThese settings are specific to the task type you specified.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.NvidiaTaskSettings" + } + ] + } + }, + "required": [ + "service", + "service_settings" + ] + }, + "examples": { + "PutNvidiaRequestExample1": { + "summary": "A text embedding task", + "description": "Run `PUT _inference/text_embedding/nvidia-text-embedding` to create an inference endpoint that performs a `text_embedding` task.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"url\": \"nvidia-embeddings-url\",\n \"api_key\": \"nvidia-embeddings-token\",\n \"model_id\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\"\n }\n}" + }, + "PutNvidiaRequestExample2": { + "summary": "A text embedding task with custom `task_settings` and no `url` parameter", + "description": "Run `PUT _inference/text_embedding/nvidia-text-embedding` to create an inference endpoint that performs a `text_embedding` task, specifying custom `task_settings` and omitting the `url` parameter if model is accessible via default NVIDIA endpoint.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n \"api_key\": \"nvidia-text-embeddings-token\"\n },\n \"task_settings\": {\n \"input_type\": \"ingest\",\n \"truncate\": \"start\"\n }\n}" + }, + "PutNvidiaRequestExample3": { + "summary": "A completion task", + "description": "Run `PUT _inference/completion/nvidia-completion` to create an inference endpoint that performs a `completion` task.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"url\": \"nvidia-completion-url\",\n \"api_key\": \"nvidia-completion-token\",\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\"\n }\n}" + }, + "PutNvidiaRequestExample4": { + "summary": "A completion task without `url` parameter", + "description": "Run `PUT _inference/completion/nvidia-completion` to create an inference endpoint that performs a `completion` task, omitting the `url` parameter if model is accessible via default NVIDIA endpoint.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"api_key\": \"nvidia-completion-token\",\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\"\n }\n}" + }, + "PutNvidiaRequestExample5": { + "summary": "A chat completion task", + "description": "Run `PUT _inference/chat_completion/nvidia-chat-completion` to create an inference endpoint that performs a `chat_completion` task.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"url\": \"nvidia-chat-completion-url\",\n \"api_key\": \"nvidia-chat-completion-token\",\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\"\n }\n}" + }, + "PutNvidiaRequestExample6": { + "summary": "A chat completion task without `url` parameter", + "description": "Run `PUT _inference/chat_completion/nvidia-chat-completion` to create an inference endpoint that performs a `chat_completion` task, omitting the `url` parameter if model is accessible via default NVIDIA endpoint.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"api_key\": \"nvidia-chat-completion-token\",\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\"\n }\n}" + }, + "PutNvidiaRequestExample7": { + "summary": "A rerank task", + "description": "Run `PUT _inference/rerank/nvidia-rerank` to create an inference endpoint that performs a `rerank` task.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"url\": \"nvidia-rerank-url\",\n \"api_key\": \"nvidia-rerank-token\",\n \"model_id\": \"nv-rerank-qa-mistral-4b:1\"\n }\n}" + }, + "PutNvidiaRequestExample8": { + "summary": "A rerank task without `url` parameter", + "description": "Run `PUT _inference/rerank/nvidia-rerank` to create an inference endpoint that performs a `rerank` task, omitting the `url` parameter if model is accessible via default NVIDIA endpoint.", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"api_key\": \"nvidia-rerank-token\",\n \"model_id\": \"nv-rerank-qa-mistral-4b:1\"\n }\n}" + } + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/inference._types.InferenceEndpointInfoNvidia" + }, + "examples": { + "PutNvidiaResponseExample1": { + "summary": "A text embedding task", + "description": "A successful response when creating an Nvidia `text_embedding` inference endpoint.", + "value": "{\n \"inference_id\": \"nvidia-text-embedding\",\n \"task_type\": \"text_embedding\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n \"url\": \"nvidia-embeddings-url\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n },\n \"dimensions\": 2048,\n \"similarity\": \"dot_product\"\n },\n \"chunking_settings\": {\n \"strategy\": \"sentence\",\n \"max_chunk_size\": 250,\n \"sentence_overlap\": 1\n }\n}" + }, + "PutNvidiaResponseExample2": { + "summary": "A text embedding task with custom `task_settings` and no `url` parameter", + "description": "A successful response when creating an Nvidia `text_embedding` inference endpoint with custom `task_settings` and no `url` parameter.", + "value": "{\n \"inference_id\": \"nvidia-text-embedding\",\n \"task_type\": \"text_embedding\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n },\n \"dimensions\": 2048,\n \"similarity\": \"dot_product\"\n },\n \"task_settings\": {\n \"input_type\": \"ingest\",\n \"truncate\": \"start\"\n },\n \"chunking_settings\": {\n \"strategy\": \"sentence\",\n \"max_chunk_size\": 250,\n \"sentence_overlap\": 1\n }\n}" + }, + "PutNvidiaResponseExample3": { + "summary": "A completion task", + "description": "A successful response when creating an Nvidia `completion` inference endpoint.", + "value": "{\n \"inference_id\": \"nvidia-completion\",\n \"task_type\": \"completion\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\",\n \"url\": \"nvidia-completion-url\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample4": { + "summary": "A completion task without `url` parameter", + "description": "A successful response when creating an Nvidia `completion` inference endpoint without `url` parameter.", + "value": "{\n \"inference_id\": \"nvidia-completion\",\n \"task_type\": \"completion\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample5": { + "summary": "A chat completion task", + "description": "A successful response when creating an Nvidia `chat_completion` inference endpoint.", + "value": "{\n \"inference_id\": \"nvidia-chat-completion\",\n \"task_type\": \"chat_completion\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\",\n \"url\": \"nvidia-chat-completion-url\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample6": { + "summary": "A chat completion task without `url` parameter", + "description": "A successful response when creating an Nvidia `chat_completion` inference endpoint without `url` parameter.", + "value": "{\n \"inference_id\": \"nvidia-chat-completion\",\n \"task_type\": \"chat_completion\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample7": { + "summary": "A rerank task", + "description": "A successful response when creating an Nvidia `rerank` inference endpoint.", + "value": "{\n \"inference_id\": \"nvidia-rerank\",\n \"task_type\": \"rerank\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nv-rerank-qa-mistral-4b:1\",\n \"url\": \"nvidia-rerank-url\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample8": { + "summary": "A rerank task without `url` parameter", + "description": "A successful response when creating an Nvidia `rerank` inference endpoint without `url` parameter.", + "value": "{\n \"inference_id\": \"nvidia-rerank\",\n \"task_type\": \"rerank\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nv-rerank-qa-mistral-4b:1\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + } + } + } + } + } + }, + "x-state": "Generally available", + "x-metaTags": [ + { + "content": "Elasticsearch, Machine Learning", + "name": "product_name" + } + ] + } + }, "/_inference/{task_type}/{openai_inference_id}": { "put": { "tags": [ @@ -68692,6 +68890,135 @@ "completion" ] }, + "inference._types.NvidiaTaskType": { + "type": "string", + "enum": [ + "text_embedding", + "completion", + "chat_completion", + "rerank" + ] + }, + "inference._types.NvidiaServiceType": { + "type": "string", + "enum": [ + "nvidia" + ] + }, + "inference._types.NvidiaServiceSettings": { + "type": "object", + "properties": { + "api_key": { + "description": "A valid API key for your Nvidia endpoint.\nCan be found in `API Keys` section of Nvidia account settings.", + "type": "string" + }, + "url": { + "description": "The URL of the Nvidia model endpoint.", + "type": "string" + }, + "model_id": { + "description": "The name of the model to use for the inference task.\nRefer to the model's documentation for the name if needed.\nService has been tested and confirmed to be working with the following models:\n\n* For `text_embedding` task - `nvidia/llama-3.2-nv-embedqa-1b-v2`.\n* For `completion` and `chat_completion` tasks - `microsoft/phi-3-mini-128k-instruct`.\n* For `rerank` task - `nv-rerank-qa-mistral-4b:1`.\nService doesn't support `text_embedding` task `baai/bge-m3` and `nvidia/nvclip` models due to them not recognizing the `input_type` parameter.", + "type": "string" + }, + "max_input_tokens": { + "description": "For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.", + "type": "number" + }, + "similarity": { + "description": "For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.NvidiaSimilarityType" + } + ] + }, + "rate_limit": { + "description": "This setting helps to minimize the number of rate limit errors returned from the Nvidia API.\nBy default, the `nvidia` service sets the number of requests allowed per minute to 3000.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.RateLimitSetting" + } + ] + } + }, + "required": [ + "api_key", + "model_id" + ] + }, + "inference._types.NvidiaSimilarityType": { + "type": "string", + "enum": [ + "cosine", + "dot_product", + "l2_norm" + ] + }, + "inference._types.NvidiaTaskSettings": { + "type": "object", + "properties": { + "input_type": { + "description": "For a `text_embedding` task, type of input sent to the Nvidia endpoint.\nValid values are:\n\n* `ingest`: Mapped to Nvidia's `passage` value in request. Used when generating embeddings during indexing.\n* `search`: Mapped to Nvidia's `query` value in request. Used when generating embeddings during querying.\n\nIMPORTANT: If not specified `input_type` field in request to Nvidia endpoint is set as `query` by default.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.NvidiaInputType" + } + ] + }, + "truncate": { + "description": "For a `text_embedding` task, the method to handle inputs longer than the maximum token length.\nValid values are:\n\n* `END`: When the input exceeds the maximum input token length, the end of the input is discarded.\n* `NONE`: When the input exceeds the maximum input token length, an error is returned.\n* `START`: When the input exceeds the maximum input token length, the start of the input is discarded.", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.CohereTruncateType" + } + ] + } + } + }, + "inference._types.NvidiaInputType": { + "type": "string", + "enum": [ + "ingest", + "search" + ] + }, + "inference._types.InferenceEndpointInfoNvidia": { + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.InferenceEndpoint" + }, + { + "type": "object", + "properties": { + "inference_id": { + "description": "The inference ID", + "type": "string" + }, + "task_type": { + "description": "The task type", + "allOf": [ + { + "$ref": "#/components/schemas/inference._types.TaskTypeNvidia" + } + ] + } + }, + "required": [ + "inference_id", + "task_type" + ] + } + ] + }, + "inference._types.TaskTypeNvidia": { + "type": "string", + "enum": [ + "text_embedding", + "chat_completion", + "completion", + "rerank" + ] + }, "inference._types.OpenAITaskType": { "type": "string", "enum": [ diff --git a/output/schema/schema.json b/output/schema/schema.json index c864610e70..2d53475d0b 100644 --- a/output/schema/schema.json +++ b/output/schema/schema.json @@ -10289,7 +10289,7 @@ "visibility": "public" } }, - "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `rerank`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`chat_completion`, `completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Groq (`chat_completion`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* JinaAI (`rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* VoyageAI (`rerank`, `text_embedding`)\n* Watsonx inference integration (`text_embedding`)", + "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `rerank`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`chat_completion`, `completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Groq (`chat_completion`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* JinaAI (`rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* Nvidia (`chat_completion`, `completion`, `text_embedding`, `rerank`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* VoyageAI (`rerank`, `text_embedding`)\n* Watsonx inference integration (`text_embedding`)", "docId": "inference-api-put", "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put", "extPreviousVersionDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/8.18/put-inference-api.html", @@ -11242,6 +11242,45 @@ } ] }, + { + "availability": { + "serverless": { + "stability": "stable", + "visibility": "public" + }, + "stack": { + "since": "9.3.0", + "stability": "stable", + "visibility": "public" + } + }, + "description": "Create an Nvidia inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `nvidia` service.", + "docId": "inference-api-put-nvidia", + "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-nvidia", + "name": "inference.put_nvidia", + "privileges": { + "cluster": [ + "manage_inference" + ] + }, + "request": { + "name": "Request", + "namespace": "inference.put_nvidia" + }, + "requestBodyRequired": true, + "response": { + "name": "Response", + "namespace": "inference.put_nvidia" + }, + "urls": [ + { + "methods": [ + "PUT" + ], + "path": "/_inference/{task_type}/{nvidia_inference_id}" + } + ] + }, { "availability": { "serverless": { @@ -178809,7 +178848,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L351-L410" + "specLocation": "inference/_types/Services.ts#L363-L422" }, { "kind": "interface", @@ -178868,7 +178907,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L50-L72" + "specLocation": "inference/_types/Services.ts#L51-L73" }, { "kind": "interface", @@ -178909,7 +178948,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L74-L86" + "specLocation": "inference/_types/Services.ts#L75-L87" }, { "kind": "interface", @@ -178949,7 +178988,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L88-L97" + "specLocation": "inference/_types/Services.ts#L89-L98" }, { "kind": "interface", @@ -178989,7 +179028,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L99-L108" + "specLocation": "inference/_types/Services.ts#L100-L109" }, { "kind": "interface", @@ -179029,7 +179068,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L110-L119" + "specLocation": "inference/_types/Services.ts#L111-L120" }, { "kind": "interface", @@ -179069,7 +179108,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L121-L130" + "specLocation": "inference/_types/Services.ts#L122-L131" }, { "kind": "interface", @@ -179109,7 +179148,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L132-L141" + "specLocation": "inference/_types/Services.ts#L133-L142" }, { "kind": "interface", @@ -179149,7 +179188,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L143-L152" + "specLocation": "inference/_types/Services.ts#L144-L153" }, { "kind": "interface", @@ -179189,7 +179228,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L154-L163" + "specLocation": "inference/_types/Services.ts#L155-L164" }, { "kind": "interface", @@ -179229,7 +179268,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L165-L174" + "specLocation": "inference/_types/Services.ts#L166-L175" }, { "kind": "interface", @@ -179269,7 +179308,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L176-L185" + "specLocation": "inference/_types/Services.ts#L177-L186" }, { "kind": "interface", @@ -179309,7 +179348,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L187-L196" + "specLocation": "inference/_types/Services.ts#L188-L197" }, { "kind": "interface", @@ -179349,7 +179388,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L197-L206" + "specLocation": "inference/_types/Services.ts#L198-L207" }, { "kind": "interface", @@ -179389,7 +179428,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L219-L228" + "specLocation": "inference/_types/Services.ts#L220-L229" }, { "kind": "interface", @@ -179429,7 +179468,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L208-L217" + "specLocation": "inference/_types/Services.ts#L209-L218" }, { "kind": "interface", @@ -179469,7 +179508,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L230-L239" + "specLocation": "inference/_types/Services.ts#L231-L240" }, { "kind": "interface", @@ -179509,7 +179548,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L241-L250" + "specLocation": "inference/_types/Services.ts#L242-L251" }, { "kind": "interface", @@ -179549,7 +179588,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L252-L261" + "specLocation": "inference/_types/Services.ts#L253-L262" }, { "kind": "interface", @@ -179589,7 +179628,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L263-L272" + "specLocation": "inference/_types/Services.ts#L264-L273" }, { "kind": "interface", @@ -179629,7 +179668,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L274-L283" + "specLocation": "inference/_types/Services.ts#L275-L284" }, { "kind": "interface", @@ -179669,7 +179708,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L285-L294" + "specLocation": "inference/_types/Services.ts#L286-L295" }, { "kind": "interface", @@ -179709,7 +179748,47 @@ } } ], - "specLocation": "inference/_types/Services.ts#L296-L305" + "specLocation": "inference/_types/Services.ts#L297-L306" + }, + { + "kind": "interface", + "inherits": { + "type": { + "name": "InferenceEndpoint", + "namespace": "inference._types" + } + }, + "name": { + "name": "InferenceEndpointInfoNvidia", + "namespace": "inference._types" + }, + "properties": [ + { + "description": "The inference ID", + "name": "inference_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "string", + "namespace": "_builtins" + } + } + }, + { + "description": "The task type", + "name": "task_type", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "TaskTypeNvidia", + "namespace": "inference._types" + } + } + } + ], + "specLocation": "inference/_types/Services.ts#L308-L317" }, { "kind": "interface", @@ -179749,7 +179828,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L307-L316" + "specLocation": "inference/_types/Services.ts#L319-L328" }, { "kind": "interface", @@ -179789,7 +179868,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L318-L327" + "specLocation": "inference/_types/Services.ts#L330-L339" }, { "kind": "interface", @@ -179829,7 +179908,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L329-L338" + "specLocation": "inference/_types/Services.ts#L341-L350" }, { "kind": "interface", @@ -179869,7 +179948,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L340-L349" + "specLocation": "inference/_types/Services.ts#L352-L361" }, { "kind": "interface", @@ -180460,6 +180539,192 @@ }, "specLocation": "inference/_types/CommonTypes.ts#L1802-L1806" }, + { + "kind": "enum", + "members": [ + { + "name": "ingest" + }, + { + "name": "search" + } + ], + "name": { + "name": "NvidiaInputType", + "namespace": "inference._types" + }, + "specLocation": "inference/_types/CommonTypes.ts#L1887-L1890" + }, + { + "kind": "interface", + "name": { + "name": "NvidiaServiceSettings", + "namespace": "inference._types" + }, + "properties": [ + { + "description": "A valid API key for your Nvidia endpoint.\nCan be found in `API Keys` section of Nvidia account settings.", + "name": "api_key", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "string", + "namespace": "_builtins" + } + } + }, + { + "description": "The URL of the Nvidia model endpoint.", + "name": "url", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "string", + "namespace": "_builtins" + } + } + }, + { + "description": "The name of the model to use for the inference task.\nRefer to the model's documentation for the name if needed.\nService has been tested and confirmed to be working with the following models:\n\n* For `text_embedding` task - `nvidia/llama-3.2-nv-embedqa-1b-v2`.\n* For `completion` and `chat_completion` tasks - `microsoft/phi-3-mini-128k-instruct`.\n* For `rerank` task - `nv-rerank-qa-mistral-4b:1`.\nService doesn't support `text_embedding` task `baai/bge-m3` and `nvidia/nvclip` models due to them not recognizing the `input_type` parameter.", + "name": "model_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "string", + "namespace": "_builtins" + } + } + }, + { + "description": "For a `text_embedding` task, the maximum number of tokens per input before chunking occurs.", + "name": "max_input_tokens", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + }, + { + "description": "For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm.", + "name": "similarity", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "NvidiaSimilarityType", + "namespace": "inference._types" + } + } + }, + { + "description": "This setting helps to minimize the number of rate limit errors returned from the Nvidia API.\nBy default, the `nvidia` service sets the number of requests allowed per minute to 3000.", + "name": "rate_limit", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "RateLimitSetting", + "namespace": "inference._types" + } + } + } + ], + "specLocation": "inference/_types/CommonTypes.ts#L1812-L1846" + }, + { + "kind": "enum", + "members": [ + { + "name": "nvidia" + } + ], + "name": { + "name": "NvidiaServiceType", + "namespace": "inference._types" + }, + "specLocation": "inference/_types/CommonTypes.ts#L1855-L1857" + }, + { + "kind": "enum", + "members": [ + { + "name": "cosine" + }, + { + "name": "dot_product" + }, + { + "name": "l2_norm" + } + ], + "name": { + "name": "NvidiaSimilarityType", + "namespace": "inference._types" + }, + "specLocation": "inference/_types/CommonTypes.ts#L1859-L1863" + }, + { + "kind": "interface", + "name": { + "name": "NvidiaTaskSettings", + "namespace": "inference._types" + }, + "properties": [ + { + "description": "For a `text_embedding` task, type of input sent to the Nvidia endpoint.\nValid values are:\n\n* `ingest`: Mapped to Nvidia's `passage` value in request. Used when generating embeddings during indexing.\n* `search`: Mapped to Nvidia's `query` value in request. Used when generating embeddings during querying.\n\nIMPORTANT: If not specified `input_type` field in request to Nvidia endpoint is set as `query` by default.", + "name": "input_type", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "NvidiaInputType", + "namespace": "inference._types" + } + } + }, + { + "description": "For a `text_embedding` task, the method to handle inputs longer than the maximum token length.\nValid values are:\n\n* `END`: When the input exceeds the maximum input token length, the end of the input is discarded.\n* `NONE`: When the input exceeds the maximum input token length, an error is returned.\n* `START`: When the input exceeds the maximum input token length, the start of the input is discarded.", + "name": "truncate", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "CohereTruncateType", + "namespace": "inference._types" + } + } + } + ], + "specLocation": "inference/_types/CommonTypes.ts#L1865-L1885" + }, + { + "kind": "enum", + "members": [ + { + "name": "text_embedding" + }, + { + "name": "completion" + }, + { + "name": "chat_completion" + }, + { + "name": "rerank" + } + ], + "name": { + "name": "NvidiaTaskType", + "namespace": "inference._types" + }, + "specLocation": "inference/_types/CommonTypes.ts#L1848-L1853" + }, { "kind": "interface", "name": { @@ -180545,7 +180810,7 @@ } } ], - "specLocation": "inference/_types/CommonTypes.ts#L1812-L1854" + "specLocation": "inference/_types/CommonTypes.ts#L1892-L1934" }, { "kind": "enum", @@ -180558,7 +180823,7 @@ "name": "OpenAIServiceType", "namespace": "inference._types" }, - "specLocation": "inference/_types/CommonTypes.ts#L1881-L1883" + "specLocation": "inference/_types/CommonTypes.ts#L1961-L1963" }, { "kind": "interface", @@ -180588,7 +180853,7 @@ } } ], - "specLocation": "inference/_types/CommonTypes.ts#L1856-L1873" + "specLocation": "inference/_types/CommonTypes.ts#L1936-L1953" }, { "kind": "enum", @@ -180607,7 +180872,7 @@ "name": "OpenAITaskType", "namespace": "inference._types" }, - "specLocation": "inference/_types/CommonTypes.ts#L1875-L1879" + "specLocation": "inference/_types/CommonTypes.ts#L1955-L1959" }, { "kind": "interface", @@ -180689,7 +180954,7 @@ } } ], - "specLocation": "inference/_types/CommonTypes.ts#L1885-L1917" + "specLocation": "inference/_types/CommonTypes.ts#L1965-L1997" }, { "kind": "enum", @@ -180702,7 +180967,7 @@ "name": "OpenShiftAiServiceType", "namespace": "inference._types" }, - "specLocation": "inference/_types/CommonTypes.ts#L1926-L1928" + "specLocation": "inference/_types/CommonTypes.ts#L2006-L2008" }, { "kind": "enum", @@ -180721,7 +180986,7 @@ "name": "OpenShiftAiSimilarityType", "namespace": "inference._types" }, - "specLocation": "inference/_types/CommonTypes.ts#L1930-L1934" + "specLocation": "inference/_types/CommonTypes.ts#L2010-L2014" }, { "kind": "interface", @@ -180755,7 +181020,7 @@ } } ], - "specLocation": "inference/_types/CommonTypes.ts#L1936-L1945" + "specLocation": "inference/_types/CommonTypes.ts#L2016-L2025" }, { "kind": "enum", @@ -180777,7 +181042,7 @@ "name": "OpenShiftAiTaskType", "namespace": "inference._types" }, - "specLocation": "inference/_types/CommonTypes.ts#L1919-L1924" + "specLocation": "inference/_types/CommonTypes.ts#L1999-L2004" }, { "kind": "interface", @@ -180844,7 +181109,7 @@ } } ], - "specLocation": "inference/_types/Services.ts#L416-L445" + "specLocation": "inference/_types/Services.ts#L428-L457" }, { "kind": "interface", @@ -180992,7 +181257,7 @@ "name": "ServiceSettings", "namespace": "inference._types" }, - "specLocation": "inference/_types/Services.ts#L412-L412", + "specLocation": "inference/_types/Services.ts#L424-L424", "type": { "kind": "user_defined_value" } @@ -181076,7 +181341,7 @@ "name": "TaskSettings", "namespace": "inference._types" }, - "specLocation": "inference/_types/Services.ts#L414-L414", + "specLocation": "inference/_types/Services.ts#L426-L426", "type": { "kind": "user_defined_value" } @@ -181462,6 +181727,28 @@ }, "specLocation": "inference/_types/TaskType.ts#L137-L141" }, + { + "kind": "enum", + "members": [ + { + "name": "text_embedding" + }, + { + "name": "chat_completion" + }, + { + "name": "completion" + }, + { + "name": "rerank" + } + ], + "name": { + "name": "TaskTypeNvidia", + "namespace": "inference._types" + }, + "specLocation": "inference/_types/TaskType.ts#L143-L148" + }, { "kind": "enum", "members": [ @@ -181479,7 +181766,7 @@ "name": "TaskTypeOpenAI", "namespace": "inference._types" }, - "specLocation": "inference/_types/TaskType.ts#L143-L147" + "specLocation": "inference/_types/TaskType.ts#L150-L154" }, { "kind": "enum", @@ -181501,7 +181788,7 @@ "name": "TaskTypeOpenShiftAi", "namespace": "inference._types" }, - "specLocation": "inference/_types/TaskType.ts#L149-L154" + "specLocation": "inference/_types/TaskType.ts#L156-L161" }, { "kind": "enum", @@ -181517,7 +181804,7 @@ "name": "TaskTypeVoyageAI", "namespace": "inference._types" }, - "specLocation": "inference/_types/TaskType.ts#L156-L159" + "specLocation": "inference/_types/TaskType.ts#L163-L166" }, { "kind": "enum", @@ -181536,7 +181823,7 @@ "name": "TaskTypeWatsonx", "namespace": "inference._types" }, - "specLocation": "inference/_types/TaskType.ts#L161-L165" + "specLocation": "inference/_types/TaskType.ts#L168-L172" }, { "kind": "interface", @@ -181804,7 +182091,7 @@ } } ], - "specLocation": "inference/_types/CommonTypes.ts#L1947-L1978" + "specLocation": "inference/_types/CommonTypes.ts#L2027-L2058" }, { "kind": "enum", @@ -181817,7 +182104,7 @@ "name": "VoyageAIServiceType", "namespace": "inference._types" }, - "specLocation": "inference/_types/CommonTypes.ts#L2011-L2013" + "specLocation": "inference/_types/CommonTypes.ts#L2091-L2093" }, { "kind": "interface", @@ -181877,7 +182164,7 @@ } } ], - "specLocation": "inference/_types/CommonTypes.ts#L1980-L2004" + "specLocation": "inference/_types/CommonTypes.ts#L2060-L2084" }, { "kind": "enum", @@ -181893,7 +182180,7 @@ "name": "VoyageAITaskType", "namespace": "inference._types" }, - "specLocation": "inference/_types/CommonTypes.ts#L2006-L2009" + "specLocation": "inference/_types/CommonTypes.ts#L2086-L2089" }, { "kind": "interface", @@ -181981,7 +182268,7 @@ } } ], - "specLocation": "inference/_types/CommonTypes.ts#L2015-L2053" + "specLocation": "inference/_types/CommonTypes.ts#L2095-L2133" }, { "kind": "enum", @@ -181994,7 +182281,7 @@ "name": "WatsonxServiceType", "namespace": "inference._types" }, - "specLocation": "inference/_types/CommonTypes.ts#L2061-L2063" + "specLocation": "inference/_types/CommonTypes.ts#L2141-L2143" }, { "kind": "enum", @@ -182013,7 +182300,7 @@ "name": "WatsonxTaskType", "namespace": "inference._types" }, - "specLocation": "inference/_types/CommonTypes.ts#L2055-L2059" + "specLocation": "inference/_types/CommonTypes.ts#L2135-L2139" }, { "kind": "request", @@ -182740,7 +183027,7 @@ } } }, - "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `rerank`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`chat_completion`, `completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Groq (`chat_completion`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* JinaAI (`rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* VoyageAI (`rerank`, `text_embedding`)\n* Watsonx inference integration (`text_embedding`)", + "description": "Create an inference endpoint.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Mistral, Azure OpenAI, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face.\nFor built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models.\nHowever, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThe following integrations are available through the inference API. You can find the available task types next to the integration name:\n* AI21 (`chat_completion`, `completion`)\n* AlibabaCloud AI Search (`completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Amazon Bedrock (`completion`, `text_embedding`)\n* Amazon SageMaker (`chat_completion`, `completion`, `rerank`, `sparse_embedding`, `text_embedding`)\n* Anthropic (`completion`)\n* Azure AI Studio (`completion`, `rerank`, `text_embedding`)\n* Azure OpenAI (`completion`, `text_embedding`)\n* Cohere (`completion`, `rerank`, `text_embedding`)\n* DeepSeek (`chat_completion`, `completion`)\n* Elasticsearch (`rerank`, `sparse_embedding`, `text_embedding` - this service is for built-in models and models uploaded through Eland)\n* ELSER (`sparse_embedding`)\n* Google AI Studio (`completion`, `text_embedding`)\n* Google Vertex AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* Groq (`chat_completion`)\n* Hugging Face (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* JinaAI (`rerank`, `text_embedding`)\n* Llama (`chat_completion`, `completion`, `text_embedding`)\n* Mistral (`chat_completion`, `completion`, `text_embedding`)\n* Nvidia (`chat_completion`, `completion`, `text_embedding`, `rerank`)\n* OpenAI (`chat_completion`, `completion`, `text_embedding`)\n* OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`)\n* VoyageAI (`rerank`, `text_embedding`)\n* Watsonx inference integration (`text_embedding`)", "examples": { "InferencePutExample1": { "alternatives": [ @@ -182825,7 +183112,7 @@ } } ], - "specLocation": "inference/put/PutRequest.ts#L26-L94" + "specLocation": "inference/put/PutRequest.ts#L26-L95" }, { "kind": "response", @@ -187058,6 +187345,231 @@ }, "specLocation": "inference/put_mistral/PutMistralResponse.ts#L22-L25" }, + { + "kind": "request", + "attachedBehaviors": [ + "CommonQueryParameters" + ], + "body": { + "kind": "properties", + "properties": [ + { + "description": "The chunking configuration object.\nApplies only to the `text_embedding` task type.\nNot applicable to the `rerank`, `completion`, or `chat_completion` task types.", + "extDocId": "inference-chunking", + "extDocUrl": "https://www.elastic.co/docs/explore-analyze/elastic-inference/inference-api#infer-chunking-config", + "name": "chunking_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "InferenceChunkingSettings", + "namespace": "inference._types" + } + } + }, + { + "description": "The type of service supported for the specified task type. In this case, `nvidia`.", + "name": "service", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "NvidiaServiceType", + "namespace": "inference._types" + } + } + }, + { + "description": "Settings used to install the inference model. These settings are specific to the `nvidia` service.", + "name": "service_settings", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "NvidiaServiceSettings", + "namespace": "inference._types" + } + } + }, + { + "description": "Settings to configure the inference task.\nApplies only to the `text_embedding` task type.\nNot applicable to the `rerank`, `completion`, or `chat_completion` task types.\nThese settings are specific to the task type you specified.", + "name": "task_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "NvidiaTaskSettings", + "namespace": "inference._types" + } + } + } + ] + }, + "description": "Create an Nvidia inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `nvidia` service.", + "examples": { + "PutNvidiaRequestExample1": { + "description": "Run `PUT _inference/text_embedding/nvidia-text-embedding` to create an inference endpoint that performs a `text_embedding` task.", + "method_request": "PUT _inference/text_embedding/nvidia-text-embedding", + "summary": "A text embedding task", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"url\": \"nvidia-embeddings-url\",\n \"api_key\": \"nvidia-embeddings-token\",\n \"model_id\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\"\n }\n}" + }, + "PutNvidiaRequestExample2": { + "description": "Run `PUT _inference/text_embedding/nvidia-text-embedding` to create an inference endpoint that performs a `text_embedding` task, specifying custom `task_settings` and omitting the `url` parameter if model is accessible via default NVIDIA endpoint.", + "method_request": "PUT _inference/text_embedding/nvidia-text-embedding", + "summary": "A text embedding task with custom `task_settings` and no `url` parameter", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n \"api_key\": \"nvidia-text-embeddings-token\"\n },\n \"task_settings\": {\n \"input_type\": \"ingest\",\n \"truncate\": \"start\"\n }\n}" + }, + "PutNvidiaRequestExample3": { + "description": "Run `PUT _inference/completion/nvidia-completion` to create an inference endpoint that performs a `completion` task.", + "method_request": "PUT _inference/completion/nvidia-completion", + "summary": "A completion task", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"url\": \"nvidia-completion-url\",\n \"api_key\": \"nvidia-completion-token\",\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\"\n }\n}" + }, + "PutNvidiaRequestExample4": { + "description": "Run `PUT _inference/completion/nvidia-completion` to create an inference endpoint that performs a `completion` task, omitting the `url` parameter if model is accessible via default NVIDIA endpoint.", + "method_request": "PUT _inference/completion/nvidia-completion", + "summary": "A completion task without `url` parameter", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"api_key\": \"nvidia-completion-token\",\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\"\n }\n}" + }, + "PutNvidiaRequestExample5": { + "description": "Run `PUT _inference/chat_completion/nvidia-chat-completion` to create an inference endpoint that performs a `chat_completion` task.", + "method_request": "PUT _inference/chat_completion/nvidia-chat-completion", + "summary": "A chat completion task", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"url\": \"nvidia-chat-completion-url\",\n \"api_key\": \"nvidia-chat-completion-token\",\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\"\n }\n}" + }, + "PutNvidiaRequestExample6": { + "description": "Run `PUT _inference/chat_completion/nvidia-chat-completion` to create an inference endpoint that performs a `chat_completion` task, omitting the `url` parameter if model is accessible via default NVIDIA endpoint.", + "method_request": "PUT _inference/chat_completion/nvidia-chat-completion", + "summary": "A chat completion task without `url` parameter", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"api_key\": \"nvidia-chat-completion-token\",\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\"\n }\n}" + }, + "PutNvidiaRequestExample7": { + "description": "Run `PUT _inference/rerank/nvidia-rerank` to create an inference endpoint that performs a `rerank` task.", + "method_request": "PUT _inference/rerank/nvidia-rerank", + "summary": "A rerank task", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"url\": \"nvidia-rerank-url\",\n \"api_key\": \"nvidia-rerank-token\",\n \"model_id\": \"nv-rerank-qa-mistral-4b:1\"\n }\n}" + }, + "PutNvidiaRequestExample8": { + "description": "Run `PUT _inference/rerank/nvidia-rerank` to create an inference endpoint that performs a `rerank` task, omitting the `url` parameter if model is accessible via default NVIDIA endpoint.", + "method_request": "PUT _inference/rerank/nvidia-rerank", + "summary": "A rerank task without `url` parameter", + "value": "{\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"api_key\": \"nvidia-rerank-token\",\n \"model_id\": \"nv-rerank-qa-mistral-4b:1\"\n }\n}" + } + }, + "inherits": { + "type": { + "name": "RequestBase", + "namespace": "_types" + } + }, + "name": { + "name": "Request", + "namespace": "inference.put_nvidia" + }, + "path": [ + { + "description": "The type of the inference task that the model will perform.\nNOTE: The `chat_completion` task type only supports streaming and only through the _stream API.", + "name": "task_type", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "NvidiaTaskType", + "namespace": "inference._types" + } + } + }, + { + "description": "The unique identifier of the inference endpoint.", + "name": "nvidia_inference_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "Id", + "namespace": "_types" + } + } + } + ], + "query": [ + { + "description": "Specifies the amount of time to wait for the inference endpoint to be created.", + "name": "timeout", + "required": false, + "serverDefault": "30s", + "type": { + "kind": "instance_of", + "type": { + "name": "Duration", + "namespace": "_types" + } + } + } + ], + "specLocation": "inference/put_nvidia/PutNvidiaRequest.ts#L31-L90" + }, + { + "kind": "response", + "body": { + "kind": "value", + "codegenName": "endpoint_info", + "value": { + "kind": "instance_of", + "type": { + "name": "InferenceEndpointInfoNvidia", + "namespace": "inference._types" + } + } + }, + "examples": { + "PutNvidiaResponseExample1": { + "description": "A successful response when creating an Nvidia `text_embedding` inference endpoint.", + "summary": "A text embedding task", + "value": "{\n \"inference_id\": \"nvidia-text-embedding\",\n \"task_type\": \"text_embedding\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n \"url\": \"nvidia-embeddings-url\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n },\n \"dimensions\": 2048,\n \"similarity\": \"dot_product\"\n },\n \"chunking_settings\": {\n \"strategy\": \"sentence\",\n \"max_chunk_size\": 250,\n \"sentence_overlap\": 1\n }\n}" + }, + "PutNvidiaResponseExample2": { + "description": "A successful response when creating an Nvidia `text_embedding` inference endpoint with custom `task_settings` and no `url` parameter.", + "summary": "A text embedding task with custom `task_settings` and no `url` parameter", + "value": "{\n \"inference_id\": \"nvidia-text-embedding\",\n \"task_type\": \"text_embedding\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nvidia/llama-3.2-nv-embedqa-1b-v2\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n },\n \"dimensions\": 2048,\n \"similarity\": \"dot_product\"\n },\n \"task_settings\": {\n \"input_type\": \"ingest\",\n \"truncate\": \"start\"\n },\n \"chunking_settings\": {\n \"strategy\": \"sentence\",\n \"max_chunk_size\": 250,\n \"sentence_overlap\": 1\n }\n}" + }, + "PutNvidiaResponseExample3": { + "description": "A successful response when creating an Nvidia `completion` inference endpoint.", + "summary": "A completion task", + "value": "{\n \"inference_id\": \"nvidia-completion\",\n \"task_type\": \"completion\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\",\n \"url\": \"nvidia-completion-url\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample4": { + "description": "A successful response when creating an Nvidia `completion` inference endpoint without `url` parameter.", + "summary": "A completion task without `url` parameter", + "value": "{\n \"inference_id\": \"nvidia-completion\",\n \"task_type\": \"completion\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample5": { + "description": "A successful response when creating an Nvidia `chat_completion` inference endpoint.", + "summary": "A chat completion task", + "value": "{\n \"inference_id\": \"nvidia-chat-completion\",\n \"task_type\": \"chat_completion\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\",\n \"url\": \"nvidia-chat-completion-url\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample6": { + "description": "A successful response when creating an Nvidia `chat_completion` inference endpoint without `url` parameter.", + "summary": "A chat completion task without `url` parameter", + "value": "{\n \"inference_id\": \"nvidia-chat-completion\",\n \"task_type\": \"chat_completion\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"microsoft/phi-3-mini-128k-instruct\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample7": { + "description": "A successful response when creating an Nvidia `rerank` inference endpoint.", + "summary": "A rerank task", + "value": "{\n \"inference_id\": \"nvidia-rerank\",\n \"task_type\": \"rerank\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nv-rerank-qa-mistral-4b:1\",\n \"url\": \"nvidia-rerank-url\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + }, + "PutNvidiaResponseExample8": { + "description": "A successful response when creating an Nvidia `rerank` inference endpoint without `url` parameter.", + "summary": "A rerank task without `url` parameter", + "value": "{\n \"inference_id\": \"nvidia-rerank\",\n \"task_type\": \"rerank\",\n \"service\": \"nvidia\",\n \"service_settings\": {\n \"model_id\": \"nv-rerank-qa-mistral-4b:1\",\n \"rate_limit\": {\n \"requests_per_minute\": 3000\n }\n }\n}" + } + }, + "name": { + "name": "Response", + "namespace": "inference.put_nvidia" + }, + "specLocation": "inference/put_nvidia/PutNvidiaResponse.ts#L22-L25" + }, { "kind": "request", "attachedBehaviors": [ diff --git a/output/typescript/types.ts b/output/typescript/types.ts index 0590943900..b4b41e77ec 100644 --- a/output/typescript/types.ts +++ b/output/typescript/types.ts @@ -14736,6 +14736,11 @@ export interface InferenceInferenceEndpointInfoMistral extends InferenceInferenc task_type: InferenceTaskTypeMistral } +export interface InferenceInferenceEndpointInfoNvidia extends InferenceInferenceEndpoint { + inference_id: string + task_type: InferenceTaskTypeNvidia +} + export interface InferenceInferenceEndpointInfoOpenAI extends InferenceInferenceEndpoint { inference_id: string task_type: InferenceTaskTypeOpenAI @@ -14820,6 +14825,28 @@ export type InferenceMistralServiceType = 'mistral' export type InferenceMistralTaskType = 'text_embedding' | 'completion' | 'chat_completion' +export type InferenceNvidiaInputType = 'ingest' | 'search' + +export interface InferenceNvidiaServiceSettings { + api_key: string + url?: string + model_id: string + max_input_tokens?: integer + similarity?: InferenceNvidiaSimilarityType + rate_limit?: InferenceRateLimitSetting +} + +export type InferenceNvidiaServiceType = 'nvidia' + +export type InferenceNvidiaSimilarityType = 'cosine' | 'dot_product' | 'l2_norm' + +export interface InferenceNvidiaTaskSettings { + input_type?: InferenceNvidiaInputType + truncate?: InferenceCohereTruncateType +} + +export type InferenceNvidiaTaskType = 'text_embedding' | 'completion' | 'chat_completion' | 'rerank' + export interface InferenceOpenAIServiceSettings { api_key: string dimensions?: integer @@ -14939,6 +14966,8 @@ export type InferenceTaskTypeLlama = 'text_embedding' | 'chat_completion' | 'com export type InferenceTaskTypeMistral = 'text_embedding' | 'chat_completion' | 'completion' +export type InferenceTaskTypeNvidia = 'text_embedding' | 'chat_completion' | 'completion' | 'rerank' + export type InferenceTaskTypeOpenAI = 'text_embedding' | 'chat_completion' | 'completion' export type InferenceTaskTypeOpenShiftAi = 'text_embedding' | 'chat_completion' | 'completion' | 'rerank' @@ -15334,6 +15363,20 @@ export interface InferencePutMistralRequest extends RequestBase { export type InferencePutMistralResponse = InferenceInferenceEndpointInfoMistral +export interface InferencePutNvidiaRequest extends RequestBase { + task_type: InferenceNvidiaTaskType + nvidia_inference_id: Id + timeout?: Duration + body?: { + chunking_settings?: InferenceInferenceChunkingSettings + service: InferenceNvidiaServiceType + service_settings: InferenceNvidiaServiceSettings + task_settings?: InferenceNvidiaTaskSettings + } +} + +export type InferencePutNvidiaResponse = InferenceInferenceEndpointInfoNvidia + export interface InferencePutOpenaiRequest extends RequestBase { task_type: InferenceOpenAITaskType openai_inference_id: Id diff --git a/package-lock.json b/package-lock.json index c7b190e242..8487c32945 100644 --- a/package-lock.json +++ b/package-lock.json @@ -7,7 +7,7 @@ "": { "version": "overlay", "dependencies": { - "@redocly/cli": "^1.34.5" + "@redocly/cli": "^1.34.6" } }, "node_modules/@babel/code-frame": { @@ -434,9 +434,9 @@ } }, "node_modules/@redocly/cli": { - "version": "1.34.5", - "resolved": "https://registry.npmjs.org/@redocly/cli/-/cli-1.34.5.tgz", - "integrity": "sha512-5IEwxs7SGP5KEXjBKLU8Ffdz9by/KqNSeBk6YUVQaGxMXK//uYlTJIPntgUXbo1KAGG2d2q2XF8y4iFz6qNeiw==", + "version": "1.34.6", + "resolved": "https://registry.npmjs.org/@redocly/cli/-/cli-1.34.6.tgz", + "integrity": "sha512-V03jtLOXLm6+wpTuFNw9+eLHE6R3wywZo4Clt9XMPnulafbJcpCFz+J0e5/4Cw4zZB087xjU7WvRdI/bZ+pHtA==", "license": "MIT", "dependencies": { "@opentelemetry/api": "1.9.0", @@ -445,8 +445,8 @@ "@opentelemetry/sdk-trace-node": "1.26.0", "@opentelemetry/semantic-conventions": "1.27.0", "@redocly/config": "^0.22.0", - "@redocly/openapi-core": "1.34.5", - "@redocly/respect-core": "1.34.5", + "@redocly/openapi-core": "1.34.6", + "@redocly/respect-core": "1.34.6", "abort-controller": "^3.0.0", "chokidar": "^3.5.1", "colorette": "^1.2.0", @@ -458,8 +458,8 @@ "handlebars": "^4.7.6", "mobx": "^6.0.4", "pluralize": "^8.0.0", - "react": "^17.0.0 || ^18.2.0 || ^19.0.0", - "react-dom": "^17.0.0 || ^18.2.0 || ^19.0.0", + "react": "^17.0.0 || ^18.2.0 || ^19.2.1", + "react-dom": "^17.0.0 || ^18.2.0 || ^19.2.1", "redoc": "2.5.0", "semver": "^7.5.2", "simple-websocket": "^9.0.0", @@ -482,9 +482,9 @@ "license": "MIT" }, "node_modules/@redocly/openapi-core": { - "version": "1.34.5", - "resolved": "https://registry.npmjs.org/@redocly/openapi-core/-/openapi-core-1.34.5.tgz", - "integrity": "sha512-0EbE8LRbkogtcCXU7liAyC00n9uNG9hJ+eMyHFdUsy9lB/WGqnEBgwjA9q2cyzAVcdTkQqTBBU1XePNnN3OijA==", + "version": "1.34.6", + "resolved": "https://registry.npmjs.org/@redocly/openapi-core/-/openapi-core-1.34.6.tgz", + "integrity": "sha512-2+O+riuIUgVSuLl3Lyh5AplWZyVMNuG2F98/o6NrutKJfW4/GTZdPpZlIphS0HGgcOHgmWcCSHj+dWFlZaGSHw==", "license": "MIT", "dependencies": { "@redocly/ajv": "^8.11.2", @@ -503,14 +503,14 @@ } }, "node_modules/@redocly/respect-core": { - "version": "1.34.5", - "resolved": "https://registry.npmjs.org/@redocly/respect-core/-/respect-core-1.34.5.tgz", - "integrity": "sha512-GheC/g/QFztPe9UA9LamooSplQuy9pe0Yr8XGTqkz0ahivLDl7svoy/LSQNn1QH3XGtLKwFYMfTwFR2TAYyh5Q==", + "version": "1.34.6", + "resolved": "https://registry.npmjs.org/@redocly/respect-core/-/respect-core-1.34.6.tgz", + "integrity": "sha512-nXFBRctoB4CPCLR2it2WxDsuAE/nLd4EnW9mQ+IUKrIFAjMv1O6rgggxkgdlyKUyenYkajJIHSKwVbRS6FwlEQ==", "license": "MIT", "dependencies": { "@faker-js/faker": "^7.6.0", "@redocly/ajv": "8.11.2", - "@redocly/openapi-core": "1.34.5", + "@redocly/openapi-core": "1.34.6", "better-ajv-errors": "^1.2.0", "colorette": "^2.0.20", "concat-stream": "^2.0.0", @@ -2274,24 +2274,24 @@ } }, "node_modules/react": { - "version": "19.2.0", - "resolved": "https://registry.npmjs.org/react/-/react-19.2.0.tgz", - "integrity": "sha512-tmbWg6W31tQLeB5cdIBOicJDJRR2KzXsV7uSK9iNfLWQ5bIZfxuPEHp7M8wiHyHnn0DD1i7w3Zmin0FtkrwoCQ==", + "version": "19.2.2", + "resolved": "https://registry.npmjs.org/react/-/react-19.2.2.tgz", + "integrity": "sha512-BdOGOY8OKRBcgoDkwqA8Q5XvOIhoNx/Sh6BnGJlet2Abt0X5BK0BDrqGyQgLhAVjD2nAg5f6o01u/OPUhG022Q==", "license": "MIT", "engines": { "node": ">=0.10.0" } }, "node_modules/react-dom": { - "version": "19.2.0", - "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.2.0.tgz", - "integrity": "sha512-UlbRu4cAiGaIewkPyiRGJk0imDN2T3JjieT6spoL2UeSf5od4n5LB/mQ4ejmxhCFT1tYe8IvaFulzynWovsEFQ==", + "version": "19.2.2", + "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.2.2.tgz", + "integrity": "sha512-fhyD2BLrew6qYf4NNtHff1rLXvzR25rq49p+FeqByOazc6TcSi2n8EYulo5C1PbH+1uBW++5S1SG7FcUU6mlDg==", "license": "MIT", "dependencies": { "scheduler": "^0.27.0" }, "peerDependencies": { - "react": "^19.2.0" + "react": "^19.2.2" } }, "node_modules/react-is": { diff --git a/package.json b/package.json index a5f1a5e0a7..ea76d681d6 100644 --- a/package.json +++ b/package.json @@ -3,7 +3,7 @@ "transform-to-openapi": "npm run transform-to-openapi --prefix compiler --" }, "dependencies": { - "@redocly/cli": "^1.34.5" + "@redocly/cli": "^1.34.6" }, "version": "overlay" } diff --git a/specification/_doc_ids/table.csv b/specification/_doc_ids/table.csv index be14073e45..a19369c61d 100644 --- a/specification/_doc_ids/table.csv +++ b/specification/_doc_ids/table.csv @@ -398,6 +398,7 @@ inference-api-put-huggingface,https://www.elastic.co/docs/api/doc/elasticsearch/ inference-api-put-jinaai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-jinaai,, inference-api-put-llama,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-llama,, inference-api-put-mistral,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-mistral,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-mistral.html, +inference-api-put-nvidia,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-nvidia,, inference-api-put-openai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-openai,https://www.elastic.co/guide/en/elasticsearch/reference/8.18/infer-service-openai.html, inference-api-put-openshift-ai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-openshift-ai,, inference-api-put-voyageai,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-voyageai,, diff --git a/specification/_json_spec/inference.put_nvidia.json b/specification/_json_spec/inference.put_nvidia.json new file mode 100644 index 0000000000..0001c99623 --- /dev/null +++ b/specification/_json_spec/inference.put_nvidia.json @@ -0,0 +1,49 @@ +{ + "inference.put_nvidia": { + "documentation": { + "url": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-nvidia", + "description": "Create an Nvidia inference endpoint" + }, + "stability": "stable", + "visibility": "public", + "headers": { + "accept": ["application/json"], + "content_type": ["application/json"] + }, + "url": { + "paths": [ + { + "path": "/_inference/{task_type}/{nvidia_inference_id}", + "methods": ["PUT"], + "parts": { + "task_type": { + "type": "enum", + "description": "The task type", + "options": [ + "rerank", + "text_embedding", + "completion", + "chat_completion" + ] + }, + "nvidia_inference_id": { + "type": "string", + "description": "The inference ID" + } + } + } + ] + }, + "body": { + "description": "The inference endpoint's task and service settings", + "required": true + }, + "params": { + "timeout": { + "type": "time", + "description": "Specifies the amount of time to wait for the inference endpoint to be created.", + "default": "30s" + } + } + } +} diff --git a/specification/inference/_types/CommonTypes.ts b/specification/inference/_types/CommonTypes.ts index 69ee9a00da..878760e2c6 100644 --- a/specification/inference/_types/CommonTypes.ts +++ b/specification/inference/_types/CommonTypes.ts @@ -1809,6 +1809,86 @@ export enum MistralServiceType { mistral } +export class NvidiaServiceSettings { + /** + * A valid API key for your Nvidia endpoint. + * Can be found in `API Keys` section of Nvidia account settings. + */ + api_key: string + /** + * The URL of the Nvidia model endpoint. + */ + url?: string + /** + * The name of the model to use for the inference task. + * Refer to the model's documentation for the name if needed. + * Service has been tested and confirmed to be working with the following models: + * + * * For `text_embedding` task - `nvidia/llama-3.2-nv-embedqa-1b-v2`. + * * For `completion` and `chat_completion` tasks - `microsoft/phi-3-mini-128k-instruct`. + * * For `rerank` task - `nv-rerank-qa-mistral-4b:1`. + * Service doesn't support `text_embedding` task `baai/bge-m3` and `nvidia/nvclip` models due to them not recognizing the `input_type` parameter. + */ + model_id: string + /** + * For a `text_embedding` task, the maximum number of tokens per input before chunking occurs. + */ + max_input_tokens?: integer + /** + * For a `text_embedding` task, the similarity measure. One of cosine, dot_product, l2_norm. + */ + similarity?: NvidiaSimilarityType + /** + * This setting helps to minimize the number of rate limit errors returned from the Nvidia API. + * By default, the `nvidia` service sets the number of requests allowed per minute to 3000. + */ + rate_limit?: RateLimitSetting +} + +export enum NvidiaTaskType { + text_embedding, + completion, + chat_completion, + rerank +} + +export enum NvidiaServiceType { + nvidia +} + +export enum NvidiaSimilarityType { + cosine, + dot_product, + l2_norm +} + +export class NvidiaTaskSettings { + /** + * For a `text_embedding` task, type of input sent to the Nvidia endpoint. + * Valid values are: + * + * * `ingest`: Mapped to Nvidia's `passage` value in request. Used when generating embeddings during indexing. + * * `search`: Mapped to Nvidia's `query` value in request. Used when generating embeddings during querying. + * + * IMPORTANT: If not specified `input_type` field in request to Nvidia endpoint is set as `query` by default. + */ + input_type?: NvidiaInputType + /** + * For a `text_embedding` task, the method to handle inputs longer than the maximum token length. + * Valid values are: + * + * * `END`: When the input exceeds the maximum input token length, the end of the input is discarded. + * * `NONE`: When the input exceeds the maximum input token length, an error is returned. + * * `START`: When the input exceeds the maximum input token length, the start of the input is discarded. + */ + truncate?: CohereTruncateType +} + +export enum NvidiaInputType { + ingest, + search +} + export class OpenAIServiceSettings { /** * A valid API key of your OpenAI account. diff --git a/specification/inference/_types/Services.ts b/specification/inference/_types/Services.ts index f5f759a894..fec93938c9 100644 --- a/specification/inference/_types/Services.ts +++ b/specification/inference/_types/Services.ts @@ -41,6 +41,7 @@ import { TaskTypeJinaAi, TaskTypeLlama, TaskTypeMistral, + TaskTypeNvidia, TaskTypeOpenAI, TaskTypeOpenShiftAi, TaskTypeVoyageAI, @@ -304,6 +305,17 @@ export class InferenceEndpointInfoMistral extends InferenceEndpoint { task_type: TaskTypeMistral } +export class InferenceEndpointInfoNvidia extends InferenceEndpoint { + /** + * The inference ID + */ + inference_id: string + /** + * The task type + */ + task_type: TaskTypeNvidia +} + export class InferenceEndpointInfoOpenAI extends InferenceEndpoint { /** * The inference Id diff --git a/specification/inference/_types/TaskType.ts b/specification/inference/_types/TaskType.ts index 44ddf99444..6756db56a3 100644 --- a/specification/inference/_types/TaskType.ts +++ b/specification/inference/_types/TaskType.ts @@ -140,6 +140,13 @@ export enum TaskTypeMistral { completion } +export enum TaskTypeNvidia { + text_embedding, + chat_completion, + completion, + rerank +} + export enum TaskTypeOpenAI { text_embedding, chat_completion, diff --git a/specification/inference/put/PutRequest.ts b/specification/inference/put/PutRequest.ts index c231a0e441..f65351d155 100644 --- a/specification/inference/put/PutRequest.ts +++ b/specification/inference/put/PutRequest.ts @@ -49,6 +49,7 @@ import { TaskType } from '@inference/_types/TaskType' * * JinaAI (`rerank`, `text_embedding`) * * Llama (`chat_completion`, `completion`, `text_embedding`) * * Mistral (`chat_completion`, `completion`, `text_embedding`) + * * Nvidia (`chat_completion`, `completion`, `text_embedding`, `rerank`) * * OpenAI (`chat_completion`, `completion`, `text_embedding`) * * OpenShift AI (`chat_completion`, `completion`, `rerank`, `text_embedding`) * * VoyageAI (`rerank`, `text_embedding`) diff --git a/specification/inference/put_nvidia/PutNvidiaRequest.ts b/specification/inference/put_nvidia/PutNvidiaRequest.ts new file mode 100644 index 0000000000..abf1d05f84 --- /dev/null +++ b/specification/inference/put_nvidia/PutNvidiaRequest.ts @@ -0,0 +1,90 @@ +/* + * Licensed to Elasticsearch B.V. under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch B.V. licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { RequestBase } from '@_types/Base' +import { Id } from '@_types/common' +import { Duration } from '@_types/Time' +import { + NvidiaServiceSettings, + NvidiaServiceType, + NvidiaTaskSettings, + NvidiaTaskType +} from '@inference/_types/CommonTypes' +import { InferenceChunkingSettings } from '@inference/_types/Services' + +/** + * Create an Nvidia inference endpoint. + * + * Create an inference endpoint to perform an inference task with the `nvidia` service. + * @rest_spec_name inference.put_nvidia + * @availability stack since=9.3.0 stability=stable visibility=public + * @availability serverless stability=stable visibility=public + * @cluster_privileges manage_inference + * @doc_id inference-api-put-nvidia + */ +export interface Request extends RequestBase { + urls: [ + { + path: '/_inference/{task_type}/{nvidia_inference_id}' + methods: ['PUT'] + } + ] + path_parts: { + /** + * The type of the inference task that the model will perform. + * NOTE: The `chat_completion` task type only supports streaming and only through the _stream API. + */ + task_type: NvidiaTaskType + /** + * The unique identifier of the inference endpoint. + */ + nvidia_inference_id: Id + } + query_parameters: { + /** + * Specifies the amount of time to wait for the inference endpoint to be created. + * @server_default 30s + */ + timeout?: Duration + } + body: { + /** + * The chunking configuration object. + * Applies only to the `text_embedding` task type. + * Not applicable to the `rerank`, `completion`, or `chat_completion` task types. + * @ext_doc_id inference-chunking + */ + chunking_settings?: InferenceChunkingSettings + /** + * The type of service supported for the specified task type. In this case, `nvidia`. + */ + service: NvidiaServiceType + /** + * Settings used to install the inference model. These settings are specific to the `nvidia` service. + */ + service_settings: NvidiaServiceSettings + /** + * Settings to configure the inference task. + * Applies only to the `text_embedding` task type. + * Not applicable to the `rerank`, `completion`, or `chat_completion` task types. + * These settings are specific to the task type you specified. + */ + task_settings?: NvidiaTaskSettings + } +} diff --git a/specification/inference/put_nvidia/PutNvidiaResponse.ts b/specification/inference/put_nvidia/PutNvidiaResponse.ts new file mode 100644 index 0000000000..9a432cde27 --- /dev/null +++ b/specification/inference/put_nvidia/PutNvidiaResponse.ts @@ -0,0 +1,25 @@ +/* + * Licensed to Elasticsearch B.V. under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch B.V. licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { InferenceEndpointInfoNvidia } from '@inference/_types/Services' + +export class Response { + /** @codegen_name endpoint_info */ + body: InferenceEndpointInfoNvidia +} diff --git a/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample1.yaml b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample1.yaml new file mode 100644 index 0000000000..88b4ab3701 --- /dev/null +++ b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample1.yaml @@ -0,0 +1,15 @@ +summary: A text embedding task +description: + Run `PUT _inference/text_embedding/nvidia-text-embedding` to create an inference endpoint + that performs a `text_embedding` task. +method_request: 'PUT _inference/text_embedding/nvidia-text-embedding' +# type: "request" +value: |- + { + "service": "nvidia", + "service_settings": { + "url": "nvidia-embeddings-url", + "api_key": "nvidia-embeddings-token", + "model_id": "nvidia/llama-3.2-nv-embedqa-1b-v2" + } + } diff --git a/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample2.yaml b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample2.yaml new file mode 100644 index 0000000000..4d55845a2d --- /dev/null +++ b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample2.yaml @@ -0,0 +1,19 @@ +summary: A text embedding task with custom `task_settings` and no `url` parameter +description: + Run `PUT _inference/text_embedding/nvidia-text-embedding` to create an inference endpoint + that performs a `text_embedding` task, specifying custom `task_settings` and omitting the `url` parameter + if model is accessible via default NVIDIA endpoint. +method_request: 'PUT _inference/text_embedding/nvidia-text-embedding' +# type: "request" +value: |- + { + "service": "nvidia", + "service_settings": { + "model_id": "nvidia/llama-3.2-nv-embedqa-1b-v2", + "api_key": "nvidia-text-embeddings-token" + }, + "task_settings": { + "input_type": "ingest", + "truncate": "start" + } + } diff --git a/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample3.yaml b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample3.yaml new file mode 100644 index 0000000000..49031da4bc --- /dev/null +++ b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample3.yaml @@ -0,0 +1,15 @@ +summary: A completion task +description: + Run `PUT _inference/completion/nvidia-completion` to create an inference endpoint + that performs a `completion` task. +method_request: 'PUT _inference/completion/nvidia-completion' +# type: "request" +value: |- + { + "service": "nvidia", + "service_settings": { + "url": "nvidia-completion-url", + "api_key": "nvidia-completion-token", + "model_id": "microsoft/phi-3-mini-128k-instruct" + } + } diff --git a/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample4.yaml b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample4.yaml new file mode 100644 index 0000000000..14ca0fab97 --- /dev/null +++ b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample4.yaml @@ -0,0 +1,15 @@ +summary: A completion task without `url` parameter +description: + Run `PUT _inference/completion/nvidia-completion` to create an inference endpoint + that performs a `completion` task, omitting the `url` parameter + if model is accessible via default NVIDIA endpoint. +method_request: 'PUT _inference/completion/nvidia-completion' +# type: "request" +value: |- + { + "service": "nvidia", + "service_settings": { + "api_key": "nvidia-completion-token", + "model_id": "microsoft/phi-3-mini-128k-instruct" + } + } diff --git a/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample5.yaml b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample5.yaml new file mode 100644 index 0000000000..982f53b7d2 --- /dev/null +++ b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample5.yaml @@ -0,0 +1,15 @@ +summary: A chat completion task +description: + Run `PUT _inference/chat_completion/nvidia-chat-completion` to create an inference endpoint + that performs a `chat_completion` task. +method_request: 'PUT _inference/chat_completion/nvidia-chat-completion' +# type: "request" +value: |- + { + "service": "nvidia", + "service_settings": { + "url": "nvidia-chat-completion-url", + "api_key": "nvidia-chat-completion-token", + "model_id": "microsoft/phi-3-mini-128k-instruct" + } + } diff --git a/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample6.yaml b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample6.yaml new file mode 100644 index 0000000000..14a975c6bd --- /dev/null +++ b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample6.yaml @@ -0,0 +1,15 @@ +summary: A chat completion task without `url` parameter +description: + Run `PUT _inference/chat_completion/nvidia-chat-completion` to create an inference endpoint + that performs a `chat_completion` task, omitting the `url` parameter + if model is accessible via default NVIDIA endpoint. +method_request: 'PUT _inference/chat_completion/nvidia-chat-completion' +# type: "request" +value: |- + { + "service": "nvidia", + "service_settings": { + "api_key": "nvidia-chat-completion-token", + "model_id": "microsoft/phi-3-mini-128k-instruct" + } + } diff --git a/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample7.yaml b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample7.yaml new file mode 100644 index 0000000000..f701b40130 --- /dev/null +++ b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample7.yaml @@ -0,0 +1,15 @@ +summary: A rerank task +description: + Run `PUT _inference/rerank/nvidia-rerank` to create an inference endpoint + that performs a `rerank` task. +method_request: 'PUT _inference/rerank/nvidia-rerank' +# type: "request" +value: |- + { + "service": "nvidia", + "service_settings": { + "url": "nvidia-rerank-url", + "api_key": "nvidia-rerank-token", + "model_id": "nv-rerank-qa-mistral-4b:1" + } + } diff --git a/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample8.yaml b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample8.yaml new file mode 100644 index 0000000000..7e39170f0f --- /dev/null +++ b/specification/inference/put_nvidia/examples/request/PutNvidiaRequestExample8.yaml @@ -0,0 +1,15 @@ +summary: A rerank task without `url` parameter +description: + Run `PUT _inference/rerank/nvidia-rerank` to create an inference endpoint + that performs a `rerank` task, omitting the `url` parameter + if model is accessible via default NVIDIA endpoint. +method_request: 'PUT _inference/rerank/nvidia-rerank' +# type: "request" +value: |- + { + "service": "nvidia", + "service_settings": { + "api_key": "nvidia-rerank-token", + "model_id": "nv-rerank-qa-mistral-4b:1" + } + } diff --git a/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample1.yaml b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample1.yaml new file mode 100644 index 0000000000..79c703f0d7 --- /dev/null +++ b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample1.yaml @@ -0,0 +1,24 @@ +summary: A text embedding task +description: A successful response when creating an Nvidia `text_embedding` inference endpoint. +# type: response +# response_code: +value: |- + { + "inference_id": "nvidia-text-embedding", + "task_type": "text_embedding", + "service": "nvidia", + "service_settings": { + "model_id": "nvidia/llama-3.2-nv-embedqa-1b-v2", + "url": "nvidia-embeddings-url", + "rate_limit": { + "requests_per_minute": 3000 + }, + "dimensions": 2048, + "similarity": "dot_product" + }, + "chunking_settings": { + "strategy": "sentence", + "max_chunk_size": 250, + "sentence_overlap": 1 + } + } diff --git a/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample2.yaml b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample2.yaml new file mode 100644 index 0000000000..e3f0bfdd0f --- /dev/null +++ b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample2.yaml @@ -0,0 +1,29 @@ +summary: A text embedding task with custom `task_settings` and no `url` parameter +description: + A successful response when creating an Nvidia `text_embedding` inference endpoint + with custom `task_settings` and no `url` parameter. +# type: response +# response_code: +value: |- + { + "inference_id": "nvidia-text-embedding", + "task_type": "text_embedding", + "service": "nvidia", + "service_settings": { + "model_id": "nvidia/llama-3.2-nv-embedqa-1b-v2", + "rate_limit": { + "requests_per_minute": 3000 + }, + "dimensions": 2048, + "similarity": "dot_product" + }, + "task_settings": { + "input_type": "ingest", + "truncate": "start" + }, + "chunking_settings": { + "strategy": "sentence", + "max_chunk_size": 250, + "sentence_overlap": 1 + } + } diff --git a/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample3.yaml b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample3.yaml new file mode 100644 index 0000000000..be467fc1b6 --- /dev/null +++ b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample3.yaml @@ -0,0 +1,17 @@ +summary: A completion task +description: A successful response when creating an Nvidia `completion` inference endpoint. +# type: response +# response_code: +value: |- + { + "inference_id": "nvidia-completion", + "task_type": "completion", + "service": "nvidia", + "service_settings": { + "model_id": "microsoft/phi-3-mini-128k-instruct", + "url": "nvidia-completion-url", + "rate_limit": { + "requests_per_minute": 3000 + } + } + } diff --git a/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample4.yaml b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample4.yaml new file mode 100644 index 0000000000..376c92b43c --- /dev/null +++ b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample4.yaml @@ -0,0 +1,16 @@ +summary: A completion task without `url` parameter +description: A successful response when creating an Nvidia `completion` inference endpoint without `url` parameter. +# type: response +# response_code: +value: |- + { + "inference_id": "nvidia-completion", + "task_type": "completion", + "service": "nvidia", + "service_settings": { + "model_id": "microsoft/phi-3-mini-128k-instruct", + "rate_limit": { + "requests_per_minute": 3000 + } + } + } diff --git a/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample5.yaml b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample5.yaml new file mode 100644 index 0000000000..839d63b5d4 --- /dev/null +++ b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample5.yaml @@ -0,0 +1,17 @@ +summary: A chat completion task +description: A successful response when creating an Nvidia `chat_completion` inference endpoint. +# type: response +# response_code: +value: |- + { + "inference_id": "nvidia-chat-completion", + "task_type": "chat_completion", + "service": "nvidia", + "service_settings": { + "model_id": "microsoft/phi-3-mini-128k-instruct", + "url": "nvidia-chat-completion-url", + "rate_limit": { + "requests_per_minute": 3000 + } + } + } diff --git a/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample6.yaml b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample6.yaml new file mode 100644 index 0000000000..ef7ffc21ac --- /dev/null +++ b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample6.yaml @@ -0,0 +1,16 @@ +summary: A chat completion task without `url` parameter +description: A successful response when creating an Nvidia `chat_completion` inference endpoint without `url` parameter. +# type: response +# response_code: +value: |- + { + "inference_id": "nvidia-chat-completion", + "task_type": "chat_completion", + "service": "nvidia", + "service_settings": { + "model_id": "microsoft/phi-3-mini-128k-instruct", + "rate_limit": { + "requests_per_minute": 3000 + } + } + } diff --git a/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample7.yaml b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample7.yaml new file mode 100644 index 0000000000..cef11ee4a3 --- /dev/null +++ b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample7.yaml @@ -0,0 +1,17 @@ +summary: A rerank task +description: A successful response when creating an Nvidia `rerank` inference endpoint. +# type: response +# response_code: +value: |- + { + "inference_id": "nvidia-rerank", + "task_type": "rerank", + "service": "nvidia", + "service_settings": { + "model_id": "nv-rerank-qa-mistral-4b:1", + "url": "nvidia-rerank-url", + "rate_limit": { + "requests_per_minute": 3000 + } + } + } diff --git a/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample8.yaml b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample8.yaml new file mode 100644 index 0000000000..b99325ed5e --- /dev/null +++ b/specification/inference/put_nvidia/examples/response/PutNvidiaResponseExample8.yaml @@ -0,0 +1,16 @@ +summary: A rerank task without `url` parameter +description: A successful response when creating an Nvidia `rerank` inference endpoint without `url` parameter. +# type: response +# response_code: +value: |- + { + "inference_id": "nvidia-rerank", + "task_type": "rerank", + "service": "nvidia", + "service_settings": { + "model_id": "nv-rerank-qa-mistral-4b:1", + "rate_limit": { + "requests_per_minute": 3000 + } + } + }