From c7c2a92ec82a67b07454d5702911a93257567cbe Mon Sep 17 00:00:00 2001 From: Adrian Cole Date: Mon, 3 Nov 2025 15:24:33 +0800 Subject: [PATCH 1/2] llama-stack: switches to open model configuration with openai remote Signed-off-by: Adrian Cole --- inference-platforms/llama-stack/README.md | 15 ++++++++------- .../llama-stack/docker-compose.yml | 10 ++++++---- inference-platforms/llama-stack/env.local | 12 ++++-------- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/inference-platforms/llama-stack/README.md b/inference-platforms/llama-stack/README.md index 6e8e202..c37f0aa 100644 --- a/inference-platforms/llama-stack/README.md +++ b/inference-platforms/llama-stack/README.md @@ -1,10 +1,9 @@ # Llama Stack -This shows how to use [Llama Stack][docs] to proxy Ollama, accessible via an -OpenAI compatible API. +This shows how to use [Llama Stack][docs] to proxy Ollama via an OpenAI +compatible API. -This uses the [`otel` telemetry sink][otel-sink] to export OpenTelemetry traces -and metrics from signals recorded with Llama Stack's observability SDK. +**Note**: Telemetry is currently broken in v0.3.1, but not on main. ## Prerequisites @@ -38,9 +37,10 @@ uv run --exact -q --env-file env.local ../chat.py --use-responses-api ## Notes -Here are some constraints about the LlamaStack implementation: -* Only supports llama models (so not Qwen) -* Bridges its tracing and metrics APIs to `otel_trace` and `otel_metric` sinks. +* Uses the `starter` distribution with its built-in `remote::openai` provider, + pointing to Ollama via `OPENAI_BASE_URL` environment variable. +* Models require `provider_id/` prefix (e.g., `openai/qwen3:0.6b`) as of + [PR #3822][prefix-pr]. * Until [this issue][docker] resolves, running docker on Apple Silicon requires emulation. @@ -48,4 +48,5 @@ Here are some constraints about the LlamaStack implementation: [docs]: https://llama-stack.readthedocs.io/en/latest/index.html [otel-sink]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration [uv]: https://docs.astral.sh/uv/getting-started/installation/ +[prefix-pr]: https://github.com/meta-llama/llama-stack/pull/3822 [docker]: https://github.com/llamastack/llama-stack/issues/406 diff --git a/inference-platforms/llama-stack/docker-compose.yml b/inference-platforms/llama-stack/docker-compose.yml index e0d3b7d..28f2288 100644 --- a/inference-platforms/llama-stack/docker-compose.yml +++ b/inference-platforms/llama-stack/docker-compose.yml @@ -7,7 +7,7 @@ services: env_file: - env.local entrypoint: sh - command: -c 'env | grep _MODEL | cut -d= -f2 | xargs -I{} ollama pull {}' + command: -c 'env | grep _MODEL | cut -d= -f2 | sed "s/^[^/]*\///" | xargs -I{} ollama pull {}' extra_hosts: # send localhost traffic to the docker host, e.g. your laptop - "localhost:host-gateway" @@ -15,9 +15,11 @@ services: depends_on: ollama-pull: condition: service_completed_successfully - image: llamastack/distribution-starter:0.2.20 + # TODO: switch to 0.3.2 or 0.4.0 + image: llamastack/distribution-starter:local container_name: llama-stack - platform: linux/amd64 # Force amd64 with emulation + # TODO: put back as published images are amd64 only + # platform: linux/amd64 # Force amd64 with emulation tty: true env_file: - env.local @@ -26,7 +28,7 @@ services: # Ensure the container which specially treats localhost routes back to the # host machine, e.g. your laptop. environment: - - OLLAMA_URL=http://host.docker.internal:11434 + - OPENAI_BASE_URL=http://host.docker.internal:11434/v1 - OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4318 extra_hosts: - "host.docker.internal:host-gateway" diff --git a/inference-platforms/llama-stack/env.local b/inference-platforms/llama-stack/env.local index 0af6145..1fea890 100644 --- a/inference-platforms/llama-stack/env.local +++ b/inference-platforms/llama-stack/env.local @@ -1,14 +1,10 @@ -# Override default ENV variables for llama-stack -OPENAI_BASE_URL=http://localhost:8321/v1/openai/v1 +# OpenAI-compatible endpoint configuration +OPENAI_BASE_URL=http://localhost:8321/v1 +# Models require `provider_id/` prefix, in this case `openai` +CHAT_MODEL=openai/qwen3:0.6b OPENAI_API_KEY=unused -CHAT_MODEL=llama3.2:1b - -# Variables used by llama-stack -OLLAMA_URL=http://localhost:11434 -INFERENCE_MODEL=llama3.2:1b # OpenTelemetry configuration -TELEMETRY_SINKS=otel_trace,otel_metric OTEL_SERVICE_NAME=llama-stack OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf From 199b86b04f4de4ff8c17e68899c0208c900230b0 Mon Sep 17 00:00:00 2001 From: Adrian Cole Date: Mon, 3 Nov 2025 16:45:30 +0800 Subject: [PATCH 2/2] agents-response Signed-off-by: Adrian Cole --- inference-platforms/agent.py | 43 +++++++++++++++++------ inference-platforms/llama-stack/README.md | 12 +++++++ inference-platforms/llama-stack/env.local | 3 +- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/inference-platforms/agent.py b/inference-platforms/agent.py index 276c3d5..11ea2cc 100644 --- a/inference-platforms/agent.py +++ b/inference-platforms/agent.py @@ -1,7 +1,7 @@ # run like this: uv run --exact -q --env-file .env agent.py # /// script # dependencies = [ -# "openai-agents", +# "openai-agents @ git+https://github.com/openai/openai-agents-python.git@refs/pull/2034/head", # "httpx", # "mcp", # "elastic-opentelemetry", @@ -16,23 +16,25 @@ # This must precede any other imports you want to instrument! auto_instrumentation.initialize() +import argparse import asyncio import os from datetime import datetime, timedelta from agents import ( Agent, + HostedMCPTool, OpenAIProvider, RunConfig, Runner, Tool, ) from agents.mcp import MCPServerStreamableHttp, MCPUtil +from openai.types.responses.tool_param import Mcp -async def run_agent(tools: list[Tool]): - model_name = os.getenv("AGENT_MODEL", "gpt-5-nano") - model = OpenAIProvider(use_responses=False).get_model(model_name) +async def run_agent(tools: list[Tool], model_name: str, use_responses: bool): + model = OpenAIProvider(use_responses=use_responses).get_model(model_name) agent = Agent( name="flight-search-agent", model=model, @@ -49,18 +51,39 @@ async def run_agent(tools: list[Tool]): async def main(): + parser = argparse.ArgumentParser(description="MCP-enabled flight search agent") + parser.add_argument("--use-responses-api", action="store_true", help="Use Responses API instead of Agents") + args = parser.parse_args() + + model_name = os.getenv("AGENT_MODEL", "gpt-5-nano") + mcp_url = os.getenv("MCP_URL", "https://mcp.kiwi.com") + mcp_headers = dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h) + + if args.use_responses_api: + # Server-side MCP via Responses API + tools = [ + HostedMCPTool( + tool_config=Mcp( + type="mcp", + server_url=mcp_url, + server_label="kiwi-flights", + headers=mcp_headers, + require_approval="never", + ) + ) + ] + await run_agent(tools, model_name, use_responses=True) + return + + # Client-side MCP orchestration async with MCPServerStreamableHttp( - { - "url": os.getenv("MCP_URL", "https://mcp.kiwi.com"), - "headers": dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h), - "timeout": 30.0, - }, + {"url": mcp_url, "headers": mcp_headers, "timeout": 30.0}, client_session_timeout_seconds=60.0, ) as server: tools = await server.list_tools() util = MCPUtil() tools = [util.to_function_tool(tool, server, False) for tool in tools] - await run_agent(tools) + await run_agent(tools, model_name, use_responses=False) if __name__ == "__main__": diff --git a/inference-platforms/llama-stack/README.md b/inference-platforms/llama-stack/README.md index c37f0aa..6f48dad 100644 --- a/inference-platforms/llama-stack/README.md +++ b/inference-platforms/llama-stack/README.md @@ -35,8 +35,19 @@ Or, for the OpenAI Responses API uv run --exact -q --env-file env.local ../chat.py --use-responses-api ``` +### MCP Agent + +```bash +uv run --exact -q --env-file env.local ../agent.py --use-responses-api +``` + ## Notes +* Llama Stack's Responses API connects to MCP servers server-side (unlike aigw + which proxies MCP). The agent passes MCP configuration via `HostedMCPTool`. +* Until [this PR][openai-agents-pr] merges, the agent requires the fix branch + for handling providers that don't return token usage details. + * Uses the `starter` distribution with its built-in `remote::openai` provider, pointing to Ollama via `OPENAI_BASE_URL` environment variable. * Models require `provider_id/` prefix (e.g., `openai/qwen3:0.6b`) as of @@ -50,3 +61,4 @@ uv run --exact -q --env-file env.local ../chat.py --use-responses-api [uv]: https://docs.astral.sh/uv/getting-started/installation/ [prefix-pr]: https://github.com/meta-llama/llama-stack/pull/3822 [docker]: https://github.com/llamastack/llama-stack/issues/406 +[openai-agents-pr]: https://github.com/openai/openai-agents-python/pull/2034 diff --git a/inference-platforms/llama-stack/env.local b/inference-platforms/llama-stack/env.local index 1fea890..69ff51c 100644 --- a/inference-platforms/llama-stack/env.local +++ b/inference-platforms/llama-stack/env.local @@ -1,8 +1,9 @@ # OpenAI-compatible endpoint configuration OPENAI_BASE_URL=http://localhost:8321/v1 +OPENAI_API_KEY=unused # Models require `provider_id/` prefix, in this case `openai` CHAT_MODEL=openai/qwen3:0.6b -OPENAI_API_KEY=unused +AGENT_MODEL=openai/qwen3:1.7b # OpenTelemetry configuration OTEL_SERVICE_NAME=llama-stack