From c7c2a92ec82a67b07454d5702911a93257567cbe Mon Sep 17 00:00:00 2001
From: Adrian Cole <adrian@tetrate.io>
Date: Mon, 3 Nov 2025 15:24:33 +0800
Subject: [PATCH 1/2] llama-stack: switches to open model configuration with
 openai remote

Signed-off-by: Adrian Cole <adrian@tetrate.io>
---
 inference-platforms/llama-stack/README.md         | 15 ++++++++-------
 .../llama-stack/docker-compose.yml                | 10 ++++++----
 inference-platforms/llama-stack/env.local         | 12 ++++--------
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/inference-platforms/llama-stack/README.md b/inference-platforms/llama-stack/README.md
index 6e8e202..c37f0aa 100644
--- a/inference-platforms/llama-stack/README.md
+++ b/inference-platforms/llama-stack/README.md
@@ -1,10 +1,9 @@
 # Llama Stack
 
-This shows how to use [Llama Stack][docs] to proxy Ollama, accessible via an
-OpenAI compatible API.
+This shows how to use [Llama Stack][docs] to proxy Ollama via an OpenAI
+compatible API.
 
-This uses the [`otel` telemetry sink][otel-sink] to export OpenTelemetry traces
-and metrics from signals recorded with Llama Stack's observability SDK.
+**Note**: Telemetry is currently broken in v0.3.1, but not on main.
 
 ## Prerequisites
 
@@ -38,9 +37,10 @@ uv run --exact -q --env-file env.local ../chat.py --use-responses-api
 
 ## Notes
 
-Here are some constraints about the LlamaStack implementation:
-* Only supports llama models (so not Qwen)
-* Bridges its tracing and metrics APIs to `otel_trace` and `otel_metric` sinks.
+* Uses the `starter` distribution with its built-in `remote::openai` provider,
+  pointing to Ollama via `OPENAI_BASE_URL` environment variable.
+* Models require `provider_id/` prefix (e.g., `openai/qwen3:0.6b`) as of
+  [PR #3822][prefix-pr].
 * Until [this issue][docker] resolves, running docker on Apple Silicon
   requires emulation.
 
@@ -48,4 +48,5 @@ Here are some constraints about the LlamaStack implementation:
 [docs]: https://llama-stack.readthedocs.io/en/latest/index.html
 [otel-sink]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration
 [uv]: https://docs.astral.sh/uv/getting-started/installation/
+[prefix-pr]: https://github.com/meta-llama/llama-stack/pull/3822
 [docker]: https://github.com/llamastack/llama-stack/issues/406
diff --git a/inference-platforms/llama-stack/docker-compose.yml b/inference-platforms/llama-stack/docker-compose.yml
index e0d3b7d..28f2288 100644
--- a/inference-platforms/llama-stack/docker-compose.yml
+++ b/inference-platforms/llama-stack/docker-compose.yml
@@ -7,7 +7,7 @@ services:
     env_file:
       - env.local
     entrypoint: sh
-    command: -c 'env | grep _MODEL | cut -d= -f2 | xargs -I{} ollama pull {}'
+    command: -c 'env | grep _MODEL | cut -d= -f2 | sed "s/^[^/]*\///" | xargs -I{} ollama pull {}'
     extra_hosts:  # send localhost traffic to the docker host, e.g. your laptop
       - "localhost:host-gateway"
 
@@ -15,9 +15,11 @@ services:
     depends_on:
       ollama-pull:
         condition: service_completed_successfully
-    image: llamastack/distribution-starter:0.2.20
+    # TODO: switch to 0.3.2 or 0.4.0
+    image: llamastack/distribution-starter:local
     container_name: llama-stack
-    platform: linux/amd64  # Force amd64 with emulation
+    # TODO: put back as published images are amd64 only
+    # platform: linux/amd64  # Force amd64 with emulation
     tty: true
     env_file:
       - env.local
@@ -26,7 +28,7 @@ services:
     # Ensure the container which specially treats localhost routes back to the
     # host machine, e.g. your laptop.
     environment:
-      - OLLAMA_URL=http://host.docker.internal:11434
+      - OPENAI_BASE_URL=http://host.docker.internal:11434/v1
       - OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4318
     extra_hosts:
       - "host.docker.internal:host-gateway"
diff --git a/inference-platforms/llama-stack/env.local b/inference-platforms/llama-stack/env.local
index 0af6145..1fea890 100644
--- a/inference-platforms/llama-stack/env.local
+++ b/inference-platforms/llama-stack/env.local
@@ -1,14 +1,10 @@
-# Override default ENV variables for llama-stack
-OPENAI_BASE_URL=http://localhost:8321/v1/openai/v1
+# OpenAI-compatible endpoint configuration
+OPENAI_BASE_URL=http://localhost:8321/v1
+# Models require `provider_id/` prefix, in this case `openai`
+CHAT_MODEL=openai/qwen3:0.6b
 OPENAI_API_KEY=unused
-CHAT_MODEL=llama3.2:1b
-
-# Variables used by llama-stack
-OLLAMA_URL=http://localhost:11434
-INFERENCE_MODEL=llama3.2:1b
 
 # OpenTelemetry configuration
-TELEMETRY_SINKS=otel_trace,otel_metric
 OTEL_SERVICE_NAME=llama-stack
 OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
 OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf

From 199b86b04f4de4ff8c17e68899c0208c900230b0 Mon Sep 17 00:00:00 2001
From: Adrian Cole <adrian@tetrate.io>
Date: Mon, 3 Nov 2025 16:45:30 +0800
Subject: [PATCH 2/2] agents-response

Signed-off-by: Adrian Cole <adrian@tetrate.io>
---
 inference-platforms/agent.py              | 43 +++++++++++++++++------
 inference-platforms/llama-stack/README.md | 12 +++++++
 inference-platforms/llama-stack/env.local |  3 +-
 3 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/inference-platforms/agent.py b/inference-platforms/agent.py
index 276c3d5..11ea2cc 100644
--- a/inference-platforms/agent.py
+++ b/inference-platforms/agent.py
@@ -1,7 +1,7 @@
 # run like this: uv run --exact -q --env-file .env agent.py
 # /// script
 # dependencies = [
-#     "openai-agents",
+#     "openai-agents @ git+https://github.com/openai/openai-agents-python.git@refs/pull/2034/head",
 #     "httpx",
 #     "mcp",
 #     "elastic-opentelemetry",
@@ -16,23 +16,25 @@
 # This must precede any other imports you want to instrument!
 auto_instrumentation.initialize()
 
+import argparse
 import asyncio
 import os
 from datetime import datetime, timedelta
 
 from agents import (
     Agent,
+    HostedMCPTool,
     OpenAIProvider,
     RunConfig,
     Runner,
     Tool,
 )
 from agents.mcp import MCPServerStreamableHttp, MCPUtil
+from openai.types.responses.tool_param import Mcp
 
 
-async def run_agent(tools: list[Tool]):
-    model_name = os.getenv("AGENT_MODEL", "gpt-5-nano")
-    model = OpenAIProvider(use_responses=False).get_model(model_name)
+async def run_agent(tools: list[Tool], model_name: str, use_responses: bool):
+    model = OpenAIProvider(use_responses=use_responses).get_model(model_name)
     agent = Agent(
         name="flight-search-agent",
         model=model,
@@ -49,18 +51,39 @@ async def run_agent(tools: list[Tool]):
 
 
 async def main():
+    parser = argparse.ArgumentParser(description="MCP-enabled flight search agent")
+    parser.add_argument("--use-responses-api", action="store_true", help="Use Responses API instead of Agents")
+    args = parser.parse_args()
+
+    model_name = os.getenv("AGENT_MODEL", "gpt-5-nano")
+    mcp_url = os.getenv("MCP_URL", "https://mcp.kiwi.com")
+    mcp_headers = dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h)
+
+    if args.use_responses_api:
+        # Server-side MCP via Responses API
+        tools = [
+            HostedMCPTool(
+                tool_config=Mcp(
+                    type="mcp",
+                    server_url=mcp_url,
+                    server_label="kiwi-flights",
+                    headers=mcp_headers,
+                    require_approval="never",
+                )
+            )
+        ]
+        await run_agent(tools, model_name, use_responses=True)
+        return
+
+    # Client-side MCP orchestration
     async with MCPServerStreamableHttp(
-        {
-            "url": os.getenv("MCP_URL", "https://mcp.kiwi.com"),
-            "headers": dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h),
-            "timeout": 30.0,
-        },
+        {"url": mcp_url, "headers": mcp_headers, "timeout": 30.0},
         client_session_timeout_seconds=60.0,
     ) as server:
         tools = await server.list_tools()
         util = MCPUtil()
         tools = [util.to_function_tool(tool, server, False) for tool in tools]
-        await run_agent(tools)
+        await run_agent(tools, model_name, use_responses=False)
 
 
 if __name__ == "__main__":
diff --git a/inference-platforms/llama-stack/README.md b/inference-platforms/llama-stack/README.md
index c37f0aa..6f48dad 100644
--- a/inference-platforms/llama-stack/README.md
+++ b/inference-platforms/llama-stack/README.md
@@ -35,8 +35,19 @@ Or, for the OpenAI Responses API
 uv run --exact -q --env-file env.local ../chat.py --use-responses-api
 ```
 
+### MCP Agent
+
+```bash
+uv run --exact -q --env-file env.local ../agent.py --use-responses-api
+```
+
 ## Notes
 
+* Llama Stack's Responses API connects to MCP servers server-side (unlike aigw
+  which proxies MCP). The agent passes MCP configuration via `HostedMCPTool`.
+* Until [this PR][openai-agents-pr] merges, the agent requires the fix branch
+  for handling providers that don't return token usage details.
+
 * Uses the `starter` distribution with its built-in `remote::openai` provider,
   pointing to Ollama via `OPENAI_BASE_URL` environment variable.
 * Models require `provider_id/` prefix (e.g., `openai/qwen3:0.6b`) as of
@@ -50,3 +61,4 @@ uv run --exact -q --env-file env.local ../chat.py --use-responses-api
 [uv]: https://docs.astral.sh/uv/getting-started/installation/
 [prefix-pr]: https://github.com/meta-llama/llama-stack/pull/3822
 [docker]: https://github.com/llamastack/llama-stack/issues/406
+[openai-agents-pr]: https://github.com/openai/openai-agents-python/pull/2034
diff --git a/inference-platforms/llama-stack/env.local b/inference-platforms/llama-stack/env.local
index 1fea890..69ff51c 100644
--- a/inference-platforms/llama-stack/env.local
+++ b/inference-platforms/llama-stack/env.local
@@ -1,8 +1,9 @@
 # OpenAI-compatible endpoint configuration
 OPENAI_BASE_URL=http://localhost:8321/v1
+OPENAI_API_KEY=unused
 # Models require `provider_id/` prefix, in this case `openai`
 CHAT_MODEL=openai/qwen3:0.6b
-OPENAI_API_KEY=unused
+AGENT_MODEL=openai/qwen3:1.7b
 
 # OpenTelemetry configuration
 OTEL_SERVICE_NAME=llama-stack