aproorg · busla · Jan 21, 2026 · Jan 22, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/.envrc b/.envrc
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+dotenv
+source ~/.config/claude/env.sh
diff --git a/docker/prod_entrypoint.sh b/docker/prod_entrypoint.sh
@@ -2,6 +2,7 @@
 
 if [ "$SEPARATE_HEALTH_APP" = "1" ]; then
     export LITELLM_ARGS="$@"
+    export SUPERVISORD_STOPWAITSECS="${SUPERVISORD_STOPWAITSECS:-3600}"
     exec supervisord -c /etc/supervisord.conf
 fi
 

diff --git a/docker/supervisord.conf b/docker/supervisord.conf
@@ -14,6 +14,7 @@ priority=1
 exitcodes=0
 stopasgroup=true
 killasgroup=true
+stopwaitsecs=%(ENV_SUPERVISORD_STOPWAITSECS)s
 stdout_logfile=/dev/stdout
 stderr_logfile=/dev/stderr
 stdout_logfile_maxbytes = 0
@@ -29,6 +30,7 @@ priority=2
 exitcodes=0
 stopasgroup=true
 killasgroup=true
+stopwaitsecs=%(ENV_SUPERVISORD_STOPWAITSECS)s
 stdout_logfile=/dev/stdout
 stderr_logfile=/dev/stderr
 stdout_logfile_maxbytes = 0

diff --git a/docs/my-website/docs/pass_through/vertex_ai.md b/docs/my-website/docs/pass_through/vertex_ai.md
@@ -461,3 +461,48 @@ generateContent();
 
 </TabItem>
 </Tabs>
+
+### Using Anthropic Beta Features on Vertex AI
+
+When using Anthropic models via Vertex AI passthrough (e.g., Claude on Vertex), you can enable Anthropic beta features like extended context windows.
+
+The `anthropic-beta` header is automatically forwarded to Vertex AI when calling Anthropic models.
+
+```bash
+curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-east5/publishers/anthropic/models/claude-3-5-sonnet:rawPredict \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -H "anthropic-beta: context-1m-2025-08-07" \
+  -d '{
+    "anthropic_version": "vertex-2023-10-16",
+    "messages": [{"role": "user", "content": "Hello"}],
+    "max_tokens": 500
+  }'
+```
+
+### Forwarding Custom Headers with `x-pass-` Prefix
+
+You can forward any custom header to the provider by prefixing it with `x-pass-`. The prefix is stripped before the header is sent to the provider.
+
+For example:
+- `x-pass-anthropic-beta: value` becomes `anthropic-beta: value`
+- `x-pass-custom-header: value` becomes `custom-header: value`
+
+This is useful when you need to send provider-specific headers that aren't in the default allowlist.
+
+```bash
+curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-east5/publishers/anthropic/models/claude-3-5-sonnet:rawPredict \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -H "x-pass-anthropic-beta: context-1m-2025-08-07" \
+  -H "x-pass-custom-feature: enabled" \
+  -d '{
+    "anthropic_version": "vertex-2023-10-16",
+    "messages": [{"role": "user", "content": "Hello"}],
+    "max_tokens": 500
+  }'
+```
+
+:::info
+The `x-pass-` prefix works for all LLM pass-through endpoints, not just Vertex AI.
+:::
diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md
@@ -866,6 +866,7 @@ router_settings:
 | SECRET_MANAGER_REFRESH_INTERVAL | Refresh interval in seconds for secret manager. Default is 86400 (24 hours)
 | SEPARATE_HEALTH_APP | If set to '1', runs health endpoints on a separate ASGI app and port. Default: '0'.
 | SEPARATE_HEALTH_PORT | Port for the separate health endpoints app. Only used if SEPARATE_HEALTH_APP=1. Default: 4001.
+| SUPERVISORD_STOPWAITSECS | Upper bound timeout in seconds for graceful shutdown when SEPARATE_HEALTH_APP=1. Default: 3600 (1 hour).
 | SERVER_ROOT_PATH | Root path for the server application
 | SEND_USER_API_KEY_ALIAS | Flag to send user API key alias to Zscaler AI Guard. Default is False
 | SEND_USER_API_KEY_TEAM_ID | Flag to send user API key team ID to Zscaler AI Guard. Default is False

diff --git a/docs/my-website/docs/proxy/prod.md b/docs/my-website/docs/proxy/prod.md
@@ -277,8 +277,13 @@ Set the following environment variable(s):
 ```bash
 SEPARATE_HEALTH_APP="1" # Default "0" 
 SEPARATE_HEALTH_PORT="8001" # Default "4001", Works only if `SEPARATE_HEALTH_APP` is "1"
+SUPERVISORD_STOPWAITSECS="3600" # Optional: Upper bound timeout in seconds for graceful shutdown. Default: 3600 (1 hour). Only used when SEPARATE_HEALTH_APP=1.
 ```
 
+**Graceful Shutdown:**
+
+Previously, `stopwaitsecs` was not set, defaulting to 10 seconds and causing in-flight requests to fail. `SUPERVISORD_STOPWAITSECS` (default: 3600) provides an upper bound for graceful shutdown, allowing uvicorn to wait for all in-flight requests to complete.
+
 <video controls width="100%" style={{ borderRadius: '8px', marginBottom: '1em' }}>
   <source src="https://cdn.loom.com/sessions/thumbnails/b08be303331246b88fdc053940d03281-1718990992822.mp4" type="video/mp4" />
   Your browser does not support the video tag.

diff --git a/litellm/constants.py b/litellm/constants.py
@@ -1113,6 +1113,20 @@
     "generateQuery/",
     "optimize-prompt/",
 ]
+
+
+# Headers that are safe to forward from incoming requests to Vertex AI
+# Using an allowlist approach for security - only forward headers we explicitly trust
+ALLOWED_VERTEX_AI_PASSTHROUGH_HEADERS = {
+    "anthropic-beta",  # Required for Anthropic features like extended context windows
+    "content-type",  # Required for request body parsing
+}
+
+# Prefix for headers that should be forwarded to the provider with the prefix stripped
+# e.g., 'x-pass-anthropic-beta: value' becomes 'anthropic-beta: value'
+# Works for all LLM pass-through endpoints (Vertex AI, Anthropic, Bedrock, etc.)
+PASS_THROUGH_HEADER_PREFIX = "x-pass-"
+
 BASE_MCP_ROUTE = "/mcp"
 
 BATCH_STATUS_POLL_INTERVAL_SECONDS = int(

diff --git a/litellm/llms/bedrock/chat/agentcore/sse_iterator.py b/litellm/llms/bedrock/chat/agentcore/sse_iterator.py
@@ -85,6 +85,37 @@ def _parse_sse_line(self, line: str) -> Optional[ModelResponse]:
                     delta = content_block_delta.get("delta", {})
                     text = delta.get("text", "")
 
+                    # Check for reasoning content (extended thinking)
+                    # Format 1: {"reasoningContent": {"text": "..."}} (AgentCore)
+                    reasoning_text = None
+                    reasoning_content = delta.get("reasoningContent")
+                    if isinstance(reasoning_content, dict):
+                        reasoning_text = reasoning_content.get("text")
+                    # Format 2: {"reasoningText": "..."} (Strands SDK flat)
+                    if not reasoning_text:
+                        reasoning_text = delta.get("reasoningText")
+
+                    if reasoning_text:
+                        chunk = ModelResponse(
+                            id=f"chatcmpl-{uuid.uuid4()}",
+                            created=0,
+                            model=self.model,
+                            object="chat.completion.chunk",
+                        )
+
+                        chunk.choices = [
+                            StreamingChoices(
+                                finish_reason=None,
+                                index=0,
+                                delta=Delta(
+                                    reasoning_content=reasoning_text,
+                                    role="assistant",
+                                ),
+                            )
+                        ]
+
+                        return chunk
+
                     if text:
                         # Return chunk with text
                         chunk = ModelResponse(