From ad1db9fc8bfabb462005a3c9889bc14647ae0a75 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 21 Jan 2026 12:01:33 -0800
Subject: [PATCH 1/7] [Fix] LiteLLM VertexAI Pass through - ensuring incoming
 headers are forwarded down to target  (#19524)

* test_vertex_passthrough_forwards_anthropic_beta_header

* add_incoming_headers
---
 .../llm_passthrough_endpoints.py              | 31 +++++-
 .../test_vertex_passthrough_load_balancing.py | 98 ++++++++++++++++++-
 2 files changed, 124 insertions(+), 5 deletions(-)

diff --git a/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py b/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
index e48fd22bc8d0..0a94fc953421 100644
--- a/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
+++ b/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
@@ -1369,6 +1369,27 @@ def get_vertex_base_url(vertex_location: Optional[str]) -> str:
     return f"https://{vertex_location}-aiplatform.googleapis.com/"
 
 
+def add_incoming_headers(request: Request, auth_header: str) -> dict:
+    """
+    Build headers from incoming request, preserving headers like anthropic-beta,
+    while removing headers that should not be forwarded and adding authorization.
+
+    Args:
+        request: The FastAPI request object
+        auth_header: The authorization token to add
+
+    Returns:
+        dict: Headers dictionary with authorization added
+    """
+    headers = dict(request.headers) or {}
+    # Remove headers that should not be forwarded
+    headers.pop("content-length", None)
+    headers.pop("host", None)
+    # Add/override the Authorization header
+    headers["Authorization"] = f"Bearer {auth_header}"
+    return headers
+
+
 def get_vertex_pass_through_handler(
     call_type: Literal["discovery", "aiplatform"],
 ) -> BaseVertexAIPassThroughHandler:
@@ -1512,9 +1533,13 @@ async def _prepare_vertex_auth_headers(
             api_base="",
         )
 
-        headers = {
-            "Authorization": f"Bearer {auth_header}",
-        }
+        # Start with incoming request headers to preserve headers like anthropic-beta
+        headers = dict(request.headers) or {}
+        # Remove headers that should not be forwarded
+        headers.pop("content-length", None)
+        headers.pop("host", None)
+        # Add/override the Authorization header
+        headers["Authorization"] = f"Bearer {auth_header}"
 
         if base_target_url is not None:
             base_target_url = get_vertex_pass_through_handler.update_base_target_url_with_credential_location(
diff --git a/tests/test_litellm/proxy/pass_through_endpoints/test_vertex_passthrough_load_balancing.py b/tests/test_litellm/proxy/pass_through_endpoints/test_vertex_passthrough_load_balancing.py
index ceb231eb4cb9..a6701451f204 100644
--- a/tests/test_litellm/proxy/pass_through_endpoints/test_vertex_passthrough_load_balancing.py
+++ b/tests/test_litellm/proxy/pass_through_endpoints/test_vertex_passthrough_load_balancing.py
@@ -1,9 +1,14 @@
 
+from unittest.mock import AsyncMock, MagicMock, patch
+
 import pytest
-from unittest.mock import MagicMock, AsyncMock, patch
-from litellm.proxy.pass_through_endpoints.llm_passthrough_endpoints import _base_vertex_proxy_route
+
+from litellm.proxy.pass_through_endpoints.llm_passthrough_endpoints import (
+    _base_vertex_proxy_route,
+)
 from litellm.types.router import DeploymentTypedDict
 
+
 @pytest.mark.asyncio
 async def test_vertex_passthrough_load_balancing():
     """
@@ -220,3 +225,92 @@ async def test_async_get_available_deployment_for_pass_through():
     assert deployment is not None
     assert deployment["litellm_params"]["use_in_pass_through"] is True
 
+
+@pytest.mark.asyncio
+async def test_vertex_passthrough_forwards_anthropic_beta_header():
+    """
+    Test that _prepare_vertex_auth_headers forwards the anthropic-beta header
+    (and other important headers) from the incoming request when credentials are available.
+
+    This test validates the fix for the issue where the 1M context window header
+    (anthropic-beta: context-1m-2025-08-07) was being dropped when forwarding
+    requests to Vertex AI.
+    """
+    from starlette.datastructures import Headers
+
+    from litellm.llms.vertex_ai.vertex_llm_base import VertexBase
+    from litellm.proxy.pass_through_endpoints.llm_passthrough_endpoints import (
+        _prepare_vertex_auth_headers,
+    )
+
+    # Create a mock request with anthropic-beta header
+    mock_request = MagicMock()
+    mock_request.headers = Headers({
+        "authorization": "Bearer old-token",
+        "anthropic-beta": "context-1m-2025-08-07",
+        "content-type": "application/json",
+        "user-agent": "test-client",
+        "content-length": "1234",  # Should be removed
+        "host": "localhost:4000",  # Should be removed
+    })
+
+    # Create mock vertex credentials
+    mock_vertex_credentials = MagicMock()
+    mock_vertex_credentials.vertex_project = "test-project"
+    mock_vertex_credentials.vertex_location = "us-central1"
+    mock_vertex_credentials.vertex_credentials = "test-credentials"
+
+    # Create mock handler
+    mock_handler = MagicMock()
+    mock_handler.update_base_target_url_with_credential_location.return_value = (
+        "https://us-central1-aiplatform.googleapis.com"
+    )
+
+    with patch.object(
+        VertexBase,
+        "_ensure_access_token_async",
+        new_callable=AsyncMock,
+        return_value=("test-auth-header", "test-project"),
+    ) as mock_ensure_token, patch.object(
+        VertexBase,
+        "_get_token_and_url",
+        return_value=("new-access-token", None),
+    ) as mock_get_token:
+
+        # Call the function
+        (
+            headers,
+            base_target_url,
+            headers_passed_through,
+            vertex_project,
+            vertex_location,
+        ) = await _prepare_vertex_auth_headers(
+            request=mock_request,
+            vertex_credentials=mock_vertex_credentials,
+            router_credentials=None,
+            vertex_project="test-project",
+            vertex_location="us-central1",
+            base_target_url="https://us-central1-aiplatform.googleapis.com",
+            get_vertex_pass_through_handler=mock_handler,
+        )
+
+        # Verify that the anthropic-beta header is preserved
+        assert "anthropic-beta" in headers
+        assert headers["anthropic-beta"] == "context-1m-2025-08-07"
+
+        # Verify that other headers are preserved
+        assert "content-type" in headers
+        assert headers["content-type"] == "application/json"
+        assert "user-agent" in headers
+
+        # Verify that the Authorization header was updated
+        assert "authorization" in headers
+        assert headers["authorization"] == "Bearer new-access-token"
+
+        # Verify that content-length and host headers were removed
+        assert "content-length" not in headers
+        assert "host" not in headers
+
+        # Verify that headers_passed_through is False (since we have credentials)
+        assert headers_passed_through is False
+

From 8c29ad41916253549afc324287544ff96ad0cae3 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Wed, 21 Jan 2026 19:12:04 -0800
Subject: [PATCH 2/7] [Fix] VertexAI Pass through - Ensure only anthropic betas
 are forwarded down to LLM API (#19542)

* fix ALLOWED_VERTEX_AI_PASSTHROUGH_HEADERS

* test_vertex_passthrough_forwards_anthropic_beta_header

* fix test_vertex_passthrough_forwards_anthropic_beta_header

* test_vertex_passthrough_does_not_forward_litellm_auth_token

* fix utils

* Using Anthropic Beta Features on Vertex AI

* test_forward_headers_from_request_x_pass_prefix
---
 .../my-website/docs/pass_through/vertex_ai.md |  45 ++++++
 litellm/constants.py                          |  14 ++
 litellm/passthrough/utils.py                  |  16 +-
 .../llm_passthrough_endpoints.py              |  36 ++---
 proxy_config.yaml                             |   7 +
 .../test_vertex_passthrough_load_balancing.py | 153 ++++++++++++++++--
 6 files changed, 242 insertions(+), 29 deletions(-)
 create mode 100644 proxy_config.yaml

diff --git a/docs/my-website/docs/pass_through/vertex_ai.md b/docs/my-website/docs/pass_through/vertex_ai.md
index 2efef60070da..560b76543520 100644
--- a/docs/my-website/docs/pass_through/vertex_ai.md
+++ b/docs/my-website/docs/pass_through/vertex_ai.md
@@ -461,3 +461,48 @@ generateContent();
 
 </TabItem>
 </Tabs>
+
+### Using Anthropic Beta Features on Vertex AI
+
+When using Anthropic models via Vertex AI passthrough (e.g., Claude on Vertex), you can enable Anthropic beta features like extended context windows.
+
+The `anthropic-beta` header is automatically forwarded to Vertex AI when calling Anthropic models.
+
+```bash
+curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-east5/publishers/anthropic/models/claude-3-5-sonnet:rawPredict \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -H "anthropic-beta: context-1m-2025-08-07" \
+  -d '{
+    "anthropic_version": "vertex-2023-10-16",
+    "messages": [{"role": "user", "content": "Hello"}],
+    "max_tokens": 500
+  }'
+```
+
+### Forwarding Custom Headers with `x-pass-` Prefix
+
+You can forward any custom header to the provider by prefixing it with `x-pass-`. The prefix is stripped before the header is sent to the provider.
+
+For example:
+- `x-pass-anthropic-beta: value` becomes `anthropic-beta: value`
+- `x-pass-custom-header: value` becomes `custom-header: value`
+
+This is useful when you need to send provider-specific headers that aren't in the default allowlist.
+
+```bash
+curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-east5/publishers/anthropic/models/claude-3-5-sonnet:rawPredict \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -H "x-pass-anthropic-beta: context-1m-2025-08-07" \
+  -H "x-pass-custom-feature: enabled" \
+  -d '{
+    "anthropic_version": "vertex-2023-10-16",
+    "messages": [{"role": "user", "content": "Hello"}],
+    "max_tokens": 500
+  }'
+```
+
+:::info
+The `x-pass-` prefix works for all LLM pass-through endpoints, not just Vertex AI.
+:::
diff --git a/litellm/constants.py b/litellm/constants.py
index 3bdd943481ef..3f43fadd6901 100644
--- a/litellm/constants.py
+++ b/litellm/constants.py
@@ -1113,6 +1113,20 @@
     "generateQuery/",
     "optimize-prompt/",
 ]
+
+
+# Headers that are safe to forward from incoming requests to Vertex AI
+# Using an allowlist approach for security - only forward headers we explicitly trust
+ALLOWED_VERTEX_AI_PASSTHROUGH_HEADERS = {
+    "anthropic-beta",  # Required for Anthropic features like extended context windows
+    "content-type",  # Required for request body parsing
+}
+
+# Prefix for headers that should be forwarded to the provider with the prefix stripped
+# e.g., 'x-pass-anthropic-beta: value' becomes 'anthropic-beta: value'
+# Works for all LLM pass-through endpoints (Vertex AI, Anthropic, Bedrock, etc.)
+PASS_THROUGH_HEADER_PREFIX = "x-pass-"
+
 BASE_MCP_ROUTE = "/mcp"
 
 BATCH_STATUS_POLL_INTERVAL_SECONDS = int(
diff --git a/litellm/passthrough/utils.py b/litellm/passthrough/utils.py
index 4bf66d498811..fbbf9cd25811 100644
--- a/litellm/passthrough/utils.py
+++ b/litellm/passthrough/utils.py
@@ -3,6 +3,8 @@
 
 import httpx
 
+from litellm.constants import PASS_THROUGH_HEADER_PREFIX
+
 
 class BasePassthroughUtils:
     @staticmethod
@@ -27,7 +29,11 @@ def forward_headers_from_request(
         forward_headers: Optional[bool] = False,
     ):
         """
-        Helper to forward headers from original request
+        Helper to forward headers from original request.
+
+        Also handles 'x-pass-' prefixed headers which are always forwarded
+        with the prefix stripped, regardless of forward_headers setting.
+        e.g., 'x-pass-anthropic-beta: value' becomes 'anthropic-beta: value'
         """
         if forward_headers is True:
             # Header We Should NOT forward
@@ -36,6 +42,14 @@ def forward_headers_from_request(
 
             # Combine request headers with custom headers
             headers = {**request_headers, **headers}
+
+        # Always process x-pass- prefixed headers (strip prefix and forward)
+        for header_name, header_value in request_headers.items():
+            if header_name.lower().startswith(PASS_THROUGH_HEADER_PREFIX):
+                # Strip the 'x-pass-' prefix to get the actual header name
+                actual_header_name = header_name[len(PASS_THROUGH_HEADER_PREFIX) :]
+                headers[actual_header_name] = header_value
+
         return headers
 
 class CommonUtils:
diff --git a/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py b/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
index 0a94fc953421..b079e1615190 100644
--- a/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
+++ b/litellm/proxy/pass_through_endpoints/llm_passthrough_endpoints.py
@@ -17,7 +17,10 @@
 
 import litellm
 from litellm._logging import verbose_proxy_logger
-from litellm.constants import BEDROCK_AGENT_RUNTIME_PASS_THROUGH_ROUTES
+from litellm.constants import (
+    ALLOWED_VERTEX_AI_PASSTHROUGH_HEADERS,
+    BEDROCK_AGENT_RUNTIME_PASS_THROUGH_ROUTES,
+)
 from litellm.llms.vertex_ai.vertex_llm_base import VertexBase
 from litellm.proxy._types import *
 from litellm.proxy.auth.route_checks import RouteChecks
@@ -1369,24 +1372,24 @@ def get_vertex_base_url(vertex_location: Optional[str]) -> str:
     return f"https://{vertex_location}-aiplatform.googleapis.com/"
 
 
-def add_incoming_headers(request: Request, auth_header: str) -> dict:
+def get_vertex_ai_allowed_incoming_headers(request: Request) -> dict:
     """
-    Build headers from incoming request, preserving headers like anthropic-beta,
-    while removing headers that should not be forwarded and adding authorization.
+    Extract only the allowed headers from incoming request for Vertex AI pass-through.
+
+    Uses an allowlist approach for security - only forwards headers we explicitly trust.
+    This prevents accidentally forwarding sensitive headers like the LiteLLM auth token.
 
     Args:
         request: The FastAPI request object
-        auth_header: The authorization token to add
 
     Returns:
-        dict: Headers dictionary with authorization added
+        dict: Headers dictionary with only allowed headers
     """
-    headers = dict(request.headers) or {}
-    # Remove headers that should not be forwarded
-    headers.pop("content-length", None)
-    headers.pop("host", None)
-    # Add/override the Authorization header
-    headers["Authorization"] = f"Bearer {auth_header}"
+    incoming_headers = dict(request.headers) or {}
+    headers = {}
+    for header_name in ALLOWED_VERTEX_AI_PASSTHROUGH_HEADERS:
+        if header_name in incoming_headers:
+            headers[header_name] = incoming_headers[header_name]
     return headers
 
 
@@ -1533,12 +1536,9 @@ async def _prepare_vertex_auth_headers(
             api_base="",
         )
 
-        # Start with incoming request headers to preserve headers like anthropic-beta
-        headers = dict(request.headers) or {}
-        # Remove headers that should not be forwarded
-        headers.pop("content-length", None)
-        headers.pop("host", None)
-        # Add/override the Authorization header
+        # Use allowlist approach - only forward specific safe headers
+        headers = get_vertex_ai_allowed_incoming_headers(request)
+        # Add the Authorization header with vendor credentials
         headers["Authorization"] = f"Bearer {auth_header}"
 
         if base_target_url is not None:
diff --git a/proxy_config.yaml b/proxy_config.yaml
new file mode 100644
index 000000000000..57397181cdaa
--- /dev/null
+++ b/proxy_config.yaml
@@ -0,0 +1,7 @@
+model_list:
+  - model_name: "*"
+    litellm_params:
+      model: "*"
+
+general_settings:
+  master_key: sk-1234
diff --git a/tests/test_litellm/proxy/pass_through_endpoints/test_vertex_passthrough_load_balancing.py b/tests/test_litellm/proxy/pass_through_endpoints/test_vertex_passthrough_load_balancing.py
index a6701451f204..28b3ba0a1792 100644
--- a/tests/test_litellm/proxy/pass_through_endpoints/test_vertex_passthrough_load_balancing.py
+++ b/tests/test_litellm/proxy/pass_through_endpoints/test_vertex_passthrough_load_balancing.py
@@ -294,23 +294,156 @@ async def test_vertex_passthrough_forwards_anthropic_beta_header():
             get_vertex_pass_through_handler=mock_handler,
         )
 
-        # Verify that the anthropic-beta header is preserved
+        # Verify that allowlisted headers are preserved
         assert "anthropic-beta" in headers
         assert headers["anthropic-beta"] == "context-1m-2025-08-07"
-
-        # Verify that other headers are preserved
         assert "content-type" in headers
         assert headers["content-type"] == "application/json"
-        assert "user-agent" in headers
 
-        # Verify that the Authorization header was updated
-        assert "authorization" in headers
-        assert headers["authorization"] == "Bearer new-access-token"
+        # Verify that the Authorization header is set with vendor credentials
+        assert "Authorization" in headers
+        assert headers["Authorization"] == "Bearer new-access-token"
 
-        # Verify that content-length and host headers were removed
-        assert "content-length" not in headers
-        assert "host" not in headers
+        # Verify that non-allowlisted headers are NOT forwarded (security)
+        # Only anthropic-beta, content-type, and Authorization should be present
+        assert "authorization" not in headers  # lowercase auth token not forwarded
+        assert "user-agent" not in headers     # not in allowlist
+        assert "content-length" not in headers  # not in allowlist
+        assert "host" not in headers            # not in allowlist
 
         # Verify that headers_passed_through is False (since we have credentials)
         assert headers_passed_through is False
 
+
+@pytest.mark.asyncio
+async def test_vertex_passthrough_does_not_forward_litellm_auth_token():
+    """
+    Test that the LiteLLM authorization header is NOT forwarded to Vertex AI.
+
+    This test validates the fix for the issue where both the LiteLLM auth token
+    (lowercase 'authorization') and the Vertex AI token (uppercase 'Authorization')
+    were being sent, causing 401 errors on the vendor side.
+
+    The incoming request has:
+      - authorization: Bearer <litellm_token>  (should NOT be forwarded)
+
+    The outgoing request should only have:
+      - Authorization: Bearer <vertex_token>  (vendor credentials)
+    """
+    from starlette.datastructures import Headers
+
+    from litellm.llms.vertex_ai.vertex_llm_base import VertexBase
+    from litellm.proxy.pass_through_endpoints.llm_passthrough_endpoints import (
+        _prepare_vertex_auth_headers,
+    )
+
+    # Create a mock request with ONLY the litellm auth token (no other headers)
+    mock_request = MagicMock()
+    mock_request.headers = Headers({
+        "authorization": "Bearer sk-litellm-secret-key",  # LiteLLM token - should NOT be forwarded
+        "Authorization": "Bearer sk-litellm-secret-key-uppercase",  # Also try uppercase
+    })
+
+    # Create mock vertex credentials
+    mock_vertex_credentials = MagicMock()
+    mock_vertex_credentials.vertex_project = "test-project"
+    mock_vertex_credentials.vertex_location = "us-central1"
+    mock_vertex_credentials.vertex_credentials = "test-credentials"
+
+    # Create mock handler
+    mock_handler = MagicMock()
+    mock_handler.update_base_target_url_with_credential_location.return_value = (
+        "https://us-central1-aiplatform.googleapis.com"
+    )
+
+    with patch.object(
+        VertexBase,
+        "_ensure_access_token_async",
+        new_callable=AsyncMock,
+        return_value=("test-auth-header", "test-project"),
+    ), patch.object(
+        VertexBase,
+        "_get_token_and_url",
+        return_value=("vertex-access-token", None),
+    ):
+
+        (
+            headers,
+            _base_target_url,
+            _headers_passed_through,
+            _vertex_project,
+            _vertex_location,
+        ) = await _prepare_vertex_auth_headers(
+            request=mock_request,
+            vertex_credentials=mock_vertex_credentials,
+            router_credentials=None,
+            vertex_project="test-project",
+            vertex_location="us-central1",
+            base_target_url="https://us-central1-aiplatform.googleapis.com",
+            get_vertex_pass_through_handler=mock_handler,
+        )
+
+        # The ONLY Authorization header should be the Vertex token
+        assert headers["Authorization"] == "Bearer vertex-access-token"
+
+        # The LiteLLM token should NOT be present (neither lowercase nor as a duplicate)
+        assert "authorization" not in headers
+        assert headers.get("Authorization") != "Bearer sk-litellm-secret-key"
+        assert headers.get("Authorization") != "Bearer sk-litellm-secret-key-uppercase"
+
+        # Verify we only have the expected headers (Authorization + any allowlisted ones present)
+        # Since the request only had auth headers, only Authorization should be in output
+        assert set(headers.keys()) == {"Authorization"}
+
+
+def test_forward_headers_from_request_x_pass_prefix():
+    """
+    Test that headers with 'x-pass-' prefix are forwarded with the prefix stripped.
+
+    This allows users to force-forward arbitrary headers to the vendor API:
+    - 'x-pass-anthropic-beta: value' becomes 'anthropic-beta: value'
+    - 'x-pass-custom-header: value' becomes 'custom-header: value'
+
+    This is tested on BasePassthroughUtils.forward_headers_from_request which is used
+    by all pass-through endpoints (not just Vertex AI).
+    """
+    from litellm.passthrough.utils import BasePassthroughUtils
+
+    # Simulate incoming request headers
+    request_headers = {
+        "x-pass-anthropic-beta": "context-1m-2025-08-07",
+        "x-pass-custom-header": "custom-value",
+        "x-pass-another-header": "another-value",
+        "authorization": "Bearer sk-litellm-key",
+        "x-litellm-api-key": "sk-1234",
+        "content-type": "application/json",
+    }
+
+    # Start with empty headers dict (simulating custom headers from endpoint config)
+    headers = {}
+
+    # Call the method with forward_headers=False (default behavior)
+    # x-pass- headers should still be forwarded
+    result = BasePassthroughUtils.forward_headers_from_request(
+        request_headers=request_headers,
+        headers=headers,
+        forward_headers=False,
+    )
+
+    # Verify x-pass- prefixed headers are forwarded with prefix stripped
+    assert "anthropic-beta" in result
+    assert result["anthropic-beta"] == "context-1m-2025-08-07"
+    assert "custom-header" in result
+    assert result["custom-header"] == "custom-value"
+    assert "another-header" in result
+    assert result["another-header"] == "another-value"
+
+    # Verify other headers are NOT forwarded (since forward_headers=False)
+    assert "authorization" not in result
+    assert "x-litellm-api-key" not in result
+    assert "content-type" not in result
+
+    # Verify original x-pass- prefixed headers are NOT in output (only stripped versions)
+    assert "x-pass-anthropic-beta" not in result
+    assert "x-pass-custom-header" not in result
+

From 9e893b7cda0a12f4ff6625a557b3b09c11352cb9 Mon Sep 17 00:00:00 2001
From: Alexsander Hamir <alexsanderhamirgomesbaptista@gmail.com>
Date: Tue, 20 Jan 2026 10:44:31 -0800
Subject: [PATCH 3/7] Fix: Handle PostgreSQL cached plan errors during rolling
 deployments (#19424)

---
 litellm/proxy/utils.py | 41 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 9ea2ea7d5c92..fcb678ef02a9 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -2214,6 +2214,45 @@ async def get_generic_data(
 
             raise e
 
+    async def _query_first_with_cached_plan_fallback(
+        self, sql_query: str
+    ) -> Optional[dict]:
+        """
+        Execute a query with automatic fallback for PostgreSQL cached plan errors.
+        
+        This handles the "cached plan must not change result type" error that occurs
+        during rolling deployments when schema changes are applied while old pods
+        still have cached query plans expecting the old schema.
+        
+        Args:
+            sql_query: SQL query string to execute
+            
+        Returns:
+            Query result or None
+            
+        Raises:
+            Original exception if not a cached plan error
+        """
+        try:
+            return await self.db.query_first(query=sql_query)
+        except Exception as e:
+            error_str = str(e)
+            if "cached plan must not change result type" in error_str:
+                # Force PostgreSQL to re-plan by invalidating the cache
+                # Add a unique comment to make the query different
+                sql_query_retry = sql_query.replace(
+                    "SELECT",
+                    f"SELECT /* cache_invalidated_{int(time.time() * 1000)} */"
+                )
+                verbose_proxy_logger.warning(
+                    "PostgreSQL cached plan error detected for token lookup, "
+                    "retrying with fresh plan. This may occur during rolling deployments "
+                    "when schema changes are applied."
+                )
+                return await self.db.query_first(query=sql_query_retry)
+            else:
+                raise
+
     @backoff.on_exception(
         backoff.expo,
         Exception,  # base exception to catch for the backoff
@@ -2545,7 +2584,7 @@ async def get_data(  # noqa: PLR0915
                         WHERE v.token = '{token}'
                     """
 
-                    response = await self.db.query_first(query=sql_query)
+                    response = await self._query_first_with_cached_plan_fallback(sql_query)
 
                     if response is not None:
                         if response["team_models"] is None:

From 790a5ce0b323c1eefa70c2df25b2780097aa3f80 Mon Sep 17 00:00:00 2001
From: Alexsander Hamir <alexsanderhamirgomesbaptista@gmail.com>
Date: Tue, 20 Jan 2026 12:17:06 -0800
Subject: [PATCH 4/7] Fix in-flight request termination on SIGTERM when
 health-check runs in a separate process (#19427)

---
 docker/prod_entrypoint.sh                     | 1 +
 docker/supervisord.conf                       | 2 ++
 docs/my-website/docs/proxy/config_settings.md | 1 +
 docs/my-website/docs/proxy/prod.md            | 5 +++++
 4 files changed, 9 insertions(+)

diff --git a/docker/prod_entrypoint.sh b/docker/prod_entrypoint.sh
index 1fc09d2c8648..28d1bdcc2942 100644
--- a/docker/prod_entrypoint.sh
+++ b/docker/prod_entrypoint.sh
@@ -2,6 +2,7 @@
 
 if [ "$SEPARATE_HEALTH_APP" = "1" ]; then
     export LITELLM_ARGS="$@"
+    export SUPERVISORD_STOPWAITSECS="${SUPERVISORD_STOPWAITSECS:-3600}"
     exec supervisord -c /etc/supervisord.conf
 fi
 
diff --git a/docker/supervisord.conf b/docker/supervisord.conf
index c6855fe652b9..9e9890e214f6 100644
--- a/docker/supervisord.conf
+++ b/docker/supervisord.conf
@@ -14,6 +14,7 @@ priority=1
 exitcodes=0
 stopasgroup=true
 killasgroup=true
+stopwaitsecs=%(ENV_SUPERVISORD_STOPWAITSECS)s
 stdout_logfile=/dev/stdout
 stderr_logfile=/dev/stderr
 stdout_logfile_maxbytes = 0
@@ -29,6 +30,7 @@ priority=2
 exitcodes=0
 stopasgroup=true
 killasgroup=true
+stopwaitsecs=%(ENV_SUPERVISORD_STOPWAITSECS)s
 stdout_logfile=/dev/stdout
 stderr_logfile=/dev/stderr
 stdout_logfile_maxbytes = 0
diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md
index b941f21b33e9..53d9c7759721 100644
--- a/docs/my-website/docs/proxy/config_settings.md
+++ b/docs/my-website/docs/proxy/config_settings.md
@@ -866,6 +866,7 @@ router_settings:
 | SECRET_MANAGER_REFRESH_INTERVAL | Refresh interval in seconds for secret manager. Default is 86400 (24 hours)
 | SEPARATE_HEALTH_APP | If set to '1', runs health endpoints on a separate ASGI app and port. Default: '0'.
 | SEPARATE_HEALTH_PORT | Port for the separate health endpoints app. Only used if SEPARATE_HEALTH_APP=1. Default: 4001.
+| SUPERVISORD_STOPWAITSECS | Upper bound timeout in seconds for graceful shutdown when SEPARATE_HEALTH_APP=1. Default: 3600 (1 hour).
 | SERVER_ROOT_PATH | Root path for the server application
 | SEND_USER_API_KEY_ALIAS | Flag to send user API key alias to Zscaler AI Guard. Default is False
 | SEND_USER_API_KEY_TEAM_ID | Flag to send user API key team ID to Zscaler AI Guard. Default is False
diff --git a/docs/my-website/docs/proxy/prod.md b/docs/my-website/docs/proxy/prod.md
index 9216b0fbf30d..a42d91a7d5f4 100644
--- a/docs/my-website/docs/proxy/prod.md
+++ b/docs/my-website/docs/proxy/prod.md
@@ -277,8 +277,13 @@ Set the following environment variable(s):
 ```bash
 SEPARATE_HEALTH_APP="1" # Default "0" 
 SEPARATE_HEALTH_PORT="8001" # Default "4001", Works only if `SEPARATE_HEALTH_APP` is "1"
+SUPERVISORD_STOPWAITSECS="3600" # Optional: Upper bound timeout in seconds for graceful shutdown. Default: 3600 (1 hour). Only used when SEPARATE_HEALTH_APP=1.
 ```
 
+**Graceful Shutdown:**
+
+Previously, `stopwaitsecs` was not set, defaulting to 10 seconds and causing in-flight requests to fail. `SUPERVISORD_STOPWAITSECS` (default: 3600) provides an upper bound for graceful shutdown, allowing uvicorn to wait for all in-flight requests to complete.
+
 <video controls width="100%" style={{ borderRadius: '8px', marginBottom: '1em' }}>
   <source src="https://cdn.loom.com/sessions/thumbnails/b08be303331246b88fdc053940d03281-1718990992822.mp4" type="video/mp4" />
   Your browser does not support the video tag.

From 5164585050b4d794f948764e6055dc8e623942b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B3n=20Levy?= <levy@apro.is>
Date: Thu, 5 Feb 2026 18:46:26 +0000
Subject: [PATCH 5/7] chore: commit save point

---
 .envrc                                        |   4 +
 .../bedrock/chat/agentcore/transformation.py  | 160 ++++-
 litellm/types/llms/bedrock_agentcore.py       |  20 +-
 .../llm_translation/test_bedrock_agentcore.py | 595 +++++++++++++++++-
 4 files changed, 768 insertions(+), 11 deletions(-)
 create mode 100644 .envrc

diff --git a/.envrc b/.envrc
new file mode 100644
index 000000000000..0be5768f9f70
--- /dev/null
+++ b/.envrc
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+dotenv
+source ~/.config/claude/env.sh
diff --git a/litellm/llms/bedrock/chat/agentcore/transformation.py b/litellm/llms/bedrock/chat/agentcore/transformation.py
index 7c65cad94df0..d3cd2a80b0db 100644
--- a/litellm/llms/bedrock/chat/agentcore/transformation.py
+++ b/litellm/llms/bedrock/chat/agentcore/transformation.py
@@ -22,9 +22,14 @@
 from litellm.types.llms.bedrock_agentcore import (
     AgentCoreMessage,
     AgentCoreParsedResponse,
+    AgentCoreReasoningContentBlock,
     AgentCoreUsage,
 )
-from litellm.types.llms.openai import AllMessageValues
+from litellm.types.llms.openai import (
+    AllMessageValues,
+    ChatCompletionRedactedThinkingBlock,
+    ChatCompletionThinkingBlock,
+)
 from litellm.types.utils import Choices, Message, ModelResponse, Usage
 
 if TYPE_CHECKING:
@@ -274,6 +279,105 @@ def _extract_content_delta(self, event_data: Dict) -> Optional[str]:
         delta = content_block_delta.get("delta", {})
         return delta.get("text")
 
+    def _extract_reasoning_from_event(
+        self, event_data: Dict
+    ) -> Optional[AgentCoreReasoningContentBlock]:
+        """
+        Extract reasoning/thinking content from Strands SDK streaming events.
+
+        Strands SDK emits reasoning events with the following structure:
+        - "reasoning": True for reasoning events
+        - "reasoningText": Text from reasoning process
+        - "reasoning_signature": Signature from reasoning process (also as "signature")
+        - "redactedContent": Reasoning content redacted by the model
+
+        Args:
+            event_data: The SSE event data dict
+
+        Returns:
+            AgentCoreReasoningContentBlock if reasoning content found, None otherwise
+        """
+        # Check for top-level reasoning event (Strands format)
+        if event_data.get("reasoning"):
+            reasoning_text = event_data.get("reasoningText")
+            signature = event_data.get("reasoning_signature") or event_data.get(
+                "signature"
+            )
+            redacted_content = event_data.get("redactedContent")
+
+            if reasoning_text:
+                reasoning_block: AgentCoreReasoningContentBlock = {
+                    "reasoningText": {"text": reasoning_text}
+                }
+                if signature:
+                    reasoning_block["reasoningText"]["signature"] = signature
+                return reasoning_block
+            elif redacted_content:
+                return {"redactedContent": redacted_content}
+
+        # Check for nested event payload with reasoning delta (Bedrock Converse style)
+        event_payload = event_data.get("event")
+        if event_payload:
+            content_block_delta = event_payload.get("contentBlockDelta")
+            if content_block_delta:
+                delta = content_block_delta.get("delta", {})
+                # Check for reasoning content in delta
+                reasoning_text = delta.get("reasoningText")
+                redacted_content = delta.get("redactedContent")
+                signature = delta.get("signature")
+
+                if reasoning_text:
+                    reasoning_block = {"reasoningText": {"text": reasoning_text}}
+                    if signature:
+                        reasoning_block["reasoningText"]["signature"] = signature
+                    return reasoning_block
+                elif redacted_content:
+                    return {"redactedContent": redacted_content}
+
+        return None
+
+    def _transform_reasoning_content(
+        self, reasoning_blocks: List[AgentCoreReasoningContentBlock]
+    ) -> str:
+        """
+        Extract the reasoning text from reasoning content blocks.
+
+        Returns concatenated reasoning text for compatibility with deepseek format.
+        """
+        reasoning_content_str = ""
+        for block in reasoning_blocks:
+            if "reasoningText" in block:
+                reasoning_content_str += block["reasoningText"]["text"]
+        return reasoning_content_str
+
+    def _transform_thinking_blocks(
+        self, reasoning_blocks: List[AgentCoreReasoningContentBlock]
+    ) -> List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]]:
+        """
+        Transform reasoning blocks to OpenAI-compatible thinking blocks format.
+
+        Returns a consistent format for thinking blocks between Anthropic and Bedrock.
+        """
+        thinking_blocks_list: List[
+            Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
+        ] = []
+        for block in reasoning_blocks:
+            if "reasoningText" in block:
+                _thinking_block = ChatCompletionThinkingBlock(type="thinking")
+                _text = block["reasoningText"].get("text")
+                _signature = block["reasoningText"].get("signature")
+                if _text is not None:
+                    _thinking_block["thinking"] = _text
+                if _signature is not None:
+                    _thinking_block["signature"] = _signature
+                thinking_blocks_list.append(_thinking_block)
+            elif "redactedContent" in block:
+                _redacted_block = ChatCompletionRedactedThinkingBlock(
+                    type="redacted_thinking", data=block["redactedContent"]
+                )
+                thinking_blocks_list.append(_redacted_block)
+        return thinking_blocks_list
+
     def _extract_content_from_message(self, message: AgentCoreMessage) -> str:
         """
         Extract text content from message content blocks.
@@ -333,7 +437,7 @@ def _parse_json_response(self, response_json: dict) -> AgentCoreParsedResponse:
         {
             "result": {
                 "role": "assistant",
-                "content": [{"text": "..."}]
+                "content": [{"text": "..."}, {"reasoningContent": {...}}]
             }
         }
         """
@@ -342,11 +446,22 @@ def _parse_json_response(self, response_json: dict) -> AgentCoreParsedResponse:
         # Extract content using the same helper as SSE parsing
         content = self._extract_content_from_message(result)  # type: ignore
 
+        # Extract reasoning content blocks from message content
+        reasoning_blocks: Optional[List[AgentCoreReasoningContentBlock]] = None
+        content_list = result.get("content", [])
+        if isinstance(content_list, list):
+            for block in content_list:
+                if isinstance(block, dict) and "reasoningContent" in block:
+                    if reasoning_blocks is None:
+                        reasoning_blocks = []
+                    reasoning_blocks.append(block["reasoningContent"])
+
         # JSON responses don't include usage data
         return AgentCoreParsedResponse(
             content=content,
             usage=None,
             final_message=result,  # type: ignore
+            reasoning_content_blocks=reasoning_blocks,
         )
 
     def _get_parsed_response(
@@ -386,11 +501,12 @@ def _parse_sse_stream(self, response_text: str) -> AgentCoreParsedResponse:
         Each line starts with 'data:' followed by JSON.
 
         Returns:
-            AgentCoreParsedResponse: Parsed response with content, usage, and message
+            AgentCoreParsedResponse: Parsed response with content, usage, message, and reasoning
         """
         final_message: Optional[AgentCoreMessage] = None
         usage_data: Optional[AgentCoreUsage] = None
         content_blocks: List[str] = []
+        reasoning_blocks: List[AgentCoreReasoningContentBlock] = []
 
         for line in response_text.strip().split("\n"):
             line = line.strip()
@@ -424,6 +540,11 @@ def _parse_sse_stream(self, response_text: str) -> AgentCoreParsedResponse:
                 if text := self._extract_content_delta(data):
                     content_blocks.append(text)
 
+            # Extract reasoning content (can be in top-level or nested event)
+            if reasoning_block := self._extract_reasoning_from_event(data):
+                reasoning_blocks.append(reasoning_block)
+                verbose_logger.debug("Found reasoning content block")
+
         # Build final content
         content = (
             self._extract_content_from_message(final_message)
@@ -432,9 +553,13 @@ def _parse_sse_stream(self, response_text: str) -> AgentCoreParsedResponse:
         )
 
         verbose_logger.debug(f"Final usage_data: {usage_data}")
+        verbose_logger.debug(f"Found {len(reasoning_blocks)} reasoning blocks")
 
         return AgentCoreParsedResponse(
-            content=content, usage=usage_data, final_message=final_message
+            content=content,
+            usage=usage_data,
+            final_message=final_message,
+            reasoning_content_blocks=reasoning_blocks if reasoning_blocks else None,
         )
 
     def get_streaming_response(
@@ -625,12 +750,35 @@ def transform_response(
 
             content = parsed_data["content"]
             usage_data = parsed_data["usage"]
+            reasoning_blocks = parsed_data.get("reasoning_content_blocks")
 
             verbose_logger.debug(f"Parsed content length: {len(content)}")
             verbose_logger.debug(f"Usage data: {usage_data}")
+            verbose_logger.debug(
+                f"Reasoning blocks: {len(reasoning_blocks) if reasoning_blocks else 0}"
+            )
+
+            # Create the message with reasoning content if available
+            message_dict: Dict[str, Any] = {"content": content, "role": "assistant"}
+
+            if reasoning_blocks:
+                # Add provider-specific fields
+                message_dict["provider_specific_fields"] = {
+                    "reasoningContentBlocks": reasoning_blocks,
+                }
+                # Add reasoning_content (concatenated text for deepseek compatibility)
+                message_dict["reasoning_content"] = self._transform_reasoning_content(
+                    reasoning_blocks
+                )
+                # Add thinking_blocks (OpenAI-compatible format)
+                message_dict["thinking_blocks"] = self._transform_thinking_blocks(
+                    reasoning_blocks
+                )
+                verbose_logger.debug(
+                    f"Added reasoning_content: {len(message_dict['reasoning_content'])} chars"
+                )
 
-            # Create the message
-            message = Message(content=content, role="assistant")
+            message = Message(**message_dict)
 
             # Create choices
             choice = Choices(finish_reason="stop", index=0, message=message)
diff --git a/litellm/types/llms/bedrock_agentcore.py b/litellm/types/llms/bedrock_agentcore.py
index 49c3bfb2d53f..95bdb46b52ef 100644
--- a/litellm/types/llms/bedrock_agentcore.py
+++ b/litellm/types/llms/bedrock_agentcore.py
@@ -4,9 +4,9 @@
 https://docs.aws.amazon.com/bedrock/latest/APIReference/API_agentcore_InvokeAgentRuntime.html
 """
 
-from typing import Dict, List, Optional
+from typing import List, Optional
 
-from typing_extensions import Literal, TypedDict
+from typing_extensions import Literal, Required, TypedDict
 
 
 # Request Types
@@ -16,6 +16,21 @@ class AgentCoreRequestPayload(TypedDict):
     prompt: str
 
 
+# Reasoning/Thinking Types (from Strands SDK streaming events)
+class AgentCoreReasoningTextBlock(TypedDict, total=False):
+    """Reasoning text block with optional signature."""
+
+    text: Required[str]
+    signature: str
+
+
+class AgentCoreReasoningContentBlock(TypedDict, total=False):
+    """Reasoning content block - can contain either reasoning text or redacted content."""
+
+    reasoningText: AgentCoreReasoningTextBlock
+    redactedContent: str
+
+
 class AgentCoreRequest(TypedDict, total=False):
     """Complete request structure for AgentCore API (internal use)."""
 
@@ -132,4 +147,5 @@ class AgentCoreParsedResponse(TypedDict):
     content: str
     usage: Optional[AgentCoreUsage]
     final_message: Optional[AgentCoreMessage]
+    reasoning_content_blocks: Optional[List[AgentCoreReasoningContentBlock]]
 
diff --git a/tests/llm_translation/test_bedrock_agentcore.py b/tests/llm_translation/test_bedrock_agentcore.py
index 3afb01482ac3..4a41674e841d 100644
--- a/tests/llm_translation/test_bedrock_agentcore.py
+++ b/tests/llm_translation/test_bedrock_agentcore.py
@@ -11,10 +11,9 @@
     0, os.path.abspath("../..")
 )
 
+import httpx
 import litellm
-from unittest.mock import MagicMock, patch
-import pytest
-
+from unittest.mock import MagicMock, Mock, patch
 import pytest
 
 @pytest.mark.parametrize(
@@ -367,3 +366,593 @@ def test_bedrock_agentcore_without_api_key_uses_sigv4():
         assert "X-Amzn-Bedrock-AgentCore-Runtime-Session-Id" in headers
         assert headers["X-Amzn-Bedrock-AgentCore-Runtime-Session-Id"] == "sigv4-test-session"
 
+
+def test_agentcore_parse_json_response():
+    """
+    Unit test for JSON response parsing (non-streaming)
+    Verifies that content-type: application/json responses are parsed correctly
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    # Create a mock JSON response
+    mock_response = Mock(spec=httpx.Response)
+    mock_response.headers = {"content-type": "application/json"}
+    mock_response.json.return_value = {
+        "result": {
+            "role": "assistant",
+            "content": [{"text": "Hello from JSON response"}]
+        }
+    }
+
+    # Parse the response
+    parsed = config._get_parsed_response(mock_response)
+
+    # Verify content extraction
+    assert parsed["content"] == "Hello from JSON response"
+    # JSON responses don't include usage data
+    assert parsed["usage"] is None
+    # Final message should be the result object
+    assert parsed["final_message"] == mock_response.json.return_value["result"]
+
+
+def test_agentcore_parse_sse_response():
+    """
+    Unit test for SSE response parsing (streaming response consumed as text)
+    Verifies that text/event-stream responses are parsed correctly
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    # Create a mock SSE response with multiple events
+    sse_data = """data: {"event":{"contentBlockDelta":{"delta":{"text":"Hello "}}}}
+
+data: {"event":{"contentBlockDelta":{"delta":{"text":"from SSE"}}}}
+
+data: {"event":{"metadata":{"usage":{"inputTokens":10,"outputTokens":5,"totalTokens":15}}}}
+
+data: {"message":{"role":"assistant","content":[{"text":"Hello from SSE"}]}}
+"""
+
+    mock_response = Mock(spec=httpx.Response)
+    mock_response.headers = {"content-type": "text/event-stream"}
+    mock_response.text = sse_data
+
+    # Parse the response
+    parsed = config._get_parsed_response(mock_response)
+
+    # Verify content extraction from final message
+    assert parsed["content"] == "Hello from SSE"
+    # SSE responses can include usage data
+    assert parsed["usage"] is not None
+    assert parsed["usage"]["inputTokens"] == 10
+    assert parsed["usage"]["outputTokens"] == 5
+    assert parsed["usage"]["totalTokens"] == 15
+    # Final message should be present
+    assert parsed["final_message"] is not None
+    assert parsed["final_message"]["role"] == "assistant"
+
+
+def test_agentcore_parse_sse_response_without_final_message():
+    """
+    Unit test for SSE response parsing when only deltas are present (no final message)
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    # Create a mock SSE response with only content deltas
+    sse_data = """data: {"event":{"contentBlockDelta":{"delta":{"text":"First "}}}}
+
+data: {"event":{"contentBlockDelta":{"delta":{"text":"second "}}}}
+
+data: {"event":{"contentBlockDelta":{"delta":{"text":"third"}}}}
+"""
+
+    mock_response = Mock(spec=httpx.Response)
+    mock_response.headers = {"content-type": "text/event-stream"}
+    mock_response.text = sse_data
+
+    # Parse the response
+    parsed = config._get_parsed_response(mock_response)
+
+    # Content should be concatenated from deltas
+    assert parsed["content"] == "First second third"
+    # No final message
+    assert parsed["final_message"] is None
+
+
+def test_agentcore_transform_response_json():
+    """
+    Integration test for transform_response with JSON response
+    Verifies end-to-end transformation of JSON responses to ModelResponse
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+    from litellm.types.utils import ModelResponse
+
+    config = AmazonAgentCoreConfig()
+
+    # Create mock JSON response
+    mock_response = Mock(spec=httpx.Response)
+    mock_response.headers = {"content-type": "application/json"}
+    mock_response.json.return_value = {
+        "result": {
+            "role": "assistant",
+            "content": [{"text": "Response from transform_response"}]
+        }
+    }
+    mock_response.status_code = 200
+
+    # Create model response
+    model_response = ModelResponse()
+
+    # Mock logging object
+    mock_logging = MagicMock()
+
+    # Transform the response
+    result = config.transform_response(
+        model="bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/test",
+        raw_response=mock_response,
+        model_response=model_response,
+        logging_obj=mock_logging,
+        request_data={},
+        messages=[{"role": "user", "content": "test"}],
+        optional_params={},
+        litellm_params={},
+        encoding=None,
+    )
+
+    # Verify ModelResponse structure
+    assert len(result.choices) == 1
+    assert result.choices[0].message.content == "Response from transform_response"
+    assert result.choices[0].message.role == "assistant"
+    assert result.choices[0].finish_reason == "stop"
+    assert result.choices[0].index == 0
+
+
+def test_agentcore_transform_response_sse():
+    """
+    Integration test for transform_response with SSE response
+    Verifies end-to-end transformation of SSE responses to ModelResponse
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+    from litellm.types.utils import ModelResponse
+
+    config = AmazonAgentCoreConfig()
+
+    # Create mock SSE response
+    sse_data = """data: {"event":{"contentBlockDelta":{"delta":{"text":"SSE "}}}}
+
+data: {"event":{"contentBlockDelta":{"delta":{"text":"response"}}}}
+
+data: {"event":{"metadata":{"usage":{"inputTokens":20,"outputTokens":10,"totalTokens":30}}}}
+
+data: {"message":{"role":"assistant","content":[{"text":"SSE response"}]}}
+"""
+
+    mock_response = Mock(spec=httpx.Response)
+    mock_response.headers = {"content-type": "text/event-stream"}
+    mock_response.text = sse_data
+    mock_response.status_code = 200
+
+    # Create model response
+    model_response = ModelResponse()
+
+    # Mock logging object
+    mock_logging = MagicMock()
+
+    # Transform the response
+    result = config.transform_response(
+        model="bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/test",
+        raw_response=mock_response,
+        model_response=model_response,
+        logging_obj=mock_logging,
+        request_data={},
+        messages=[{"role": "user", "content": "test"}],
+        optional_params={},
+        litellm_params={},
+        encoding=None,
+    )
+
+    # Verify ModelResponse structure
+    assert len(result.choices) == 1
+    assert result.choices[0].message.content == "SSE response"
+    assert result.choices[0].message.role == "assistant"
+    assert result.choices[0].finish_reason == "stop"
+
+    # Verify usage data from SSE metadata
+    assert hasattr(result, "usage")
+    assert result.usage.prompt_tokens == 20
+    assert result.usage.completion_tokens == 10
+    assert result.usage.total_tokens == 30
+
+
+def test_agentcore_synchronous_non_streaming_response():
+    """
+    Test that synchronous (non-streaming) AgentCore calls still work correctly
+    after streaming simplification changes.
+
+    This test verifies:
+    1. Synchronous completion calls work (stream=False or no stream param)
+    2. Response is properly parsed and returned as ModelResponse
+    3. Content is extracted correctly
+    4. Usage data is calculated when not provided by API
+
+    This is a regression test for the streaming simplification changes
+    to ensure we didn't break the non-streaming code path.
+    """
+    from litellm.llms.custom_httpx.http_handler import HTTPHandler
+
+    litellm._turn_on_debug()
+    client = HTTPHandler()
+
+    # Mock a JSON response (typical for synchronous AgentCore calls)
+    mock_json_response = {
+        "result": {
+            "role": "assistant",
+            "content": [{"text": "This is a synchronous response from AgentCore."}]
+        }
+    }
+
+    # Create a mock response object
+    mock_response = Mock(spec=httpx.Response)
+    mock_response.status_code = 200
+    mock_response.headers = {"content-type": "application/json"}
+    mock_response.json.return_value = mock_json_response
+
+    with patch.object(client, "post", return_value=mock_response) as mock_post:
+        # Make a synchronous (non-streaming) completion call
+        response = litellm.completion(
+            model="bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:888602223428:runtime/hosted_agent_r9jvp-3ySZuRHjLC",
+            messages=[
+                {
+                    "role": "user",
+                    "content": "Test synchronous response",
+                }
+            ],
+            stream=False,  # Explicitly disable streaming
+            client=client,
+        )
+
+        # Verify the response structure
+        assert response is not None
+        assert hasattr(response, "choices")
+        assert len(response.choices) > 0
+
+        # Verify content
+        message = response.choices[0].message
+        assert message is not None
+        assert message.content == "This is a synchronous response from AgentCore."
+        assert message.role == "assistant"
+
+        # Verify completion metadata
+        assert response.choices[0].finish_reason == "stop"
+        assert response.choices[0].index == 0
+
+        # Verify usage data exists (either from API or calculated)
+        assert hasattr(response, "usage")
+        assert response.usage is not None
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
+        print(f"Synchronous response: {response}")
+        print(f"Content: {message.content}")
+        print(f"Usage: prompt={response.usage.prompt_tokens}, completion={response.usage.completion_tokens}, total={response.usage.total_tokens}")
+
+
+def test_agentcore_extract_reasoning_from_strands_event():
+    """
+    Unit test for extracting reasoning content from Strands SDK streaming events.
+
+    Strands SDK emits reasoning events with top-level format:
+    {"reasoning": true, "reasoningText": "...", "reasoning_signature": "..."}
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    # Test Strands SDK reasoning event format
+    strands_event = {
+        "reasoning": True,
+        "reasoningText": "Let me think about this problem step by step...",
+        "reasoning_signature": "sig123abc"
+    }
+
+    reasoning_block = config._extract_reasoning_from_event(strands_event)
+
+    assert reasoning_block is not None
+    assert "reasoningText" in reasoning_block
+    assert reasoning_block["reasoningText"]["text"] == "Let me think about this problem step by step..."
+    assert reasoning_block["reasoningText"]["signature"] == "sig123abc"
+
+
+def test_agentcore_extract_reasoning_with_signature_alias():
+    """
+    Unit test for extracting reasoning with 'signature' alias (instead of reasoning_signature).
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    # Test with 'signature' key instead of 'reasoning_signature'
+    strands_event = {
+        "reasoning": True,
+        "reasoningText": "Analyzing the request...",
+        "signature": "alt_sig_456"
+    }
+
+    reasoning_block = config._extract_reasoning_from_event(strands_event)
+
+    assert reasoning_block is not None
+    assert reasoning_block["reasoningText"]["text"] == "Analyzing the request..."
+    assert reasoning_block["reasoningText"]["signature"] == "alt_sig_456"
+
+
+def test_agentcore_extract_redacted_reasoning():
+    """
+    Unit test for extracting redacted reasoning content.
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    # Test redacted reasoning event
+    redacted_event = {
+        "reasoning": True,
+        "redactedContent": "base64encodedredacteddata=="
+    }
+
+    reasoning_block = config._extract_reasoning_from_event(redacted_event)
+
+    assert reasoning_block is not None
+    assert "redactedContent" in reasoning_block
+    assert reasoning_block["redactedContent"] == "base64encodedredacteddata=="
+
+
+def test_agentcore_extract_reasoning_from_bedrock_converse_style():
+    """
+    Unit test for extracting reasoning from Bedrock Converse style nested events.
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    # Test Bedrock Converse style reasoning event
+    converse_event = {
+        "event": {
+            "contentBlockDelta": {
+                "delta": {
+                    "reasoningText": "Processing user input...",
+                    "signature": "converse_sig_789"
+                }
+            }
+        }
+    }
+
+    reasoning_block = config._extract_reasoning_from_event(converse_event)
+
+    assert reasoning_block is not None
+    assert "reasoningText" in reasoning_block
+    assert reasoning_block["reasoningText"]["text"] == "Processing user input..."
+    assert reasoning_block["reasoningText"]["signature"] == "converse_sig_789"
+
+
+def test_agentcore_extract_reasoning_no_reasoning_event():
+    """
+    Unit test verifying that non-reasoning events return None.
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    # Test regular content delta (not reasoning)
+    content_event = {
+        "event": {
+            "contentBlockDelta": {
+                "delta": {
+                    "text": "Hello, this is regular content."
+                }
+            }
+        }
+    }
+
+    reasoning_block = config._extract_reasoning_from_event(content_event)
+    assert reasoning_block is None
+
+
+def test_agentcore_transform_reasoning_content():
+    """
+    Unit test for transforming reasoning blocks to concatenated reasoning text.
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    reasoning_blocks = [
+        {"reasoningText": {"text": "First, I analyze the problem. ", "signature": "sig1"}},
+        {"reasoningText": {"text": "Then, I consider the options. ", "signature": "sig2"}},
+        {"redactedContent": "redacted_data"},  # Should be skipped
+        {"reasoningText": {"text": "Finally, I reach a conclusion.", "signature": "sig3"}},
+    ]
+
+    result = config._transform_reasoning_content(reasoning_blocks)
+
+    assert result == "First, I analyze the problem. Then, I consider the options. Finally, I reach a conclusion."
+
+
+def test_agentcore_transform_thinking_blocks():
+    """
+    Unit test for transforming reasoning blocks to OpenAI-compatible thinking blocks.
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    reasoning_blocks = [
+        {"reasoningText": {"text": "Step 1: Understand the query.", "signature": "sig_step1"}},
+        {"redactedContent": "some_redacted_data"},
+        {"reasoningText": {"text": "Step 2: Formulate response."}},  # No signature
+    ]
+
+    thinking_blocks = config._transform_thinking_blocks(reasoning_blocks)
+
+    assert len(thinking_blocks) == 3
+
+    # First block - thinking with signature
+    assert thinking_blocks[0]["type"] == "thinking"
+    assert thinking_blocks[0]["thinking"] == "Step 1: Understand the query."
+    assert thinking_blocks[0]["signature"] == "sig_step1"
+
+    # Second block - redacted
+    assert thinking_blocks[1]["type"] == "redacted_thinking"
+    assert thinking_blocks[1]["data"] == "some_redacted_data"
+
+    # Third block - thinking without signature
+    assert thinking_blocks[2]["type"] == "thinking"
+    assert thinking_blocks[2]["thinking"] == "Step 2: Formulate response."
+    assert "signature" not in thinking_blocks[2]
+
+
+def test_agentcore_parse_sse_response_with_reasoning():
+    """
+    Unit test for SSE response parsing with reasoning content (Strands format).
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    # Create SSE response with reasoning events
+    sse_data = """data: {"reasoning":true,"reasoningText":"Let me analyze this...","reasoning_signature":"sig_analysis"}
+
+data: {"reasoning":true,"reasoningText":"Now considering options...","reasoning_signature":"sig_consider"}
+
+data: {"event":{"contentBlockDelta":{"delta":{"text":"Here is my answer."}}}}
+
+data: {"event":{"metadata":{"usage":{"inputTokens":50,"outputTokens":100,"totalTokens":150}}}}
+
+data: {"message":{"role":"assistant","content":[{"text":"Here is my answer."}]}}
+"""
+
+    mock_response = Mock(spec=httpx.Response)
+    mock_response.headers = {"content-type": "text/event-stream"}
+    mock_response.text = sse_data
+
+    parsed = config._get_parsed_response(mock_response)
+
+    # Verify content
+    assert parsed["content"] == "Here is my answer."
+
+    # Verify usage
+    assert parsed["usage"] is not None
+    assert parsed["usage"]["inputTokens"] == 50
+    assert parsed["usage"]["outputTokens"] == 100
+
+    # Verify reasoning blocks were captured
+    assert parsed["reasoning_content_blocks"] is not None
+    assert len(parsed["reasoning_content_blocks"]) == 2
+    assert parsed["reasoning_content_blocks"][0]["reasoningText"]["text"] == "Let me analyze this..."
+    assert parsed["reasoning_content_blocks"][0]["reasoningText"]["signature"] == "sig_analysis"
+    assert parsed["reasoning_content_blocks"][1]["reasoningText"]["text"] == "Now considering options..."
+
+
+def test_agentcore_transform_response_with_reasoning():
+    """
+    Integration test for transform_response with reasoning content.
+    Verifies that reasoning_content and thinking_blocks are populated in the response.
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+    from litellm.types.utils import ModelResponse
+
+    config = AmazonAgentCoreConfig()
+
+    # Create mock SSE response with reasoning
+    sse_data = """data: {"reasoning":true,"reasoningText":"Thinking about the problem...","reasoning_signature":"thinking_sig"}
+
+data: {"reasoning":true,"redactedContent":"c29tZXJlZGFjdGVkZGF0YQ=="}
+
+data: {"event":{"contentBlockDelta":{"delta":{"text":"The answer is 42."}}}}
+
+data: {"event":{"metadata":{"usage":{"inputTokens":25,"outputTokens":10,"totalTokens":35}}}}
+
+data: {"message":{"role":"assistant","content":[{"text":"The answer is 42."}]}}
+"""
+
+    mock_response = Mock(spec=httpx.Response)
+    mock_response.headers = {"content-type": "text/event-stream"}
+    mock_response.text = sse_data
+    mock_response.status_code = 200
+
+    model_response = ModelResponse()
+    mock_logging = MagicMock()
+
+    result = config.transform_response(
+        model="bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/test",
+        raw_response=mock_response,
+        model_response=model_response,
+        logging_obj=mock_logging,
+        request_data={},
+        messages=[{"role": "user", "content": "What is the meaning of life?"}],
+        optional_params={},
+        litellm_params={},
+        encoding=None,
+    )
+
+    # Verify basic response structure
+    assert len(result.choices) == 1
+    assert result.choices[0].message.content == "The answer is 42."
+    assert result.choices[0].message.role == "assistant"
+
+    # Verify reasoning_content (concatenated text)
+    message = result.choices[0].message
+    assert hasattr(message, "reasoning_content")
+    assert message.reasoning_content == "Thinking about the problem..."
+
+    # Verify thinking_blocks (OpenAI format)
+    assert hasattr(message, "thinking_blocks")
+    assert len(message.thinking_blocks) == 2
+    assert message.thinking_blocks[0]["type"] == "thinking"
+    assert message.thinking_blocks[0]["thinking"] == "Thinking about the problem..."
+    assert message.thinking_blocks[0]["signature"] == "thinking_sig"
+    assert message.thinking_blocks[1]["type"] == "redacted_thinking"
+    assert message.thinking_blocks[1]["data"] == "c29tZXJlZGFjdGVkZGF0YQ=="
+
+    # Verify provider_specific_fields
+    assert hasattr(message, "provider_specific_fields")
+    assert "reasoningContentBlocks" in message.provider_specific_fields
+    assert len(message.provider_specific_fields["reasoningContentBlocks"]) == 2
+
+
+def test_agentcore_json_response_with_reasoning():
+    """
+    Unit test for JSON response parsing with reasoning content embedded in message.
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    # Create mock JSON response with reasoning in content blocks
+    mock_response = Mock(spec=httpx.Response)
+    mock_response.headers = {"content-type": "application/json"}
+    mock_response.json.return_value = {
+        "result": {
+            "role": "assistant",
+            "content": [
+                {"reasoningContent": {"reasoningText": {"text": "Reasoning step 1", "signature": "json_sig"}}},
+                {"text": "Final answer from JSON response."}
+            ]
+        }
+    }
+
+    parsed = config._get_parsed_response(mock_response)
+
+    # Verify content extraction
+    assert parsed["content"] == "Final answer from JSON response."
+
+    # Verify reasoning blocks extracted from content
+    assert parsed["reasoning_content_blocks"] is not None
+    assert len(parsed["reasoning_content_blocks"]) == 1
+    # The reasoningContent object is added directly
+    assert "reasoningText" in parsed["reasoning_content_blocks"][0]

From 8b41910cfeb8804bcdbc32f9c924b7fb9cb169e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B3n=20Levy?= <levy@apro.is>
Date: Fri, 6 Feb 2026 15:38:54 +0000
Subject: [PATCH 6/7] fix: dual format reasoning extraction

---
 .../bedrock/chat/agentcore/sse_iterator.py    |  31 ++
 .../bedrock/chat/agentcore/transformation.py  |   9 +
 test_agentcore_direct.py                      | 114 +++++
 test_agentcore_thinking.py                    | 446 ++++++++++++++++++
 .../llm_translation/test_bedrock_agentcore.py | 179 +++++++
 5 files changed, 779 insertions(+)
 create mode 100644 test_agentcore_direct.py
 create mode 100644 test_agentcore_thinking.py

diff --git a/litellm/llms/bedrock/chat/agentcore/sse_iterator.py b/litellm/llms/bedrock/chat/agentcore/sse_iterator.py
index 90c5ada769f0..9364431ba65c 100644
--- a/litellm/llms/bedrock/chat/agentcore/sse_iterator.py
+++ b/litellm/llms/bedrock/chat/agentcore/sse_iterator.py
@@ -85,6 +85,37 @@ def _parse_sse_line(self, line: str) -> Optional[ModelResponse]:
                     delta = content_block_delta.get("delta", {})
                     text = delta.get("text", "")
 
+                    # Check for reasoning content (extended thinking)
+                    # Format 1: {"reasoningContent": {"text": "..."}} (AgentCore)
+                    reasoning_text = None
+                    reasoning_content = delta.get("reasoningContent")
+                    if isinstance(reasoning_content, dict):
+                        reasoning_text = reasoning_content.get("text")
+                    # Format 2: {"reasoningText": "..."} (Strands SDK flat)
+                    if not reasoning_text:
+                        reasoning_text = delta.get("reasoningText")
+
+                    if reasoning_text:
+                        chunk = ModelResponse(
+                            id=f"chatcmpl-{uuid.uuid4()}",
+                            created=0,
+                            model=self.model,
+                            object="chat.completion.chunk",
+                        )
+
+                        chunk.choices = [
+                            StreamingChoices(
+                                finish_reason=None,
+                                index=0,
+                                delta=Delta(
+                                    reasoning_content=reasoning_text,
+                                    role="assistant",
+                                ),
+                            )
+                        ]
+
+                        return chunk
+
                     if text:
                         # Return chunk with text
                         chunk = ModelResponse(
diff --git a/litellm/llms/bedrock/chat/agentcore/transformation.py b/litellm/llms/bedrock/chat/agentcore/transformation.py
index d3cd2a80b0db..efee9936f85c 100644
--- a/litellm/llms/bedrock/chat/agentcore/transformation.py
+++ b/litellm/llms/bedrock/chat/agentcore/transformation.py
@@ -322,10 +322,19 @@ def _extract_reasoning_from_event(
             if content_block_delta:
                 delta = content_block_delta.get("delta", {})
                 # Check for reasoning content in delta
+                # Format 1: {"reasoningText": "..."} (Strands SDK flat)
                 reasoning_text = delta.get("reasoningText")
                 redacted_content = delta.get("redactedContent")
                 signature = delta.get("signature")
 
+                # Format 2: {"reasoningContent": {"text": "...", "signature": "..."}} (AgentCore nested)
+                reasoning_content_block = delta.get("reasoningContent")
+                if isinstance(reasoning_content_block, dict):
+                    if not reasoning_text:
+                        reasoning_text = reasoning_content_block.get("text")
+                    if not signature:
+                        signature = reasoning_content_block.get("signature")
+
                 if reasoning_text:
                     reasoning_block = {"reasoningText": {"text": reasoning_text}}
                     if signature:
diff --git a/test_agentcore_direct.py b/test_agentcore_direct.py
new file mode 100644
index 000000000000..18b960b91c5b
--- /dev/null
+++ b/test_agentcore_direct.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+"""
+Direct test of AgentCore API (no LiteLLM) to verify thinking/reasoning works.
+"""
+
+import json
+import requests
+import uuid
+from urllib.parse import quote
+
+# Configuration
+COGNITO_CLIENT_ID = "65su16qe567iap1010l914dhbg"
+COGNITO_CLIENT_SECRET = "pa01tlqslkokulsq0mq910v7238oa8g6vins0lqd966rmb8ck19"
+COGNITO_TOKEN_URL = "https://apro-chat.auth.eu-central-1.amazoncognito.com/oauth2/token"
+AGENTCORE_RUNTIME_ARN = "arn:aws:bedrock-agentcore:eu-west-1:515966504419:runtime/apro_sandbox_tomas_genai_v2_ut_messan_runtime-N9x7Ce3CcD"
+
+def get_token():
+    """Get JWT token from Cognito."""
+    print("🔐 Getting token...")
+    resp = requests.post(
+        COGNITO_TOKEN_URL,
+        data={
+            "grant_type": "client_credentials",
+            "client_id": COGNITO_CLIENT_ID,
+            "client_secret": COGNITO_CLIENT_SECRET,
+        },
+        headers={"Content-Type": "application/x-www-form-urlencoded"},
+    )
+    token = resp.json()["access_token"]
+    print(f"✅ Token: {token[:50]}...")
+    return token
+
+def test_agentcore(token: str):
+    """Test AgentCore directly."""
+    # Parse ARN
+    region = AGENTCORE_RUNTIME_ARN.split(":")[3]
+
+    # Build URL with URL-encoded ARN
+    encoded_arn = quote(AGENTCORE_RUNTIME_ARN, safe="")
+    url = f"https://bedrock-agentcore.{region}.amazonaws.com/runtimes/{encoded_arn}/invocations"
+
+    # Generate session ID (must be 33+ chars)
+    session_id = f"test-session-{uuid.uuid4()}"
+
+    prompt = "Think step by step: What is 23 * 47?"
+
+    print(f"\n🌐 URL: {url}")
+    print(f"📤 Prompt: {prompt}")
+    print(f"📡 Session: {session_id}")
+    print("\n" + "="*60)
+    print("  RAW SSE EVENTS")
+    print("="*60 + "\n")
+
+    # Make streaming request
+    resp = requests.post(
+        url,
+        json={"prompt": prompt},
+        headers={
+            "Authorization": f"Bearer {token}",
+            "Content-Type": "application/json",
+            "Accept": "text/event-stream",
+            "X-Amzn-Bedrock-AgentCore-Runtime-Session-Id": session_id,
+        },
+        stream=True,
+    )
+
+    print(f"Status: {resp.status_code}\n")
+
+    if resp.status_code != 200:
+        print(f"❌ Error: {resp.text}")
+        return
+
+    # Process events
+    reasoning_found = False
+    content_parts = []
+
+    for line in resp.iter_lines():
+        if line:
+            decoded = line.decode('utf-8')
+            print(decoded)
+
+            # Check for reasoning
+            if 'reasoning' in decoded.lower():
+                reasoning_found = True
+                print("  👆 REASONING EVENT!")
+
+            # Extract content
+            if decoded.startswith("data:"):
+                try:
+                    data = json.loads(decoded[5:].strip())
+                    if "event" in data:
+                        evt = data["event"]
+                        if "contentBlockDelta" in evt:
+                            text = evt["contentBlockDelta"].get("delta", {}).get("text", "")
+                            if text:
+                                content_parts.append(text)
+                except:
+                    pass
+
+    print("\n" + "="*60)
+    print("  SUMMARY")
+    print("="*60)
+
+    if content_parts:
+        print(f"\n📄 Response: {''.join(content_parts)}")
+
+    if reasoning_found:
+        print("\n✅ REASONING/THINKING DETECTED!")
+    else:
+        print("\n❌ No reasoning detected - agent needs thinking enabled")
+
+if __name__ == "__main__":
+    token = get_token()
+    test_agentcore(token)
diff --git a/test_agentcore_thinking.py b/test_agentcore_thinking.py
new file mode 100644
index 000000000000..abb03015f61a
--- /dev/null
+++ b/test_agentcore_thinking.py
@@ -0,0 +1,446 @@
+#!/usr/bin/env python3
+"""
+Test script for Bedrock AgentCore thinking/reasoning feature.
+
+This script tests whether the AgentCore agent emits reasoning events
+and whether LiteLLM correctly captures them.
+
+Usage:
+    python test_agentcore_thinking.py
+
+Requirements:
+    - pip install requests httpx
+"""
+
+import os
+import json
+import requests
+import httpx
+from typing import Optional
+
+# ============================================================================
+# Configuration - Update these values
+# ============================================================================
+
+# Cognito OAuth2 credentials
+COGNITO_CLIENT_ID = os.getenv("COGNITO_CLIENT_ID", "65su16qe567iap1010l914dhbg")
+COGNITO_CLIENT_SECRET = os.getenv("COGNITO_CLIENT_SECRET", "pa01tlqslkokulsq0mq910v7238oa8g6vins0lqd966rmb8ck19")
+COGNITO_TOKEN_URL = os.getenv(
+    "COGNITO_TOKEN_URL",
+    "https://apro-chat.auth.eu-central-1.amazoncognito.com/oauth2/token"
+)
+
+# AgentCore Runtime ARN
+AGENTCORE_RUNTIME_ARN = os.getenv(
+    "AGENTCORE_RUNTIME_ARN",
+    "arn:aws:bedrock-agentcore:eu-west-1:515966504419:runtime/apro_sandbox_tomas_genai_v2_ut_messan_runtime-N9x7Ce3CcD"
+)
+
+# Enable debug logging
+DEBUG = os.getenv("DEBUG", "false").lower() == "true"
+
+# ============================================================================
+# Helper Functions
+# ============================================================================
+
+def get_cognito_token() -> str:
+    """Get JWT token from Cognito using client credentials flow."""
+    print("🔐 Getting JWT token from Cognito...")
+
+    response = requests.post(
+        COGNITO_TOKEN_URL,
+        data={
+            "grant_type": "client_credentials",
+            "client_id": COGNITO_CLIENT_ID,
+            "client_secret": COGNITO_CLIENT_SECRET,
+        },
+        headers={"Content-Type": "application/x-www-form-urlencoded"},
+    )
+
+    if response.status_code != 200:
+        raise Exception(f"Failed to get token: {response.status_code} - {response.text}")
+
+    token = response.json()["access_token"]
+    print(f"✅ Got token (first 50 chars): {token[:50]}...")
+    return token
+
+
+def print_separator(title: str):
+    """Print a visual separator."""
+    print(f"\n{'='*60}")
+    print(f"  {title}")
+    print(f"{'='*60}\n")
+
+
+def parse_arn(arn: str) -> tuple:
+    """Parse ARN to extract region and runtime ID."""
+    parts = arn.split(":")
+    region = parts[3]
+    runtime_id = parts[5].split("/")[1]
+    return region, runtime_id
+
+
+def generate_session_id() -> str:
+    """Generate a session ID (must be 33+ chars)."""
+    import uuid
+    return f"session-{uuid.uuid4()}-test"
+
+
+# ============================================================================
+# Test Functions
+# ============================================================================
+
+def test_direct_agentcore_streaming(token: str):
+    """Test AgentCore streaming by making direct HTTP calls."""
+    print_separator("Test: Direct AgentCore Streaming")
+
+    region, runtime_id = parse_arn(AGENTCORE_RUNTIME_ARN)
+    session_id = generate_session_id()
+
+    # AgentCore invoke endpoint
+    url = f"https://bedrock-agentcore.{region}.amazonaws.com/runtimes/{runtime_id}/sessions/{session_id}/invoke"
+
+    prompt = "Think step by step: What is 23 * 47? Show your reasoning process."
+
+    print(f"🌐 URL: {url}")
+    print(f"📤 Prompt: {prompt}")
+    print(f"📡 Raw SSE events:\n")
+
+    # Build request
+    payload = {"prompt": prompt}
+    request_body = {"payload": json.dumps(payload)}
+
+    try:
+        with httpx.Client(timeout=120.0) as client:
+            with client.stream(
+                "POST",
+                url,
+                json=request_body,
+                headers={
+                    "Authorization": f"Bearer {token}",
+                    "Content-Type": "application/json",
+                    "Accept": "text/event-stream",
+                },
+            ) as response:
+                print(f"📡 Response status: {response.status_code}")
+
+                if response.status_code != 200:
+                    error_body = response.read().decode()
+                    print(f"❌ Error: {error_body[:500]}")
+                    return False
+
+                reasoning_events = []
+                content_events = []
+                all_content = []
+                event_count = 0
+
+                for line in response.iter_lines():
+                    if line:
+                        event_count += 1
+
+                        # Print truncated line for readability
+                        display_line = line[:120] + "..." if len(line) > 120 else line
+                        print(f"  [{event_count}] {display_line}")
+
+                        # Try to parse JSON from "data:" prefix
+                        if line.startswith("data:"):
+                            try:
+                                json_str = line[5:].strip()
+                                if json_str:
+                                    event_data = json.loads(json_str)
+
+                                    # Check for reasoning events
+                                    if event_data.get("reasoning"):
+                                        reasoning_events.append(event_data)
+                                        text = event_data.get("reasoningText", "")
+                                        print(f"      🧠 REASONING: {text[:80]}...")
+
+                                    # Check for content delta
+                                    if "event" in event_data:
+                                        evt = event_data["event"]
+                                        if "contentBlockDelta" in evt:
+                                            delta = evt["contentBlockDelta"].get("delta", {})
+                                            text = delta.get("text", "")
+                                            if text:
+                                                all_content.append(text)
+                                            # Check for reasoning in delta
+                                            if delta.get("reasoningText"):
+                                                reasoning_events.append(event_data)
+                                                print(f"      🧠 REASONING (delta): {delta['reasoningText'][:80]}...")
+
+                                        content_events.append(event_data)
+                            except json.JSONDecodeError:
+                                pass
+
+                        # Also check raw line for reasoning keywords
+                        elif 'reasoning' in line.lower():
+                            reasoning_events.append(line)
+
+                print("\n")
+                print_separator("Results Summary")
+                print(f"📊 Total events: {event_count}")
+                print(f"📝 Content events: {len(content_events)}")
+                print(f"🧠 Reasoning events: {len(reasoning_events)}")
+
+                if all_content:
+                    full_content = "".join(all_content)
+                    print(f"\n📄 Full Response:\n{full_content[:500]}...")
+
+                if reasoning_events:
+                    print("\n✅ REASONING DETECTED!")
+                    for i, evt in enumerate(reasoning_events[:3]):
+                        if isinstance(evt, dict):
+                            print(f"   [{i+1}] {json.dumps(evt)[:150]}...")
+                        else:
+                            print(f"   [{i+1}] {evt[:150]}...")
+                    return True
+                else:
+                    print("\n❌ No reasoning events detected in stream")
+                    return False
+
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def test_direct_agentcore_non_streaming(token: str):
+    """Test AgentCore non-streaming by making direct HTTP calls."""
+    print_separator("Test: Direct AgentCore Non-Streaming")
+
+    region, runtime_id = parse_arn(AGENTCORE_RUNTIME_ARN)
+    session_id = generate_session_id()
+
+    # AgentCore invoke endpoint
+    url = f"https://bedrock-agentcore.{region}.amazonaws.com/runtimes/{runtime_id}/sessions/{session_id}/invoke"
+
+    prompt = "What is the capital of France? Answer briefly."
+
+    print(f"🌐 URL: {url}")
+    print(f"📤 Prompt: {prompt}")
+    print(f"⏳ Waiting for response...\n")
+
+    # Build request
+    payload = {"prompt": prompt}
+    request_body = {"payload": json.dumps(payload)}
+
+    try:
+        response = httpx.post(
+            url,
+            json=request_body,
+            headers={
+                "Authorization": f"Bearer {token}",
+                "Content-Type": "application/json",
+            },
+            timeout=120.0,
+        )
+
+        print(f"📡 Response status: {response.status_code}")
+
+        if response.status_code != 200:
+            print(f"❌ Error: {response.text[:500]}")
+            return False
+
+        # Print raw response
+        raw_text = response.text
+        print(f"\n📥 Raw response:\n{raw_text[:1000]}...")
+
+        # Check for reasoning in response
+        has_reasoning = 'reasoning' in raw_text.lower()
+
+        if has_reasoning:
+            print("\n✅ REASONING DETECTED in response!")
+            return True
+        else:
+            print("\n❌ No reasoning detected in response")
+            return False
+
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def test_with_litellm_config(token: str):
+    """Test using LiteLLM's AmazonAgentCoreConfig."""
+    print_separator("Test: LiteLLM AmazonAgentCoreConfig")
+
+    try:
+        from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+        from litellm.litellm_core_utils.litellm_logging import Logging
+    except ImportError as e:
+        print(f"❌ Could not import LiteLLM AgentCore module: {e}")
+        return False
+
+    config = AmazonAgentCoreConfig()
+
+    prompt = "What is 2+2?"
+    messages = [{"role": "user", "content": prompt}]
+
+    # The model format that LiteLLM expects
+    model_str = f"agentcore/{AGENTCORE_RUNTIME_ARN}"
+
+    print(f"📤 Prompt: {prompt}")
+    print(f"📦 Model: {model_str}")
+
+    try:
+        # Validate environment and get headers
+        headers = config.validate_environment(
+            headers={},
+            model=model_str,
+            messages=messages,
+            optional_params={},
+            litellm_params={},
+            api_key=token,
+        )
+
+        # Get the complete URL
+        url = config.get_complete_url(
+            api_base=None,
+            api_key=token,
+            model=model_str,
+            optional_params={},
+            litellm_params={},
+            stream=True,
+        )
+
+        # Transform request
+        data = config.transform_request(
+            model=model_str,
+            messages=messages,
+            optional_params={},
+            litellm_params={},
+            headers=headers,
+        )
+
+        print(f"🌐 URL: {url}")
+        print(f"📦 Request: {json.dumps(data, indent=2)}")
+        print(f"📡 Making streaming request...\n")
+
+        # Make request
+        with httpx.Client(timeout=120.0) as client:
+            with client.stream("POST", url, json=data, headers=headers) as response:
+                print(f"📡 Response status: {response.status_code}")
+
+                if response.status_code != 200:
+                    error_body = response.read().decode()
+                    print(f"❌ Error: {error_body[:500]}")
+                    return False
+
+                reasoning_found = False
+                event_count = 0
+                all_content = []
+
+                for line in response.iter_lines():
+                    if line:
+                        event_count += 1
+                        display = line[:100] + "..." if len(line) > 100 else line
+                        print(f"  [{event_count}] {display}")
+
+                        # Check for reasoning
+                        if 'reasoning' in line.lower():
+                            reasoning_found = True
+                            print(f"      🧠 REASONING DETECTED!")
+
+                        # Try to extract content
+                        if line.startswith("data:"):
+                            try:
+                                event_data = json.loads(line[5:].strip())
+                                if "event" in event_data:
+                                    evt = event_data["event"]
+                                    if "contentBlockDelta" in evt:
+                                        delta = evt["contentBlockDelta"].get("delta", {})
+                                        text = delta.get("text", "")
+                                        if text:
+                                            all_content.append(text)
+                            except:
+                                pass
+
+                print(f"\n📊 Total events: {event_count}")
+                if all_content:
+                    print(f"📄 Response: {''.join(all_content)[:300]}...")
+
+                if reasoning_found:
+                    print("\n✅ REASONING DETECTED!")
+                else:
+                    print("\n❌ No reasoning detected")
+
+                return reasoning_found
+
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+# ============================================================================
+# Main
+# ============================================================================
+
+def main():
+    print("""
+╔══════════════════════════════════════════════════════════════╗
+║     Bedrock AgentCore Thinking/Reasoning Test Script         ║
+╠══════════════════════════════════════════════════════════════╣
+║  This script tests whether reasoning events are emitted      ║
+║  by your AgentCore agent and captured by LiteLLM.            ║
+╚══════════════════════════════════════════════════════════════╝
+""")
+
+    # Get authentication token
+    try:
+        token = get_cognito_token()
+    except Exception as e:
+        print(f"❌ Authentication failed: {e}")
+        return
+
+    results = {}
+
+    # Run tests
+    results['direct_streaming'] = test_direct_agentcore_streaming(token)
+    results['direct_non_streaming'] = test_direct_agentcore_non_streaming(token)
+    results['litellm_config'] = test_with_litellm_config(token)
+
+    # Final summary
+    print_separator("FINAL SUMMARY")
+
+    for test_name, passed in results.items():
+        status = "✅ THINKING FOUND" if passed else "❌ NO THINKING"
+        print(f"  {test_name}: {status}")
+
+    if not any(results.values()):
+        print("""
+╔══════════════════════════════════════════════════════════════╗
+║  ⚠️  NO THINKING/REASONING DETECTED                          ║
+╠══════════════════════════════════════════════════════════════╣
+║  Your AgentCore agent is NOT emitting reasoning events.      ║
+║                                                              ║
+║  To enable thinking in your Strands agent, configure it:     ║
+║                                                              ║
+║    bedrock_model = BedrockModel(                             ║
+║        model_id="anthropic.claude-sonnet-4-...",             ║
+║        additional_request_fields={                           ║
+║            "thinking": {                                     ║
+║                "type": "enabled",                            ║
+║                "budget_tokens": 4096                         ║
+║            }                                                 ║
+║        }                                                     ║
+║    )                                                         ║
+╚══════════════════════════════════════════════════════════════╝
+""")
+    else:
+        print("""
+╔══════════════════════════════════════════════════════════════╗
+║  ✅ THINKING/REASONING DETECTED!                             ║
+╠══════════════════════════════════════════════════════════════╣
+║  Your AgentCore agent is emitting reasoning events and       ║
+║  LiteLLM is correctly capturing them.                        ║
+╚══════════════════════════════════════════════════════════════╝
+""")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/llm_translation/test_bedrock_agentcore.py b/tests/llm_translation/test_bedrock_agentcore.py
index 4a41674e841d..e05e653a57e5 100644
--- a/tests/llm_translation/test_bedrock_agentcore.py
+++ b/tests/llm_translation/test_bedrock_agentcore.py
@@ -16,6 +16,15 @@
 from unittest.mock import MagicMock, Mock, patch
 import pytest
 
+
+# Skip marker for integration tests that require live AWS credentials with AgentCore permissions
+requires_agentcore_credentials = pytest.mark.skipif(
+    os.getenv("AGENTCORE_INTEGRATION_TEST") != "true",
+    reason="AgentCore integration tests require AGENTCORE_INTEGRATION_TEST=true and valid AWS credentials with bedrock-agentcore:InvokeAgentRuntime permission"
+)
+
+
+@requires_agentcore_credentials
 @pytest.mark.parametrize(
     "model", [
         "bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:888602223428:runtime/hosted_agent_13sf6-cALnp38iZD", # non-streaming invocation
@@ -37,6 +46,7 @@ def test_bedrock_agentcore_basic(model):
     assert len(response.choices[0].message.content) > 0
 
 
+@requires_agentcore_credentials
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model", [
@@ -956,3 +966,172 @@ def test_agentcore_json_response_with_reasoning():
     assert len(parsed["reasoning_content_blocks"]) == 1
     # The reasoningContent object is added directly
     assert "reasoningText" in parsed["reasoning_content_blocks"][0]
+
+
+def test_agentcore_extract_reasoning_from_agentcore_nested_format():
+    """
+    Unit test for extracting reasoning from AgentCore nested format.
+
+    Our Strands agent emits reasoning via:
+    {"event": {"contentBlockDelta": {"delta": {"reasoningContent": {"text": "..."}}}}}
+
+    This is distinct from both:
+    - Strands top-level: {"reasoning": true, "reasoningText": "..."}
+    - Bedrock Converse flat: {"event": {"contentBlockDelta": {"delta": {"reasoningText": "..."}}}}
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    # Test AgentCore nested reasoningContent format
+    agentcore_event = {
+        "event": {
+            "contentBlockDelta": {
+                "delta": {
+                    "reasoningContent": {
+                        "text": "Let me analyze this step by step..."
+                    }
+                },
+                "contentBlockIndex": 0,
+            }
+        }
+    }
+
+    reasoning_block = config._extract_reasoning_from_event(agentcore_event)
+
+    assert reasoning_block is not None
+    assert "reasoningText" in reasoning_block
+    assert reasoning_block["reasoningText"]["text"] == "Let me analyze this step by step..."
+
+
+def test_agentcore_extract_reasoning_agentcore_format_with_signature():
+    """
+    Unit test for AgentCore nested format with signature field.
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    agentcore_event = {
+        "event": {
+            "contentBlockDelta": {
+                "delta": {
+                    "reasoningContent": {
+                        "text": "Considering the options...",
+                        "signature": "nested_sig_abc"
+                    }
+                }
+            }
+        }
+    }
+
+    reasoning_block = config._extract_reasoning_from_event(agentcore_event)
+
+    assert reasoning_block is not None
+    assert reasoning_block["reasoningText"]["text"] == "Considering the options..."
+    assert reasoning_block["reasoningText"]["signature"] == "nested_sig_abc"
+
+
+def test_agentcore_sse_iterator_streams_reasoning_content():
+    """
+    Unit test for SSE iterator streaming reasoning content.
+
+    Verifies that the SSE iterator emits ModelResponse chunks with
+    reasoning_content in the Delta when processing reasoningContent events.
+    """
+    from litellm.llms.bedrock.chat.agentcore.sse_iterator import AgentCoreSSEStreamIterator
+
+    # Create mock response with reasoning + text SSE lines
+    lines = [
+        'data: {"event":{"contentBlockDelta":{"delta":{"reasoningContent":{"text":"Thinking..."}},"contentBlockIndex":0}}}',
+        'data: {"event":{"contentBlockDelta":{"delta":{"reasoningContent":{"text":" more thoughts"}},"contentBlockIndex":0}}}',
+        'data: {"event":{"contentBlockDelta":{"delta":{"text":"Final answer."}},"contentBlockIndex":0}}',
+        'data: {"event":{"metadata":{"usage":{"inputTokens":10,"outputTokens":5,"totalTokens":15}}}}',
+    ]
+
+    mock_response = Mock(spec=httpx.Response)
+    mock_response.iter_lines.return_value = iter(lines)
+
+    iterator = AgentCoreSSEStreamIterator(response=mock_response, model="test-model")
+    iter(iterator)  # Initialize sync iteration
+
+    chunks = list(iterator)
+
+    # Should have 4 chunks: 2 reasoning, 1 text, 1 finish
+    assert len(chunks) == 4
+
+    # First two chunks should have reasoning_content
+    assert hasattr(chunks[0].choices[0].delta, "reasoning_content")
+    assert chunks[0].choices[0].delta.reasoning_content == "Thinking..."
+
+    assert hasattr(chunks[1].choices[0].delta, "reasoning_content")
+    assert chunks[1].choices[0].delta.reasoning_content == " more thoughts"
+
+    # Third chunk should have regular text content
+    assert chunks[2].choices[0].delta.content == "Final answer."
+
+    # Fourth chunk should signal finish
+    assert chunks[3].choices[0].finish_reason == "stop"
+
+
+def test_agentcore_sse_iterator_streams_reasoning_text_flat():
+    """
+    Unit test for SSE iterator with flat reasoningText format.
+    """
+    from litellm.llms.bedrock.chat.agentcore.sse_iterator import AgentCoreSSEStreamIterator
+
+    lines = [
+        'data: {"event":{"contentBlockDelta":{"delta":{"reasoningText":"Flat thinking..."}}}}',
+        'data: {"event":{"contentBlockDelta":{"delta":{"text":"Done."}}}}',
+        'data: {"event":{"metadata":{"usage":{"inputTokens":5,"outputTokens":3,"totalTokens":8}}}}',
+    ]
+
+    mock_response = Mock(spec=httpx.Response)
+    mock_response.iter_lines.return_value = iter(lines)
+
+    iterator = AgentCoreSSEStreamIterator(response=mock_response, model="test-model")
+    iter(iterator)
+
+    chunks = list(iterator)
+
+    assert len(chunks) == 3
+    assert hasattr(chunks[0].choices[0].delta, "reasoning_content")
+    assert chunks[0].choices[0].delta.reasoning_content == "Flat thinking..."
+    assert chunks[1].choices[0].delta.content == "Done."
+    assert chunks[2].choices[0].finish_reason == "stop"
+
+
+def test_agentcore_parse_sse_response_with_agentcore_reasoning_format():
+    """
+    Unit test for non-streaming SSE parsing with AgentCore nested reasoningContent format.
+    """
+    from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
+
+    config = AmazonAgentCoreConfig()
+
+    # SSE data with reasoningContent nested format (what our agent emits)
+    sse_data = """data: {"event":{"contentBlockDelta":{"delta":{"reasoningContent":{"text":"Step 1: analyze..."}},"contentBlockIndex":0}}}
+
+data: {"event":{"contentBlockDelta":{"delta":{"reasoningContent":{"text":"Step 2: decide..."}},"contentBlockIndex":0}}}
+
+data: {"event":{"contentBlockDelta":{"delta":{"text":"Here is the answer."}},"contentBlockIndex":0}}
+
+data: {"event":{"metadata":{"usage":{"inputTokens":30,"outputTokens":20,"totalTokens":50}}}}
+
+data: {"message":{"role":"assistant","content":[{"text":"Here is the answer."}]}}
+"""
+
+    mock_response = Mock(spec=httpx.Response)
+    mock_response.headers = {"content-type": "text/event-stream"}
+    mock_response.text = sse_data
+
+    parsed = config._get_parsed_response(mock_response)
+
+    assert parsed["content"] == "Here is the answer."
+    assert parsed["usage"]["inputTokens"] == 30
+
+    # Reasoning blocks should be captured from the nested format
+    assert parsed["reasoning_content_blocks"] is not None
+    assert len(parsed["reasoning_content_blocks"]) == 2
+    assert parsed["reasoning_content_blocks"][0]["reasoningText"]["text"] == "Step 1: analyze..."
+    assert parsed["reasoning_content_blocks"][1]["reasoningText"]["text"] == "Step 2: decide..."

From 2c8e5927db10c38dbd0c8275649f6117f17ac8ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B3n=20Levy?= <levy@apro.is>
Date: Fri, 6 Feb 2026 16:18:18 +0000
Subject: [PATCH 7/7] chore: remove ad-hoc agentcore test scripts from repo
 root

These were temporary debugging scripts with hardcoded credentials
that should not be part of the codebase.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 test_agentcore_direct.py   | 114 ----------
 test_agentcore_thinking.py | 446 -------------------------------------
 2 files changed, 560 deletions(-)
 delete mode 100644 test_agentcore_direct.py
 delete mode 100644 test_agentcore_thinking.py

diff --git a/test_agentcore_direct.py b/test_agentcore_direct.py
deleted file mode 100644
index 18b960b91c5b..000000000000
--- a/test_agentcore_direct.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env python3
-"""
-Direct test of AgentCore API (no LiteLLM) to verify thinking/reasoning works.
-"""
-
-import json
-import requests
-import uuid
-from urllib.parse import quote
-
-# Configuration
-COGNITO_CLIENT_ID = "65su16qe567iap1010l914dhbg"
-COGNITO_CLIENT_SECRET = "pa01tlqslkokulsq0mq910v7238oa8g6vins0lqd966rmb8ck19"
-COGNITO_TOKEN_URL = "https://apro-chat.auth.eu-central-1.amazoncognito.com/oauth2/token"
-AGENTCORE_RUNTIME_ARN = "arn:aws:bedrock-agentcore:eu-west-1:515966504419:runtime/apro_sandbox_tomas_genai_v2_ut_messan_runtime-N9x7Ce3CcD"
-
-def get_token():
-    """Get JWT token from Cognito."""
-    print("🔐 Getting token...")
-    resp = requests.post(
-        COGNITO_TOKEN_URL,
-        data={
-            "grant_type": "client_credentials",
-            "client_id": COGNITO_CLIENT_ID,
-            "client_secret": COGNITO_CLIENT_SECRET,
-        },
-        headers={"Content-Type": "application/x-www-form-urlencoded"},
-    )
-    token = resp.json()["access_token"]
-    print(f"✅ Token: {token[:50]}...")
-    return token
-
-def test_agentcore(token: str):
-    """Test AgentCore directly."""
-    # Parse ARN
-    region = AGENTCORE_RUNTIME_ARN.split(":")[3]
-
-    # Build URL with URL-encoded ARN
-    encoded_arn = quote(AGENTCORE_RUNTIME_ARN, safe="")
-    url = f"https://bedrock-agentcore.{region}.amazonaws.com/runtimes/{encoded_arn}/invocations"
-
-    # Generate session ID (must be 33+ chars)
-    session_id = f"test-session-{uuid.uuid4()}"
-
-    prompt = "Think step by step: What is 23 * 47?"
-
-    print(f"\n🌐 URL: {url}")
-    print(f"📤 Prompt: {prompt}")
-    print(f"📡 Session: {session_id}")
-    print("\n" + "="*60)
-    print("  RAW SSE EVENTS")
-    print("="*60 + "\n")
-
-    # Make streaming request
-    resp = requests.post(
-        url,
-        json={"prompt": prompt},
-        headers={
-            "Authorization": f"Bearer {token}",
-            "Content-Type": "application/json",
-            "Accept": "text/event-stream",
-            "X-Amzn-Bedrock-AgentCore-Runtime-Session-Id": session_id,
-        },
-        stream=True,
-    )
-
-    print(f"Status: {resp.status_code}\n")
-
-    if resp.status_code != 200:
-        print(f"❌ Error: {resp.text}")
-        return
-
-    # Process events
-    reasoning_found = False
-    content_parts = []
-
-    for line in resp.iter_lines():
-        if line:
-            decoded = line.decode('utf-8')
-            print(decoded)
-
-            # Check for reasoning
-            if 'reasoning' in decoded.lower():
-                reasoning_found = True
-                print("  👆 REASONING EVENT!")
-
-            # Extract content
-            if decoded.startswith("data:"):
-                try:
-                    data = json.loads(decoded[5:].strip())
-                    if "event" in data:
-                        evt = data["event"]
-                        if "contentBlockDelta" in evt:
-                            text = evt["contentBlockDelta"].get("delta", {}).get("text", "")
-                            if text:
-                                content_parts.append(text)
-                except:
-                    pass
-
-    print("\n" + "="*60)
-    print("  SUMMARY")
-    print("="*60)
-
-    if content_parts:
-        print(f"\n📄 Response: {''.join(content_parts)}")
-
-    if reasoning_found:
-        print("\n✅ REASONING/THINKING DETECTED!")
-    else:
-        print("\n❌ No reasoning detected - agent needs thinking enabled")
-
-if __name__ == "__main__":
-    token = get_token()
-    test_agentcore(token)
diff --git a/test_agentcore_thinking.py b/test_agentcore_thinking.py
deleted file mode 100644
index abb03015f61a..000000000000
--- a/test_agentcore_thinking.py
+++ /dev/null
@@ -1,446 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test script for Bedrock AgentCore thinking/reasoning feature.
-
-This script tests whether the AgentCore agent emits reasoning events
-and whether LiteLLM correctly captures them.
-
-Usage:
-    python test_agentcore_thinking.py
-
-Requirements:
-    - pip install requests httpx
-"""
-
-import os
-import json
-import requests
-import httpx
-from typing import Optional
-
-# ============================================================================
-# Configuration - Update these values
-# ============================================================================
-
-# Cognito OAuth2 credentials
-COGNITO_CLIENT_ID = os.getenv("COGNITO_CLIENT_ID", "65su16qe567iap1010l914dhbg")
-COGNITO_CLIENT_SECRET = os.getenv("COGNITO_CLIENT_SECRET", "pa01tlqslkokulsq0mq910v7238oa8g6vins0lqd966rmb8ck19")
-COGNITO_TOKEN_URL = os.getenv(
-    "COGNITO_TOKEN_URL",
-    "https://apro-chat.auth.eu-central-1.amazoncognito.com/oauth2/token"
-)
-
-# AgentCore Runtime ARN
-AGENTCORE_RUNTIME_ARN = os.getenv(
-    "AGENTCORE_RUNTIME_ARN",
-    "arn:aws:bedrock-agentcore:eu-west-1:515966504419:runtime/apro_sandbox_tomas_genai_v2_ut_messan_runtime-N9x7Ce3CcD"
-)
-
-# Enable debug logging
-DEBUG = os.getenv("DEBUG", "false").lower() == "true"
-
-# ============================================================================
-# Helper Functions
-# ============================================================================
-
-def get_cognito_token() -> str:
-    """Get JWT token from Cognito using client credentials flow."""
-    print("🔐 Getting JWT token from Cognito...")
-
-    response = requests.post(
-        COGNITO_TOKEN_URL,
-        data={
-            "grant_type": "client_credentials",
-            "client_id": COGNITO_CLIENT_ID,
-            "client_secret": COGNITO_CLIENT_SECRET,
-        },
-        headers={"Content-Type": "application/x-www-form-urlencoded"},
-    )
-
-    if response.status_code != 200:
-        raise Exception(f"Failed to get token: {response.status_code} - {response.text}")
-
-    token = response.json()["access_token"]
-    print(f"✅ Got token (first 50 chars): {token[:50]}...")
-    return token
-
-
-def print_separator(title: str):
-    """Print a visual separator."""
-    print(f"\n{'='*60}")
-    print(f"  {title}")
-    print(f"{'='*60}\n")
-
-
-def parse_arn(arn: str) -> tuple:
-    """Parse ARN to extract region and runtime ID."""
-    parts = arn.split(":")
-    region = parts[3]
-    runtime_id = parts[5].split("/")[1]
-    return region, runtime_id
-
-
-def generate_session_id() -> str:
-    """Generate a session ID (must be 33+ chars)."""
-    import uuid
-    return f"session-{uuid.uuid4()}-test"
-
-
-# ============================================================================
-# Test Functions
-# ============================================================================
-
-def test_direct_agentcore_streaming(token: str):
-    """Test AgentCore streaming by making direct HTTP calls."""
-    print_separator("Test: Direct AgentCore Streaming")
-
-    region, runtime_id = parse_arn(AGENTCORE_RUNTIME_ARN)
-    session_id = generate_session_id()
-
-    # AgentCore invoke endpoint
-    url = f"https://bedrock-agentcore.{region}.amazonaws.com/runtimes/{runtime_id}/sessions/{session_id}/invoke"
-
-    prompt = "Think step by step: What is 23 * 47? Show your reasoning process."
-
-    print(f"🌐 URL: {url}")
-    print(f"📤 Prompt: {prompt}")
-    print(f"📡 Raw SSE events:\n")
-
-    # Build request
-    payload = {"prompt": prompt}
-    request_body = {"payload": json.dumps(payload)}
-
-    try:
-        with httpx.Client(timeout=120.0) as client:
-            with client.stream(
-                "POST",
-                url,
-                json=request_body,
-                headers={
-                    "Authorization": f"Bearer {token}",
-                    "Content-Type": "application/json",
-                    "Accept": "text/event-stream",
-                },
-            ) as response:
-                print(f"📡 Response status: {response.status_code}")
-
-                if response.status_code != 200:
-                    error_body = response.read().decode()
-                    print(f"❌ Error: {error_body[:500]}")
-                    return False
-
-                reasoning_events = []
-                content_events = []
-                all_content = []
-                event_count = 0
-
-                for line in response.iter_lines():
-                    if line:
-                        event_count += 1
-
-                        # Print truncated line for readability
-                        display_line = line[:120] + "..." if len(line) > 120 else line
-                        print(f"  [{event_count}] {display_line}")
-
-                        # Try to parse JSON from "data:" prefix
-                        if line.startswith("data:"):
-                            try:
-                                json_str = line[5:].strip()
-                                if json_str:
-                                    event_data = json.loads(json_str)
-
-                                    # Check for reasoning events
-                                    if event_data.get("reasoning"):
-                                        reasoning_events.append(event_data)
-                                        text = event_data.get("reasoningText", "")
-                                        print(f"      🧠 REASONING: {text[:80]}...")
-
-                                    # Check for content delta
-                                    if "event" in event_data:
-                                        evt = event_data["event"]
-                                        if "contentBlockDelta" in evt:
-                                            delta = evt["contentBlockDelta"].get("delta", {})
-                                            text = delta.get("text", "")
-                                            if text:
-                                                all_content.append(text)
-                                            # Check for reasoning in delta
-                                            if delta.get("reasoningText"):
-                                                reasoning_events.append(event_data)
-                                                print(f"      🧠 REASONING (delta): {delta['reasoningText'][:80]}...")
-
-                                        content_events.append(event_data)
-                            except json.JSONDecodeError:
-                                pass
-
-                        # Also check raw line for reasoning keywords
-                        elif 'reasoning' in line.lower():
-                            reasoning_events.append(line)
-
-                print("\n")
-                print_separator("Results Summary")
-                print(f"📊 Total events: {event_count}")
-                print(f"📝 Content events: {len(content_events)}")
-                print(f"🧠 Reasoning events: {len(reasoning_events)}")
-
-                if all_content:
-                    full_content = "".join(all_content)
-                    print(f"\n📄 Full Response:\n{full_content[:500]}...")
-
-                if reasoning_events:
-                    print("\n✅ REASONING DETECTED!")
-                    for i, evt in enumerate(reasoning_events[:3]):
-                        if isinstance(evt, dict):
-                            print(f"   [{i+1}] {json.dumps(evt)[:150]}...")
-                        else:
-                            print(f"   [{i+1}] {evt[:150]}...")
-                    return True
-                else:
-                    print("\n❌ No reasoning events detected in stream")
-                    return False
-
-    except Exception as e:
-        print(f"\n❌ Error: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-
-def test_direct_agentcore_non_streaming(token: str):
-    """Test AgentCore non-streaming by making direct HTTP calls."""
-    print_separator("Test: Direct AgentCore Non-Streaming")
-
-    region, runtime_id = parse_arn(AGENTCORE_RUNTIME_ARN)
-    session_id = generate_session_id()
-
-    # AgentCore invoke endpoint
-    url = f"https://bedrock-agentcore.{region}.amazonaws.com/runtimes/{runtime_id}/sessions/{session_id}/invoke"
-
-    prompt = "What is the capital of France? Answer briefly."
-
-    print(f"🌐 URL: {url}")
-    print(f"📤 Prompt: {prompt}")
-    print(f"⏳ Waiting for response...\n")
-
-    # Build request
-    payload = {"prompt": prompt}
-    request_body = {"payload": json.dumps(payload)}
-
-    try:
-        response = httpx.post(
-            url,
-            json=request_body,
-            headers={
-                "Authorization": f"Bearer {token}",
-                "Content-Type": "application/json",
-            },
-            timeout=120.0,
-        )
-
-        print(f"📡 Response status: {response.status_code}")
-
-        if response.status_code != 200:
-            print(f"❌ Error: {response.text[:500]}")
-            return False
-
-        # Print raw response
-        raw_text = response.text
-        print(f"\n📥 Raw response:\n{raw_text[:1000]}...")
-
-        # Check for reasoning in response
-        has_reasoning = 'reasoning' in raw_text.lower()
-
-        if has_reasoning:
-            print("\n✅ REASONING DETECTED in response!")
-            return True
-        else:
-            print("\n❌ No reasoning detected in response")
-            return False
-
-    except Exception as e:
-        print(f"\n❌ Error: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-
-def test_with_litellm_config(token: str):
-    """Test using LiteLLM's AmazonAgentCoreConfig."""
-    print_separator("Test: LiteLLM AmazonAgentCoreConfig")
-
-    try:
-        from litellm.llms.bedrock.chat.agentcore.transformation import AmazonAgentCoreConfig
-        from litellm.litellm_core_utils.litellm_logging import Logging
-    except ImportError as e:
-        print(f"❌ Could not import LiteLLM AgentCore module: {e}")
-        return False
-
-    config = AmazonAgentCoreConfig()
-
-    prompt = "What is 2+2?"
-    messages = [{"role": "user", "content": prompt}]
-
-    # The model format that LiteLLM expects
-    model_str = f"agentcore/{AGENTCORE_RUNTIME_ARN}"
-
-    print(f"📤 Prompt: {prompt}")
-    print(f"📦 Model: {model_str}")
-
-    try:
-        # Validate environment and get headers
-        headers = config.validate_environment(
-            headers={},
-            model=model_str,
-            messages=messages,
-            optional_params={},
-            litellm_params={},
-            api_key=token,
-        )
-
-        # Get the complete URL
-        url = config.get_complete_url(
-            api_base=None,
-            api_key=token,
-            model=model_str,
-            optional_params={},
-            litellm_params={},
-            stream=True,
-        )
-
-        # Transform request
-        data = config.transform_request(
-            model=model_str,
-            messages=messages,
-            optional_params={},
-            litellm_params={},
-            headers=headers,
-        )
-
-        print(f"🌐 URL: {url}")
-        print(f"📦 Request: {json.dumps(data, indent=2)}")
-        print(f"📡 Making streaming request...\n")
-
-        # Make request
-        with httpx.Client(timeout=120.0) as client:
-            with client.stream("POST", url, json=data, headers=headers) as response:
-                print(f"📡 Response status: {response.status_code}")
-
-                if response.status_code != 200:
-                    error_body = response.read().decode()
-                    print(f"❌ Error: {error_body[:500]}")
-                    return False
-
-                reasoning_found = False
-                event_count = 0
-                all_content = []
-
-                for line in response.iter_lines():
-                    if line:
-                        event_count += 1
-                        display = line[:100] + "..." if len(line) > 100 else line
-                        print(f"  [{event_count}] {display}")
-
-                        # Check for reasoning
-                        if 'reasoning' in line.lower():
-                            reasoning_found = True
-                            print(f"      🧠 REASONING DETECTED!")
-
-                        # Try to extract content
-                        if line.startswith("data:"):
-                            try:
-                                event_data = json.loads(line[5:].strip())
-                                if "event" in event_data:
-                                    evt = event_data["event"]
-                                    if "contentBlockDelta" in evt:
-                                        delta = evt["contentBlockDelta"].get("delta", {})
-                                        text = delta.get("text", "")
-                                        if text:
-                                            all_content.append(text)
-                            except:
-                                pass
-
-                print(f"\n📊 Total events: {event_count}")
-                if all_content:
-                    print(f"📄 Response: {''.join(all_content)[:300]}...")
-
-                if reasoning_found:
-                    print("\n✅ REASONING DETECTED!")
-                else:
-                    print("\n❌ No reasoning detected")
-
-                return reasoning_found
-
-    except Exception as e:
-        print(f"\n❌ Error: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-
-# ============================================================================
-# Main
-# ============================================================================
-
-def main():
-    print("""
-╔══════════════════════════════════════════════════════════════╗
-║     Bedrock AgentCore Thinking/Reasoning Test Script         ║
-╠══════════════════════════════════════════════════════════════╣
-║  This script tests whether reasoning events are emitted      ║
-║  by your AgentCore agent and captured by LiteLLM.            ║
-╚══════════════════════════════════════════════════════════════╝
-""")
-
-    # Get authentication token
-    try:
-        token = get_cognito_token()
-    except Exception as e:
-        print(f"❌ Authentication failed: {e}")
-        return
-
-    results = {}
-
-    # Run tests
-    results['direct_streaming'] = test_direct_agentcore_streaming(token)
-    results['direct_non_streaming'] = test_direct_agentcore_non_streaming(token)
-    results['litellm_config'] = test_with_litellm_config(token)
-
-    # Final summary
-    print_separator("FINAL SUMMARY")
-
-    for test_name, passed in results.items():
-        status = "✅ THINKING FOUND" if passed else "❌ NO THINKING"
-        print(f"  {test_name}: {status}")
-
-    if not any(results.values()):
-        print("""
-╔══════════════════════════════════════════════════════════════╗
-║  ⚠️  NO THINKING/REASONING DETECTED                          ║
-╠══════════════════════════════════════════════════════════════╣
-║  Your AgentCore agent is NOT emitting reasoning events.      ║
-║                                                              ║
-║  To enable thinking in your Strands agent, configure it:     ║
-║                                                              ║
-║    bedrock_model = BedrockModel(                             ║
-║        model_id="anthropic.claude-sonnet-4-...",             ║
-║        additional_request_fields={                           ║
-║            "thinking": {                                     ║
-║                "type": "enabled",                            ║
-║                "budget_tokens": 4096                         ║
-║            }                                                 ║
-║        }                                                     ║
-║    )                                                         ║
-╚══════════════════════════════════════════════════════════════╝
-""")
-    else:
-        print("""
-╔══════════════════════════════════════════════════════════════╗
-║  ✅ THINKING/REASONING DETECTED!                             ║
-╠══════════════════════════════════════════════════════════════╣
-║  Your AgentCore agent is emitting reasoning events and       ║
-║  LiteLLM is correctly capturing them.                        ║
-╚══════════════════════════════════════════════════════════════╝
-""")
-
-
-if __name__ == "__main__":
-    main()