Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
model=model_name,
input="What is 13 * 24? Use python to calculate the result.",
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
extra_body={"enable_response_messages": True},
temperature=0.0,
)

Expand All @@ -178,3 +179,8 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
# make sure the correct math is in the final output
assert response.output[3].type == "message"
assert "312" in response.output[3].content[0].text

# test raw input_messages / output_messages
assert len(response.input_messages) == 1
assert len(response.output_messages) == 3
assert "312" in response.output_messages[2]["message"]
28 changes: 28 additions & 0 deletions vllm/entrypoints/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,12 +267,40 @@ def __init__(
self.chat_template = chat_template
self.chat_template_content_format = chat_template_content_format

self.input_messages: list[ResponseRawMessageAndToken] = []
self.output_messages: list[ResponseRawMessageAndToken] = []

def append_output(self, output: RequestOutput) -> None:
self.num_prompt_tokens = len(output.prompt_token_ids or [])
self.num_cached_tokens = output.num_cached_tokens or 0
self.num_output_tokens += len(output.outputs[0].token_ids or [])
self.parser.process(output.outputs[0])

# only store if enable_response_messages is True, save memory
if self.request.enable_response_messages:
output_prompt = output.prompt or ""
output_prompt_token_ids = output.prompt_token_ids or []
if len(self.input_messages) == 0:
self.input_messages.append(
ResponseRawMessageAndToken(
message=output_prompt,
tokens=output_prompt_token_ids,
)
)
else:
self.output_messages.append(
ResponseRawMessageAndToken(
message=output_prompt,
tokens=output_prompt_token_ids,
)
)
self.output_messages.append(
ResponseRawMessageAndToken(
message=output.outputs[0].text,
tokens=output.outputs[0].token_ids,
)
)

def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None:
self.parser.response_messages.extend(output)

Expand Down
38 changes: 37 additions & 1 deletion vllm/entrypoints/openai/parser/responses_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,19 @@
import logging
from collections.abc import Callable

from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
from openai.types.responses.response_function_tool_call_output_item import (
ResponseFunctionToolCallOutputItem,
)
from openai.types.responses.response_input_item import McpCall
Comment on lines +6 to +10

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Import McpCall output schema for parsed tool calls

The parser now converts MCP tool call outputs into McpCall objects inside make_response_output_items_from_parsable_context, but the import here pulls McpCall from response_input_item. That class models request inputs and (unlike the output version used elsewhere, e.g. harmony_utils) does not carry the output field this method populates, so parsing an MCP call will either raise validation/TypeError or drop the tool output when assembling the response. This should import the output-item McpCall to correctly serialize tool results for parsable contexts.

Useful? React with 👍 / 👎.

from openai.types.responses.response_output_message import ResponseOutputMessage
from openai.types.responses.response_output_text import ResponseOutputText
from openai.types.responses.response_reasoning_item import (
Content,
ResponseReasoningItem,
)

from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser
from vllm.outputs import CompletionOutput
Expand Down Expand Up @@ -111,6 +116,37 @@ def process(self, output: CompletionOutput) -> "ResponsesParser":

return self

def make_response_output_items_from_parsable_context(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just a refactor, no functional changes.

self,
) -> list[ResponseOutputItem]:
"""Given a list of sentences, construct ResponseOutput Items."""
response_messages = self.response_messages[self.num_init_messages :]
output_messages: list[ResponseOutputItem] = []
for message in response_messages:
if not isinstance(message, ResponseFunctionToolCallOutputItem):
output_messages.append(message)
else:
if len(output_messages) == 0:
raise ValueError(
"Cannot have a FunctionToolCallOutput before FunctionToolCall."
)
if isinstance(output_messages[-1], ResponseFunctionToolCall):
mcp_message = McpCall(
id=f"{MCP_PREFIX}{random_uuid()}",
arguments=output_messages[-1].arguments,
name=output_messages[-1].name,
server_label=output_messages[
-1
].name, # TODO: store the server label
type="mcp_call",
status="completed",
output=message.output,
# TODO: support error output
)
output_messages[-1] = mcp_message

return output_messages


def get_responses_parser_for_simple_context(
*,
Expand Down
13 changes: 3 additions & 10 deletions vllm/entrypoints/openai/serving_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@
construct_input_messages,
construct_tool_dicts,
extract_tool_types,
make_response_output_items_from_parsable_context,
)
from vllm.entrypoints.tool_server import ToolServer
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
Expand Down Expand Up @@ -651,17 +650,11 @@ async def responses_full_generator(
else:
status = "incomplete"
elif isinstance(context, ParsableContext):
response_messages = context.parser.response_messages[
context.parser.num_init_messages :
]
output = make_response_output_items_from_parsable_context(response_messages)
output = context.parser.make_response_output_items_from_parsable_context()

# TODO: context for non-gptoss models doesn't use messages
# so we can't get them out yet
if request.enable_response_messages:
raise NotImplementedError(
"enable_response_messages is currently only supported for gpt-oss"
)
input_messages = context.input_messages
output_messages = context.output_messages

# TODO: Calculate usage.
# assert final_res.prompt_token_ids is not None
Expand Down
33 changes: 0 additions & 33 deletions vllm/entrypoints/responses_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from openai.types.responses.response_function_tool_call_output_item import (
ResponseFunctionToolCallOutputItem,
)
from openai.types.responses.response_output_item import McpCall
from openai.types.responses.response_output_message import ResponseOutputMessage
from openai.types.responses.response_reasoning_item import ResponseReasoningItem
from openai.types.responses.tool import Tool
Expand All @@ -27,38 +26,6 @@
ChatCompletionMessageParam,
ResponseInputOutputItem,
)
from vllm.utils import random_uuid


def make_response_output_items_from_parsable_context(
response_messages: list[ResponseInputOutputItem],
) -> list[ResponseOutputItem]:
"""Given a list of sentences, construct ResponseOutput Items."""
output_messages: list[ResponseOutputItem] = []
for message in response_messages:
if not isinstance(message, ResponseFunctionToolCallOutputItem):
output_messages.append(message)
else:
if len(output_messages) == 0:
raise ValueError(
"Cannot have a FunctionToolCallOutput before FunctionToolCall."
)
if isinstance(output_messages[-1], ResponseFunctionToolCall):
mcp_message = McpCall(
id=f"{MCP_PREFIX}{random_uuid()}",
arguments=output_messages[-1].arguments,
name=output_messages[-1].name,
server_label=output_messages[
-1
].name, # TODO: store the server label
type=f"{MCP_PREFIX}call",
status="completed",
output=message.output,
# TODO: support error output
)
output_messages[-1] = mcp_message

return output_messages


def construct_input_messages(
Expand Down