553 lines
20 KiB
Python
553 lines
20 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""
|
|
Harmony ↔ Responses API conversion utilities.
|
|
|
|
Handles two directions:
|
|
1. Response Input → Harmony Messages (input parsing)
|
|
2. Harmony Messages → Response Output Items (output parsing)
|
|
"""
|
|
|
|
import json
|
|
|
|
from openai.types.responses import (
|
|
ResponseFunctionToolCall,
|
|
ResponseOutputItem,
|
|
ResponseOutputMessage,
|
|
ResponseOutputText,
|
|
ResponseReasoningItem,
|
|
)
|
|
from openai.types.responses.response_function_web_search import (
|
|
ActionFind,
|
|
ActionOpenPage,
|
|
ActionSearch,
|
|
ResponseFunctionWebSearch,
|
|
)
|
|
from openai.types.responses.response_output_item import McpCall
|
|
from openai.types.responses.response_reasoning_item import (
|
|
Content as ResponseReasoningTextContent,
|
|
)
|
|
from openai_harmony import Author, Message, Role, StreamableParser, TextContent
|
|
|
|
from vllm.entrypoints.openai.parser.harmony_utils import (
|
|
BUILTIN_TOOL_TO_MCP_SERVER_LABEL,
|
|
flatten_chat_text_content,
|
|
)
|
|
from vllm.entrypoints.openai.responses.protocol import (
|
|
ResponseInputOutputItem,
|
|
ResponsesRequest,
|
|
)
|
|
from vllm.logger import init_logger
|
|
from vllm.utils import random_uuid
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 1. Private helpers for input parsing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _parse_harmony_format_message(chat_msg: dict) -> Message:
|
|
"""Reconstruct a Message from Harmony-format dict,
|
|
preserving channel, recipient, and content_type."""
|
|
author_dict = chat_msg["author"]
|
|
role = author_dict.get("role")
|
|
name = author_dict.get("name")
|
|
|
|
raw_content = chat_msg.get("content", "")
|
|
if isinstance(raw_content, list):
|
|
# TODO: Support refusal and non-text content types.
|
|
contents = [TextContent(text=c.get("text", "")) for c in raw_content]
|
|
elif isinstance(raw_content, str):
|
|
contents = [TextContent(text=raw_content)]
|
|
else:
|
|
contents = [TextContent(text="")]
|
|
|
|
if name:
|
|
msg = Message.from_author_and_contents(Author.new(Role(role), name), contents)
|
|
else:
|
|
msg = Message.from_role_and_contents(Role(role), contents)
|
|
|
|
channel = chat_msg.get("channel")
|
|
if channel:
|
|
msg = msg.with_channel(channel)
|
|
recipient = chat_msg.get("recipient")
|
|
if recipient:
|
|
msg = msg.with_recipient(recipient)
|
|
content_type = chat_msg.get("content_type")
|
|
if content_type:
|
|
msg = msg.with_content_type(content_type)
|
|
|
|
return msg
|
|
|
|
|
|
def _parse_chat_format_message(chat_msg: dict) -> list[Message]:
|
|
"""Parse an OpenAI chat-format dict into Harmony messages."""
|
|
role = chat_msg.get("role")
|
|
if role is None:
|
|
raise ValueError(f"Message has no 'role' key: {chat_msg}")
|
|
|
|
# Assistant message with tool calls
|
|
tool_calls = chat_msg.get("tool_calls")
|
|
if role == "assistant" and tool_calls:
|
|
msgs: list[Message] = []
|
|
for call in tool_calls:
|
|
func = call.get("function", {})
|
|
name = func.get("name", "")
|
|
arguments = func.get("arguments", "") or ""
|
|
msg = Message.from_role_and_content(Role.ASSISTANT, arguments)
|
|
msg = msg.with_channel("commentary")
|
|
msg = msg.with_recipient(f"functions.{name}")
|
|
msg = msg.with_content_type("json")
|
|
msgs.append(msg)
|
|
return msgs
|
|
|
|
# Tool role message (tool output)
|
|
if role == "tool":
|
|
name = chat_msg.get("name", "")
|
|
if name and not name.startswith("functions."):
|
|
name = f"functions.{name}"
|
|
content = chat_msg.get("content", "") or ""
|
|
content = flatten_chat_text_content(content)
|
|
# NOTE: .with_recipient("assistant") is required on tool messages
|
|
# to match parse_chat_input_to_harmony_message behavior and ensure
|
|
# proper routing in the Harmony protocol.
|
|
msg = (
|
|
Message.from_author_and_content(Author.new(Role.TOOL, name), content)
|
|
.with_channel("commentary")
|
|
.with_recipient("assistant")
|
|
)
|
|
return [msg]
|
|
|
|
# Default: user/assistant/system messages
|
|
content = chat_msg.get("content", "")
|
|
if isinstance(content, str):
|
|
contents = [TextContent(text=content)]
|
|
else:
|
|
# TODO: Support refusal.
|
|
contents = [TextContent(text=c.get("text", "")) for c in content]
|
|
msg = Message.from_role_and_contents(role, contents)
|
|
return [msg]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 2. Public input parsing functions
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def response_input_to_harmony(
|
|
response_msg: ResponseInputOutputItem,
|
|
prev_responses: list[ResponseOutputItem | ResponseReasoningItem],
|
|
) -> Message:
|
|
"""Convert a single ResponseInputOutputItem into a Harmony Message."""
|
|
if not isinstance(response_msg, dict):
|
|
response_msg = response_msg.model_dump()
|
|
if "type" not in response_msg or response_msg["type"] == "message":
|
|
role = response_msg["role"]
|
|
content = response_msg["content"]
|
|
# Add prefix for developer messages.
|
|
# <|start|>developer<|message|># Instructions {instructions}<|end|>
|
|
text_prefix = "Instructions:\n" if role == "developer" else ""
|
|
if isinstance(content, str):
|
|
msg = Message.from_role_and_content(role, text_prefix + content)
|
|
else:
|
|
contents = [TextContent(text=text_prefix + c["text"]) for c in content]
|
|
msg = Message.from_role_and_contents(role, contents)
|
|
if role == "assistant":
|
|
msg = msg.with_channel("final")
|
|
elif response_msg["type"] == "function_call_output":
|
|
call_id = response_msg["call_id"]
|
|
call_response: ResponseFunctionToolCall | None = None
|
|
for prev_response in reversed(prev_responses):
|
|
if (
|
|
isinstance(prev_response, ResponseFunctionToolCall)
|
|
and prev_response.call_id == call_id
|
|
):
|
|
call_response = prev_response
|
|
break
|
|
if call_response is None:
|
|
raise ValueError(f"No call message found for {call_id}")
|
|
msg = Message.from_author_and_content(
|
|
Author.new(Role.TOOL, f"functions.{call_response.name}"),
|
|
response_msg["output"],
|
|
)
|
|
elif response_msg["type"] == "reasoning":
|
|
content = response_msg["content"]
|
|
assert len(content) == 1
|
|
msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
|
|
elif response_msg["type"] == "function_call":
|
|
msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"])
|
|
msg = msg.with_channel("commentary")
|
|
msg = msg.with_recipient(f"functions.{response_msg['name']}")
|
|
msg = msg.with_content_type("json")
|
|
else:
|
|
raise ValueError(f"Unknown input type: {response_msg['type']}")
|
|
return msg
|
|
|
|
|
|
def response_previous_input_to_harmony(chat_msg) -> list[Message]:
|
|
"""Parse a message from request.previous_input_messages
|
|
into Harmony messages.
|
|
|
|
Supports both OpenAI chat format ({"role": "..."}) and
|
|
Harmony format ({"author": {"role": "..."}}).
|
|
"""
|
|
if not isinstance(chat_msg, dict):
|
|
chat_msg = chat_msg.model_dump(exclude_none=True)
|
|
|
|
if "author" in chat_msg and isinstance(chat_msg.get("author"), dict):
|
|
return [_parse_harmony_format_message(chat_msg)]
|
|
|
|
return _parse_chat_format_message(chat_msg)
|
|
|
|
|
|
def construct_harmony_previous_input_messages(
|
|
request: ResponsesRequest,
|
|
) -> list[Message]:
|
|
"""Build a Harmony message list from request.previous_input_messages.
|
|
|
|
Filters out system/developer messages to match OpenAI behavior where
|
|
instructions are always taken from the most recent Responses API request.
|
|
"""
|
|
messages: list[Message] = []
|
|
if request.previous_input_messages:
|
|
for message in request.previous_input_messages:
|
|
# Handle both Message objects and dictionary inputs
|
|
if isinstance(message, Message):
|
|
message_role = message.author.role
|
|
if message_role == Role.SYSTEM or message_role == Role.DEVELOPER:
|
|
continue
|
|
messages.append(message)
|
|
else:
|
|
harmony_messages = response_previous_input_to_harmony(message)
|
|
for harmony_msg in harmony_messages:
|
|
message_role = harmony_msg.author.role
|
|
if message_role == Role.SYSTEM or message_role == Role.DEVELOPER:
|
|
continue
|
|
messages.append(harmony_msg)
|
|
return messages
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 3. Private helpers for output parsing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _parse_browser_tool_call(message: Message, recipient: str) -> ResponseOutputItem:
|
|
"""Parse browser tool calls (search, open, find) into web search items."""
|
|
if len(message.content) != 1:
|
|
raise ValueError("Invalid number of contents in browser message")
|
|
content = message.content[0]
|
|
|
|
# Parse JSON args (with retry detection)
|
|
try:
|
|
browser_call = json.loads(content.text)
|
|
except json.JSONDecodeError:
|
|
logger.warning(
|
|
"Invalid JSON in browser tool call, using error placeholder: %s",
|
|
content.text,
|
|
)
|
|
json_retry_output_message = (
|
|
f"Invalid JSON args, caught and retried: {content.text}"
|
|
)
|
|
browser_call = {
|
|
"query": json_retry_output_message,
|
|
"url": json_retry_output_message,
|
|
"pattern": json_retry_output_message,
|
|
}
|
|
|
|
# Create appropriate action based on recipient
|
|
if recipient == "browser.search":
|
|
action = ActionSearch(
|
|
query=f"cursor:{browser_call.get('query', '')}", type="search"
|
|
)
|
|
elif recipient == "browser.open":
|
|
action = ActionOpenPage(
|
|
url=f"cursor:{browser_call.get('url', '')}", type="open_page"
|
|
)
|
|
elif recipient == "browser.find":
|
|
action = ActionFind(
|
|
pattern=browser_call.get("pattern", ""),
|
|
url=f"cursor:{browser_call.get('url', '')}",
|
|
type="find",
|
|
)
|
|
else:
|
|
raise ValueError(f"Unknown browser action: {recipient}")
|
|
|
|
return ResponseFunctionWebSearch(
|
|
id=f"ws_{random_uuid()}",
|
|
action=action,
|
|
status="completed",
|
|
type="web_search_call",
|
|
)
|
|
|
|
|
|
def _parse_function_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
|
|
"""Parse function calls into function tool call items."""
|
|
function_name = recipient.split(".")[-1]
|
|
output_items = []
|
|
for content in message.content:
|
|
random_id = random_uuid()
|
|
response_item = ResponseFunctionToolCall(
|
|
arguments=content.text,
|
|
call_id=f"call_{random_id}",
|
|
type="function_call",
|
|
name=function_name,
|
|
id=f"fc_{random_id}",
|
|
)
|
|
output_items.append(response_item)
|
|
return output_items
|
|
|
|
|
|
def _parse_reasoning(message: Message) -> list[ResponseOutputItem]:
|
|
"""Parse reasoning/analysis content into reasoning items."""
|
|
output_items = []
|
|
for content in message.content:
|
|
reasoning_item = ResponseReasoningItem(
|
|
id=f"rs_{random_uuid()}",
|
|
summary=[],
|
|
type="reasoning",
|
|
content=[
|
|
ResponseReasoningTextContent(text=content.text, type="reasoning_text")
|
|
],
|
|
status=None,
|
|
)
|
|
output_items.append(reasoning_item)
|
|
return output_items
|
|
|
|
|
|
def _parse_final_message(message: Message) -> ResponseOutputItem:
|
|
"""Parse final channel messages into output message items."""
|
|
contents = []
|
|
for content in message.content:
|
|
output_text = ResponseOutputText(
|
|
text=content.text,
|
|
annotations=[], # TODO
|
|
type="output_text",
|
|
logprobs=None, # TODO
|
|
)
|
|
contents.append(output_text)
|
|
return ResponseOutputMessage(
|
|
id=f"msg_{random_uuid()}",
|
|
content=contents,
|
|
role=message.author.role,
|
|
status="completed",
|
|
type="message",
|
|
)
|
|
|
|
|
|
def _parse_mcp_recipient(recipient: str) -> tuple[str, str]:
|
|
"""Parse MCP recipient into (server_label, tool_name).
|
|
|
|
For dotted recipients like "repo_browser.list":
|
|
- server_label: "repo_browser" (namespace/server)
|
|
- tool_name: "list" (specific tool)
|
|
|
|
For simple recipients like "filesystem":
|
|
- server_label: "filesystem"
|
|
- tool_name: "filesystem"
|
|
"""
|
|
if "." in recipient:
|
|
server_label = recipient.split(".")[0]
|
|
tool_name = recipient.split(".")[-1]
|
|
else:
|
|
server_label = recipient
|
|
tool_name = recipient
|
|
return server_label, tool_name
|
|
|
|
|
|
def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]:
|
|
"""Parse MCP calls into MCP call items."""
|
|
# Handle built-in tools that need server_label mapping
|
|
if recipient in BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
|
|
server_label = BUILTIN_TOOL_TO_MCP_SERVER_LABEL[recipient]
|
|
tool_name = recipient
|
|
else:
|
|
server_label, tool_name = _parse_mcp_recipient(recipient)
|
|
|
|
output_items = []
|
|
for content in message.content:
|
|
response_item = McpCall(
|
|
arguments=content.text,
|
|
type="mcp_call",
|
|
name=tool_name,
|
|
server_label=server_label,
|
|
id=f"mcp_{random_uuid()}",
|
|
status="completed",
|
|
)
|
|
output_items.append(response_item)
|
|
return output_items
|
|
|
|
|
|
def _parse_message_no_recipient(
|
|
message: Message,
|
|
) -> list[ResponseOutputItem]:
|
|
"""Parse a Harmony message with no recipient based on its channel."""
|
|
if message.channel == "analysis":
|
|
return _parse_reasoning(message)
|
|
|
|
if message.channel in ("commentary", "final"):
|
|
# Per Harmony format, preambles (commentary with no recipient) and
|
|
# final channel content are both intended to be shown to end-users.
|
|
# See: https://cookbook.openai.com/articles/openai-harmony
|
|
return [_parse_final_message(message)]
|
|
|
|
raise ValueError(f"Unknown channel: {message.channel}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 4. Public output parsing functions
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def harmony_to_response_output(message: Message) -> list[ResponseOutputItem]:
|
|
"""Parse a Harmony message into a list of output response items.
|
|
|
|
This is the main dispatcher that routes based on channel and recipient.
|
|
"""
|
|
if message.author.role != "assistant":
|
|
# This is a message from a tool to the assistant (e.g., search result).
|
|
# Don't include it in the final output for now. This aligns with
|
|
# OpenAI's behavior on models like o4-mini.
|
|
return []
|
|
|
|
output_items: list[ResponseOutputItem] = []
|
|
recipient = message.recipient
|
|
|
|
if recipient is not None:
|
|
# Browser tool calls (browser.search, browser.open, browser.find)
|
|
if recipient.startswith("browser."):
|
|
output_items.append(_parse_browser_tool_call(message, recipient))
|
|
|
|
# Function calls (should only happen on commentary channel)
|
|
elif message.channel == "commentary" and recipient.startswith("functions."):
|
|
output_items.extend(_parse_function_call(message, recipient))
|
|
|
|
# Built-in MCP tools (python, browser, container)
|
|
elif recipient in BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
|
|
output_items.extend(_parse_reasoning(message))
|
|
|
|
# All other recipients are MCP calls
|
|
else:
|
|
output_items.extend(_parse_mcp_call(message, recipient))
|
|
|
|
# No recipient - handle based on channel for non-tool messages
|
|
else:
|
|
output_items.extend(_parse_message_no_recipient(message))
|
|
|
|
return output_items
|
|
|
|
|
|
def parser_state_to_response_output(
|
|
parser: StreamableParser,
|
|
) -> list[ResponseOutputItem]:
|
|
"""Extract in-progress response items from incomplete parser state.
|
|
|
|
Called when the parser has buffered content that hasn't formed a
|
|
complete message yet (e.g., generation was cut short).
|
|
"""
|
|
if not parser.current_content:
|
|
return []
|
|
if parser.current_role != Role.ASSISTANT:
|
|
return []
|
|
current_recipient = parser.current_recipient
|
|
if current_recipient is not None and current_recipient.startswith("browser."):
|
|
return []
|
|
|
|
if current_recipient and parser.current_channel in ("commentary", "analysis"):
|
|
if current_recipient.startswith("functions."):
|
|
rid = random_uuid()
|
|
return [
|
|
ResponseFunctionToolCall(
|
|
arguments=parser.current_content,
|
|
call_id=f"call_{rid}",
|
|
type="function_call",
|
|
name=current_recipient.split(".")[-1],
|
|
id=f"fc_{rid}",
|
|
status="in_progress",
|
|
)
|
|
]
|
|
# Built-in MCP tools (python, browser, container)
|
|
elif current_recipient in BUILTIN_TOOL_TO_MCP_SERVER_LABEL:
|
|
return [
|
|
ResponseReasoningItem(
|
|
id=f"rs_{random_uuid()}",
|
|
summary=[],
|
|
type="reasoning",
|
|
content=[
|
|
ResponseReasoningTextContent(
|
|
text=parser.current_content, type="reasoning_text"
|
|
)
|
|
],
|
|
status=None,
|
|
)
|
|
]
|
|
# All other recipients are MCP calls
|
|
else:
|
|
rid = random_uuid()
|
|
server_label, tool_name = _parse_mcp_recipient(current_recipient)
|
|
return [
|
|
McpCall(
|
|
arguments=parser.current_content,
|
|
type="mcp_call",
|
|
name=tool_name,
|
|
server_label=server_label,
|
|
id=f"mcp_{rid}",
|
|
status="in_progress",
|
|
)
|
|
]
|
|
|
|
if parser.current_channel == "commentary":
|
|
# Per Harmony format, preambles (commentary with no recipient) are
|
|
# intended to be shown to end-users, unlike analysis channel content.
|
|
output_text = ResponseOutputText(
|
|
text=parser.current_content,
|
|
annotations=[],
|
|
type="output_text",
|
|
logprobs=None,
|
|
)
|
|
return [
|
|
ResponseOutputMessage(
|
|
id=f"msg_{random_uuid()}",
|
|
content=[output_text],
|
|
role="assistant",
|
|
status="incomplete",
|
|
type="message",
|
|
)
|
|
]
|
|
|
|
if parser.current_channel == "analysis":
|
|
return [
|
|
ResponseReasoningItem(
|
|
id=f"rs_{random_uuid()}",
|
|
summary=[],
|
|
type="reasoning",
|
|
content=[
|
|
ResponseReasoningTextContent(
|
|
text=parser.current_content, type="reasoning_text"
|
|
)
|
|
],
|
|
status=None,
|
|
)
|
|
]
|
|
|
|
if parser.current_channel == "final":
|
|
output_text = ResponseOutputText(
|
|
text=parser.current_content,
|
|
annotations=[], # TODO
|
|
type="output_text",
|
|
logprobs=None, # TODO
|
|
)
|
|
text_item = ResponseOutputMessage(
|
|
id=f"msg_{random_uuid()}",
|
|
content=[output_text],
|
|
role="assistant",
|
|
# if the parser still has messages (ie if the generator got cut
|
|
# abruptly), this should be incomplete
|
|
status="incomplete",
|
|
type="message",
|
|
)
|
|
return [text_item]
|
|
|
|
return []
|