# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Harmony ↔ Responses API conversion utilities. Handles two directions: 1. Response Input → Harmony Messages (input parsing) 2. Harmony Messages → Response Output Items (output parsing) """ import json from openai.types.responses import ( ResponseFunctionToolCall, ResponseOutputItem, ResponseOutputMessage, ResponseOutputText, ResponseReasoningItem, ) from openai.types.responses.response_function_web_search import ( ActionFind, ActionOpenPage, ActionSearch, ResponseFunctionWebSearch, ) from openai.types.responses.response_output_item import McpCall from openai.types.responses.response_reasoning_item import ( Content as ResponseReasoningTextContent, ) from openai_harmony import Author, Message, Role, StreamableParser, TextContent from vllm.entrypoints.openai.parser.harmony_utils import ( BUILTIN_TOOL_TO_MCP_SERVER_LABEL, flatten_chat_text_content, ) from vllm.entrypoints.openai.responses.protocol import ( ResponseInputOutputItem, ResponsesRequest, ) from vllm.logger import init_logger from vllm.utils import random_uuid logger = init_logger(__name__) # --------------------------------------------------------------------------- # 1. Private helpers for input parsing # --------------------------------------------------------------------------- def _parse_harmony_format_message(chat_msg: dict) -> Message: """Reconstruct a Message from Harmony-format dict, preserving channel, recipient, and content_type.""" author_dict = chat_msg["author"] role = author_dict.get("role") name = author_dict.get("name") raw_content = chat_msg.get("content", "") if isinstance(raw_content, list): # TODO: Support refusal and non-text content types. contents = [TextContent(text=c.get("text", "")) for c in raw_content] elif isinstance(raw_content, str): contents = [TextContent(text=raw_content)] else: contents = [TextContent(text="")] if name: msg = Message.from_author_and_contents(Author.new(Role(role), name), contents) else: msg = Message.from_role_and_contents(Role(role), contents) channel = chat_msg.get("channel") if channel: msg = msg.with_channel(channel) recipient = chat_msg.get("recipient") if recipient: msg = msg.with_recipient(recipient) content_type = chat_msg.get("content_type") if content_type: msg = msg.with_content_type(content_type) return msg def _parse_chat_format_message(chat_msg: dict) -> list[Message]: """Parse an OpenAI chat-format dict into Harmony messages.""" role = chat_msg.get("role") if role is None: raise ValueError(f"Message has no 'role' key: {chat_msg}") # Assistant message with tool calls tool_calls = chat_msg.get("tool_calls") if role == "assistant" and tool_calls: msgs: list[Message] = [] for call in tool_calls: func = call.get("function", {}) name = func.get("name", "") arguments = func.get("arguments", "") or "" msg = Message.from_role_and_content(Role.ASSISTANT, arguments) msg = msg.with_channel("commentary") msg = msg.with_recipient(f"functions.{name}") msg = msg.with_content_type("json") msgs.append(msg) return msgs # Tool role message (tool output) if role == "tool": name = chat_msg.get("name", "") if name and not name.startswith("functions."): name = f"functions.{name}" content = chat_msg.get("content", "") or "" content = flatten_chat_text_content(content) # NOTE: .with_recipient("assistant") is required on tool messages # to match parse_chat_input_to_harmony_message behavior and ensure # proper routing in the Harmony protocol. msg = ( Message.from_author_and_content(Author.new(Role.TOOL, name), content) .with_channel("commentary") .with_recipient("assistant") ) return [msg] # Default: user/assistant/system messages content = chat_msg.get("content", "") if isinstance(content, str): contents = [TextContent(text=content)] else: # TODO: Support refusal. contents = [TextContent(text=c.get("text", "")) for c in content] msg = Message.from_role_and_contents(role, contents) return [msg] # --------------------------------------------------------------------------- # 2. Public input parsing functions # --------------------------------------------------------------------------- def response_input_to_harmony( response_msg: ResponseInputOutputItem, prev_responses: list[ResponseOutputItem | ResponseReasoningItem], ) -> Message: """Convert a single ResponseInputOutputItem into a Harmony Message.""" if not isinstance(response_msg, dict): response_msg = response_msg.model_dump() if "type" not in response_msg or response_msg["type"] == "message": role = response_msg["role"] content = response_msg["content"] # Add prefix for developer messages. # <|start|>developer<|message|># Instructions {instructions}<|end|> text_prefix = "Instructions:\n" if role == "developer" else "" if isinstance(content, str): msg = Message.from_role_and_content(role, text_prefix + content) else: contents = [TextContent(text=text_prefix + c["text"]) for c in content] msg = Message.from_role_and_contents(role, contents) if role == "assistant": msg = msg.with_channel("final") elif response_msg["type"] == "function_call_output": call_id = response_msg["call_id"] call_response: ResponseFunctionToolCall | None = None for prev_response in reversed(prev_responses): if ( isinstance(prev_response, ResponseFunctionToolCall) and prev_response.call_id == call_id ): call_response = prev_response break if call_response is None: raise ValueError(f"No call message found for {call_id}") msg = Message.from_author_and_content( Author.new(Role.TOOL, f"functions.{call_response.name}"), response_msg["output"], ) elif response_msg["type"] == "reasoning": content = response_msg["content"] assert len(content) == 1 msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"]) elif response_msg["type"] == "function_call": msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"]) msg = msg.with_channel("commentary") msg = msg.with_recipient(f"functions.{response_msg['name']}") msg = msg.with_content_type("json") else: raise ValueError(f"Unknown input type: {response_msg['type']}") return msg def response_previous_input_to_harmony(chat_msg) -> list[Message]: """Parse a message from request.previous_input_messages into Harmony messages. Supports both OpenAI chat format ({"role": "..."}) and Harmony format ({"author": {"role": "..."}}). """ if not isinstance(chat_msg, dict): chat_msg = chat_msg.model_dump(exclude_none=True) if "author" in chat_msg and isinstance(chat_msg.get("author"), dict): return [_parse_harmony_format_message(chat_msg)] return _parse_chat_format_message(chat_msg) def construct_harmony_previous_input_messages( request: ResponsesRequest, ) -> list[Message]: """Build a Harmony message list from request.previous_input_messages. Filters out system/developer messages to match OpenAI behavior where instructions are always taken from the most recent Responses API request. """ messages: list[Message] = [] if request.previous_input_messages: for message in request.previous_input_messages: # Handle both Message objects and dictionary inputs if isinstance(message, Message): message_role = message.author.role if message_role == Role.SYSTEM or message_role == Role.DEVELOPER: continue messages.append(message) else: harmony_messages = response_previous_input_to_harmony(message) for harmony_msg in harmony_messages: message_role = harmony_msg.author.role if message_role == Role.SYSTEM or message_role == Role.DEVELOPER: continue messages.append(harmony_msg) return messages # --------------------------------------------------------------------------- # 3. Private helpers for output parsing # --------------------------------------------------------------------------- def _parse_browser_tool_call(message: Message, recipient: str) -> ResponseOutputItem: """Parse browser tool calls (search, open, find) into web search items.""" if len(message.content) != 1: raise ValueError("Invalid number of contents in browser message") content = message.content[0] # Parse JSON args (with retry detection) try: browser_call = json.loads(content.text) except json.JSONDecodeError: logger.warning( "Invalid JSON in browser tool call, using error placeholder: %s", content.text, ) json_retry_output_message = ( f"Invalid JSON args, caught and retried: {content.text}" ) browser_call = { "query": json_retry_output_message, "url": json_retry_output_message, "pattern": json_retry_output_message, } # Create appropriate action based on recipient if recipient == "browser.search": action = ActionSearch( query=f"cursor:{browser_call.get('query', '')}", type="search" ) elif recipient == "browser.open": action = ActionOpenPage( url=f"cursor:{browser_call.get('url', '')}", type="open_page" ) elif recipient == "browser.find": action = ActionFind( pattern=browser_call.get("pattern", ""), url=f"cursor:{browser_call.get('url', '')}", type="find", ) else: raise ValueError(f"Unknown browser action: {recipient}") return ResponseFunctionWebSearch( id=f"ws_{random_uuid()}", action=action, status="completed", type="web_search_call", ) def _parse_function_call(message: Message, recipient: str) -> list[ResponseOutputItem]: """Parse function calls into function tool call items.""" function_name = recipient.split(".")[-1] output_items = [] for content in message.content: random_id = random_uuid() response_item = ResponseFunctionToolCall( arguments=content.text, call_id=f"call_{random_id}", type="function_call", name=function_name, id=f"fc_{random_id}", ) output_items.append(response_item) return output_items def _parse_reasoning(message: Message) -> list[ResponseOutputItem]: """Parse reasoning/analysis content into reasoning items.""" output_items = [] for content in message.content: reasoning_item = ResponseReasoningItem( id=f"rs_{random_uuid()}", summary=[], type="reasoning", content=[ ResponseReasoningTextContent(text=content.text, type="reasoning_text") ], status=None, ) output_items.append(reasoning_item) return output_items def _parse_final_message(message: Message) -> ResponseOutputItem: """Parse final channel messages into output message items.""" contents = [] for content in message.content: output_text = ResponseOutputText( text=content.text, annotations=[], # TODO type="output_text", logprobs=None, # TODO ) contents.append(output_text) return ResponseOutputMessage( id=f"msg_{random_uuid()}", content=contents, role=message.author.role, status="completed", type="message", ) def _parse_mcp_recipient(recipient: str) -> tuple[str, str]: """Parse MCP recipient into (server_label, tool_name). For dotted recipients like "repo_browser.list": - server_label: "repo_browser" (namespace/server) - tool_name: "list" (specific tool) For simple recipients like "filesystem": - server_label: "filesystem" - tool_name: "filesystem" """ if "." in recipient: server_label = recipient.split(".")[0] tool_name = recipient.split(".")[-1] else: server_label = recipient tool_name = recipient return server_label, tool_name def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]: """Parse MCP calls into MCP call items.""" # Handle built-in tools that need server_label mapping if recipient in BUILTIN_TOOL_TO_MCP_SERVER_LABEL: server_label = BUILTIN_TOOL_TO_MCP_SERVER_LABEL[recipient] tool_name = recipient else: server_label, tool_name = _parse_mcp_recipient(recipient) output_items = [] for content in message.content: response_item = McpCall( arguments=content.text, type="mcp_call", name=tool_name, server_label=server_label, id=f"mcp_{random_uuid()}", status="completed", ) output_items.append(response_item) return output_items def _parse_message_no_recipient( message: Message, ) -> list[ResponseOutputItem]: """Parse a Harmony message with no recipient based on its channel.""" if message.channel == "analysis": return _parse_reasoning(message) if message.channel in ("commentary", "final"): # Per Harmony format, preambles (commentary with no recipient) and # final channel content are both intended to be shown to end-users. # See: https://cookbook.openai.com/articles/openai-harmony return [_parse_final_message(message)] raise ValueError(f"Unknown channel: {message.channel}") # --------------------------------------------------------------------------- # 4. Public output parsing functions # --------------------------------------------------------------------------- def harmony_to_response_output(message: Message) -> list[ResponseOutputItem]: """Parse a Harmony message into a list of output response items. This is the main dispatcher that routes based on channel and recipient. """ if message.author.role != "assistant": # This is a message from a tool to the assistant (e.g., search result). # Don't include it in the final output for now. This aligns with # OpenAI's behavior on models like o4-mini. return [] output_items: list[ResponseOutputItem] = [] recipient = message.recipient if recipient is not None: # Browser tool calls (browser.search, browser.open, browser.find) if recipient.startswith("browser."): output_items.append(_parse_browser_tool_call(message, recipient)) # Function calls (should only happen on commentary channel) elif message.channel == "commentary" and recipient.startswith("functions."): output_items.extend(_parse_function_call(message, recipient)) # Built-in MCP tools (python, browser, container) elif recipient in BUILTIN_TOOL_TO_MCP_SERVER_LABEL: output_items.extend(_parse_reasoning(message)) # All other recipients are MCP calls else: output_items.extend(_parse_mcp_call(message, recipient)) # No recipient - handle based on channel for non-tool messages else: output_items.extend(_parse_message_no_recipient(message)) return output_items def parser_state_to_response_output( parser: StreamableParser, ) -> list[ResponseOutputItem]: """Extract in-progress response items from incomplete parser state. Called when the parser has buffered content that hasn't formed a complete message yet (e.g., generation was cut short). """ if not parser.current_content: return [] if parser.current_role != Role.ASSISTANT: return [] current_recipient = parser.current_recipient if current_recipient is not None and current_recipient.startswith("browser."): return [] if current_recipient and parser.current_channel in ("commentary", "analysis"): if current_recipient.startswith("functions."): rid = random_uuid() return [ ResponseFunctionToolCall( arguments=parser.current_content, call_id=f"call_{rid}", type="function_call", name=current_recipient.split(".")[-1], id=f"fc_{rid}", status="in_progress", ) ] # Built-in MCP tools (python, browser, container) elif current_recipient in BUILTIN_TOOL_TO_MCP_SERVER_LABEL: return [ ResponseReasoningItem( id=f"rs_{random_uuid()}", summary=[], type="reasoning", content=[ ResponseReasoningTextContent( text=parser.current_content, type="reasoning_text" ) ], status=None, ) ] # All other recipients are MCP calls else: rid = random_uuid() server_label, tool_name = _parse_mcp_recipient(current_recipient) return [ McpCall( arguments=parser.current_content, type="mcp_call", name=tool_name, server_label=server_label, id=f"mcp_{rid}", status="in_progress", ) ] if parser.current_channel == "commentary": # Per Harmony format, preambles (commentary with no recipient) are # intended to be shown to end-users, unlike analysis channel content. output_text = ResponseOutputText( text=parser.current_content, annotations=[], type="output_text", logprobs=None, ) return [ ResponseOutputMessage( id=f"msg_{random_uuid()}", content=[output_text], role="assistant", status="incomplete", type="message", ) ] if parser.current_channel == "analysis": return [ ResponseReasoningItem( id=f"rs_{random_uuid()}", summary=[], type="reasoning", content=[ ResponseReasoningTextContent( text=parser.current_content, type="reasoning_text" ) ], status=None, ) ] if parser.current_channel == "final": output_text = ResponseOutputText( text=parser.current_content, annotations=[], # TODO type="output_text", logprobs=None, # TODO ) text_item = ResponseOutputMessage( id=f"msg_{random_uuid()}", content=[output_text], role="assistant", # if the parser still has messages (ie if the generator got cut # abruptly), this should be incomplete status="incomplete", type="message", ) return [text_item] return []