395 lines
14 KiB
Python
395 lines
14 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import datetime
|
|
from collections.abc import Iterable, Sequence
|
|
from typing import Literal
|
|
|
|
from openai.types.responses.tool import Tool
|
|
from openai_harmony import (
|
|
Author,
|
|
Conversation,
|
|
DeveloperContent,
|
|
HarmonyEncodingName,
|
|
Message,
|
|
ReasoningEffort,
|
|
Role,
|
|
StreamableParser,
|
|
SystemContent,
|
|
TextContent,
|
|
ToolDescription,
|
|
load_harmony_encoding,
|
|
)
|
|
|
|
from vllm import envs
|
|
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionToolsParam
|
|
from vllm.logger import init_logger
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
REASONING_EFFORT = {
|
|
"high": ReasoningEffort.HIGH,
|
|
"medium": ReasoningEffort.MEDIUM,
|
|
"low": ReasoningEffort.LOW,
|
|
}
|
|
|
|
_harmony_encoding = None
|
|
|
|
# Builtin tools that should be included in the system message when
|
|
# they are available and requested by the user.
|
|
# Tool args are provided by MCP tool descriptions. Output
|
|
# of the tools are stringified.
|
|
BUILTIN_TOOL_TO_MCP_SERVER_LABEL: dict[str, str] = {
|
|
"python": "code_interpreter",
|
|
"browser": "web_search_preview",
|
|
"container": "container",
|
|
}
|
|
|
|
# Derive MCP_BUILTIN_TOOLS from the canonical mapping
|
|
MCP_BUILTIN_TOOLS: set[str] = set(BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values())
|
|
|
|
|
|
def has_custom_tools(tool_types: set[str]) -> bool:
|
|
"""
|
|
Checks if the given tool types are custom tools
|
|
(i.e. any tool other than MCP buildin tools)
|
|
"""
|
|
return not tool_types.issubset(MCP_BUILTIN_TOOLS)
|
|
|
|
|
|
def get_encoding():
|
|
global _harmony_encoding
|
|
if _harmony_encoding is None:
|
|
_harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
|
|
return _harmony_encoding
|
|
|
|
|
|
def get_system_message(
|
|
model_identity: str | None = None,
|
|
reasoning_effort: Literal["high", "medium", "low"] | None = None,
|
|
start_date: str | None = None,
|
|
browser_description: str | None = None,
|
|
python_description: str | None = None,
|
|
container_description: str | None = None,
|
|
instructions: str | None = None,
|
|
with_custom_tools: bool = False,
|
|
) -> Message:
|
|
sys_msg_content = SystemContent.new()
|
|
if model_identity is not None:
|
|
sys_msg_content = sys_msg_content.with_model_identity(model_identity)
|
|
if instructions is not None and envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS:
|
|
current_identity = sys_msg_content.model_identity
|
|
new_identity = (
|
|
f"{current_identity}\n{instructions}" if current_identity else instructions
|
|
)
|
|
sys_msg_content = sys_msg_content.with_model_identity(new_identity)
|
|
if reasoning_effort is not None:
|
|
sys_msg_content = sys_msg_content.with_reasoning_effort(
|
|
REASONING_EFFORT[reasoning_effort]
|
|
)
|
|
if start_date is None:
|
|
# NOTE(woosuk): This brings non-determinism in vLLM.
|
|
# Set VLLM_SYSTEM_START_DATE to pin it.
|
|
start_date = envs.VLLM_SYSTEM_START_DATE or datetime.datetime.now().strftime(
|
|
"%Y-%m-%d"
|
|
)
|
|
sys_msg_content = sys_msg_content.with_conversation_start_date(start_date)
|
|
if browser_description is not None:
|
|
sys_msg_content = sys_msg_content.with_tools(browser_description)
|
|
if python_description is not None:
|
|
sys_msg_content = sys_msg_content.with_tools(python_description)
|
|
if container_description is not None:
|
|
sys_msg_content = sys_msg_content.with_tools(container_description)
|
|
sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content)
|
|
return sys_msg
|
|
|
|
|
|
def create_tool_definition(tool: ChatCompletionToolsParam | Tool):
|
|
if isinstance(tool, ChatCompletionToolsParam):
|
|
return ToolDescription.new(
|
|
name=tool.function.name,
|
|
description=tool.function.description,
|
|
parameters=tool.function.parameters,
|
|
)
|
|
return ToolDescription.new(
|
|
name=tool.name,
|
|
description=tool.description,
|
|
parameters=tool.parameters,
|
|
)
|
|
|
|
|
|
def get_developer_message(
|
|
instructions: str | None = None,
|
|
tools: list[Tool | ChatCompletionToolsParam] | None = None,
|
|
) -> Message:
|
|
dev_msg_content = DeveloperContent.new()
|
|
if instructions is not None and not envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS:
|
|
dev_msg_content = dev_msg_content.with_instructions(instructions)
|
|
if tools is not None:
|
|
function_tools: list[Tool | ChatCompletionToolsParam] = []
|
|
for tool in tools:
|
|
if tool.type in (
|
|
"web_search_preview",
|
|
"code_interpreter",
|
|
"container",
|
|
):
|
|
pass
|
|
|
|
elif tool.type == "function":
|
|
function_tools.append(tool)
|
|
else:
|
|
raise ValueError(f"tool type {tool.type} not supported")
|
|
if function_tools:
|
|
function_tool_descriptions = [
|
|
create_tool_definition(tool) for tool in function_tools
|
|
]
|
|
dev_msg_content = dev_msg_content.with_function_tools(
|
|
function_tool_descriptions
|
|
)
|
|
dev_msg = Message.from_role_and_content(Role.DEVELOPER, dev_msg_content)
|
|
return dev_msg
|
|
|
|
|
|
def get_user_message(content: str) -> Message:
|
|
return Message.from_role_and_content(Role.USER, content)
|
|
|
|
|
|
def parse_chat_inputs_to_harmony_messages(chat_msgs: list) -> list[Message]:
|
|
"""
|
|
Parse a list of messages from request.messages in the Chat Completion API to
|
|
Harmony messages.
|
|
"""
|
|
msgs: list[Message] = []
|
|
tool_id_names: dict[str, str] = {}
|
|
|
|
# Collect tool id to name mappings for tool response recipient values
|
|
for chat_msg in chat_msgs:
|
|
for tool_call in chat_msg.get("tool_calls", []):
|
|
tool_id_names[tool_call.get("id")] = tool_call.get("function", {}).get(
|
|
"name"
|
|
)
|
|
|
|
for chat_msg in chat_msgs:
|
|
msgs.extend(parse_chat_input_to_harmony_message(chat_msg, tool_id_names))
|
|
|
|
msgs = auto_drop_analysis_messages(msgs)
|
|
return msgs
|
|
|
|
|
|
def auto_drop_analysis_messages(msgs: list[Message]) -> list[Message]:
|
|
"""
|
|
Harmony models expect the analysis messages (representing raw chain of thought) to
|
|
be dropped after an assistant message to the final channel is produced from the
|
|
reasoning of those messages.
|
|
|
|
The openai-harmony library does this if the very last assistant message is to the
|
|
final channel, but it does not handle the case where we're in longer multi-turn
|
|
conversations and the client gave us reasoning content from previous turns of
|
|
the conversation with multiple assistant messages to the final channel in the
|
|
conversation.
|
|
|
|
So, we find the index of the last assistant message to the final channel and drop
|
|
all analysis messages that precede it, leaving only the analysis messages that
|
|
are relevant to the current part of the conversation.
|
|
"""
|
|
last_assistant_final_index = -1
|
|
for i in range(len(msgs) - 1, -1, -1):
|
|
msg = msgs[i]
|
|
if msg.author.role == "assistant" and msg.channel == "final":
|
|
last_assistant_final_index = i
|
|
break
|
|
|
|
cleaned_msgs: list[Message] = []
|
|
for i, msg in enumerate(msgs):
|
|
if i < last_assistant_final_index and msg.channel == "analysis":
|
|
continue
|
|
cleaned_msgs.append(msg)
|
|
|
|
return cleaned_msgs
|
|
|
|
|
|
def flatten_chat_text_content(content: str | list | None) -> str | None:
|
|
"""
|
|
Extract the text parts from a chat message content field and flatten them
|
|
into a single string.
|
|
"""
|
|
if isinstance(content, list):
|
|
return "".join(
|
|
item.get("text", "")
|
|
for item in content
|
|
if isinstance(item, dict) and item.get("type") == "text"
|
|
)
|
|
return content
|
|
|
|
|
|
def parse_chat_input_to_harmony_message(
|
|
chat_msg, tool_id_names: dict[str, str] | None = None
|
|
) -> list[Message]:
|
|
"""
|
|
Parse a message from request.messages in the Chat Completion API to
|
|
Harmony messages.
|
|
"""
|
|
tool_id_names = tool_id_names or {}
|
|
|
|
if not isinstance(chat_msg, dict):
|
|
# Handle Pydantic models
|
|
chat_msg = chat_msg.model_dump(exclude_none=True)
|
|
|
|
role = chat_msg.get("role")
|
|
msgs: list[Message] = []
|
|
|
|
# Assistant message with tool calls
|
|
tool_calls = chat_msg.get("tool_calls", [])
|
|
|
|
if role == "assistant" and tool_calls:
|
|
content = flatten_chat_text_content(chat_msg.get("content"))
|
|
if content:
|
|
commentary_msg = Message.from_role_and_content(Role.ASSISTANT, content)
|
|
commentary_msg = commentary_msg.with_channel("commentary")
|
|
msgs.append(commentary_msg)
|
|
|
|
reasoning = chat_msg.get("reasoning")
|
|
if reasoning:
|
|
analysis_msg = Message.from_role_and_content(Role.ASSISTANT, reasoning)
|
|
analysis_msg = analysis_msg.with_channel("analysis")
|
|
msgs.append(analysis_msg)
|
|
|
|
for call in tool_calls:
|
|
func = call.get("function", {})
|
|
name = func.get("name", "")
|
|
arguments = func.get("arguments", "") or ""
|
|
msg = Message.from_role_and_content(Role.ASSISTANT, arguments)
|
|
msg = msg.with_channel("commentary")
|
|
msg = msg.with_recipient(f"functions.{name}")
|
|
# Officially, this should be `<|constrain|>json` but there is not clear
|
|
# evidence that improves accuracy over `json` and some anecdotes to the
|
|
# contrary. Further testing of the different content_types is needed.
|
|
msg = msg.with_content_type("json")
|
|
msgs.append(msg)
|
|
return msgs
|
|
|
|
# Tool role message (tool output)
|
|
if role == "tool":
|
|
tool_call_id = chat_msg.get("tool_call_id", "")
|
|
name = tool_id_names.get(tool_call_id, "")
|
|
content = chat_msg.get("content", "") or ""
|
|
content = flatten_chat_text_content(content)
|
|
|
|
msg = (
|
|
Message.from_author_and_content(
|
|
Author.new(Role.TOOL, f"functions.{name}"), content
|
|
)
|
|
.with_channel("commentary")
|
|
.with_recipient("assistant")
|
|
)
|
|
return [msg]
|
|
|
|
# Non-tool reasoning content
|
|
reasoning = chat_msg.get("reasoning")
|
|
if role == "assistant" and reasoning:
|
|
analysis_msg = Message.from_role_and_content(Role.ASSISTANT, reasoning)
|
|
analysis_msg = analysis_msg.with_channel("analysis")
|
|
msgs.append(analysis_msg)
|
|
|
|
# Default: user/assistant/system messages with content
|
|
content = chat_msg.get("content") or ""
|
|
if content is None:
|
|
content = ""
|
|
if isinstance(content, str):
|
|
contents = [TextContent(text=content)]
|
|
else:
|
|
# TODO: Support refusal.
|
|
contents = [TextContent(text=c.get("text", "")) for c in content]
|
|
|
|
# Only add assistant messages if they have content, as reasoning or tool calling
|
|
# assistant messages were already added above.
|
|
if role == "assistant" and contents and contents[0].text:
|
|
msg = Message.from_role_and_contents(role, contents)
|
|
# Send non-tool assistant messages to the final channel
|
|
msg = msg.with_channel("final")
|
|
msgs.append(msg)
|
|
# For user/system/developer messages, add them directly even if no content.
|
|
elif role != "assistant":
|
|
msg = Message.from_role_and_contents(role, contents)
|
|
msgs.append(msg)
|
|
|
|
return msgs
|
|
|
|
|
|
def render_for_completion(messages: list[Message]) -> list[int]:
|
|
conversation = Conversation.from_messages(messages)
|
|
token_ids = get_encoding().render_conversation_for_completion(
|
|
conversation, Role.ASSISTANT
|
|
)
|
|
return token_ids
|
|
|
|
|
|
def get_stop_tokens_for_assistant_actions() -> list[int]:
|
|
return get_encoding().stop_tokens_for_assistant_actions()
|
|
|
|
|
|
def get_streamable_parser_for_assistant() -> StreamableParser:
|
|
return StreamableParser(get_encoding(), role=Role.ASSISTANT)
|
|
|
|
|
|
def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser:
|
|
parser = get_streamable_parser_for_assistant()
|
|
for token_id in token_ids:
|
|
parser.process(token_id)
|
|
return parser
|
|
|
|
|
|
def parse_chat_output(
|
|
token_ids: Sequence[int],
|
|
) -> tuple[str | None, str | None, bool]:
|
|
"""
|
|
Parse the output of a Harmony chat completion into reasoning and final content.
|
|
Note that when the `openai` tool parser is used, serving_chat only uses this
|
|
for the reasoning content and gets the final content from the tool call parser.
|
|
|
|
When the `openai` tool parser is not enabled, or when `GptOssReasoningParser` is
|
|
in use,this needs to return the final content without any tool calls parsed.
|
|
|
|
Empty reasoning or final content is returned as None instead of an empty string.
|
|
"""
|
|
parser = parse_output_into_messages(token_ids)
|
|
output_msgs = parser.messages
|
|
is_tool_call = False # TODO: update this when tool call is supported
|
|
|
|
# Get completed messages from the parser
|
|
# - analysis channel: hidden reasoning
|
|
# - commentary channel without recipient (preambles): visible to user
|
|
# - final channel: visible to user
|
|
# - commentary with recipient (tool calls): handled separately by tool parser
|
|
reasoning_texts = [
|
|
msg.content[0].text for msg in output_msgs if msg.channel == "analysis"
|
|
]
|
|
final_texts = [
|
|
msg.content[0].text
|
|
for msg in output_msgs
|
|
if msg.channel == "final" or (msg.channel == "commentary" and not msg.recipient)
|
|
]
|
|
|
|
# Extract partial messages from the parser
|
|
if parser.current_channel == "analysis" and parser.current_content:
|
|
reasoning_texts.append(parser.current_content)
|
|
elif parser.current_channel == "final" and parser.current_content:
|
|
final_texts.append(parser.current_content)
|
|
elif (
|
|
parser.current_channel == "commentary"
|
|
and not parser.current_recipient
|
|
and parser.current_content
|
|
):
|
|
# Preambles (commentary without recipient) are visible to user
|
|
final_texts.append(parser.current_content)
|
|
|
|
# Flatten multiple messages into a single string
|
|
reasoning: str | None = "\n".join(reasoning_texts)
|
|
final_content: str | None = "\n".join(final_texts)
|
|
|
|
# Return None instead of empty string since existing callers check for None
|
|
reasoning = reasoning or None
|
|
final_content = final_content or None
|
|
|
|
return reasoning, final_content, is_tool_call
|