Add minimal vLLM 0.16.1 build repo for BI-V150
This commit is contained in:
394
vllm/entrypoints/openai/parser/harmony_utils.py
Normal file
394
vllm/entrypoints/openai/parser/harmony_utils.py
Normal file
@@ -0,0 +1,394 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import datetime
|
||||
from collections.abc import Iterable, Sequence
|
||||
from typing import Literal
|
||||
|
||||
from openai.types.responses.tool import Tool
|
||||
from openai_harmony import (
|
||||
Author,
|
||||
Conversation,
|
||||
DeveloperContent,
|
||||
HarmonyEncodingName,
|
||||
Message,
|
||||
ReasoningEffort,
|
||||
Role,
|
||||
StreamableParser,
|
||||
SystemContent,
|
||||
TextContent,
|
||||
ToolDescription,
|
||||
load_harmony_encoding,
|
||||
)
|
||||
|
||||
from vllm import envs
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionToolsParam
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
REASONING_EFFORT = {
|
||||
"high": ReasoningEffort.HIGH,
|
||||
"medium": ReasoningEffort.MEDIUM,
|
||||
"low": ReasoningEffort.LOW,
|
||||
}
|
||||
|
||||
_harmony_encoding = None
|
||||
|
||||
# Builtin tools that should be included in the system message when
|
||||
# they are available and requested by the user.
|
||||
# Tool args are provided by MCP tool descriptions. Output
|
||||
# of the tools are stringified.
|
||||
BUILTIN_TOOL_TO_MCP_SERVER_LABEL: dict[str, str] = {
|
||||
"python": "code_interpreter",
|
||||
"browser": "web_search_preview",
|
||||
"container": "container",
|
||||
}
|
||||
|
||||
# Derive MCP_BUILTIN_TOOLS from the canonical mapping
|
||||
MCP_BUILTIN_TOOLS: set[str] = set(BUILTIN_TOOL_TO_MCP_SERVER_LABEL.values())
|
||||
|
||||
|
||||
def has_custom_tools(tool_types: set[str]) -> bool:
|
||||
"""
|
||||
Checks if the given tool types are custom tools
|
||||
(i.e. any tool other than MCP buildin tools)
|
||||
"""
|
||||
return not tool_types.issubset(MCP_BUILTIN_TOOLS)
|
||||
|
||||
|
||||
def get_encoding():
|
||||
global _harmony_encoding
|
||||
if _harmony_encoding is None:
|
||||
_harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
|
||||
return _harmony_encoding
|
||||
|
||||
|
||||
def get_system_message(
|
||||
model_identity: str | None = None,
|
||||
reasoning_effort: Literal["high", "medium", "low"] | None = None,
|
||||
start_date: str | None = None,
|
||||
browser_description: str | None = None,
|
||||
python_description: str | None = None,
|
||||
container_description: str | None = None,
|
||||
instructions: str | None = None,
|
||||
with_custom_tools: bool = False,
|
||||
) -> Message:
|
||||
sys_msg_content = SystemContent.new()
|
||||
if model_identity is not None:
|
||||
sys_msg_content = sys_msg_content.with_model_identity(model_identity)
|
||||
if instructions is not None and envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS:
|
||||
current_identity = sys_msg_content.model_identity
|
||||
new_identity = (
|
||||
f"{current_identity}\n{instructions}" if current_identity else instructions
|
||||
)
|
||||
sys_msg_content = sys_msg_content.with_model_identity(new_identity)
|
||||
if reasoning_effort is not None:
|
||||
sys_msg_content = sys_msg_content.with_reasoning_effort(
|
||||
REASONING_EFFORT[reasoning_effort]
|
||||
)
|
||||
if start_date is None:
|
||||
# NOTE(woosuk): This brings non-determinism in vLLM.
|
||||
# Set VLLM_SYSTEM_START_DATE to pin it.
|
||||
start_date = envs.VLLM_SYSTEM_START_DATE or datetime.datetime.now().strftime(
|
||||
"%Y-%m-%d"
|
||||
)
|
||||
sys_msg_content = sys_msg_content.with_conversation_start_date(start_date)
|
||||
if browser_description is not None:
|
||||
sys_msg_content = sys_msg_content.with_tools(browser_description)
|
||||
if python_description is not None:
|
||||
sys_msg_content = sys_msg_content.with_tools(python_description)
|
||||
if container_description is not None:
|
||||
sys_msg_content = sys_msg_content.with_tools(container_description)
|
||||
sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content)
|
||||
return sys_msg
|
||||
|
||||
|
||||
def create_tool_definition(tool: ChatCompletionToolsParam | Tool):
|
||||
if isinstance(tool, ChatCompletionToolsParam):
|
||||
return ToolDescription.new(
|
||||
name=tool.function.name,
|
||||
description=tool.function.description,
|
||||
parameters=tool.function.parameters,
|
||||
)
|
||||
return ToolDescription.new(
|
||||
name=tool.name,
|
||||
description=tool.description,
|
||||
parameters=tool.parameters,
|
||||
)
|
||||
|
||||
|
||||
def get_developer_message(
|
||||
instructions: str | None = None,
|
||||
tools: list[Tool | ChatCompletionToolsParam] | None = None,
|
||||
) -> Message:
|
||||
dev_msg_content = DeveloperContent.new()
|
||||
if instructions is not None and not envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS:
|
||||
dev_msg_content = dev_msg_content.with_instructions(instructions)
|
||||
if tools is not None:
|
||||
function_tools: list[Tool | ChatCompletionToolsParam] = []
|
||||
for tool in tools:
|
||||
if tool.type in (
|
||||
"web_search_preview",
|
||||
"code_interpreter",
|
||||
"container",
|
||||
):
|
||||
pass
|
||||
|
||||
elif tool.type == "function":
|
||||
function_tools.append(tool)
|
||||
else:
|
||||
raise ValueError(f"tool type {tool.type} not supported")
|
||||
if function_tools:
|
||||
function_tool_descriptions = [
|
||||
create_tool_definition(tool) for tool in function_tools
|
||||
]
|
||||
dev_msg_content = dev_msg_content.with_function_tools(
|
||||
function_tool_descriptions
|
||||
)
|
||||
dev_msg = Message.from_role_and_content(Role.DEVELOPER, dev_msg_content)
|
||||
return dev_msg
|
||||
|
||||
|
||||
def get_user_message(content: str) -> Message:
|
||||
return Message.from_role_and_content(Role.USER, content)
|
||||
|
||||
|
||||
def parse_chat_inputs_to_harmony_messages(chat_msgs: list) -> list[Message]:
|
||||
"""
|
||||
Parse a list of messages from request.messages in the Chat Completion API to
|
||||
Harmony messages.
|
||||
"""
|
||||
msgs: list[Message] = []
|
||||
tool_id_names: dict[str, str] = {}
|
||||
|
||||
# Collect tool id to name mappings for tool response recipient values
|
||||
for chat_msg in chat_msgs:
|
||||
for tool_call in chat_msg.get("tool_calls", []):
|
||||
tool_id_names[tool_call.get("id")] = tool_call.get("function", {}).get(
|
||||
"name"
|
||||
)
|
||||
|
||||
for chat_msg in chat_msgs:
|
||||
msgs.extend(parse_chat_input_to_harmony_message(chat_msg, tool_id_names))
|
||||
|
||||
msgs = auto_drop_analysis_messages(msgs)
|
||||
return msgs
|
||||
|
||||
|
||||
def auto_drop_analysis_messages(msgs: list[Message]) -> list[Message]:
|
||||
"""
|
||||
Harmony models expect the analysis messages (representing raw chain of thought) to
|
||||
be dropped after an assistant message to the final channel is produced from the
|
||||
reasoning of those messages.
|
||||
|
||||
The openai-harmony library does this if the very last assistant message is to the
|
||||
final channel, but it does not handle the case where we're in longer multi-turn
|
||||
conversations and the client gave us reasoning content from previous turns of
|
||||
the conversation with multiple assistant messages to the final channel in the
|
||||
conversation.
|
||||
|
||||
So, we find the index of the last assistant message to the final channel and drop
|
||||
all analysis messages that precede it, leaving only the analysis messages that
|
||||
are relevant to the current part of the conversation.
|
||||
"""
|
||||
last_assistant_final_index = -1
|
||||
for i in range(len(msgs) - 1, -1, -1):
|
||||
msg = msgs[i]
|
||||
if msg.author.role == "assistant" and msg.channel == "final":
|
||||
last_assistant_final_index = i
|
||||
break
|
||||
|
||||
cleaned_msgs: list[Message] = []
|
||||
for i, msg in enumerate(msgs):
|
||||
if i < last_assistant_final_index and msg.channel == "analysis":
|
||||
continue
|
||||
cleaned_msgs.append(msg)
|
||||
|
||||
return cleaned_msgs
|
||||
|
||||
|
||||
def flatten_chat_text_content(content: str | list | None) -> str | None:
|
||||
"""
|
||||
Extract the text parts from a chat message content field and flatten them
|
||||
into a single string.
|
||||
"""
|
||||
if isinstance(content, list):
|
||||
return "".join(
|
||||
item.get("text", "")
|
||||
for item in content
|
||||
if isinstance(item, dict) and item.get("type") == "text"
|
||||
)
|
||||
return content
|
||||
|
||||
|
||||
def parse_chat_input_to_harmony_message(
|
||||
chat_msg, tool_id_names: dict[str, str] | None = None
|
||||
) -> list[Message]:
|
||||
"""
|
||||
Parse a message from request.messages in the Chat Completion API to
|
||||
Harmony messages.
|
||||
"""
|
||||
tool_id_names = tool_id_names or {}
|
||||
|
||||
if not isinstance(chat_msg, dict):
|
||||
# Handle Pydantic models
|
||||
chat_msg = chat_msg.model_dump(exclude_none=True)
|
||||
|
||||
role = chat_msg.get("role")
|
||||
msgs: list[Message] = []
|
||||
|
||||
# Assistant message with tool calls
|
||||
tool_calls = chat_msg.get("tool_calls", [])
|
||||
|
||||
if role == "assistant" and tool_calls:
|
||||
content = flatten_chat_text_content(chat_msg.get("content"))
|
||||
if content:
|
||||
commentary_msg = Message.from_role_and_content(Role.ASSISTANT, content)
|
||||
commentary_msg = commentary_msg.with_channel("commentary")
|
||||
msgs.append(commentary_msg)
|
||||
|
||||
reasoning = chat_msg.get("reasoning")
|
||||
if reasoning:
|
||||
analysis_msg = Message.from_role_and_content(Role.ASSISTANT, reasoning)
|
||||
analysis_msg = analysis_msg.with_channel("analysis")
|
||||
msgs.append(analysis_msg)
|
||||
|
||||
for call in tool_calls:
|
||||
func = call.get("function", {})
|
||||
name = func.get("name", "")
|
||||
arguments = func.get("arguments", "") or ""
|
||||
msg = Message.from_role_and_content(Role.ASSISTANT, arguments)
|
||||
msg = msg.with_channel("commentary")
|
||||
msg = msg.with_recipient(f"functions.{name}")
|
||||
# Officially, this should be `<|constrain|>json` but there is not clear
|
||||
# evidence that improves accuracy over `json` and some anecdotes to the
|
||||
# contrary. Further testing of the different content_types is needed.
|
||||
msg = msg.with_content_type("json")
|
||||
msgs.append(msg)
|
||||
return msgs
|
||||
|
||||
# Tool role message (tool output)
|
||||
if role == "tool":
|
||||
tool_call_id = chat_msg.get("tool_call_id", "")
|
||||
name = tool_id_names.get(tool_call_id, "")
|
||||
content = chat_msg.get("content", "") or ""
|
||||
content = flatten_chat_text_content(content)
|
||||
|
||||
msg = (
|
||||
Message.from_author_and_content(
|
||||
Author.new(Role.TOOL, f"functions.{name}"), content
|
||||
)
|
||||
.with_channel("commentary")
|
||||
.with_recipient("assistant")
|
||||
)
|
||||
return [msg]
|
||||
|
||||
# Non-tool reasoning content
|
||||
reasoning = chat_msg.get("reasoning")
|
||||
if role == "assistant" and reasoning:
|
||||
analysis_msg = Message.from_role_and_content(Role.ASSISTANT, reasoning)
|
||||
analysis_msg = analysis_msg.with_channel("analysis")
|
||||
msgs.append(analysis_msg)
|
||||
|
||||
# Default: user/assistant/system messages with content
|
||||
content = chat_msg.get("content") or ""
|
||||
if content is None:
|
||||
content = ""
|
||||
if isinstance(content, str):
|
||||
contents = [TextContent(text=content)]
|
||||
else:
|
||||
# TODO: Support refusal.
|
||||
contents = [TextContent(text=c.get("text", "")) for c in content]
|
||||
|
||||
# Only add assistant messages if they have content, as reasoning or tool calling
|
||||
# assistant messages were already added above.
|
||||
if role == "assistant" and contents and contents[0].text:
|
||||
msg = Message.from_role_and_contents(role, contents)
|
||||
# Send non-tool assistant messages to the final channel
|
||||
msg = msg.with_channel("final")
|
||||
msgs.append(msg)
|
||||
# For user/system/developer messages, add them directly even if no content.
|
||||
elif role != "assistant":
|
||||
msg = Message.from_role_and_contents(role, contents)
|
||||
msgs.append(msg)
|
||||
|
||||
return msgs
|
||||
|
||||
|
||||
def render_for_completion(messages: list[Message]) -> list[int]:
|
||||
conversation = Conversation.from_messages(messages)
|
||||
token_ids = get_encoding().render_conversation_for_completion(
|
||||
conversation, Role.ASSISTANT
|
||||
)
|
||||
return token_ids
|
||||
|
||||
|
||||
def get_stop_tokens_for_assistant_actions() -> list[int]:
|
||||
return get_encoding().stop_tokens_for_assistant_actions()
|
||||
|
||||
|
||||
def get_streamable_parser_for_assistant() -> StreamableParser:
|
||||
return StreamableParser(get_encoding(), role=Role.ASSISTANT)
|
||||
|
||||
|
||||
def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser:
|
||||
parser = get_streamable_parser_for_assistant()
|
||||
for token_id in token_ids:
|
||||
parser.process(token_id)
|
||||
return parser
|
||||
|
||||
|
||||
def parse_chat_output(
|
||||
token_ids: Sequence[int],
|
||||
) -> tuple[str | None, str | None, bool]:
|
||||
"""
|
||||
Parse the output of a Harmony chat completion into reasoning and final content.
|
||||
Note that when the `openai` tool parser is used, serving_chat only uses this
|
||||
for the reasoning content and gets the final content from the tool call parser.
|
||||
|
||||
When the `openai` tool parser is not enabled, or when `GptOssReasoningParser` is
|
||||
in use,this needs to return the final content without any tool calls parsed.
|
||||
|
||||
Empty reasoning or final content is returned as None instead of an empty string.
|
||||
"""
|
||||
parser = parse_output_into_messages(token_ids)
|
||||
output_msgs = parser.messages
|
||||
is_tool_call = False # TODO: update this when tool call is supported
|
||||
|
||||
# Get completed messages from the parser
|
||||
# - analysis channel: hidden reasoning
|
||||
# - commentary channel without recipient (preambles): visible to user
|
||||
# - final channel: visible to user
|
||||
# - commentary with recipient (tool calls): handled separately by tool parser
|
||||
reasoning_texts = [
|
||||
msg.content[0].text for msg in output_msgs if msg.channel == "analysis"
|
||||
]
|
||||
final_texts = [
|
||||
msg.content[0].text
|
||||
for msg in output_msgs
|
||||
if msg.channel == "final" or (msg.channel == "commentary" and not msg.recipient)
|
||||
]
|
||||
|
||||
# Extract partial messages from the parser
|
||||
if parser.current_channel == "analysis" and parser.current_content:
|
||||
reasoning_texts.append(parser.current_content)
|
||||
elif parser.current_channel == "final" and parser.current_content:
|
||||
final_texts.append(parser.current_content)
|
||||
elif (
|
||||
parser.current_channel == "commentary"
|
||||
and not parser.current_recipient
|
||||
and parser.current_content
|
||||
):
|
||||
# Preambles (commentary without recipient) are visible to user
|
||||
final_texts.append(parser.current_content)
|
||||
|
||||
# Flatten multiple messages into a single string
|
||||
reasoning: str | None = "\n".join(reasoning_texts)
|
||||
final_content: str | None = "\n".join(final_texts)
|
||||
|
||||
# Return None instead of empty string since existing callers check for None
|
||||
reasoning = reasoning or None
|
||||
final_content = final_content or None
|
||||
|
||||
return reasoning, final_content, is_tool_call
|
||||
Reference in New Issue
Block a user