Add minimal vLLM 0.16.1 build repo for BI-V150
This commit is contained in:
171
vllm/entrypoints/openai/chat_completion/stream_harmony.py
Normal file
171
vllm/entrypoints/openai/chat_completion/stream_harmony.py
Normal file
@@ -0,0 +1,171 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Harmony-specific streaming delta extraction for chat completions.
|
||||
|
||||
This module handles the extraction of DeltaMessage objects from
|
||||
harmony parser state during streaming chat completions.
|
||||
"""
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
from openai_harmony import StreamableParser
|
||||
|
||||
from vllm.entrypoints.chat_utils import make_tool_call_id
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
DeltaFunctionCall,
|
||||
DeltaMessage,
|
||||
DeltaToolCall,
|
||||
)
|
||||
|
||||
|
||||
class TokenState(NamedTuple):
|
||||
channel: str | None
|
||||
recipient: str | None
|
||||
text: str
|
||||
|
||||
|
||||
def extract_harmony_streaming_delta(
|
||||
harmony_parser: StreamableParser,
|
||||
token_states: list[TokenState],
|
||||
prev_recipient: str | None,
|
||||
include_reasoning: bool,
|
||||
) -> tuple[DeltaMessage | None, bool]:
|
||||
"""
|
||||
Extract a DeltaMessage from harmony parser state during streaming.
|
||||
|
||||
Args:
|
||||
harmony_parser: The StreamableParser instance tracking parse state
|
||||
token_states: List of TokenState tuples for each token
|
||||
prev_recipient: Previous recipient for detecting tool call transitions
|
||||
include_reasoning: Whether to include reasoning content
|
||||
|
||||
Returns:
|
||||
A tuple of (DeltaMessage or None, tools_streamed_flag)
|
||||
"""
|
||||
|
||||
if not token_states:
|
||||
return None, False
|
||||
|
||||
tools_streamed = False
|
||||
|
||||
# Group consecutive tokens with same channel/recipient
|
||||
groups: list[TokenState] = []
|
||||
|
||||
current_channel = token_states[0].channel
|
||||
current_recipient = token_states[0].recipient
|
||||
current_text = token_states[0].text
|
||||
|
||||
for i in range(1, len(token_states)):
|
||||
state = token_states[i]
|
||||
if state.channel == current_channel and state.recipient == current_recipient:
|
||||
current_text += state.text
|
||||
else:
|
||||
groups.append(TokenState(current_channel, current_recipient, current_text))
|
||||
current_channel = state.channel
|
||||
current_recipient = state.recipient
|
||||
current_text = state.text
|
||||
|
||||
groups.append(TokenState(current_channel, current_recipient, current_text))
|
||||
|
||||
# Process each group and create delta messages
|
||||
delta_message = None
|
||||
combined_content = ""
|
||||
combined_reasoning = ""
|
||||
tool_messages = []
|
||||
content_encountered = False
|
||||
|
||||
# Calculate base_index once before the loop
|
||||
# This counts completed tool calls in messages
|
||||
base_index = 0
|
||||
for msg in harmony_parser.messages:
|
||||
if (
|
||||
(msg.channel == "commentary" or msg.channel == "analysis")
|
||||
and msg.recipient
|
||||
and msg.recipient.startswith("functions.")
|
||||
):
|
||||
base_index += 1
|
||||
|
||||
# If there's an ongoing tool call from previous chunk,
|
||||
# the next new tool call starts at base_index + 1
|
||||
if prev_recipient and prev_recipient.startswith("functions."):
|
||||
next_tool_index = base_index + 1
|
||||
# Ongoing call is at base_index
|
||||
ongoing_tool_index = base_index
|
||||
else:
|
||||
# No ongoing call, next new call is at base_index
|
||||
next_tool_index = base_index
|
||||
ongoing_tool_index = None
|
||||
|
||||
for group in groups:
|
||||
if group.channel == "final":
|
||||
combined_content += group.text
|
||||
content_encountered = True
|
||||
elif (
|
||||
(group.channel == "commentary" or group.channel == "analysis")
|
||||
and group.recipient
|
||||
and group.recipient.startswith("functions.")
|
||||
):
|
||||
opened_new_call = False
|
||||
if prev_recipient != group.recipient:
|
||||
# New tool call - emit the opening message
|
||||
tool_name = group.recipient.split("functions.", 1)[1]
|
||||
tool_messages.append(
|
||||
DeltaToolCall(
|
||||
id=make_tool_call_id(),
|
||||
type="function",
|
||||
function=DeltaFunctionCall(
|
||||
name=tool_name,
|
||||
arguments="",
|
||||
),
|
||||
index=next_tool_index,
|
||||
)
|
||||
)
|
||||
opened_new_call = True
|
||||
prev_recipient = group.recipient
|
||||
# Increment for subsequent new tool calls
|
||||
next_tool_index += 1
|
||||
|
||||
if group.text:
|
||||
# Stream arguments for the ongoing tool call
|
||||
if opened_new_call:
|
||||
# Just opened in this group
|
||||
tool_call_index = next_tool_index - 1
|
||||
else:
|
||||
# Continuing from previous chunk
|
||||
# If ongoing_tool_index is None here, it means
|
||||
# we're continuing a call but prev_recipient
|
||||
# wasn't a function. Use base_index.
|
||||
tool_call_index = (
|
||||
ongoing_tool_index
|
||||
if ongoing_tool_index is not None
|
||||
else base_index
|
||||
)
|
||||
tool_messages.append(
|
||||
DeltaToolCall(
|
||||
index=tool_call_index,
|
||||
function=DeltaFunctionCall(arguments=group.text),
|
||||
)
|
||||
)
|
||||
elif group.channel == "commentary" and group.recipient is None:
|
||||
# Tool call preambles meant to be shown to the user
|
||||
combined_content += group.text
|
||||
content_encountered = True
|
||||
elif group.channel == "analysis" and include_reasoning:
|
||||
combined_reasoning += group.text
|
||||
|
||||
# Combine all non-empty fields into a single message
|
||||
if content_encountered or combined_reasoning or tool_messages:
|
||||
delta_kwargs: dict[str, str | list[DeltaToolCall]] = {}
|
||||
if content_encountered:
|
||||
delta_kwargs["content"] = combined_content
|
||||
if combined_reasoning:
|
||||
delta_kwargs["reasoning"] = combined_reasoning
|
||||
if tool_messages:
|
||||
delta_kwargs["tool_calls"] = tool_messages
|
||||
tools_streamed = True
|
||||
delta_message = DeltaMessage(**delta_kwargs)
|
||||
else:
|
||||
delta_message = None
|
||||
|
||||
return delta_message, tools_streamed
|
||||
Reference in New Issue
Block a user