Add minimal vLLM 0.16.1 build repo for BI-V150

2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions
--- a/vllm/entrypoints/openai/chat_completion/stream_harmony.py
+++ b/vllm/entrypoints/openai/chat_completion/stream_harmony.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Harmony-specific streaming delta extraction for chat completions.
+
+This module handles the extraction of DeltaMessage objects from
+harmony parser state during streaming chat completions.
+"""
+
+from typing import NamedTuple
+
+from openai_harmony import StreamableParser
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.engine.protocol import (
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+)
+
+
+class TokenState(NamedTuple):
+    channel: str | None
+    recipient: str | None
+    text: str
+
+
+def extract_harmony_streaming_delta(
+    harmony_parser: StreamableParser,
+    token_states: list[TokenState],
+    prev_recipient: str | None,
+    include_reasoning: bool,
+) -> tuple[DeltaMessage | None, bool]:
+    """
+    Extract a DeltaMessage from harmony parser state during streaming.
+
+    Args:
+        harmony_parser: The StreamableParser instance tracking parse state
+        token_states: List of TokenState tuples for each token
+        prev_recipient: Previous recipient for detecting tool call transitions
+        include_reasoning: Whether to include reasoning content
+
+    Returns:
+        A tuple of (DeltaMessage or None, tools_streamed_flag)
+    """
+
+    if not token_states:
+        return None, False
+
+    tools_streamed = False
+
+    # Group consecutive tokens with same channel/recipient
+    groups: list[TokenState] = []
+
+    current_channel = token_states[0].channel
+    current_recipient = token_states[0].recipient
+    current_text = token_states[0].text
+
+    for i in range(1, len(token_states)):
+        state = token_states[i]
+        if state.channel == current_channel and state.recipient == current_recipient:
+            current_text += state.text
+        else:
+            groups.append(TokenState(current_channel, current_recipient, current_text))
+            current_channel = state.channel
+            current_recipient = state.recipient
+            current_text = state.text
+
+    groups.append(TokenState(current_channel, current_recipient, current_text))
+
+    # Process each group and create delta messages
+    delta_message = None
+    combined_content = ""
+    combined_reasoning = ""
+    tool_messages = []
+    content_encountered = False
+
+    # Calculate base_index once before the loop
+    # This counts completed tool calls in messages
+    base_index = 0
+    for msg in harmony_parser.messages:
+        if (
+            (msg.channel == "commentary" or msg.channel == "analysis")
+            and msg.recipient
+            and msg.recipient.startswith("functions.")
+        ):
+            base_index += 1
+
+    # If there's an ongoing tool call from previous chunk,
+    # the next new tool call starts at base_index + 1
+    if prev_recipient and prev_recipient.startswith("functions."):
+        next_tool_index = base_index + 1
+        # Ongoing call is at base_index
+        ongoing_tool_index = base_index
+    else:
+        # No ongoing call, next new call is at base_index
+        next_tool_index = base_index
+        ongoing_tool_index = None
+
+    for group in groups:
+        if group.channel == "final":
+            combined_content += group.text
+            content_encountered = True
+        elif (
+            (group.channel == "commentary" or group.channel == "analysis")
+            and group.recipient
+            and group.recipient.startswith("functions.")
+        ):
+            opened_new_call = False
+            if prev_recipient != group.recipient:
+                # New tool call - emit the opening message
+                tool_name = group.recipient.split("functions.", 1)[1]
+                tool_messages.append(
+                    DeltaToolCall(
+                        id=make_tool_call_id(),
+                        type="function",
+                        function=DeltaFunctionCall(
+                            name=tool_name,
+                            arguments="",
+                        ),
+                        index=next_tool_index,
+                    )
+                )
+                opened_new_call = True
+                prev_recipient = group.recipient
+                # Increment for subsequent new tool calls
+                next_tool_index += 1
+
+            if group.text:
+                # Stream arguments for the ongoing tool call
+                if opened_new_call:
+                    # Just opened in this group
+                    tool_call_index = next_tool_index - 1
+                else:
+                    # Continuing from previous chunk
+                    # If ongoing_tool_index is None here, it means
+                    # we're continuing a call but prev_recipient
+                    # wasn't a function. Use base_index.
+                    tool_call_index = (
+                        ongoing_tool_index
+                        if ongoing_tool_index is not None
+                        else base_index
+                    )
+                tool_messages.append(
+                    DeltaToolCall(
+                        index=tool_call_index,
+                        function=DeltaFunctionCall(arguments=group.text),
+                    )
+                )
+        elif group.channel == "commentary" and group.recipient is None:
+            # Tool call preambles meant to be shown to the user
+            combined_content += group.text
+            content_encountered = True
+        elif group.channel == "analysis" and include_reasoning:
+            combined_reasoning += group.text
+
+    # Combine all non-empty fields into a single message
+    if content_encountered or combined_reasoning or tool_messages:
+        delta_kwargs: dict[str, str | list[DeltaToolCall]] = {}
+        if content_encountered:
+            delta_kwargs["content"] = combined_content
+        if combined_reasoning:
+            delta_kwargs["reasoning"] = combined_reasoning
+        if tool_messages:
+            delta_kwargs["tool_calls"] = tool_messages
+            tools_streamed = True
+        delta_message = DeltaMessage(**delta_kwargs)
+    else:
+        delta_message = None
+
+    return delta_message, tools_streamed