update

2026-04-09 11:23:47 +08:00
parent 8082d5f4b2
commit 72387e4fa8
1885 changed files with 611521 additions and 1 deletions
--- a/vllm/parser/abstract_parser.py
+++ b/vllm/parser/abstract_parser.py
@@ -0,0 +1,541 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from abc import abstractmethod
+from collections.abc import Sequence
+from functools import cached_property
+
+from openai.types.responses import (
+    ResponseFunctionToolCall,
+    ResponseOutputItem,
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ResponseReasoningItem,
+    ToolChoiceFunction,
+)
+from openai.types.responses.response_output_text import Logprob
+from openai.types.responses.response_reasoning_item import (
+    Content as ResponseReasoningTextContent,
+)
+from pydantic import TypeAdapter
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionNamedToolChoiceParam,
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
+    DeltaMessage,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    FunctionDefinition,
+)
+from vllm.entrypoints.openai.responses.protocol import (
+    ResponsesRequest,
+)
+from vllm.logger import init_logger
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import ToolParser
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+class Parser:
+    """
+    Abstract Parser class that unifies ReasoningParser and ToolParser into
+    a single interface for parsing model output.
+
+    This class provides a unified way to handle both reasoning extraction
+    (e.g., chain-of-thought content in <think> tags) and tool call extraction
+    (e.g., function calls in XML/JSON format) from model outputs.
+
+    Subclasses can either:
+    1. Override the abstract methods directly for custom parsing logic
+    2. Set `reasoning_parser` and `tool_parser` properties to delegate to
+       existing parser implementations
+
+    Class Attributes:
+        reasoning_parser_cls: The ReasoningParser class to use (for compatibility
+            with code that needs the class, not instance).
+        tool_parser_cls: The ToolParser class to use (for compatibility with
+            code that needs the class, not instance).
+    """
+
+    # Class-level parser classes for compatibility with existing patterns
+    # Subclasses should override these if they use specific parser classes
+    reasoning_parser_cls: type[ReasoningParser] | None = None
+    tool_parser_cls: type[ToolParser] | None = None
+
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
+        """
+        Initialize the Parser.
+
+        Args:
+            tokenizer: The tokenizer used by the model. This is required for
+                token-based parsing operations.
+        """
+        self.model_tokenizer = tokenizer
+        self._reasoning_parser: ReasoningParser | None = None
+        self._tool_parser: ToolParser | None = None
+
+    @cached_property
+    def vocab(self) -> dict[str, int]:
+        """Get the vocabulary mapping from tokens to IDs."""
+        return self.model_tokenizer.get_vocab()
+
+    @property
+    def reasoning_parser(self) -> ReasoningParser | None:
+        """The underlying reasoning parser, if any."""
+        return self._reasoning_parser
+
+    @reasoning_parser.setter
+    def reasoning_parser(self, parser: ReasoningParser | None) -> None:
+        self._reasoning_parser = parser
+
+    @property
+    def tool_parser(self) -> ToolParser | None:
+        """The underlying tool parser, if any."""
+        return self._tool_parser
+
+    @tool_parser.setter
+    def tool_parser(self, parser: ToolParser | None) -> None:
+        self._tool_parser = parser
+
+    # ========== Reasoning Parser Methods ==========
+
+    @abstractmethod
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids.
+
+        Used by structured engines like `xgrammar` to check if the
+        reasoning content ends in the model output.
+
+        Args:
+            input_ids: The token IDs of the model output.
+
+        Returns:
+            True if the reasoning content ends in the input_ids.
+        """
+
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        """
+        Check if the reasoning content ends during a decode step.
+
+        Args:
+            input_ids: The entire model output token IDs.
+            delta_ids: The last few computed tokens at the current decode step.
+
+        Returns:
+            True if the reasoning content ends in the delta_ids.
+        """
+        return self.is_reasoning_end(input_ids)
+
+    @abstractmethod
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract content token IDs from the input_ids.
+
+        This extracts the non-reasoning content (e.g., everything after
+        the </think> tag).
+
+        Args:
+            input_ids: The token IDs of the model output.
+
+        Returns:
+            The extracted content token IDs.
+        """
+
+    @abstractmethod
+    def extract_response_outputs(
+        self,
+        model_output: str,
+        request: ResponsesRequest,
+        enable_auto_tools: bool = False,
+        tool_call_id_type: str = "random",
+        logprobs: list[Logprob] | None = None,
+    ) -> list[ResponseOutputItem]:
+        """
+        Extract reasoning, content, and tool calls from a complete
+        model-generated string and return as ResponseOutputItem objects.
+
+        Used for non-streaming responses where we have the entire model
+        response available before sending to the client.
+
+        Args:
+            model_output: The complete model-generated string.
+            request: The request object used to generate the output.
+            enable_auto_tools: Whether to enable automatic tool call parsing.
+            tool_call_id_type: Type of tool call ID generation ("random", etc).
+            logprobs: Pre-computed logprobs for the output text, if any.
+
+        Returns:
+            A list of ResponseOutputItem objects.
+        """
+
+    @abstractmethod
+    def extract_reasoning(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest | ResponsesRequest,
+    ) -> tuple[str | None, str | None]:
+        """
+        Extract reasoning content from a complete model-generated string.
+
+        Used for non-streaming responses where we have the entire model
+        response available before sending to the client.
+
+        Args:
+            model_output: The complete model-generated string.
+            request: The request object used to generate the output.
+
+        Returns:
+            A tuple of (reasoning_content, response_content).
+        """
+
+    @abstractmethod
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """
+        Extract reasoning content from a streaming delta message.
+
+        Args:
+            previous_text: Text from all previous tokens.
+            current_text: Text including the current delta.
+            delta_text: The new text in this delta.
+            previous_token_ids: Token IDs from previous generation.
+            current_token_ids: All token IDs including current.
+            delta_token_ids: The new token IDs in this delta.
+
+        Returns:
+            A DeltaMessage with reasoning and/or content fields, or None.
+        """
+
+    # ========== Tool Parser Methods ==========
+
+    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        """
+        Adjust the request parameters for tool calling.
+
+        Can be overridden by subclasses to modify request parameters
+        (e.g., setting structured output schemas for tool calling).
+
+        Args:
+            request: The original request.
+
+        Returns:
+            The adjusted request.
+        """
+        return request
+
+    @abstractmethod
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        """
+        Extract tool calls from a complete model-generated string.
+
+        Used for non-streaming responses.
+
+        Args:
+            model_output: The complete model-generated string.
+            request: The request object used to generate the output.
+
+        Returns:
+            ExtractedToolCallInformation containing the tool calls.
+        """
+
+    @abstractmethod
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        """
+        Extract tool calls from a streaming delta message.
+
+        Args:
+            previous_text: Text from all previous tokens.
+            current_text: Text including the current delta.
+            delta_text: The new text in this delta.
+            previous_token_ids: Token IDs from previous generation.
+            current_token_ids: All token IDs including current.
+            delta_token_ids: The new token IDs in this delta.
+            request: The request object.
+
+        Returns:
+            A DeltaMessage with tool_calls field, or None.
+        """
+
+
+class DelegatingParser(Parser):
+    """
+    A Parser implementation that delegates to separate ReasoningParser and
+    ToolParser instances.
+
+    This is the recommended base class for creating model-specific parsers
+    that combine existing reasoning and tool parser implementations.
+    Subclasses should set `self._reasoning_parser` and `self._tool_parser`
+    in their `__init__` method.
+
+    If either parser is None, the corresponding methods will return default
+    values (no reasoning extraction, no tool calls).
+    """
+
+    def extract_reasoning(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest | ResponsesRequest,
+    ) -> tuple[str | None, str | None]:
+        if self._reasoning_parser is None:
+            return None, model_output
+        return self._reasoning_parser.extract_reasoning(model_output, request)
+
+    def extract_response_outputs(
+        self,
+        model_output: str,
+        request: ResponsesRequest,
+        enable_auto_tools: bool = False,
+        tool_call_id_type: str = "random",
+        logprobs: list[Logprob] | None = None,
+    ) -> list[ResponseOutputItem]:
+        # First extract reasoning
+        reasoning, content = self.extract_reasoning(model_output, request)
+
+        # Then parse tool calls from the content
+        tool_calls, content = self._parse_tool_calls(
+            request=request,
+            content=content,
+            enable_auto_tools=enable_auto_tools,
+        )
+
+        # Build output items
+        outputs: list[ResponseOutputItem] = []
+
+        # Add reasoning item if present
+        if reasoning:
+            reasoning_item = ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                summary=[],
+                type="reasoning",
+                content=[
+                    ResponseReasoningTextContent(text=reasoning, type="reasoning_text")
+                ],
+                status=None,  # NOTE: Only the last output item has status.
+            )
+            outputs.append(reasoning_item)
+
+        # Add message item if there's content
+        if content:
+            res_text_part = ResponseOutputText(
+                text=content,
+                annotations=[],
+                type="output_text",
+                logprobs=logprobs,
+            )
+            message_item = ResponseOutputMessage(
+                id=f"msg_{random_uuid()}",
+                content=[res_text_part],
+                role="assistant",
+                status="completed",
+                type="message",
+            )
+            outputs.append(message_item)
+
+        if tool_calls:
+            # We use a simple counter for history_tool_call_count because
+            # we don't track the history of tool calls in the Responses API yet.
+            # This means that the tool call index will start from 0 for each
+            # request.
+            for history_tool_call_cnt, tool_call in enumerate(tool_calls):
+                tool_call_item = ResponseFunctionToolCall(
+                    id=f"fc_{random_uuid()}",
+                    call_id=tool_call.id
+                    if tool_call.id
+                    else make_tool_call_id(
+                        id_type=tool_call_id_type,
+                        func_name=tool_call.name,
+                        idx=history_tool_call_cnt,
+                    ),
+                    type="function_call",
+                    status="completed",
+                    name=tool_call.name,
+                    arguments=tool_call.arguments,
+                )
+                outputs.append(tool_call_item)
+
+        return outputs
+
+    def _parse_tool_calls(
+        self,
+        request: ResponsesRequest,
+        content: str | None,
+        enable_auto_tools: bool,
+    ) -> tuple[list[FunctionCall], str | None]:
+        """
+        TODO(qandrew): merge _parse_tool_calls_from_content
+        for ChatCompletions into this function
+        Parse tool calls from content based on request tool_choice settings.
+
+        Returns:
+            A tuple of (function_calls, remaining_content) if tool calls
+            were parsed
+        """
+        function_calls: list[FunctionCall] = []
+
+        if request.tool_choice and isinstance(request.tool_choice, ToolChoiceFunction):
+            # Forced Function Call (Responses API style)
+            assert content is not None
+            function_calls.append(
+                FunctionCall(name=request.tool_choice.name, arguments=content)
+            )
+            return function_calls, None  # Clear content since tool is called.
+
+        if request.tool_choice and isinstance(
+            request.tool_choice, ChatCompletionNamedToolChoiceParam
+        ):
+            # Forced Function Call (Chat Completion API style)
+            assert content is not None
+            function_calls.append(
+                FunctionCall(name=request.tool_choice.function.name, arguments=content)
+            )
+            return function_calls, None  # Clear content since tool is called.
+
+        if request.tool_choice == "required":
+            # Required tool calls - parse JSON
+            assert content is not None
+            tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(content)
+            function_calls.extend(
+                FunctionCall(
+                    name=tool_call.name,
+                    arguments=json.dumps(tool_call.parameters, ensure_ascii=False),
+                )
+                for tool_call in tool_calls
+            )
+            return function_calls, None  # Clear content since tool is called.
+
+        if (
+            self._tool_parser is not None
+            and enable_auto_tools
+            and (request.tool_choice == "auto" or request.tool_choice is None)
+        ):
+            # Automatic Tool Call Parsing
+            tool_call_info = self._tool_parser.extract_tool_calls(
+                content if content is not None else "",
+                request=request,  # type: ignore
+            )
+            if tool_call_info is not None and tool_call_info.tools_called:
+                function_calls.extend(
+                    FunctionCall(
+                        id=tool_call.id,
+                        name=tool_call.function.name,
+                        arguments=tool_call.function.arguments,
+                    )
+                    for tool_call in tool_call_info.tool_calls
+                )
+                remaining_content = tool_call_info.content
+                if remaining_content and remaining_content.strip() == "":
+                    remaining_content = None
+                return function_calls, remaining_content
+
+        # No tool calls
+        return [], content
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        if self._reasoning_parser is None:
+            return DeltaMessage(content=delta_text)
+        return self._reasoning_parser.extract_reasoning_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+        )
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        if self._tool_parser is None:
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+        return self._tool_parser.extract_tool_calls(model_output, request)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        if self._tool_parser is None:
+            return None
+        return self._tool_parser.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+            request,
+        )
+
+
+class _WrappedParser(DelegatingParser):
+    """
+    A DelegatingParser subclass that instantiates parsers from class attributes.
+
+    This class is used to dynamically create a parser that wraps individual
+    ReasoningParser and ToolParser classes. The class attributes
+    `reasoning_parser_cls` and `tool_parser_cls` should be set before
+    instantiation.
+
+    Usage:
+        _WrappedParser.reasoning_parser_cls = MyReasoningParser
+        _WrappedParser.tool_parser_cls = MyToolParser
+        parser = _WrappedParser(tokenizer)
+    """
+
+    reasoning_parser_cls: type[ReasoningParser] | None = None
+    tool_parser_cls: type[ToolParser] | None = None
+
+    def __init__(self, tokenizer: TokenizerLike):
+        super().__init__(tokenizer)
+        # Instantiate the underlying parsers from class attributes
+        if self.__class__.reasoning_parser_cls is not None:
+            self._reasoning_parser = self.__class__.reasoning_parser_cls(tokenizer)
+        if self.__class__.tool_parser_cls is not None:
+            self._tool_parser = self.__class__.tool_parser_cls(tokenizer)