init src 0.9.2

2026-01-09 15:09:53 +08:00
parent 0eb2c0a4b3
commit 41d98d4359
1438 changed files with 417605 additions and 683 deletions
--- a/vllm/entrypoints/openai/tool_parsers/init.py
+++ b/vllm/entrypoints/openai/tool_parsers/init.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .abstract_tool_parser import ToolParser, ToolParserManager
+from .deepseekv3_tool_parser import DeepSeekV3ToolParser
+from .glm4_moe_tool_parser import Glm4MoeModelToolParser
+from .granite_20b_fc_tool_parser import Granite20bFCToolParser
+from .granite_tool_parser import GraniteToolParser
+from .hermes_tool_parser import Hermes2ProToolParser
+from .internlm2_tool_parser import Internlm2ToolParser
+from .jamba_tool_parser import JambaToolParser
+from .llama4_pythonic_tool_parser import Llama4PythonicToolParser
+from .llama_tool_parser import Llama3JsonToolParser
+from .minimax_tool_parser import MinimaxToolParser
+from .mistral_tool_parser import MistralToolParser
+from .phi4mini_tool_parser import Phi4MiniJsonToolParser
+from .pythonic_tool_parser import PythonicToolParser
+from .step3_tool_parser import Step3ToolParser
+from .xlam_tool_parser import xLAMToolParser
+
+__all__ = [
+    "ToolParser", 
+    "ToolParserManager", 
+    "Granite20bFCToolParser",
+    "GraniteToolParser", 
+    "Hermes2ProToolParser", 
+    "MistralToolParser",
+    "Internlm2ToolParser", 
+    "Llama3JsonToolParser", 
+    "JambaToolParser",
+    "Llama4PythonicToolParser", 
+    "PythonicToolParser", 
+    "Phi4MiniJsonToolParser",
+    "DeepSeekV3ToolParser", 
+    "Step3ToolParser",
+    "xLAMToolParser", 
+    "MinimaxToolParser",
+    "Glm4MoeModelToolParser",
+]
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+from collections.abc import Sequence
+from functools import cached_property
+from typing import Callable, Optional, Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage,
+                                              ExtractedToolCallInformation)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import import_from_path, is_list_of
+
+logger = init_logger(__name__)
+
+
+class ToolParser:
+    """
+    Abstract ToolParser class that should not be used directly. Provided
+    properties and methods should be used in
+    derived classes.
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        self.prev_tool_call_arr: list[dict] = []
+        # the index of the tool call that is currently being parsed
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = []
+
+        self.model_tokenizer = tokenizer
+
+    @cached_property
+    def vocab(self) -> dict[str, int]:
+        # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
+        # whereas all tokenizers have .get_vocab()
+        return self.model_tokenizer.get_vocab()
+
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        """
+        Static method that used to adjust the request parameters.
+        """
+        return request
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Static method that should be implemented for extracting tool calls from
+        a complete model-generated string.
+        Used for non-streaming responses where we have the entire model response
+        available before sending to the client.
+        Static because it's stateless.
+        """
+        raise NotImplementedError(
+            "AbstractToolParser.extract_tool_calls has not been implemented!")
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        """
+        Instance method that should be implemented for extracting tool calls
+        from an incomplete response; for use when handling tool calls and
+        streaming. Has to be an instance method because  it requires state -
+        the current tokens/diffs, but also the information about what has
+        previously been parsed and extracted (see constructor)
+        """
+        raise NotImplementedError(
+            "AbstractToolParser.extract_tool_calls_streaming has not been "
+            "implemented!")
+
+
+class ToolParserManager:
+    tool_parsers: dict[str, type] = {}
+
+    @classmethod
+    def get_tool_parser(cls, name) -> type:
+        """
+        Get tool parser by name which is registered by `register_module`.
+
+        Raise a KeyError exception if the name is not registered.
+        """
+        if name in cls.tool_parsers:
+            return cls.tool_parsers[name]
+
+        raise KeyError(f"tool helper: '{name}' not found in tool_parsers")
+
+    @classmethod
+    def _register_module(cls,
+                         module: type,
+                         module_name: Optional[Union[str, list[str]]] = None,
+                         force: bool = True) -> None:
+        if not issubclass(module, ToolParser):
+            raise TypeError(
+                f'module must be subclass of ToolParser, but got {type(module)}'
+            )
+        if module_name is None:
+            module_name = module.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in cls.tool_parsers:
+                existed_module = cls.tool_parsers[name]
+                raise KeyError(f'{name} is already registered '
+                               f'at {existed_module.__module__}')
+            cls.tool_parsers[name] = module
+
+    @classmethod
+    def register_module(
+            cls,
+            name: Optional[Union[str, list[str]]] = None,
+            force: bool = True,
+            module: Union[type, None] = None) -> Union[type, Callable]:
+        """
+        Register module with the given name or name list. it can be used as a
+        decoder(with module as None) or normal function(with module as not 
+        None).
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f'force must be a boolean, but got {type(force)}')
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str)
+                or is_list_of(name, str)):
+            raise TypeError(
+                'name must be None, an instance of str, or a sequence of str, '
+                f'but got {type(name)}')
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(module):
+            cls._register_module(module=module, module_name=name, force=force)
+            return module
+
+        return _register
+
+    @classmethod
+    def import_tool_parser(cls, plugin_path: str) -> None:
+        """
+        Import a user-defined tool parser by the path of the tool parser define
+        file.
+        """
+        module_name = os.path.splitext(os.path.basename(plugin_path))[0]
+
+        try:
+            import_from_path(module_name, plugin_path)
+        except Exception:
+            logger.exception("Failed to load module '%s' from %s.",
+                             module_name, plugin_path)
+            return
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -0,0 +1,370 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+from typing import Union
+
+import regex as re
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("deepseek_v3")
+class DeepSeekV3ToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = (
+            [])  # map what has been streamed for each tool so far to a list
+
+        self.tool_calls_start_token: str = "<｜tool▁calls▁begin｜>"
+        self.tool_calls_end_token: str = "<｜tool▁calls▁end｜>"
+
+        self.tool_call_start_token: str = "<｜tool▁call▁begin｜>"
+        self.tool_call_end_token: str = "<｜tool▁call▁end｜>"
+
+        self.tool_call_regex = re.compile(
+            r"<｜tool▁call▁begin｜>(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n```json\n(?P<function_arguments>.*)\n```<｜tool▁call▁end｜>"
+        )
+
+        self.stream_tool_call_portion_regex = re.compile(
+            r"(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n```json\n(?P<function_arguments>.*[^\n`])"
+        )
+
+        self.stream_tool_call_name_regex = re.compile(
+            r"(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n")
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+        self.tool_calls_start_token_id = self.vocab.get(
+            self.tool_calls_start_token)
+        self.tool_calls_end_token_id = self.vocab.get(
+            self.tool_calls_end_token)
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+
+        if (self.tool_calls_start_token_id is None
+                or self.tool_calls_end_token_id is None):
+            raise RuntimeError(
+                "DeepSeek-V3 Tool parser could not locate tool call start/end "
+                "tokens in the tokenizer!")
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_calls_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        else:
+            try:
+                # there are two possible captures - between tags, or between a
+                # tag and end-of-string so the result of
+                # findall is an array of tuples where one is a function call and
+                # the other is None
+                function_call_tuples = self.tool_call_regex.findall(
+                    model_output)
+
+                tool_calls = []
+                for match in function_call_tuples:
+                    tool_type, function_name, function_args = match
+                    tool_calls.append(
+                        ToolCall(
+                            type=tool_type,
+                            function=FunctionCall(name=function_name,
+                                                  arguments=function_args),
+                        ))
+
+                content = model_output[:model_output.
+                                       find(self.tool_calls_start_token)]
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if content else None,
+                )
+
+            except Exception:
+                logger.exception(
+                    "Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+        # check to see if we should be streaming a tool call - is there a
+        if self.tool_calls_start_token_id not in current_token_ids:
+            logger.debug("No tool call tokens found!")
+            return DeltaMessage(content=delta_text)
+        delta_text = delta_text.replace(self.tool_calls_start_token,
+                                        "").replace(self.tool_calls_end_token,
+                                                    "")
+        try:
+
+            # figure out where we are in the parsing by counting tool call
+            # start & end tags
+            prev_tool_start_count = previous_token_ids.count(
+                self.tool_call_start_token_id)
+            prev_tool_end_count = previous_token_ids.count(
+                self.tool_call_end_token_id)
+            cur_tool_start_count = current_token_ids.count(
+                self.tool_call_start_token_id)
+            cur_tool_end_count = current_token_ids.count(
+                self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
+
+            # case: if we're generating text, OR rounding out a tool call
+            if (cur_tool_start_count == cur_tool_end_count
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
+                logger.debug("Generating text content! skipping tool parsing.")
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
+
+            # case -- we're starting a new tool call
+            if (cur_tool_start_count > cur_tool_end_count
+                    and cur_tool_start_count > prev_tool_start_count):
+                if len(delta_token_ids) > 1:
+                    tool_call_portion = current_text.split(
+                        self.tool_call_start_token)[-1]
+                else:
+                    tool_call_portion = None
+                    delta = None
+
+                text_portion = None
+
+                # set cursors and state appropriately
+                self.current_tool_id += 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("Starting on a new tool %s", self.current_tool_id)
+
+            # case -- we're updating an existing tool call
+            elif (cur_tool_start_count > cur_tool_end_count
+                  and cur_tool_start_count == prev_tool_start_count):
+
+                # get the portion of the text that's the tool call
+                tool_call_portion = current_text.split(
+                    self.tool_call_start_token)[-1]
+                text_portion = None
+
+            # case -- the current tool call is being closed.
+            elif (cur_tool_start_count == cur_tool_end_count
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if self.prev_tool_call_arr is None or len(
+                        self.prev_tool_call_arr) == 0:
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
+                diff = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments")
+                if diff:
+                    diff = (diff.encode("utf-8").decode("unicode_escape")
+                            if diff is str else diff)
+                    if '"}' not in delta_text:
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
+                    logger.debug(
+                        "Finishing tool and found diff that had not "
+                        "been streamed yet: %s",
+                        diff,
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += diff
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=diff).model_dump(exclude_none=True),
+                        )
+                    ])
+
+            # case -- otherwise we're just generating text
+            else:
+                text = delta_text.replace(self.tool_call_start_token, "")
+                text = text.replace(self.tool_call_end_token, "")
+                delta = DeltaMessage(tool_calls=[], content=text)
+                return delta
+
+            current_tool_call = dict()
+            if tool_call_portion:
+                current_tool_call_matches = (
+                    self.stream_tool_call_portion_regex.match(
+                        tool_call_portion))
+                if current_tool_call_matches:
+                    tool_type, tool_name, tool_args = (
+                        current_tool_call_matches.groups())
+                    current_tool_call["name"] = tool_name
+                    current_tool_call["arguments"] = tool_args
+                else:
+                    current_tool_call_name_matches = (
+                        self.stream_tool_call_name_regex.match(
+                            tool_call_portion))
+                    if current_tool_call_name_matches:
+                        tool_type, tool_name = (
+                            current_tool_call_name_matches.groups())
+                        current_tool_call["name"] = tool_name
+                        current_tool_call["arguments"] = ""
+                    else:
+                        logger.debug("Not enough token")
+                        return None
+
+            # case - we haven't sent the tool name yet. If it's available, send
+            #   it. otherwise, wait until it's available.
+            if not self.current_tool_name_sent:
+                if current_tool_call is None:
+                    return None
+                function_name: Union[str, None] = current_tool_call.get("name")
+                if function_name:
+                    self.current_tool_name_sent = True
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            type="function",
+                            id=random_tool_call_id(),
+                            function=DeltaFunctionCall(
+                                name=function_name).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                else:
+                    return None
+
+            # case -- otherwise, send the tool call delta
+
+            # if the tool call portion is None, send the delta as text
+            if tool_call_portion is None:
+                # if there's text but not tool calls, send that -
+                # otherwise None to skip chunk
+                delta = (DeltaMessage(
+                    content=delta_text) if text_portion is not None else None)
+                return delta
+
+            # now, the nitty-gritty of tool calls
+            # now we have the portion to parse as tool call.
+
+            logger.debug("Trying to parse current tool call with ID %s",
+                         self.current_tool_id)
+
+            # if we're starting a new tool call, push an empty object in as
+            #   a placeholder for the arguments
+            if len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+
+            # main logic for tool parsing here - compare prev. partially-parsed
+            #   JSON to the current partially-parsed JSON
+            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                "arguments")
+            cur_arguments = current_tool_call.get("arguments")
+
+            logger.debug("diffing old arguments: %s", prev_arguments)
+            logger.debug("against new ones: %s", cur_arguments)
+
+            # case -- no arguments have been created yet. skip sending a delta.
+            if not cur_arguments and not prev_arguments:
+                logger.debug("Skipping text %s - no arguments", delta_text)
+                delta = None
+
+            # case -- prev arguments are defined, but non are now.
+            #   probably impossible, but not a fatal error - just keep going
+            elif not cur_arguments and prev_arguments:
+                logger.error("should be impossible to have arguments reset "
+                             "mid-call. skipping streaming anything.")
+                delta = None
+
+            # case -- we now have the first info about arguments available from
+            #   autocompleting the JSON
+            elif cur_arguments and not prev_arguments:
+
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_id,
+                        function=DeltaFunctionCall(
+                            arguments=cur_arguments).model_dump(
+                                exclude_none=True),
+                    )
+                ])
+                self.streamed_args_for_tool[
+                    self.current_tool_id] = cur_arguments
+
+            # last case -- we have an update to existing arguments.
+            elif cur_arguments and prev_arguments:
+                if (isinstance(delta_text, str)
+                        and cur_arguments != prev_arguments
+                        and len(cur_arguments) > len(prev_arguments)
+                        and cur_arguments.startswith(prev_arguments)):
+                    delta_arguments = cur_arguments[len(prev_arguments):]
+                    logger.debug("got diff %s", delta_text)
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=delta_arguments).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] = cur_arguments
+                else:
+                    delta = None
+
+            # handle saving the state for the current tool into
+            # the "prev" list for use in diffing for the next iteration
+            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
+                self.prev_tool_call_arr[
+                    self.current_tool_id] = current_tool_call
+            else:
+                self.prev_tool_call_arr.append(current_tool_call)
+
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            return None  # do not stream a delta. skip this token ID.
--- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
@@ -0,0 +1,402 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# code modified from deepseekv3_tool_parser.py
+
+from collections.abc import Sequence
+from typing import Union
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("glm4_moe")
+class Glm4MoeModelToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+        self.current_tool_name_sent = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id = -1
+        self.streamed_args_for_tool: list[str] = []
+        self.tool_call_start_token = "<tool_call>"
+        self.tool_call_end_token = "</tool_call>"
+
+        self.tool_calls_start_token = self.tool_call_start_token
+
+        # Updated regex for the XML-based format
+        self.tool_call_regex = re.compile(
+            r"<tool_call>\s*"
+            r"(?P<function_name>[^\n<]+)\s*"  # 函数名（到换行或 <）
+            r"(?P<arguments>(?:\s*<arg_key>[^<]+</arg_key>\s*"
+            r"<arg_value>[^<]*</arg_value>\s*)*)\s*"
+            r"</tool_call>",
+            re.DOTALL,
+        )
+
+        # Regex for parsing individual arguments
+        self.arg_regex = re.compile(
+            r"<arg_key>(?P<key>[^<]+)</arg_key>\s*<arg_value>(?P<value>[^<]*)</arg_value>",
+            re.DOTALL,
+        )
+
+        # Streaming regex
+        self.stream_tool_call_portion_regex = re.compile(
+            r"(?P<function_name>[^\n<]+)\s*"
+            r"(?P<arguments>(?:\s*<arg_key>[^<]+</arg_key>\s*"
+            r"<arg_value>[^<]*</arg_value>\s*)*)",
+            re.DOTALL,
+        )
+
+        # For streaming, we also need a regex to match just the function name
+        self.stream_tool_call_name_regex = re.compile(
+            r"(?P<function_name>[^\n<]+)",
+            re.DOTALL,
+        )
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+
+    def _parse_arguments(self, args_text: str) -> str:
+        """Parse XML-based arguments into JSON format."""
+        if not args_text or not args_text.strip():
+            return "{}"
+
+        args_dict = {}
+        matches = self.arg_regex.findall(args_text)
+
+        for key, value in matches:
+            args_dict[key.strip()] = value.strip()
+
+        import json
+        return json.dumps(args_dict, ensure_ascii=False)
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_calls_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            # Find all tool calls in the output
+            function_call_matches = self.tool_call_regex.findall(model_output)
+
+            logger.debug("function_call_matches: %s", function_call_matches)
+
+            if not function_call_matches:
+                return ExtractedToolCallInformation(
+                    tools_called=False,
+                    tool_calls=[],
+                    content=model_output,
+                )
+
+            tool_calls = []
+            for i, match in enumerate(function_call_matches):
+                function_name, function_args_xml = match
+                function_name = function_name.strip()
+
+                # Parse XML arguments to JSON
+                function_args_json = self._parse_arguments(function_args_xml)
+
+                tool_calls.append(
+                    ToolCall(
+                        id=f"call_{i}",
+                        type='function',
+                        function=FunctionCall(name=function_name,
+                                              arguments=function_args_json),
+                    ))
+
+            # Extract content before the first tool call
+            content = model_output[:model_output.find(self.
+                                                      tool_calls_start_token)]
+            return ExtractedToolCallInformation(
+                tools_called=bool(tool_calls),
+                tool_calls=tool_calls,
+                content=content.strip() if content.strip() else None,
+            )
+
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+        # check to see if we should be streaming a tool call - is there a
+        if self.tool_call_start_token_id not in current_token_ids:
+            logger.debug("No tool call tokens found!")
+            return DeltaMessage(content=delta_text)
+        delta_text = delta_text.replace(self.tool_calls_start_token,
+                                        "").replace(self.tool_call_end_token,
+                                                    "")
+        try:
+
+            # figure out where we are in the parsing by counting tool call
+            # start & end tags
+            prev_tool_start_count = previous_token_ids.count(
+                self.tool_call_start_token_id)
+            prev_tool_end_count = previous_token_ids.count(
+                self.tool_call_end_token_id)
+            cur_tool_start_count = current_token_ids.count(
+                self.tool_call_start_token_id)
+            cur_tool_end_count = current_token_ids.count(
+                self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
+
+            # case: if we're generating text, OR rounding out a tool call
+            if (cur_tool_start_count == cur_tool_end_count
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
+                logger.debug("Generating text content! skipping tool parsing.")
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
+
+            # case -- we're starting a new tool call
+            if (cur_tool_start_count > cur_tool_end_count
+                    and cur_tool_start_count > prev_tool_start_count):
+                if len(delta_token_ids) > 1:
+                    tool_call_portion = current_text.split(
+                        self.tool_call_start_token)[-1]
+                else:
+                    tool_call_portion = None
+                    delta = None
+
+                text_portion = None
+
+                # set cursors and state appropriately
+                self.current_tool_id += 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("Starting on a new tool %s", self.current_tool_id)
+
+            # case -- we're updating an existing tool call
+            elif (cur_tool_start_count > cur_tool_end_count
+                  and cur_tool_start_count == prev_tool_start_count):
+
+                # get the portion of the text that's the tool call
+                tool_call_portion = current_text.split(
+                    self.tool_call_start_token)[-1]
+                text_portion = None
+
+            # case -- the current tool call is being closed.
+            elif (cur_tool_start_count == cur_tool_end_count
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if self.prev_tool_call_arr is None or len(
+                        self.prev_tool_call_arr) == 0:
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
+                diff = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments")
+                if diff:
+                    diff = (diff.encode("utf-8").decode("unicode_escape")
+                            if diff is str else diff)
+                    if '"}' not in delta_text:
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
+                    logger.debug(
+                        "Finishing tool and found diff that had not "
+                        "been streamed yet: %s",
+                        diff,
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += diff
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=diff).model_dump(exclude_none=True),
+                        )
+                    ])
+
+            # case -- otherwise we're just generating text
+            else:
+                text = delta_text.replace(self.tool_call_start_token, "")
+                text = text.replace(self.tool_call_end_token, "")
+                delta = DeltaMessage(tool_calls=[], content=text)
+                return delta
+
+            current_tool_call = dict()
+            if tool_call_portion:
+                current_tool_call_matches = (
+                    self.stream_tool_call_portion_regex.match(
+                        tool_call_portion))
+                if current_tool_call_matches:
+                    tool_id, tool_args = (current_tool_call_matches.groups())
+                    tool_name = tool_id.split('.')[1].split(':')[0]
+                    current_tool_call['id'] = tool_id
+                    current_tool_call["name"] = tool_name
+                    current_tool_call["arguments"] = tool_args
+                else:
+                    current_tool_call_name_matches = (
+                        self.stream_tool_call_name_regex.match(
+                            tool_call_portion))
+                    if current_tool_call_name_matches:
+                        tool_id_str, = current_tool_call_name_matches.groups()
+                        tool_name = tool_id_str.split('.')[1].split(':')[0]
+                        current_tool_call['id'] = tool_id_str
+                        current_tool_call["name"] = tool_name
+                        current_tool_call["arguments"] = ""
+                    else:
+                        logger.debug("Not enough token")
+                        return None
+
+            # case - we haven't sent the tool name yet. If it's available, send
+            #   it. otherwise, wait until it's available.
+            if not self.current_tool_name_sent:
+                if current_tool_call is None:
+                    return None
+                function_name: Union[str, None] = current_tool_call.get("name")
+                tool_id = current_tool_call.get("id")
+                if function_name:
+                    self.current_tool_name_sent = True
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            type="function",
+                            id=tool_id,
+                            function=DeltaFunctionCall(
+                                name=function_name).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                else:
+                    return None
+
+            # case -- otherwise, send the tool call delta
+
+            # if the tool call portion is None, send the delta as text
+            if tool_call_portion is None:
+                # if there's text but not tool calls, send that -
+                # otherwise None to skip chunk
+                delta = (DeltaMessage(
+                    content=delta_text) if text_portion is not None else None)
+                return delta
+
+            # now, the nitty-gritty of tool calls
+            # now we have the portion to parse as tool call.
+
+            logger.debug("Trying to parse current tool call with ID %s",
+                         self.current_tool_id)
+
+            # if we're starting a new tool call, push an empty object in as
+            #   a placeholder for the arguments
+            if len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+
+            # main logic for tool parsing here - compare prev. partially-parsed
+            #   JSON to the current partially-parsed JSON
+            prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
+                "arguments")
+            cur_arguments = current_tool_call.get("arguments")
+
+            logger.debug("diffing old arguments: %s", prev_arguments)
+            logger.debug("against new ones: %s", cur_arguments)
+
+            # case -- no arguments have been created yet. skip sending a delta.
+            if not cur_arguments and not prev_arguments:
+                logger.debug("Skipping text %s - no arguments", delta_text)
+                delta = None
+
+            # case -- prev arguments are defined, but non are now.
+            #   probably impossible, but not a fatal error - just keep going
+            elif not cur_arguments and prev_arguments:
+                logger.error("should be impossible to have arguments reset "
+                             "mid-call. skipping streaming anything.")
+                delta = None
+
+            # case -- we now have the first info about arguments available from
+            #   autocompleting the JSON
+            elif cur_arguments and not prev_arguments:
+
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(
+                        index=self.current_tool_id,
+                        function=DeltaFunctionCall(
+                            arguments=cur_arguments).model_dump(
+                                exclude_none=True),
+                    )
+                ])
+                self.streamed_args_for_tool[
+                    self.current_tool_id] = cur_arguments
+
+            # last case -- we have an update to existing arguments.
+            elif cur_arguments and prev_arguments:
+                if (isinstance(delta_text, str)
+                        and cur_arguments != prev_arguments
+                        and len(cur_arguments) > len(prev_arguments)
+                        and cur_arguments.startswith(prev_arguments)):
+                    delta_arguments = cur_arguments[len(prev_arguments):]
+                    logger.debug("got diff %s", delta_text)
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=self.current_tool_id,
+                            function=DeltaFunctionCall(
+                                arguments=delta_arguments).model_dump(
+                                    exclude_none=True),
+                        )
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] = cur_arguments
+                else:
+                    delta = None
+
+            # handle saving the state for the current tool into
+            # the "prev" list for use in diffing for the next iteration
+            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
+                self.prev_tool_call_arr[
+                    self.current_tool_id] = current_tool_call
+            else:
+                self.prev_tool_call_arr.append(current_tool_call)
+
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            return None  # do not stream a delta. skip this token ID.
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -0,0 +1,259 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from json import JSONDecoder
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (consume_space,
+                                                        find_common_prefix,
+                                                        is_complete_json,
+                                                        partial_json_loads)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("granite-20b-fc")
+class Granite20bFCToolParser(ToolParser):
+    """
+    Tool call parser for the granite-20b-functioncalling model intended
+    for use with the examples/tool_chat_template_granite20b_fc.jinja
+    template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser granite-20-fc
+    are all set
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        self.bot_token = "<function_call>"
+        self.tool_start_token = self.bot_token
+        self.tool_call_regex = re.compile(r"<function_call>\s*")
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        if self.tool_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        dec = JSONDecoder()
+        try:
+            matches = list(self.tool_call_regex.finditer(model_output))
+            logger.debug("Found %d tool call matches", len(matches))
+
+            raw_function_calls = []
+
+            for i, match in enumerate(matches):
+                # position after the <function_call> tag
+                start_of_json = match.end()
+                # end_index == the start of the next function call
+                # (if exists)
+                next_function_call_start = (matches[i + 1].start() if i +
+                                            1 < len(matches) else None)
+
+                raw_function_calls.append(
+                    dec.raw_decode(
+                        model_output[start_of_json:next_function_call_start])
+                    [0])
+
+            logger.debug("Extracted %d tool calls", len(raw_function_calls))
+            tool_calls = [
+                ToolCall(
+                    type="function",
+                    function=FunctionCall(
+                        name=function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(function_call["arguments"],
+                                             ensure_ascii=False),
+                    ),
+                ) for function_call in raw_function_calls
+            ]
+
+            content = model_output[:model_output.find(self.bot_token)]
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=content if content else None,
+            )
+
+        except Exception as e:
+            logger.error("Error in extracting tool call from response %s", e)
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        if len(current_text) < len(
+                self.bot_token) and self.bot_token.startswith(current_text):
+            return None
+
+        if not current_text.startswith(self.bot_token):
+            return DeltaMessage(content=delta_text)
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+            tool_call_arr = []
+            is_complete = []
+            try:
+                start_idx = len(self.bot_token)
+                start_idx = consume_space(start_idx, current_text)
+
+                while start_idx < len(current_text):
+                    (obj,
+                     end_idx) = partial_json_loads(current_text[start_idx:],
+                                                   flags)
+                    is_complete.append(
+                        is_complete_json(current_text[start_idx:start_idx +
+                                                      end_idx]))
+                    start_idx += end_idx
+                    start_idx = consume_space(start_idx, current_text)
+                    start_idx += len(self.bot_token)
+                    start_idx = consume_space(start_idx, current_text)
+                    tool_call_arr.append(obj)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # select as the current tool call the one we're on the state at
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > 0 else {}
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if len(tool_call_arr) == 0:
+                return None
+
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    cur_arguments = current_tool_call.get("arguments")
+                    if cur_arguments:
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
+                        sent = len(
+                            self.streamed_args_for_tool[self.current_tool_id])
+                        argument_diff = cur_args_json[sent:]
+
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            elif not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+                cur_arguments = current_tool_call.get("arguments")
+                delta = None
+
+                if cur_arguments:
+                    sent = len(
+                        self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+
+                    argument_diff = None
+                    if is_complete[self.current_tool_id]:
+                        argument_diff = cur_args_json[sent:]
+                    elif prev_arguments:
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
+                        if cur_args_json != prev_args_json:
+
+                            prefix = find_common_prefix(
+                                prev_args_json, cur_args_json)
+                            argument_diff = prefix[sent:]
+
+                    if argument_diff is not None:
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception as e:
+            logger.error("Error trying to handle streaming tool call: %s", e)
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (consume_space,
+                                                        find_common_prefix,
+                                                        is_complete_json,
+                                                        partial_json_loads)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("granite")
+class GraniteToolParser(ToolParser):
+    """
+    Tool call parser for the granite 3.0 models. Intended
+    for use with the examples/tool_chat_template_granite.jinja
+    template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser granite
+    are all set
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+        # for granite 3.0, the token `<|tool_call|>`
+        self.bot_token = "<|tool_call|>"
+        # for granite 3.1, the string `<tool_call>`
+        self.bot_string = "<tool_call>"
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        stripped = model_output.strip()\
+                    .removeprefix(self.bot_token)\
+                    .removeprefix(self.bot_string)\
+                    .lstrip()
+        if not stripped or stripped[0] != '[':
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+        try:
+            raw_function_calls = json.loads(stripped)
+            if not isinstance(raw_function_calls, list):
+                raise Exception(
+                    f"Expected dict or list, got {type(raw_function_calls)}")
+
+            logger.debug("Extracted %d tool calls", len(raw_function_calls))
+            tool_calls = [
+                ToolCall(
+                    type="function",
+                    function=FunctionCall(
+                        name=function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(function_call["arguments"],
+                                             ensure_ascii=False),
+                    ),
+                ) for function_call in raw_function_calls
+            ]
+
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=None,
+            )
+
+        except Exception as e:
+            logger.error("Error in extracting tool call from response %s", e)
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        start_idx = consume_space(0, current_text)
+        if current_text[start_idx:].startswith(self.bot_token):
+            start_idx = consume_space(start_idx + len(self.bot_token),
+                                      current_text)
+        if current_text[start_idx:].startswith(self.bot_string):
+            start_idx = consume_space(start_idx + len(self.bot_string),
+                                      current_text)
+        if not current_text or start_idx >= len(current_text)\
+            or current_text[start_idx] != '[':
+            return DeltaMessage(content=delta_text)
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+            tool_call_arr = None
+            is_complete = None
+            try:
+                tool_calls, end_idx = partial_json_loads(
+                    current_text[start_idx:], flags)
+                if type(tool_calls) is list:
+                    tool_call_arr = tool_calls
+                else:
+                    return DeltaMessage(content=delta_text)
+
+                is_complete = [True] * len(tool_calls)
+                if not is_complete_json(
+                        current_text[start_idx:start_idx + end_idx]):
+                    is_complete[-1] = False
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if not tool_call_arr:
+                return None
+
+            # select as the current tool call the one we're on the state at
+            current_tool_call: dict = tool_call_arr[self.current_tool_id]
+
+            delta = None
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            if len(tool_call_arr) > self.current_tool_id + 1:
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    cur_arguments = current_tool_call.get("arguments")
+                    if cur_arguments:
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
+                        sent = len(
+                            self.streamed_args_for_tool[self.current_tool_id])
+                        argument_diff = cur_args_json[sent:]
+
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            elif not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+                cur_arguments = current_tool_call.get("arguments")
+
+                if cur_arguments:
+                    sent = len(
+                        self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+
+                    argument_diff = None
+                    if is_complete[self.current_tool_id]:
+                        argument_diff = cur_args_json[sent:]
+                    elif prev_arguments:
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
+                        if cur_args_json != prev_args_json:
+                            prefix = find_common_prefix(
+                                prev_args_json, cur_args_json)
+                            argument_diff = prefix[sent:]
+
+                    if argument_diff is not None:
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception as e:
+            logger.error("Error trying to handle streaming tool call: %s", e)
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -0,0 +1,371 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("hermes")
+class Hermes2ProToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        if isinstance(self.model_tokenizer, MistralTokenizer):
+            logger.error(
+                "Detected Mistral tokenizer when using a Hermes model")
+            self.model_tokenizer = self.model_tokenizer.tokenizer
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+
+        self.tool_call_regex = re.compile(
+            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL)
+        self.scratch_pad_regex = re.compile(
+            r"<scratch_pad>(.*?)</scratch_pad>", re.DOTALL)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+        if (self.tool_call_start_token_id is None
+                or self.tool_call_end_token_id is None):
+            raise RuntimeError(
+                "Hermes 2 Pro Tool parser could not locate tool call start/end "
+                "tokens in the tokenizer!")
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_call_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        else:
+
+            try:
+                # there are two possible captures - between tags, or between a
+                # tag and end-of-string so the result of
+                # findall is an array of tuples where one is a function call and
+                # the other is None
+                function_call_tuples = (
+                    self.tool_call_regex.findall(model_output))
+
+                # load the JSON, and then use it to build the Function and
+                # Tool Call
+                raw_function_calls = [
+                    json.loads(match[0] if match[0] else match[1])
+                    for match in function_call_tuples
+                ]
+                tool_calls = [
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=function_call["name"],
+                            # function call args are JSON but as a string
+                            arguments=json.dumps(function_call["arguments"],
+                                                 ensure_ascii=False)))
+                    for function_call in raw_function_calls
+                ]
+
+                content = model_output[:model_output.
+                                       find(self.tool_call_start_token)]
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if content else None)
+
+            except Exception:
+                logger.exception(
+                    "Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+        # check to see if we should be streaming a tool call - is there a
+        if self.tool_call_start_token_id not in current_token_ids:
+            logger.debug("No tool call tokens found!")
+            return DeltaMessage(content=delta_text)
+
+        try:
+
+            # figure out where we are in the parsing by counting tool call
+            # start & end tags
+            prev_tool_start_count = previous_token_ids.count(
+                self.tool_call_start_token_id)
+            prev_tool_end_count = previous_token_ids.count(
+                self.tool_call_end_token_id)
+            cur_tool_start_count = current_token_ids.count(
+                self.tool_call_start_token_id)
+            cur_tool_end_count = current_token_ids.count(
+                self.tool_call_end_token_id)
+            tool_call_portion = None
+            text_portion = None
+
+            # case: if we're generating text, OR rounding out a tool call
+            if (cur_tool_start_count == cur_tool_end_count
+                    and prev_tool_end_count == cur_tool_end_count
+                    and self.tool_call_end_token not in delta_text):
+                logger.debug("Generating text content! skipping tool parsing.")
+                return DeltaMessage(content=delta_text)
+
+            if self.tool_call_end_token in delta_text:
+                logger.debug("tool_call_end_token in delta_text")
+                full_text = current_text + delta_text
+                tool_call_portion = full_text.split(
+                    self.tool_call_start_token)[-1].split(
+                        self.tool_call_end_token)[0].rstrip()
+                delta_text = delta_text.split(
+                    self.tool_call_end_token)[0].rstrip()
+                text_portion = delta_text.split(
+                    self.tool_call_end_token)[-1].lstrip()
+
+            # case: if tool open & close tag counts don't match, we're doing
+            # imaginary "else" block here
+            # something with tools with this diff.
+            # flags for partial JSON parting. exported constants from
+            # "Allow" are handled via BIT MASK
+            flags = Allow.ALL if self.current_tool_name_sent \
+                else Allow.ALL & ~Allow.STR
+
+            # case -- we're starting a new tool call
+            if (cur_tool_start_count > cur_tool_end_count
+                    and cur_tool_start_count > prev_tool_start_count):
+                if len(delta_token_ids) > 1:
+                    tool_call_portion = current_text.split(
+                        self.tool_call_start_token)[-1]
+                else:
+                    tool_call_portion = None
+                    delta = None
+
+                text_portion = None
+
+                # set cursors and state appropriately
+                self.current_tool_id += 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("Starting on a new tool %s", self.current_tool_id)
+
+            # case -- we're updating an existing tool call
+            elif (cur_tool_start_count > cur_tool_end_count
+                  and cur_tool_start_count == prev_tool_start_count):
+
+                # get the portion of the text that's the tool call
+                tool_call_portion = current_text.split(
+                    self.tool_call_start_token)[-1]
+                text_portion = None
+
+            # case -- the current tool call is being closed.
+            elif (cur_tool_start_count == cur_tool_end_count
+                  and cur_tool_end_count >= prev_tool_end_count):
+                if (self.prev_tool_call_arr is None
+                        or len(self.prev_tool_call_arr) == 0):
+                    logger.debug(
+                        "attempting to close tool call, but no tool call")
+                    return None
+                diff = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments")
+                if diff:
+                    diff = diff.encode('utf-8').decode(
+                        'unicode_escape') if diff is str else diff
+                    if ('"}' not in delta_text):
+                        return None
+                    end_loc = delta_text.rindex('"}')
+                    diff = delta_text[:end_loc] + '"}'
+                    logger.debug(
+                        "Finishing tool and found diff that had not "
+                        "been streamed yet: %s", diff)
+                    self.streamed_args_for_tool[self.current_tool_id] \
+                        += diff
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=diff).model_dump(
+                                              exclude_none=True))
+                    ])
+
+            # case -- otherwise we're just generating text
+            else:
+                text = delta_text.replace(self.tool_call_start_token, "")
+                text = text.replace(self.tool_call_end_token, "")
+                delta = DeltaMessage(tool_calls=[], content=text)
+                return delta
+
+            try:
+
+                current_tool_call = partial_json_parser.loads(
+                    tool_call_portion or "{}",
+                    flags) if tool_call_portion else None
+                logger.debug("Parsed tool call %s", current_tool_call)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+            except json.decoder.JSONDecodeError:
+                logger.debug("unable to parse JSON")
+                return None
+
+            # case - we haven't sent the tool name yet. If it's available, send
+            #   it. otherwise, wait until it's available.
+            if not self.current_tool_name_sent:
+                if (current_tool_call is None):
+                    return None
+                function_name: Union[str, None] = current_tool_call.get("name")
+                if function_name:
+                    self.current_tool_name_sent = True
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                else:
+                    return None
+            # case -- otherwise, send the tool call delta
+
+            # if the tool call portion is None, send the delta as text
+            if tool_call_portion is None:
+                # if there's text but not tool calls, send that -
+                # otherwise None to skip chunk
+                delta = DeltaMessage(content=delta_text) \
+                    if text_portion is not None else None
+                return delta
+
+            # now, the nitty-gritty of tool calls
+            # now we have the portion to parse as tool call.
+
+            logger.debug("Trying to parse current tool call with ID %s",
+                         self.current_tool_id)
+
+            # if we're starting a new tool call, push an empty object in as
+            #   a placeholder for the arguments
+            if len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+
+            # main logic for tool parsing here - compare prev. partially-parsed
+            #   JSON to the current partially-parsed JSON
+            prev_arguments = (
+                self.prev_tool_call_arr[self.current_tool_id].get("arguments"))
+            cur_arguments = current_tool_call.get("arguments")
+
+            logger.debug("diffing old arguments: %s", prev_arguments)
+            logger.debug("against new ones: %s", cur_arguments)
+
+            # case -- no arguments have been created yet. skip sending a delta.
+            if not cur_arguments and not prev_arguments:
+                logger.debug("Skipping text %s - no arguments", delta_text)
+                delta = None
+
+            # case -- prev arguments are defined, but non are now.
+            #   probably impossible, but not a fatal error - just keep going
+            elif not cur_arguments and prev_arguments:
+                logger.error("should be impossible to have arguments reset "
+                             "mid-call. skipping streaming anything.")
+                delta = None
+
+            # case -- we now have the first info about arguments available from
+            #   autocompleting the JSON
+            elif cur_arguments and not prev_arguments:
+
+                cur_arguments_json = json.dumps(cur_arguments,
+                                                ensure_ascii=False)
+                logger.debug("finding %s in %s", delta_text,
+                             cur_arguments_json)
+
+                # get the location where previous args differ from current
+                if (delta_text not in cur_arguments_json[:-2]):
+                    return None
+                args_delta_start_loc = cur_arguments_json[:-2]. \
+                                           rindex(delta_text) + \
+                                           len(delta_text)
+
+                # use that to find the actual delta
+                arguments_delta = cur_arguments_json[:args_delta_start_loc]
+                logger.debug("First tokens in arguments received: %s",
+                             arguments_delta)
+
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(index=self.current_tool_id,
+                                  function=DeltaFunctionCall(
+                                      arguments=arguments_delta).model_dump(
+                                          exclude_none=True))
+                ])
+                self.streamed_args_for_tool[self.current_tool_id] \
+                    += arguments_delta
+
+            # last case -- we have an update to existing arguments.
+            elif cur_arguments and prev_arguments:
+                if isinstance(delta_text, str) and len(delta_text.rstrip(
+                )) >= 1 and delta_text.rstrip()[-1] == '}':
+                    delta_text = delta_text.rstrip()[:-1]
+
+                logger.debug("got diff %s", delta_text)
+
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(index=self.current_tool_id,
+                                  function=DeltaFunctionCall(
+                                      arguments=delta_text).model_dump(
+                                          exclude_none=True))
+                ])
+                self.streamed_args_for_tool[self.current_tool_id] \
+                    += delta_text
+
+            # handle saving the state for the current tool into
+            # the "prev" list for use in diffing for the next iteration
+            if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
+                self.prev_tool_call_arr[self.current_tool_id] = \
+                    current_tool_call
+            else:
+                self.prev_tool_call_arr.append(current_tool_call)
+
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            return None  # do not stream a delta. skip this token ID.
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    extract_intermediate_diff)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module(["internlm"])
+class Internlm2ToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+        self.position = 0
+
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        if request.tools and request.tool_choice != 'none':
+            # do not skip special tokens because internlm use the special
+            # tokens to indicated the start and end of the tool calls
+            # information.
+            request.skip_special_tokens = False
+        return request
+
+    def get_arguments(self, obj):
+        if "parameters" in obj:
+            return obj.get("parameters")
+        elif "arguments" in obj:
+            return obj.get("arguments")
+        return None
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        if '<|action_start|>' not in current_text:
+            self.position = len(current_text)
+            return DeltaMessage(content=delta_text)
+        # if the tool call is sended, return a empty delta message
+        # to make sure the finish_reason will be send correctly.
+        if self.current_tool_id > 0:
+            return DeltaMessage(content='')
+
+        last_pos = self.position
+        if '<|action_start|><|plugin|>' not in current_text[last_pos:]:
+            return None
+
+        new_delta = current_text[last_pos:]
+        text, action = new_delta.split('<|action_start|><|plugin|>')
+
+        if len(text) > 0:
+            self.position = self.position + len(text)
+            return DeltaMessage(content=text)
+
+        action = action.strip()
+        action = action.split('<|action_end|>'.strip())[0]
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+
+        try:
+            parsable_arr = action
+
+            # tool calls are generated in an object in inernlm2
+            # it's not support parallel tool calls
+            try:
+                tool_call_arr: dict = partial_json_parser.loads(
+                    parsable_arr, flags)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            if not self.current_tool_name_sent:
+                function_name = tool_call_arr.get("name")
+                if function_name:
+                    self.current_tool_id = self.current_tool_id + 1
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                    self.streamed_args_for_tool.append("")
+                else:
+                    delta = None
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+                prev_arguments = self.get_arguments(
+                    self.prev_tool_call_arr[self.current_tool_id])
+                cur_arguments = self.get_arguments(tool_call_arr)
+
+                # not arguments generated
+                if not cur_arguments and not prev_arguments:
+                    delta = None
+                # will never happen
+                elif not cur_arguments and prev_arguments:
+                    logger.error(
+                        "INVARIANT - impossible to have arguments reset "
+                        "mid-arguments")
+                    delta = None
+                # first time to get parameters
+                elif cur_arguments and not prev_arguments:
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
+
+                    arguments_delta = cur_arguments_json[:cur_arguments_json.
+                                                         index(delta_text) +
+                                                         len(delta_text)]
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=arguments_delta).
+                                      model_dump(exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += arguments_delta
+                # both prev and cur parameters, send the increase parameters
+                elif cur_arguments and prev_arguments:
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
+
+                    argument_diff = extract_intermediate_diff(
+                        cur_args_json, prev_args_json)
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=argument_diff).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += argument_diff
+
+            # check to see if the name is defined and has been sent. if so,
+            # stream the name - otherwise keep waiting
+            # finish by setting old and returning None as base case
+            tool_call_arr["arguments"] = self.get_arguments(tool_call_arr)
+            self.prev_tool_call_arr = [tool_call_arr]
+            return delta
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        text = model_output
+        tools = request.tools
+        if '<|action_start|><|plugin|>' in text:
+            text, action = text.split('<|action_start|><|plugin|>')
+            action = action.split('<|action_end|>'.strip())[0]
+            action = action[action.find('{'):]
+            action_dict = json.loads(action)
+            name, parameters = action_dict['name'], json.dumps(
+                action_dict.get('parameters', action_dict.get('arguments',
+                                                              {})),
+                ensure_ascii=False)
+
+            if not tools or name not in [t.function.name for t in tools]:
+                ExtractedToolCallInformation(tools_called=False,
+                                             tool_calls=[],
+                                             content=text)
+
+            tool_calls = [
+                ToolCall(
+                    function=FunctionCall(name=name, arguments=parameters))
+            ]
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=text if len(text) > 0 else None)
+
+        return ExtractedToolCallInformation(tools_called=False,
+                                            tool_calls=[],
+                                            content=text)
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -0,0 +1,308 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    extract_intermediate_diff)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizers import MistralTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("jamba")
+class JambaToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        if isinstance(self.model_tokenizer, MistralTokenizer):
+            raise ValueError(
+                "Detected a MistralTokenizer tokenizer when using a Jamba model"
+            )
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+
+        self.tool_calls_start_token: str = "<tool_calls>"
+        self.tool_calls_end_token: str = "</tool_calls>"
+
+        self.tool_calls_regex = re.compile(
+            rf"{self.tool_calls_start_token}(.*?){self.tool_calls_end_token}",
+            re.DOTALL)
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+        self.tool_calls_start_token_id = self.vocab.get(
+            self.tool_calls_start_token)
+        self.tool_calls_end_token_id = self.vocab.get(
+            self.tool_calls_end_token)
+        if (self.tool_calls_start_token_id is None
+                or self.tool_calls_end_token_id is None):
+            raise RuntimeError(
+                "Jamba Tool parser could not locate tool calls start/end "
+                "tokens in the tokenizer!")
+
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        if request.tools and request.tool_choice != 'none':
+            # do not skip special tokens because jamba use the special
+            # tokens to indicate the start and end of the tool calls
+            # information.
+            request.skip_special_tokens = False
+        return request
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_calls_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        else:
+
+            try:
+                # use a regex to find the tool call between the tags
+                function_calls = self.tool_calls_regex.findall(model_output)[0]
+
+                # load the JSON, and then use it to build the Function and
+                # Tool Call
+                raw_function_calls = json.loads(function_calls)
+                tool_calls = [
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=function_call["name"],
+                            # function call args are JSON but as a string
+                            arguments=json.dumps(function_call["arguments"],
+                                                 ensure_ascii=False),
+                        )) for function_call in raw_function_calls
+                ]
+
+                content = model_output[:model_output.
+                                       find(self.tool_calls_start_token)]
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if
+                    (len(content) > 0 and content != " ") else None)
+
+            except Exception:
+                logger.exception(
+                    "Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        # if the tool call token is not in the tokens generated so far, append
+        # output to contents since it's not a tool
+        if self.tool_calls_start_token not in current_text:
+            return DeltaMessage(content=delta_text)
+
+        # if the tool call token ID IS in the tokens generated so far, that
+        # means we're parsing as tool calls now
+
+        # handle if we detected the start of tool calls token which means
+        # the start of tool calling
+        if (self.tool_calls_start_token_id in delta_token_ids
+                and len(delta_token_ids) == 1):
+            # if it's the only token, return None, so we don't send a chat
+            # completion and don't send a control token
+            return None
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+
+            # Extract the tool calls between the special tool call tokens
+            parsable_arr = current_text.split(
+                self.tool_calls_start_token)[-1].split(
+                    self.tool_calls_end_token)[0]
+
+            # tool calls are generated in an array, so do partial JSON
+            # parsing on the entire array
+            try:
+                tool_call_arr: list[dict] = partial_json_parser.loads(
+                    parsable_arr, flags)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # select as the current tool call the one we're on the state at
+
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > 0 else {}
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if len(tool_call_arr) == 0:
+                return None
+
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    diff: Union[str, None] = current_tool_call.get("arguments")
+
+                    if diff:
+                        diff = json.dumps(diff, ensure_ascii=False).replace(
+                            self.streamed_args_for_tool[self.current_tool_id],
+                            "")
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=diff).model_dump(
+                                                  exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # case: update an existing tool - this is handled below
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            if not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+
+                prev_arguments = self.prev_tool_call_arr[
+                    self.current_tool_id].get("arguments")
+                cur_arguments = current_tool_call.get("arguments")
+
+                new_text = delta_text.replace("\'", "\"")
+
+                if not cur_arguments and not prev_arguments:
+
+                    delta = None
+                elif not cur_arguments and prev_arguments:
+                    logger.error(
+                        "INVARIANT - impossible to have arguments reset "
+                        "mid-arguments")
+                    delta = None
+                elif cur_arguments and not prev_arguments:
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
+                    logger.debug("finding %s in %s", new_text,
+                                 cur_arguments_json)
+
+                    arguments_delta = cur_arguments_json[:cur_arguments_json.
+                                                         index(new_text) +
+                                                         len(new_text)]
+                    logger.debug("First tokens in arguments received: %s",
+                                 arguments_delta)
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=arguments_delta).
+                                      model_dump(exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += arguments_delta
+
+                elif cur_arguments and prev_arguments:
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
+                    logger.debug("Searching for diff between \n%s\n%s",
+                                 cur_args_json, prev_args_json)
+
+                    argument_diff = extract_intermediate_diff(
+                        cur_args_json, prev_args_json)
+                    logger.debug("got arguments diff: %s", argument_diff)
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=argument_diff).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += argument_diff
+                else:
+                    # try parsing it with regular JSON - if it works we're
+                    # at the end, and we need to send the difference between
+                    # tokens streamed so far and the valid JSON
+                    delta = None
+
+            # check to see if the name is defined and has been sent. if so,
+            # stream the name - otherwise keep waiting
+            # finish by setting old and returning None as base case
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
--- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
@@ -0,0 +1,316 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ast
+import json
+from collections.abc import Sequence
+from typing import Any, Union
+
+import regex as re
+from transformers import PreTrainedTokenizerBase
+
+import vllm.envs as envs
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class _UnexpectedAstError(Exception):
+    pass
+
+
+@ToolParserManager.register_module("llama4_pythonic")
+class Llama4PythonicToolParser(ToolParser):
+    """
+    Toolcall parser for Llama4 that produce tool calls in a pythonic style
+    Use --enable-auto-tool-choice --tool-call-parser llama4_pythonic
+    """
+    # TODO(mdepinet): Possible future improvements:
+    #   1. Support text + tools separated by either <|python_tag|> or \n\n
+    #   2. Support tools outside of a list (or separated by a semicolon).
+    #      This depends on item 1 for consistent streaming.
+    # Neither of these are necessary for e.g. ToolACE, but both would help make
+    # Llama3.2 models more reliable.
+
+    TOOL_CALL_REGEX = re.compile(
+        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+        re.DOTALL)
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+    # Rename for readability. This is NOT a tool id.
+    @property
+    def current_tool_index(self) -> int:
+        return self.current_tool_id
+
+    @current_tool_index.setter
+    def current_tool_index(self, value: int) -> None:
+        self.current_tool_id = value
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+
+        # remove <|python_start|> and <|python_end|>
+        # as Llama 4 model sometime will output those tokens
+        if model_output.startswith("<|python_start|>"):
+            model_output = model_output[len("<|python_start|>"):]
+            model_output = model_output.replace("<|python_end|>", "")
+
+        is_tool_call_pattern = False
+        try:
+            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
+                model_output,
+                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
+        except TimeoutError:
+            logger.warning(
+                "Regex timeout occurred when matching tool call pattern.")
+            logger.debug("Regex timeout occurred when matching user input: %s",
+                         model_output)
+
+        if not is_tool_call_pattern:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            module = ast.parse(model_output)
+            parsed = getattr(module.body[0], "value", None)
+            if isinstance(parsed, ast.List) and all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=[
+                        _handle_single_tool(e)  # type: ignore
+                        for e in parsed.elts
+                    ],
+                    content=None)
+            else:
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # Treat as regular text
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        if not current_text.startswith("[") and not current_text.startswith(
+                "<|python_start|>"):
+            return DeltaMessage(content=delta_text)
+
+        try:
+            # remove <|python_start|> and <|python_end|>
+            if current_text.startswith("<|python_start|>"):
+                current_text = current_text[len("<|python_start|>"):]
+            if current_text.endswith("<|python_end|>"):
+                current_text = current_text[:current_text.
+                                            rfind("<|python_end|>")]
+            valid_and_added_text = _make_valid_python(current_text)
+            if valid_and_added_text is None:
+                return None
+            valid_text, added_text = valid_and_added_text
+
+            module = ast.parse(valid_text)
+            parsed = getattr(module.body[0], "value", None)
+            if not isinstance(parsed, ast.List) or not all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+            tool_calls = [
+                _handle_single_tool(e)  # type: ignore
+                for e in parsed.elts
+            ]
+
+            tool_deltas = []
+            for index, new_call in enumerate(tool_calls):
+                if index < self.current_tool_index:
+                    continue
+
+                self.current_tool_index = index
+                if len(self.streamed_args_for_tool) == index:
+                    self.streamed_args_for_tool.append("")
+
+                new_call_complete = index < len(
+                    tool_calls) - 1 or ")]" not in added_text
+                if new_call_complete:
+                    self.current_tool_index += 1
+
+                withheld_suffix = (added_text[:-2]
+                                   if not new_call_complete else "")
+                if not new_call_complete and added_text[-2] == ")":
+                    # Function call is incomplete. Withhold the closing bracket.
+                    withheld_suffix = withheld_suffix + "}"
+                # Strings get single quotes in the model-produced string.
+                # JSON requires double quotes.
+                withheld_suffix = withheld_suffix.replace("'", '"')
+                delta = _compute_tool_delta(self.streamed_args_for_tool[index],
+                                            new_call, index, withheld_suffix)
+
+                if delta is not None:
+                    tool_deltas.append(delta)
+                    if (delta.function is not None
+                            and delta.function.arguments is not None):
+                        self.streamed_args_for_tool[
+                            index] += delta.function.arguments
+
+        # HACK: serving_chat.py inspects the internal state of tool parsers
+        # when determining it's final streaming delta, automatically
+        # adding autocompleted JSON.
+        # These two lines avoid that nonsense while ensuring finish_reason
+        # is set to tool_calls when at least one tool is called.
+            if tool_deltas and not self.prev_tool_call_arr:
+                self.prev_tool_call_arr = [{"arguments": {}}]
+
+            if tool_deltas:
+                return DeltaMessage(tool_calls=tool_deltas)
+            elif not added_text and self.current_tool_id > 0:
+                # Return an empty DeltaMessage once the tool calls are all done
+                # so that finish_reason gets set.
+                return DeltaMessage(content='')
+            else:
+                return None
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
+
+
+def _get_parameter_value(val: ast.expr) -> Any:
+    if isinstance(val, ast.Constant):
+        return val.value
+    elif isinstance(val, ast.Dict):
+        if not all(isinstance(k, ast.Constant) for k in val.keys):
+            raise _UnexpectedAstError(
+                "Dict tool call arguments must have literal keys")
+        return {
+            k.value: _get_parameter_value(v)  # type: ignore
+            for k, v in zip(val.keys, val.values)
+        }
+    elif isinstance(val, ast.List):
+        return [_get_parameter_value(v) for v in val.elts]
+    else:
+        raise _UnexpectedAstError("Tool call arguments must be literals")
+
+
+def _handle_single_tool(call: ast.Call) -> ToolCall:
+    if not isinstance(call.func, ast.Name):
+        raise _UnexpectedAstError("Invalid tool call name")
+    function_name = call.func.id
+    arguments = {}
+    for keyword in call.keywords:
+        arguments[keyword.arg] = _get_parameter_value(keyword.value)
+    return ToolCall(type="function",
+                    function=FunctionCall(name=function_name,
+                                          arguments=json.dumps(arguments)))
+
+
+def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
+    bracket_stack = []
+    for index, char in enumerate(text):
+        if char in {"[", "(", "{"}:
+            bracket_stack.append(char)
+        elif char == "]":
+            if not bracket_stack or bracket_stack.pop() != "[":
+                raise _UnexpectedAstError("Mismatched square brackets")
+        elif char == ")":
+            if not bracket_stack or bracket_stack.pop() != "(":
+                raise _UnexpectedAstError("Mismatched parentheses")
+        elif char == "}":
+            if not bracket_stack or bracket_stack.pop() != "{":
+                raise _UnexpectedAstError("Mismatched curly braces")
+        elif char in {"'", '"'}:
+            if bracket_stack and bracket_stack[-1] == char:
+                if index > 0 and text[index - 1] == "\\":
+                    # Treat an escaped quote as a regular character
+                    pass
+                else:
+                    bracket_stack.pop()
+            elif bracket_stack and bracket_stack[-1] in {"'", '"'}:
+                # Double quote within a single quote string or vice versa.
+                pass
+            else:
+                bracket_stack.append(char)
+
+    text = text.rstrip()
+    if text.endswith("=") or text.endswith(":"):
+        # Since we have no type information for this property/parameter value,
+        # we can't fill in a valid value.
+        return None
+    if bracket_stack and bracket_stack[-1] == "{":
+        trailing_dict_text = text[:text.rfind("{")]
+        num_keys = trailing_dict_text.count(":")
+        num_values = trailing_dict_text.count(",")
+        if num_keys <= num_values:
+            return None  # Incomplete property name within parameter value
+    if bracket_stack and bracket_stack[-1] == "(":
+        trailing_params_text = text[:text.rfind("(")]
+        num_full_param_names = trailing_params_text.count("=")
+        num_full_param_values = trailing_params_text.count(",")
+        if num_full_param_names <= num_full_param_values:
+            return None  # Incomplete parameter name
+    if text.endswith(","):
+        text = text[:-1]
+    if bracket_stack and bracket_stack[-1] == "[" and not text.endswith(
+            "[") and not text.endswith(")"):
+        return None  # Incomplete function name
+
+    added_text = ""
+    for char in reversed(bracket_stack):
+        if char == "[":
+            added_text += "]"
+        elif char == "(":
+            added_text += ")"
+        elif char == "{":
+            added_text += "}"
+        elif char == "'":
+            added_text += "'"
+        elif char == '"':
+            added_text += '"'
+
+    return text + added_text, added_text
+
+
+def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall,
+                        index: int,
+                        withheld_suffix: str) -> Union[DeltaToolCall, None]:
+    new_call_args = new_call.function.arguments
+    if withheld_suffix:
+        assert new_call_args.endswith(withheld_suffix)
+        new_call_args = new_call_args[:-len(withheld_suffix)]
+    if not previously_sent_args:
+        return DeltaToolCall(id=new_call.id,
+                             type="function",
+                             index=index,
+                             function=DeltaFunctionCall(
+                                 name=new_call.function.name,
+                                 arguments=new_call_args,
+                             ))
+
+    arg_diff = new_call_args[len(previously_sent_args):]
+    return DeltaToolCall(
+        id=None, index=index, function=DeltaFunctionCall(
+            arguments=arg_diff)) if arg_diff else None
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -0,0 +1,267 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from json import JSONDecoder
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (find_common_prefix,
+                                                        is_complete_json,
+                                                        partial_json_loads)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("llama3_json")
+@ToolParserManager.register_module("llama4_json")
+class Llama3JsonToolParser(ToolParser):
+    """
+    Tool call parser for Llama 3.1 models intended for use with the
+    examples/tool_chat_template_llama.jinja template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser llama3_json 
+    are all set
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.bot_token = "<|python_tag|>"
+        self.bot_token_id = tokenizer.encode(self.bot_token,
+                                             add_special_tokens=False)[0]
+        self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL)
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        # case -- if a tool call token is not present, return a text response
+        if not (model_output.startswith(self.bot_token)
+                or model_output.startswith('{')):
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            # load the JSON, and then use it to build the Function and
+            # Tool Call
+            dec = JSONDecoder()
+            function_call_arr = []
+
+            # depending on the prompt format the Llama model may or may not
+            # prefix the output with the <|python_tag|> token
+            start_idx = len(self.bot_token) if model_output.startswith(
+                self.bot_token) else 0
+            while start_idx < len(model_output):
+                (obj, end_idx) = dec.raw_decode(model_output[start_idx:])
+                start_idx += end_idx + len('; ')
+                function_call_arr.append(obj)
+
+            tool_calls: list[ToolCall] = [
+                ToolCall(
+                    type="function",
+                    function=FunctionCall(
+                        name=raw_function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(raw_function_call["arguments"] \
+                                if "arguments" in raw_function_call \
+                                else raw_function_call["parameters"],
+                                ensure_ascii=False)))
+                for raw_function_call in function_call_arr
+            ]
+
+            # get any content before  the tool call
+            ret = ExtractedToolCallInformation(tools_called=True,
+                                               tool_calls=tool_calls,
+                                               content=None)
+            return ret
+
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # return information to just treat the tool call as regular JSON
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        if not (current_text.startswith(self.bot_token)
+                or current_text.startswith('{')):
+            return DeltaMessage(content=delta_text)
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+            tool_call_arr = []
+            is_complete = []
+            try:
+                # depending on the prompt format the Llama model may or may not
+                # prefix the output with the <|python_tag|> token
+                start_idx = len(self.bot_token) if current_text.startswith(
+                    self.bot_token) else 0
+                while start_idx < len(current_text):
+                    (obj,
+                     end_idx) = partial_json_loads(current_text[start_idx:],
+                                                   flags)
+                    is_complete.append(
+                        is_complete_json(current_text[start_idx:start_idx +
+                                                      end_idx]))
+                    start_idx += end_idx + len('; ')
+                    # depending on the prompt Llama can use
+                    # either arguments or parameters
+                    if "parameters" in obj:
+                        assert "arguments" not in obj, \
+                            "model generated both parameters and arguments"
+                        obj["arguments"] = obj["parameters"]
+                    tool_call_arr.append(obj)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # select as the current tool call the one we're on the state at
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > 0 else {}
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if len(tool_call_arr) == 0:
+                return None
+
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    cur_arguments = current_tool_call.get("arguments")
+                    if cur_arguments:
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
+                        sent = len(
+                            self.streamed_args_for_tool[self.current_tool_id])
+                        argument_diff = cur_args_json[sent:]
+
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            elif not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+                cur_arguments = current_tool_call.get("arguments")
+                delta = None
+
+                if cur_arguments:
+                    sent = len(
+                        self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+
+                    argument_diff = None
+                    if is_complete[self.current_tool_id]:
+                        argument_diff = cur_args_json[sent:]
+                    elif prev_arguments:
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
+                        if cur_args_json != prev_args_json:
+
+                            prefix = find_common_prefix(
+                                prev_args_json, cur_args_json)
+                            argument_diff = prefix[sent:]
+
+                    if argument_diff is not None:
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -0,0 +1,369 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("minimax")
+class MinimaxToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        self.current_tool_name_sent: bool = False
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.streamed_args_for_tool: list[str] = []
+
+        self.tool_call_start_token: str = "<tool_calls>"
+        self.tool_call_end_token: str = "</tool_calls>"
+
+        self.tool_call_regex = re.compile(
+            r"<tool_calls>(.*?)</tool_calls>|<tool_calls>(.*)", re.DOTALL)
+
+        # Add regex pattern for thinking tag
+        self.thinking_tag_pattern = r"<think>(.*?)</think>"
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+
+        self.tool_call_start_token_id = self.vocab.get(
+            self.tool_call_start_token)
+        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
+
+        if (self.tool_call_start_token_id is None
+                or self.tool_call_end_token_id is None):
+            logger.warning(
+                "Minimax Tool parser could not locate tool call start/end "
+                "tokens in the tokenizer. Falling back to string matching.")
+
+    def preprocess_model_output(self, model_output: str) -> str:
+        """
+        Remove tool calls from within thinking tags to avoid processing them.
+        """
+
+        def remove_tool_calls_from_think(match):
+            think_content = match.group(1)
+            # Remove tool_calls from within the think tag
+            cleaned_content = re.sub(r"<tool_calls>.*?</tool_calls>",
+                                     "",
+                                     think_content,
+                                     flags=re.DOTALL)
+            return f"<think>{cleaned_content}</think>"
+
+        # Process thinking tags and remove tool_calls from within them
+        processed_output = re.sub(self.thinking_tag_pattern,
+                                  remove_tool_calls_from_think,
+                                  model_output,
+                                  flags=re.DOTALL)
+
+        return processed_output
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # Preprocess to remove tool calls from thinking tags
+        processed_output = self.preprocess_model_output(model_output)
+
+        if self.tool_call_start_token not in processed_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            function_call_tuples = (
+                self.tool_call_regex.findall(processed_output))
+
+            raw_function_calls = []
+            for match in function_call_tuples:
+                tool_call_content = match[0] if match[0] else match[1]
+                if tool_call_content.strip():
+                    lines = tool_call_content.strip().split('\n')
+                    for line in lines:
+                        line = line.strip()
+                        if line and line.startswith('{') and line.endswith(
+                                '}'):
+                            try:
+                                parsed_call = json.loads(line)
+                                raw_function_calls.append(parsed_call)
+                            except json.JSONDecodeError:
+                                continue
+
+            tool_calls = []
+            for function_call in raw_function_calls:
+                if "name" in function_call and "arguments" in function_call:
+                    tool_calls.append(
+                        ToolCall(type="function",
+                                 function=FunctionCall(
+                                     name=function_call["name"],
+                                     arguments=json.dumps(
+                                         function_call["arguments"],
+                                         ensure_ascii=False))))
+
+            # Extract content before the first valid tool call
+            # Find the position in processed output, then map back to original
+            processed_pos = processed_output.find(self.tool_call_start_token)
+            if processed_pos != -1:
+                # Get the content before tool calls in processed output
+                processed_content = processed_output[:processed_pos].strip()
+
+                if processed_content:
+                    # Find the end of this content in the original output
+                    # Look for the last non-empty line of processed content
+                    lines = processed_content.split('\n')
+                    for line in reversed(lines):
+                        line = line.strip()
+                        if line:
+                            # Find this line in original output
+                            pos = model_output.find(line)
+                            if pos != -1:
+                                content = model_output[:pos + len(line)]
+                                break
+                    else:
+                        content = ""
+                else:
+                    content = ""
+            else:
+                content = model_output
+
+            return ExtractedToolCallInformation(
+                tools_called=len(tool_calls) > 0,
+                tool_calls=tool_calls,
+                content=content.strip() if content.strip() else None)
+
+        except Exception:
+            logger.exception(
+                "An unexpected error occurred during tool call extraction.")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        logger.debug("delta_text: %s", delta_text)
+        logger.debug("delta_token_ids: %s", delta_token_ids)
+
+        # Preprocess to remove tool calls from thinking tags
+        processed_current_text = self.preprocess_model_output(current_text)
+
+        if self.tool_call_start_token not in processed_current_text:
+            return DeltaMessage(content=delta_text)
+
+        if (self.tool_call_start_token_id is not None
+                and self.tool_call_start_token_id in delta_token_ids
+                and len(delta_token_ids) == 1):
+            return None
+
+        original_tool_call_start_pos = current_text.find(
+            self.tool_call_start_token)
+        if original_tool_call_start_pos > 0:
+            delta_start_pos = len(current_text) - len(delta_text)
+            if delta_start_pos < original_tool_call_start_pos:
+                content_part = delta_text
+                if delta_start_pos + len(
+                        delta_text) > original_tool_call_start_pos:
+                    content_part = delta_text[:original_tool_call_start_pos -
+                                              delta_start_pos]
+                if content_part:
+                    return DeltaMessage(content=content_part)
+
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+
+        try:
+            parsable_content = processed_current_text.split(
+                self.tool_call_start_token)[-1].split(
+                    self.tool_call_end_token)[0]
+
+            tool_call_arr = []
+            if parsable_content.strip():
+                lines = parsable_content.strip().split('\n')
+                for line in lines:
+                    line = line.strip()
+                    if line and (line.startswith('{') or '"name"' in line):
+                        try:
+                            if line.endswith('}'):
+                                parsed_call = json.loads(line)
+                                tool_call_arr.append(parsed_call)
+                            else:
+                                parsed_call = partial_json_parser.loads(
+                                    line, flags)
+                                if parsed_call and isinstance(
+                                        parsed_call, dict):
+                                    tool_call_arr.append(parsed_call)
+                        except (json.JSONDecodeError, partial_json_parser.core.
+                                exceptions.MalformedJSON):
+                            continue
+
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > self.current_tool_id >= 0 else {}
+
+            if len(tool_call_arr) == 0:
+                return None
+
+            # Starting a new tool in the array
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # Handle any missed arguments from previous tool
+                if self.current_tool_id >= 0 and self.current_tool_id < len(
+                        self.prev_tool_call_arr):
+                    prev_tool_call = self.prev_tool_call_arr[
+                        self.current_tool_id]
+                    diff_arguments = prev_tool_call.get("arguments")
+
+                    if diff_arguments:
+                        diff_arguments_json = json.dumps(diff_arguments,
+                                                         ensure_ascii=False)
+                        already_streamed = self.streamed_args_for_tool[
+                            self.
+                            current_tool_id] if self.current_tool_id < len(
+                                self.streamed_args_for_tool) else ""
+
+                        if diff_arguments_json != already_streamed:
+                            diff = diff_arguments_json[len(already_streamed):]
+                            delta = DeltaMessage(tool_calls=[
+                                DeltaToolCall(index=self.current_tool_id,
+                                              function=DeltaFunctionCall(
+                                                  arguments=diff).model_dump(
+                                                      exclude_none=True))
+                            ])
+                            if self.current_tool_id < len(
+                                    self.streamed_args_for_tool):
+                                self.streamed_args_for_tool[
+                                    self.current_tool_id] = diff_arguments_json
+                        else:
+                            delta = None
+                    else:
+                        delta = None
+                else:
+                    delta = None
+
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # Send tool name if not sent yet
+            if not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=random_tool_call_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # Stream arguments
+            else:
+                prev_arguments = None
+                if (self.current_tool_id < len(self.prev_tool_call_arr)
+                        and self.prev_tool_call_arr[self.current_tool_id]):
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+
+                cur_arguments = current_tool_call.get("arguments")
+
+                if not cur_arguments and not prev_arguments:
+                    delta = None
+                elif not cur_arguments and prev_arguments:
+                    logger.error(
+                        "Arguments reset mid-call, skipping streaming")
+                    delta = None
+                elif cur_arguments and not prev_arguments:
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
+                    logger.debug("First tokens in arguments received: %s",
+                                 cur_arguments_json)
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=cur_arguments_json).
+                                      model_dump(exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] = cur_arguments_json
+
+                elif cur_arguments and prev_arguments:
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
+
+                    logger.debug("Searching for diff between \n%s\n%s",
+                                 cur_args_json, prev_args_json)
+
+                    already_streamed = self.streamed_args_for_tool[
+                        self.current_tool_id] if self.current_tool_id < len(
+                            self.streamed_args_for_tool) else ""
+
+                    if cur_args_json.startswith(already_streamed):
+                        argument_diff = cur_args_json[len(already_streamed):]
+                    elif cur_args_json != already_streamed:
+                        argument_diff = cur_args_json
+                        self.streamed_args_for_tool[self.current_tool_id] = ""
+                    else:
+                        argument_diff = ""
+
+                    if argument_diff:
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception:
+            logger.exception("An unexpected error occurred",
+                             "during streaming tool call handling.")
+            return None
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -0,0 +1,369 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from random import choices
+from string import ascii_letters, digits
+from typing import Union
+
+import partial_json_parser
+import regex as re
+from partial_json_parser.core.options import Allow
+from pydantic import Field
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    extract_intermediate_diff)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+
+logger = init_logger(__name__)
+
+ALPHANUMERIC = ascii_letters + digits
+
+
+class MistralToolCall(ToolCall):
+    id: str = Field(
+        default_factory=lambda: MistralToolCall.generate_random_id())
+
+    @staticmethod
+    def generate_random_id():
+        # Mistral Tool Call Ids must be alphanumeric with a length of 9.
+        # https://github.com/mistralai/mistral-common/blob/21ee9f6cee3441e9bb1e6ed2d10173f90bd9b94b/src/mistral_common/protocol/instruct/validator.py#L299
+        return "".join(choices(ALPHANUMERIC, k=9))
+
+    @staticmethod
+    def is_valid_id(id: str) -> bool:
+        return id.isalnum() and len(id) == 9
+
+
+def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
+    return isinstance(model_tokenizer, MistralTokenizer) \
+        and model_tokenizer.version >= 11
+
+
+@ToolParserManager.register_module("mistral")
+class MistralToolParser(ToolParser):
+    """
+    Tool call parser for Mistral 7B Instruct v0.3, intended for use with
+    - [`mistral_common`](https://github.com/mistralai/mistral-common/)
+    - the examples/tool_chat_template_mistral.jinja template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        if not isinstance(self.model_tokenizer, MistralTokenizer):
+            logger.info("Non-Mistral tokenizer detected when using a Mistral "
+                        "model...")
+
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: list[dict] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.bot_token = "[TOOL_CALLS]"
+        self.bot_token_id = self.vocab.get(self.bot_token)
+        self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
+        if _is_fn_name_regex_support(self.model_tokenizer):
+            self.fn_name_regex = re.compile(
+                r'([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)', re.DOTALL)
+        else:
+            self.fn_name_regex = None
+
+        if self.bot_token_id is None:
+            raise RuntimeError(
+                "Mistral Tool Parser could not locate the tool call token in "
+                "the tokenizer!")
+
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        if not isinstance(
+                self.model_tokenizer, MistralTokenizer
+        ) and request.tools and request.tool_choice != 'none':
+            # Do not skip special tokens when using chat template
+            # with Mistral parser as TOOL_CALL token is needed
+            # for tool detection.
+            # Note: we don't want skip_special_tokens=False
+            # with MistralTokenizer as it is incompatible
+            request.skip_special_tokens = False
+        return request
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response. Requires
+        find-and-replacing single quotes with double quotes for JSON parsing,
+        make sure your tool call arguments don't ever include quotes!
+        """
+
+        # case -- if a tool call token is not present, return a text response
+        if self.bot_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        # first remove the BOT token
+        tool_content = model_output.replace(self.bot_token, "").strip()
+
+        try:
+            # we first try to directly load the json as parsing very nested
+            # jsons is difficult
+            try:
+                if self.fn_name_regex:
+                    matches = self.fn_name_regex.findall(tool_content)
+
+                    function_call_arr = []
+                    for match in matches:
+                        fn_name = match[0]
+                        args = match[1]
+
+                        # fn_name is encoded outside serialized json dump
+                        # only arguments are serialized
+                        function_call_arr.append({
+                            "name": fn_name,
+                            "arguments": json.loads(args)
+                        })
+                else:
+                    function_call_arr = json.loads(tool_content)
+            except json.JSONDecodeError:
+                # use a regex to find the part corresponding to the tool call.
+                # NOTE: This use case should not happen if the model is trained
+                # correctly. It's a easy possible fix so it's included, but
+                # can be brittle for very complex / highly nested tool calls
+                raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
+                function_call_arr = json.loads(raw_tool_call)
+
+            # Tool Call
+            tool_calls: list[MistralToolCall] = [
+                MistralToolCall(
+                    type="function",
+                    function=FunctionCall(
+                        name=raw_function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(raw_function_call["arguments"],
+                                             ensure_ascii=False)))
+                for raw_function_call in function_call_arr
+            ]
+
+            # get any content before  the tool call
+            content = model_output.split(self.bot_token)[0]
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=content if len(content) > 0 else None)
+
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # return information to just treat the tool call as regular JSON
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=tool_content)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        # if the tool call token is not in the tokens generated so far, append
+        # output to contents since it's not a tool
+        if self.bot_token not in current_text:
+            return DeltaMessage(content=delta_text)
+
+        # if the tool call token ID IS in the tokens generated so far, that
+        # means we're parsing as tool calls now
+
+        # handle if we detected the BOT token which means the start of tool
+        # calling
+        if (self.bot_token_id in delta_token_ids
+                and len(delta_token_ids) == 1):
+            # if it's the only token, return None, so we don't send a chat
+            # completion any don't send a control token
+            return None
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+
+            # replace BOT token with empty string, and convert single quotes
+            # to double to allow parsing as JSON since mistral uses single
+            # quotes instead of double for tool calls
+            parsable_arr = current_text.split(self.bot_token)[-1]
+
+            # tool calls are generated in an array, so do partial JSON
+            # parsing on the entire array
+            try:
+                tool_call_arr: list[dict] = partial_json_parser.loads(
+                    parsable_arr, flags)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # select as the current tool call the one we're on the state at
+
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > 0 else {}
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if len(tool_call_arr) == 0:
+                return None
+
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    diff: Union[str, None] = current_tool_call.get("arguments")
+
+                    if diff:
+                        diff = json.dumps(diff, ensure_ascii=False).replace(
+                            self.streamed_args_for_tool[self.current_tool_id],
+                            "")
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=diff).model_dump(
+                                                  exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # case: update an existing tool - this is handled below
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            if not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=MistralToolCall.generate_random_id(),
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+
+                prev_arguments = self.prev_tool_call_arr[
+                    self.current_tool_id].get("arguments")
+                cur_arguments = current_tool_call.get("arguments")
+
+                new_text = delta_text.replace("\'", "\"")
+                if ('"}' in new_text):
+                    new_text = new_text[:new_text.rindex('"}')]
+
+                if not cur_arguments and not prev_arguments:
+
+                    delta = None
+                elif not cur_arguments and prev_arguments:
+                    logger.error(
+                        "INVARIANT - impossible to have arguments reset "
+                        "mid-arguments")
+                    delta = None
+                elif cur_arguments and not prev_arguments:
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)[:-2]
+                    logger.debug("finding %s in %s", new_text,
+                                 cur_arguments_json)
+
+                    if (new_text not in cur_arguments_json):
+                        return None
+                    arguments_delta = cur_arguments_json[:cur_arguments_json.
+                                                         rindex(new_text) +
+                                                         len(new_text)]
+                    logger.debug("First tokens in arguments received: %s",
+                                 arguments_delta)
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=arguments_delta).
+                                      model_dump(exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += arguments_delta
+
+                elif cur_arguments and prev_arguments:
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
+                    logger.debug("Searching for diff between \n%s\n%s",
+                                 cur_args_json, prev_args_json)
+
+                    argument_diff = extract_intermediate_diff(
+                        cur_args_json, prev_args_json)
+                    logger.debug("got arguments diff: %s", argument_diff)
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=argument_diff).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += argument_diff
+                else:
+                    # try parsing it with regular JSON - if it works we're
+                    # at the end, and we need to send the difference between
+                    # tokens streamed so far and the valid JSON
+                    delta = None
+
+            # check to see if the name is defined and has been sent. if so,
+            # stream the name - otherwise keep waiting
+            # finish by setting old and returning None as base case
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from collections.abc import Sequence
+from typing import Any, Optional
+
+import regex as re
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("phi4_mini_json")
+class Phi4MiniJsonToolParser(ToolParser):
+    """
+    Tool call parser for phi-4-mini models intended for use with the
+    examples/tool_chat_template_llama.jinja template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser phi4_mini_json  
+    are all set
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
+        super().__init__(tokenizer)
+
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: list[dict[str, Any]] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.bot_token: str = "functools"
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        logger.debug("Model output: %s", model_output)
+
+        pattern = r'functools\[(.*?)\]'
+        matches = re.search(pattern, model_output, re.DOTALL)
+
+        if not matches:
+            logger.debug("No function calls found")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            function_call_arr: list[dict[str, Any]] = []
+            try:
+                json_content = '[' + matches.group(1) + ']'
+
+                function_call_arr = json.loads(json_content)
+                logger.debug("Successfully extracted %d function calls",
+                             len(function_call_arr))
+            except json.JSONDecodeError as e:
+                logger.error(
+                    "Failed to parse function calls from model output. "
+                    "Error: %s", str(e))
+
+            tool_calls: list[ToolCall] = [
+                ToolCall(
+                    id=random_tool_call_id(),
+                    type="function",
+                    function=FunctionCall(
+                        name=raw_function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(
+                            raw_function_call["arguments"]
+                            if "arguments" in raw_function_call else
+                            raw_function_call["parameters"],
+                            ensure_ascii=False),
+                    )) for raw_function_call in function_call_arr
+            ]
+
+            # get any content before the tool call
+            ret = ExtractedToolCallInformation(tools_called=True,
+                                               tool_calls=tool_calls,
+                                               content=None)
+            return ret
+
+        except Exception:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Optional[DeltaMessage]:
+
+        return None
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -0,0 +1,308 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import ast
+import json
+from collections.abc import Sequence
+from typing import Any, Union
+
+import regex as re
+from transformers import PreTrainedTokenizerBase
+
+import vllm.envs as envs
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class _UnexpectedAstError(Exception):
+    pass
+
+
+@ToolParserManager.register_module("pythonic")
+class PythonicToolParser(ToolParser):
+    """
+    Tool call parser for models that produce tool calls in a pythonic style,
+    such as Llama 3.2 and Llama 4 models.
+
+    Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set
+    """
+    # TODO(mdepinet): Possible future improvements:
+    #   1. Support text + tools separated by either <|python_tag|> or \n\n
+    #   2. Support tools outside of a list (or separated by a semicolon).
+    #      This depends on item 1 for consistent streaming.
+    # Neither of these are necessary for e.g. ToolACE, but both would help make
+    # Llama3.2 models more reliable.
+
+    TOOL_CALL_REGEX = re.compile(
+        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+        re.DOTALL)
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+    # Rename for readability. This is NOT a tool id.
+    @property
+    def current_tool_index(self) -> int:
+        return self.current_tool_id
+
+    @current_tool_index.setter
+    def current_tool_index(self, value: int) -> None:
+        self.current_tool_id = value
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        is_tool_call_pattern = False
+        try:
+            is_tool_call_pattern = self.TOOL_CALL_REGEX.match(
+                model_output,
+                timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None
+        except TimeoutError:
+            logger.warning(
+                "Regex timeout occurred when matching tool call pattern.")
+            logger.debug("Regex timeout occurred when matching user input: %s",
+                         model_output)
+
+        if not is_tool_call_pattern:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            module = ast.parse(model_output)
+            parsed = getattr(module.body[0], "value", None)
+            if isinstance(parsed, ast.List) and all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=[
+                        _handle_single_tool(e)  # type: ignore
+                        for e in parsed.elts
+                    ],
+                    content=None)
+            else:
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # Treat as regular text
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        if not current_text.startswith("["):
+            return DeltaMessage(content=delta_text)
+
+        try:
+            valid_and_added_text = _make_valid_python(current_text)
+            if valid_and_added_text is None:
+                return None
+            valid_text, added_text = valid_and_added_text
+
+            module = ast.parse(valid_text)
+            parsed = getattr(module.body[0], "value", None)
+            if not isinstance(parsed, ast.List) or not all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+            tool_calls = [
+                _handle_single_tool(e)  # type: ignore
+                for e in parsed.elts
+            ]
+
+            tool_deltas = []
+            for index, new_call in enumerate(tool_calls):
+                if index < self.current_tool_index:
+                    continue
+
+                self.current_tool_index = index
+                if len(self.streamed_args_for_tool) == index:
+                    self.streamed_args_for_tool.append("")
+
+                new_call_complete = index < len(
+                    tool_calls) - 1 or ")]" not in added_text
+                if new_call_complete:
+                    self.current_tool_index += 1
+
+                withheld_suffix = (added_text[:-2]
+                                   if not new_call_complete else "")
+                if not new_call_complete and added_text[-2] == ")":
+                    # Function call is incomplete. Withhold the closing bracket.
+                    withheld_suffix = withheld_suffix + "}"
+                # Strings get single quotes in the model-produced string.
+                # JSON requires double quotes.
+                withheld_suffix = withheld_suffix.replace("'", '"')
+                delta = _compute_tool_delta(self.streamed_args_for_tool[index],
+                                            new_call, index, withheld_suffix)
+
+                if delta is not None:
+                    tool_deltas.append(delta)
+                    if (delta.function is not None
+                            and delta.function.arguments is not None):
+                        self.streamed_args_for_tool[
+                            index] += delta.function.arguments
+
+            # HACK: serving_chat.py inspects the internal state of tool parsers
+            # when determining it's final streaming delta, automatically
+            # adding autocompleted JSON.
+            # These two lines avoid that nonsense while ensuring finish_reason
+            # is set to tool_calls when at least one tool is called.
+            if tool_deltas and not self.prev_tool_call_arr:
+                self.prev_tool_call_arr = [{"arguments": {}}]
+
+            if tool_deltas:
+                return DeltaMessage(tool_calls=tool_deltas)
+            elif not added_text and self.current_tool_id > 0:
+                # Return an empty DeltaMessage once the tool calls are all done
+                # so that finish_reason gets set.
+                return DeltaMessage(content='')
+            else:
+                return None
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
+
+
+def _get_parameter_value(val: ast.expr) -> Any:
+    if isinstance(val, ast.Constant):
+        return val.value
+    elif isinstance(val, ast.Dict):
+        if not all(isinstance(k, ast.Constant) for k in val.keys):
+            raise _UnexpectedAstError(
+                "Dict tool call arguments must have literal keys")
+        return {
+            k.value: _get_parameter_value(v)  # type: ignore
+            for k, v in zip(val.keys, val.values)
+        }
+    elif isinstance(val, ast.List):
+        return [_get_parameter_value(v) for v in val.elts]
+    else:
+        raise _UnexpectedAstError("Tool call arguments must be literals")
+
+
+def _handle_single_tool(call: ast.Call) -> ToolCall:
+    if not isinstance(call.func, ast.Name):
+        raise _UnexpectedAstError("Invalid tool call name")
+    function_name = call.func.id
+    arguments = {}
+    for keyword in call.keywords:
+        arguments[keyword.arg] = _get_parameter_value(keyword.value)
+    return ToolCall(
+        type="function",
+        function=FunctionCall(name=function_name,
+                              arguments=json.dumps(arguments,
+                                                   ensure_ascii=False)),
+    )
+
+
+def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
+    bracket_stack = []
+    for index, char in enumerate(text):
+        if char in {"[", "(", "{"}:
+            bracket_stack.append(char)
+        elif char == "]":
+            if not bracket_stack or bracket_stack.pop() != "[":
+                raise _UnexpectedAstError("Mismatched square brackets")
+        elif char == ")":
+            if not bracket_stack or bracket_stack.pop() != "(":
+                raise _UnexpectedAstError("Mismatched parentheses")
+        elif char == "}":
+            if not bracket_stack or bracket_stack.pop() != "{":
+                raise _UnexpectedAstError("Mismatched curly braces")
+        elif char in {"'", '"'}:
+            if bracket_stack and bracket_stack[-1] == char:
+                if index > 0 and text[index - 1] == "\\":
+                    # Treat an escaped quote as a regular character
+                    pass
+                else:
+                    bracket_stack.pop()
+            elif bracket_stack and bracket_stack[-1] in {"'", '"'}:
+                # Double quote within a single quote string or vice versa.
+                pass
+            else:
+                bracket_stack.append(char)
+
+    text = text.rstrip()
+    if text.endswith("=") or text.endswith(":"):
+        # Since we have no type information for this property/parameter value,
+        # we can't fill in a valid value.
+        return None
+    if bracket_stack and bracket_stack[-1] == "{":
+        trailing_dict_text = text[:text.rfind("{")]
+        num_keys = trailing_dict_text.count(":")
+        num_values = trailing_dict_text.count(",")
+        if num_keys <= num_values:
+            return None  # Incomplete property name within parameter value
+    if bracket_stack and bracket_stack[-1] == "(":
+        trailing_params_text = text[:text.rfind("(")]
+        num_full_param_names = trailing_params_text.count("=")
+        num_full_param_values = trailing_params_text.count(",")
+        if num_full_param_names <= num_full_param_values:
+            return None  # Incomplete parameter name
+    if text.endswith(","):
+        text = text[:-1]
+    if bracket_stack and bracket_stack[-1] == "[" and not text.endswith(
+            "[") and not text.endswith(")"):
+        return None  # Incomplete function name
+
+    added_text = ""
+    for char in reversed(bracket_stack):
+        if char == "[":
+            added_text += "]"
+        elif char == "(":
+            added_text += ")"
+        elif char == "{":
+            added_text += "}"
+        elif char == "'":
+            added_text += "'"
+        elif char == '"':
+            added_text += '"'
+
+    return text + added_text, added_text
+
+
+def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall,
+                        index: int,
+                        withheld_suffix: str) -> Union[DeltaToolCall, None]:
+    new_call_args = new_call.function.arguments
+    if withheld_suffix:
+        assert new_call_args.endswith(withheld_suffix)
+        new_call_args = new_call_args[:-len(withheld_suffix)]
+    if not previously_sent_args:
+        return DeltaToolCall(id=new_call.id,
+                             type="function",
+                             index=index,
+                             function=DeltaFunctionCall(
+                                 name=new_call.function.name,
+                                 arguments=new_call_args,
+                             ))
+
+    arg_diff = new_call_args[len(previously_sent_args):]
+    return DeltaToolCall(
+        id=None, index=index, function=DeltaFunctionCall(
+            arguments=arg_diff)) if arg_diff else None
--- a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
@@ -0,0 +1,296 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+import json
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import regex as re
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module(["step3"])
+class Step3ToolParser(ToolParser):
+    """
+    Tool parser for a model that uses a specific XML-like format for tool calls.
+    This version uses a robust, stateful, cursor-based streaming parser and
+    consolidates tool arguments into a single message.
+    """
+
+    TOOL_CALLS_BEGIN = "<｜tool_calls_begin｜>"
+    TOOL_CALLS_END = "<｜tool_calls_end｜>"
+    TOOL_CALL_BEGIN = "<｜tool_call_begin｜>"
+    TOOL_CALL_END = "<｜tool_call_end｜>"
+    TOOL_SEP = "<｜tool_sep｜>"
+    SPECIAL_TOKENS = [
+        TOOL_CALLS_BEGIN, TOOL_CALLS_END, TOOL_CALL_BEGIN, TOOL_CALL_END
+    ]
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+        self.position = 0
+        # Explicit state flags for robust streaming
+        self.tool_block_started = False
+        self.tool_block_finished = False
+
+    def adjust_request(
+            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        if request.tools and request.tool_choice != 'none':
+            request.skip_special_tokens = False
+        return request
+
+    @staticmethod
+    def _parse_steptml_invoke(
+            action_text: str
+    ) -> tuple[Optional[str], Optional[dict[str, str]]]:
+        func_name_match = re.search(r'<steptml:invoke name="([^"]+)">',
+                                    action_text)
+        if not func_name_match:
+            return None, None
+        func_name = func_name_match.group(1)
+
+        params: dict[str, str] = {}
+        param_matches = re.findall(
+            r'<steptml:parameter name="([^"]+)">([^<]*)</steptml:parameter>',
+            action_text)
+        for name, value in param_matches:
+            params[name] = value.strip()
+        return func_name, params
+
+    def _cast_arguments(
+        self,
+        func_name: str,
+        params: dict[str, Any],
+        request: ChatCompletionRequest,
+    ) -> dict[str, Any]:
+        for tool in request.tools or []:
+            if tool.function.name == func_name:
+                schema = tool.function.parameters or {}
+                properties = schema.get("properties", {})
+                for key, value in params.items():
+                    if not isinstance(value, str):
+                        continue
+                    prop = properties.get(key, {})
+                    typ = prop.get("type")
+                    if typ == "string":
+                        params[key] = value.strip()
+                    elif typ == "integer":
+                        with contextlib.suppress(ValueError):
+                            params[key] = int(value)
+                    elif typ == "number":
+                        with contextlib.suppress(ValueError):
+                            params[key] = float(value)
+                    elif typ == "boolean":
+                        lower_val = value.lower()
+                        params[key] = lower_val == "true" if lower_val in (
+                            "true", "false") else value
+                    elif typ == "null":
+                        params[key] = None if value.lower(
+                        ) == "null" else value
+                break
+        return params
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        # The main loop processes the stream from the last known position.
+        while True:
+            if self.position >= len(current_text):
+                return None  # We've processed the entire stream.
+
+            unprocessed_text = current_text[self.position:]
+
+            # STATE: After all tools are done, all subsequent text is content.
+            if self.tool_block_finished:
+                self.position = len(current_text)
+                return DeltaMessage(content=unprocessed_text)
+
+            # STATE: Before the tool block has started.
+            if not self.tool_block_started:
+                if unprocessed_text.startswith(self.TOOL_CALLS_BEGIN):
+                    self.position += len(self.TOOL_CALLS_BEGIN)
+                    self.tool_block_started = True
+                    continue  # Token consumed, re-loop.
+
+                start_pos = unprocessed_text.find(self.TOOL_CALLS_BEGIN)
+                if start_pos == -1:
+                    if self.TOOL_CALLS_BEGIN.startswith(
+                            unprocessed_text.strip()) and unprocessed_text:
+                        return None  # It's a prefix, wait.
+                    self.position = len(current_text)
+                    return DeltaMessage(content=unprocessed_text)
+                else:
+                    content = unprocessed_text[:start_pos]
+                    self.position += len(content)
+                    return DeltaMessage(content=content)
+
+            # STATE: Inside the main tool block.
+            offset = len(unprocessed_text) - len(unprocessed_text.lstrip())
+            unprocessed_text = unprocessed_text.lstrip()
+            self.position += offset
+
+            if unprocessed_text.startswith(self.TOOL_CALLS_END):
+                self.position += len(self.TOOL_CALLS_END)
+                self.tool_block_finished = True
+                self.current_tool_id = -1
+                continue
+
+            # Check if we are between tool calls.
+            tool_finished = (
+                self.current_tool_id != -1 and
+                self.prev_tool_call_arr[self.current_tool_id].get("finished"))
+            if self.current_tool_id == -1 or tool_finished:
+                if unprocessed_text.startswith(self.TOOL_CALL_BEGIN):
+                    self.position += len(self.TOOL_CALL_BEGIN)
+                    if self.current_tool_id == -1:
+                        self.current_tool_id = 0
+                    else:
+                        self.current_tool_id += 1
+                    self.current_tool_name_sent = False
+                    while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                        self.prev_tool_call_arr.append({})
+                    self.prev_tool_call_arr[
+                        self.current_tool_id]["finished"] = False
+                    continue
+
+                if self.TOOL_CALL_BEGIN.startswith(unprocessed_text):
+                    return None
+
+            # STATE: Parsing an active tool call.
+            if self.current_tool_id != -1 and not self.prev_tool_call_arr[
+                    self.current_tool_id].get("finished", False):
+                end_tool_pos = unprocessed_text.find(self.TOOL_CALL_END)
+                if end_tool_pos == -1:
+                    tool_body = unprocessed_text
+                else:
+                    tool_body = unprocessed_text[:end_tool_pos]
+
+                if end_tool_pos == -1 and self.TOOL_CALL_END.startswith(
+                        tool_body):
+                    return None
+
+                function_name, arguments = self._parse_steptml_invoke(
+                    tool_body)
+                if not function_name:
+                    return None
+
+                tool_call_arr = {
+                    "name": function_name,
+                    "parameters": arguments or {}
+                }
+
+                # Send the function name as soon as it's parsed.
+                if not self.current_tool_name_sent:
+                    self.current_tool_name_sent = True
+                    self.prev_tool_call_arr[self.current_tool_id].update(
+                        tool_call_arr)
+                    return DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=f"chatcmpl-tool-{random_uuid()}",
+                                      function=DeltaFunctionCall(
+                                          name=function_name))
+                    ])
+
+                # Update our internal state with the latest parsed arguments.
+                self.prev_tool_call_arr[
+                    self.current_tool_id].update(  # noqa: E501
+                        tool_call_arr)
+
+                # Only send arguments when the tool call is complete.
+                if end_tool_pos != -1:
+                    self.position += end_tool_pos + len(self.TOOL_CALL_END)
+                    self.prev_tool_call_arr[
+                        self.current_tool_id]["finished"] = True
+
+                    final_args = self._cast_arguments(
+                        function_name,
+                        tool_call_arr.get("parameters", {}),  # type: ignore
+                        request)
+                    if final_args:
+                        final_args_json = json.dumps(final_args,
+                                                     ensure_ascii=False)
+                        return DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=final_args_json))
+                        ])
+
+                # If tool is not finished, return None to wait for more tokens.
+                return None
+
+            return None
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+        if self.TOOL_CALLS_BEGIN not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        pre_text, rest = model_output.split(self.TOOL_CALLS_BEGIN, 1)
+        if self.TOOL_CALLS_END not in rest:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        tool_block, post_text = rest.split(self.TOOL_CALLS_END, 1)
+        content = (pre_text + post_text).strip()
+
+        tool_calls: list[ToolCall] = []
+        call_parts = tool_block.split(self.TOOL_CALL_BEGIN)
+
+        for part in call_parts:
+            if not part or self.TOOL_CALL_END not in part:
+                continue
+
+            call_content = part.split(self.TOOL_CALL_END, 1)[0]
+            if self.TOOL_SEP not in call_content:
+                continue
+
+            type_part, invoke_part = call_content.split(self.TOOL_SEP, 1)
+            if type_part.strip() != "function":
+                continue
+
+            function_name, params_dict = self._parse_steptml_invoke(
+                invoke_part)
+
+            if function_name and params_dict is not None:
+                params_dict = self._cast_arguments(function_name, params_dict,
+                                                   request)
+                params_str = json.dumps(params_dict, ensure_ascii=False)
+                tool_calls.append(
+                    ToolCall(function=FunctionCall(name=function_name,
+                                                   arguments=params_str)))
+        if tool_calls:
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=content if content else None)
+        return ExtractedToolCallInformation(tools_called=False,
+                                            tool_calls=[],
+                                            content=model_output)
--- a/vllm/entrypoints/openai/tool_parsers/utils.py
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from json import JSONDecodeError, JSONDecoder
+from typing import Any
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+
+def find_common_prefix(s1: str, s2: str) -> str:
+    """
+    Finds a common prefix that is shared between two strings, if there is one.
+    Order of arguments is NOT important.
+
+    This function is provided as a UTILITY for extracting information from JSON
+    generated by partial_json_parser, to help in ensuring that the right tokens
+    are returned in streaming, so that close-quotes, close-brackets and
+    close-braces are not returned prematurely.
+
+    e.g. find_common_prefix('{"fruit": "ap"}', '{"fruit": "apple"}') ->
+    '{"fruit": "ap'
+    """
+    prefix = ''
+    min_length = min(len(s1), len(s2))
+    for i in range(0, min_length):
+        if s1[i] == s2[i]:
+            prefix += s1[i]
+        else:
+            break
+    return prefix
+
+
+def find_common_suffix(s1: str, s2: str) -> str:
+    """
+    Finds a common suffix shared between two strings, if there is one. Order of
+    arguments is NOT important.
+    Stops when the suffix ends OR it hits an alphanumeric character
+
+    e.g. find_common_suffix('{"fruit": "ap"}', '{"fruit": "apple"}') -> '"}'
+    """
+    suffix = ''
+    min_length = min(len(s1), len(s2))
+    for i in range(1, min_length + 1):
+        if s1[-i] == s2[-i] and not s1[-i].isalnum():
+            suffix = s1[-i] + suffix
+        else:
+            break
+    return suffix
+
+
+def extract_intermediate_diff(curr: str, old: str) -> str:
+    """
+    Given two strings, extract the difference in the middle between two strings
+    that are known to have a common prefix and/or suffix.
+
+    This function is provided as a UTILITY for extracting information from JSON
+    generated by partial_json_parser, to help in ensuring that the right tokens
+    are returned in streaming, so that close-quotes, close-brackets and
+    close-braces are not returned prematurely. The order of arguments IS
+    important - the new version of the partially-parsed JSON must be the first
+    argument, and the secnod argument must be from the previous generation.
+
+    What it returns, is tokens that should be streamed to the client.
+
+    e.g. extract_intermediate_diff('{"fruit": "apple"}', '{"fruit": "ap"}')
+        -> 'ple'
+
+    """
+    suffix = find_common_suffix(curr, old)
+
+    old = old[::-1].replace(suffix[::-1], '', 1)[::-1]
+    prefix = find_common_prefix(curr, old)
+    diff = curr
+    if len(suffix):
+        diff = diff[::-1].replace(suffix[::-1], '', 1)[::-1]
+
+    if len(prefix):
+        # replace the prefix only once in case it's mirrored
+        diff = diff.replace(prefix, '', 1)
+
+    return diff
+
+
+def find_all_indices(string: str, substring: str) -> list[int]:
+    """
+    Find all (starting) indices of a substring in a given string. Useful for
+    tool call extraction
+    """
+    indices = []
+    index = -1
+    while True:
+        index = string.find(substring, index + 1)
+        if index == -1:
+            break
+        indices.append(index)
+    return indices
+
+
+# partial_json_parser doesn't support extra data and
+# JSONDecoder.raw_decode doesn't support partial JSON
+def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
+    try:
+        return (partial_json_parser.loads(input_str, flags), len(input_str))
+    except JSONDecodeError as e:
+        if "Extra data" in e.msg:
+            dec = JSONDecoder()
+            return dec.raw_decode(input_str)
+        raise
+
+
+def is_complete_json(input_str: str) -> bool:
+    try:
+        json.loads(input_str)
+        return True
+    except JSONDecodeError:
+        return False
+
+
+def consume_space(i: int, s: str) -> int:
+    while i < len(s) and s[i].isspace():
+        i += 1
+    return i
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -0,0 +1,466 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa
+import json
+from collections.abc import Sequence
+from typing import Any, Optional, Union
+
+import regex as re
+
+from vllm.entrypoints.chat_utils import random_tool_call_id
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("xlam")
+class xLAMToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        # Initialize state for streaming mode
+        self.prev_tool_calls: list[dict] = []
+        self.current_tool_id = -1
+        self.current_tool_name_sent = False
+        self.streamed_args: list[str] = [
+        ]  # Track arguments sent for each tool
+
+        # For backward compatibility with tests
+        self.current_tools_sent: list[bool] = []
+
+        # For backward compatibility with serving code
+        self.prev_tool_call_arr = []
+
+        # Regex patterns for preprocessing
+        self.json_code_block_patterns = [
+            r"```(?:json)?\s*([\s\S]*?)```",
+            r"\[TOOL_CALLS\]([\s\S]*?)(?=\n|$)",
+            r"<tool_call>([\s\S]*?)</tool_call>",
+        ]
+        self.thinking_tag_pattern = r"</think>([\s\S]*)"
+
+        # Define streaming state type to be initialized later
+        self.streaming_state: dict[str, Any] = {
+            "current_tool_index": -1,
+            "tool_ids": [],
+            "sent_tools": [],
+        }
+
+    def preprocess_model_output(
+            self, model_output: str) -> tuple[Optional[str], Optional[str]]:
+        """
+        Preprocess the model output to extract content and potential tool calls.
+        Returns:
+            Tuple of (content, potential_tool_calls_json)
+        """
+        # Check for thinking tag
+        thinking_match = re.search(self.thinking_tag_pattern, model_output)
+        if thinking_match:
+            content = model_output[:thinking_match.start() +
+                                   len("</think>")].strip()
+            thinking_content = thinking_match.group(1).strip()
+
+            # Try to parse the thinking content as JSON
+            try:
+                json.loads(thinking_content)
+                return content, thinking_content
+            except json.JSONDecodeError:
+                # If can't parse as JSON, look for JSON code blocks
+                for json_pattern in self.json_code_block_patterns:
+                    json_matches = re.findall(json_pattern, thinking_content)
+                    if json_matches:
+                        for json_str in json_matches:
+                            try:
+                                json.loads(json_str)
+                                return content, json_str
+                            except json.JSONDecodeError:
+                                continue
+
+        # Check for JSON code blocks in the entire output
+        for json_pattern in self.json_code_block_patterns:
+            json_matches = re.findall(json_pattern, model_output)
+            if json_matches:
+                for json_str in json_matches:
+                    try:
+                        json.loads(json_str)
+                        # Extract content by removing the JSON code block
+                        content = re.sub(json_pattern, "",
+                                         model_output).strip()
+                        return content, json_str
+                    except json.JSONDecodeError:
+                        continue
+
+        # If the entire output is a valid JSON array or looks like one, treat it as tool calls
+        if model_output.strip().startswith("["):
+            try:
+                json.loads(model_output)
+                return None, model_output
+            except json.JSONDecodeError:
+                # Even if it's not valid JSON yet, it might be a tool call in progress
+                if ("{" in model_output and "name" in model_output
+                        and "arguments" in model_output):
+                    return None, model_output
+
+        # If no tool calls found, return the original output as content
+        return model_output, None
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract tool calls from a complete model output.
+        """
+        try:
+            # Preprocess the model output
+            content, potential_tool_calls = self.preprocess_model_output(
+                model_output)
+
+            if not potential_tool_calls:
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=content)
+
+            # Parse the potential tool calls as JSON
+            tool_calls_data = json.loads(potential_tool_calls)
+
+            # Ensure it's an array
+            if not isinstance(tool_calls_data, list):
+                logger.debug("Tool calls data is not an array")
+                return ExtractedToolCallInformation(
+                    tools_called=False,
+                    tool_calls=[],
+                    content=content or model_output,
+                )
+
+            tool_calls: list[ToolCall] = []
+
+            for idx, call in enumerate(tool_calls_data):
+                if (not isinstance(call, dict) or "name" not in call
+                        or "arguments" not in call):
+                    logger.debug("Invalid tool call format at index %d", idx)
+                    continue
+
+                tool_call = ToolCall(
+                    id=f"call_{idx}_{random_uuid()}",
+                    type="function",
+                    function=FunctionCall(
+                        name=call["name"],
+                        arguments=(json.dumps(call["arguments"]) if isinstance(
+                            call["arguments"], dict) else call["arguments"]),
+                    ),
+                )
+                tool_calls.append(tool_call)
+
+            return ExtractedToolCallInformation(
+                tools_called=len(tool_calls) > 0,
+                tool_calls=tool_calls,
+                content=content,
+            )
+
+        except Exception as e:
+            logger.exception("Error extracting tool calls: %s", str(e))
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract tool calls for streaming mode.
+        """
+        # Simplify detection: if it begins with "[" treat it as a function call
+        is_function_call = (current_text.strip().startswith("["))
+
+        # If not a function call, return normal content
+        if not is_function_call:
+            return DeltaMessage(content=delta_text)
+
+        try:
+            # Initialize streaming state if not exists
+            if not hasattr(self, "streaming_state"):
+                self.streaming_state = {
+                    "current_tool_index": -1,
+                    "tool_ids": [],
+                    "sent_tools": [],  # Track complete state of each tool
+                }
+
+            # Try parsing as JSON to check for complete tool calls
+            try:
+                parsed_tools = json.loads(current_text)
+                if isinstance(parsed_tools, list):
+                    # Update our tool array for next time
+                    self.prev_tool_call_arr = parsed_tools
+            except json.JSONDecodeError:
+                # Not complete JSON yet, use regex for partial parsing
+                pass
+
+            # Check for test-specific state setup (current_tools_sent)
+            # This handles the case where tests manually set current_tools_sent
+            if (hasattr(self, "current_tools_sent")  # type: ignore
+                    and len(self.current_tools_sent) > 0):
+                # If current_tools_sent is set to [False], it means the test wants us to send the name
+                if (len(self.current_tools_sent) == 1
+                        and self.current_tools_sent[0] is False):
+                    # Extract the function name using regex
+                    name_pattern = r'"name"\s*:\s*"([^"]+)"'
+                    name_match = re.search(name_pattern, current_text)
+                    if name_match:
+                        function_name = name_match.group(1)
+
+                        # The test expects us to send just the name first
+                        tool_id = random_tool_call_id()
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(
+                                index=0,
+                                type="function",
+                                id=tool_id,
+                                function=DeltaFunctionCall(
+                                    name=function_name).model_dump(
+                                        exclude_none=True),  # type: ignore
+                            )
+                        ])
+                        # Update state to reflect that we've sent the name
+                        self.current_tools_sent = [True]
+                        self.current_tool_id = 0
+                        self.streaming_state["current_tool_index"] = 0
+                        if len(self.streaming_state["sent_tools"]) == 0:
+                            self.streaming_state["sent_tools"].append({
+                                "sent_name":
+                                True,
+                                "sent_arguments_prefix":
+                                False,
+                                "sent_arguments":
+                                "",
+                            })
+                        else:
+                            self.streaming_state["sent_tools"][0][
+                                "sent_name"] = True
+                        self.current_tool_name_sent = True
+                        return delta
+
+            # Use regex to identify tool calls in the output
+            name_pattern = r'"name"\s*:\s*"([^"]+)"'
+            name_matches = list(re.finditer(name_pattern, current_text))
+            tool_count = len(name_matches)
+
+            # If no tools found yet, return
+            if tool_count == 0:
+                return None
+
+            # Ensure our state arrays are large enough
+            while len(self.streaming_state["sent_tools"]) < tool_count:
+                self.streaming_state["sent_tools"].append({
+                    "sent_name":
+                    False,
+                    "sent_arguments_prefix":
+                    False,
+                    "sent_arguments":
+                    "",
+                })
+
+            while len(self.streaming_state["tool_ids"]) < tool_count:
+                self.streaming_state["tool_ids"].append(None)
+
+            # Determine if we need to move to a new tool
+            current_idx = self.streaming_state["current_tool_index"]
+
+            # If we haven't processed any tool yet or current tool is complete, move to next
+            if current_idx == -1 or current_idx < tool_count - 1:
+                next_idx = current_idx + 1
+
+                # If tool at next_idx has not been sent yet
+                if (next_idx < tool_count
+                        and not self.streaming_state["sent_tools"][next_idx]
+                    ["sent_name"]):
+                    # Update indexes
+                    self.streaming_state["current_tool_index"] = next_idx
+                    self.current_tool_id = (
+                        next_idx  # For backward compatibility
+                    )
+                    current_idx = next_idx
+
+                    # Extract the tool name
+                    tool_name = name_matches[current_idx].group(1)
+
+                    # Generate ID and send tool name
+                    tool_id = f"call_{current_idx}_{random_uuid()}"
+                    self.streaming_state["tool_ids"][current_idx] = tool_id
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(
+                            index=current_idx,
+                            type="function",
+                            id=tool_id,
+                            function=DeltaFunctionCall(
+                                name=tool_name).model_dump(
+                                    exclude_none=True),  # type: ignore
+                        )
+                    ])
+                    self.streaming_state["sent_tools"][current_idx][
+                        "sent_name"] = True
+                    self.current_tool_name_sent = (
+                        True  # For backward compatibility
+                    )
+
+                    # Keep track of streamed args for backward compatibility
+                    while len(self.streamed_args) <= current_idx:
+                        self.streamed_args.append("")
+
+                    return delta
+
+            # Process arguments for the current tool
+            if current_idx >= 0 and current_idx < tool_count:
+                # Support both regular and empty argument objects
+                # First, check for the empty arguments case: "arguments": {}
+                empty_args_pattern = (
+                    r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*\{\s*\}')
+                empty_args_match = re.search(empty_args_pattern, current_text)
+
+                # Check if this tool has empty arguments
+                if empty_args_match and empty_args_match.start() > 0:
+                    # Find which tool this empty arguments belongs to
+                    empty_args_tool_idx = 0
+                    for i in range(tool_count):
+                        if i == current_idx:
+                            # If this is our current tool and it has empty arguments
+                            if not self.streaming_state["sent_tools"][
+                                    current_idx]["sent_arguments_prefix"]:
+                                # Send empty object
+                                self.streaming_state["sent_tools"][
+                                    current_idx][
+                                        "sent_arguments_prefix"] = True
+                                self.streaming_state["sent_tools"][
+                                    current_idx]["sent_arguments"] = "{}"
+
+                                # Update streamed_args for backward compatibility
+                                while len(self.streamed_args) <= current_idx:
+                                    self.streamed_args.append("")
+                                self.streamed_args[current_idx] += "{}"
+
+                                delta = DeltaMessage(tool_calls=[
+                                    DeltaToolCall(
+                                        index=current_idx,
+                                        function=DeltaFunctionCall(
+                                            arguments="{}").
+                                        model_dump(
+                                            exclude_none=True),  # type: ignore
+                                    )
+                                ])
+
+                                # Move to next tool if available
+                                if current_idx < tool_count - 1:
+                                    self.streaming_state[
+                                        "current_tool_index"] += 1
+                                    self.current_tool_id = self.streaming_state[
+                                        "current_tool_index"]
+
+                                return delta
+
+                # Extract arguments for current tool using regex for non-empty arguments
+                args_pattern = r'"name"\s*:\s*"[^"]+"\s*,\s*"arguments"\s*:\s*(\{(?:[^{}]|(?:\{[^{}]*\}))*\})'
+                args_matches = list(re.finditer(args_pattern, current_text))
+
+                if current_idx < len(args_matches):
+                    args_text = args_matches[current_idx].group(1)
+
+                    # Handle transition between tools
+                    is_last_tool = current_idx == tool_count - 1
+
+                    # Find where the arguments for our current tool end
+                    if not is_last_tool:
+                        # If we have more tools after this one, try to find the complete argument block
+                        next_tool_pos = current_text.find(
+                            "},{", args_matches[current_idx].start())
+                        if next_tool_pos != -1:
+                            args_end_pos = (next_tool_pos + 1
+                                            )  # +1 to include the '}'
+                            args_text = (current_text[args_matches[current_idx]
+                                                      .start():args_end_pos].
+                                         split('"arguments":')[1].strip())
+
+                    # If arguments haven't been sent yet
+                    sent_args = self.streaming_state["sent_tools"][
+                        current_idx]["sent_arguments"]
+
+                    # If we haven't sent the opening bracket yet
+                    if not self.streaming_state["sent_tools"][current_idx][
+                            "sent_arguments_prefix"] and args_text.startswith(
+                                "{"):
+                        self.streaming_state["sent_tools"][current_idx][
+                            "sent_arguments_prefix"] = True
+                        self.streaming_state["sent_tools"][current_idx][
+                            "sent_arguments"] = "{"
+
+                        # Update streamed_args for backward compatibility
+                        while len(self.streamed_args) <= current_idx:
+                            self.streamed_args.append("")
+                        self.streamed_args[current_idx] += "{"
+
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(
+                                index=current_idx,
+                                function=DeltaFunctionCall(
+                                    arguments="{").model_dump(
+                                        exclude_none=True),  # type: ignore  
+                            )
+                        ])
+                        return delta
+
+                    # If we need to send more arguments
+                    if args_text.startswith(sent_args):
+                        # Calculate what part of arguments we need to send
+                        args_diff = args_text[len(sent_args):]
+
+                        if args_diff:
+                            # Update our state
+                            self.streaming_state["sent_tools"][current_idx][
+                                "sent_arguments"] = args_text
+
+                            # Update streamed_args for backward compatibility
+                            while len(self.streamed_args) <= current_idx:
+                                self.streamed_args.append("")
+                            self.streamed_args[current_idx] += args_diff
+
+                            delta = DeltaMessage(tool_calls=[
+                                DeltaToolCall(
+                                    index=current_idx,
+                                    function=DeltaFunctionCall(
+                                        arguments=args_diff).model_dump(
+                                            exclude_none=True),  # type: ignore
+                                )
+                            ])
+                            return delta
+
+                    # If the tool's arguments are complete, check if we need to move to the next tool
+                    if args_text.endswith("}") and args_text == sent_args:
+                        # This tool is complete, move to the next one in the next iteration
+                        if current_idx < tool_count - 1:
+                            self.streaming_state["current_tool_index"] += 1
+                            self.current_tool_id = self.streaming_state[
+                                "current_tool_index"]  # For compatibility
+
+            # If we got here, we couldn't determine what to stream next
+            return None
+
+        except Exception as e:
+            logger.exception(f"Error in streaming tool calls: {e}")
+            # If we encounter an error, just return the delta text as regular content
+            return DeltaMessage(content=delta_text)