Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/tool_use/init.py
+++ b/tests/tool_use/init.py
--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import pytest_asyncio
+from huggingface_hub import snapshot_download
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+from .utils import ARGS, CONFIGS, ServerConfig
+
+
+# select models to test based on command line arguments
+def pytest_addoption(parser):
+    parser.addoption("--models", nargs="+", help="Specify one or more models to test")
+    parser.addoption(
+        "--extended",
+        action="store_true",
+        default=False,
+        help="invoke extended tests requiring large GPUs",
+    )
+
+
+# for each server config, download the model and return the config
+@pytest.fixture(scope="session", params=CONFIGS.keys())
+def server_config(request):
+    extended = request.config.getoption("--extended")
+    models = request.config.getoption("--models")
+
+    config_keys_to_test = [
+        key
+        for key in CONFIGS
+        if (models is None or key in models)
+        and (extended or not CONFIGS[key].get("extended", False))
+    ]
+
+    config_key = request.param
+    if config_key not in config_keys_to_test:
+        pytest.skip(f"Skipping config '{config_key}'")
+
+    config = CONFIGS[config_key]
+
+    if current_platform.is_rocm() and not config.get("supports_rocm", True):
+        pytest.skip(
+            "The {} model can't be tested on the ROCm platform".format(config["model"])
+        )
+
+    # download model and tokenizer using transformers
+    snapshot_download(config["model"])
+    yield CONFIGS[request.param]
+
+
+# run this for each server config
+@pytest.fixture(scope="session")
+def server(request, server_config: ServerConfig):
+    model = server_config["model"]
+    args_for_model = server_config["arguments"]
+    with RemoteOpenAIServer(
+        model, ARGS + args_for_model, max_wait_seconds=480
+    ) as server:
+        yield server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    async with server.get_async_client() as async_client:
+        yield async_client
--- a/tests/tool_use/mistral/init.py
+++ b/tests/tool_use/mistral/init.py
--- a/tests/tool_use/mistral/conftest.py
+++ b/tests/tool_use/mistral/conftest.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import pytest_asyncio
+from huggingface_hub import snapshot_download
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+from .utils import ARGS, CONFIGS, ServerConfig
+
+
+# for each server config, download the model and return the config
+@pytest.fixture(scope="package", params=CONFIGS.keys())
+def server_config(request):
+    config = CONFIGS[request.param]
+
+    if current_platform.is_rocm() and not config.get("supports_rocm", True):
+        pytest.skip(
+            "The {} model can't be tested on the ROCm platform".format(config["model"])
+        )
+
+    # download model and tokenizer using transformers
+    snapshot_download(config["model"])
+    yield CONFIGS[request.param]
+
+
+# run this for each server config
+@pytest.fixture(scope="package")
+def server(request, server_config: ServerConfig):
+    model = server_config["model"]
+    args_for_model = server_config["arguments"]
+    with RemoteOpenAIServer(
+        model, ARGS + args_for_model, max_wait_seconds=480
+    ) as server:
+        yield server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    async with server.get_async_client() as async_client:
+        yield async_client
--- a/tests/tool_use/mistral/test_mistral_tool_calls.py
+++ b/tests/tool_use/mistral/test_mistral_tool_calls.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai
+import pytest
+
+from tests.tool_use.utils import MESSAGES_ASKING_FOR_TOOLS, WEATHER_TOOL
+
+
+# test: a tool_choice with mistral-tokenizer results in an ID of length 9
+@pytest.mark.asyncio
+async def test_tool_call_with_tool_choice(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_TOOLS,
+        temperature=0,
+        max_completion_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL],
+        tool_choice=WEATHER_TOOL,
+        logprobs=False,
+    )
+
+    choice = chat_completion.choices[0]
+
+    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
+    assert choice.message.role == "assistant"
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 1
+    assert len(choice.message.tool_calls[0].id) == 9  # length of 9 for mistral
--- a/tests/tool_use/mistral/utils.py
+++ b/tests/tool_use/mistral/utils.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from typing_extensions import TypedDict
+
+
+class ServerConfig(TypedDict, total=False):
+    model: str
+    arguments: list[str]
+    system_prompt: str | None
+    supports_parallel: bool | None
+    supports_rocm: bool | None
+
+
+ARGS: list[str] = ["--max-model-len", "1024"]
+
+CONFIGS: dict[str, ServerConfig] = {
+    "mistral": {
+        "model": "mistralai/Mistral-7B-Instruct-v0.3",
+        "arguments": [
+            "--tokenizer-mode",
+            "mistral",
+            '--ignore-patterns="consolidated.safetensors"',
+        ],
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally.",
+    },
+}
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+
+
+def test_chat_completion_request_with_no_tools():
+    # tools key is not present
+    request = ChatCompletionRequest.model_validate(
+        {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "model": "facebook/opt-125m",
+        }
+    )
+    assert request.tool_choice == "none"
+
+    # tools key is None
+    request = ChatCompletionRequest.model_validate(
+        {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "model": "facebook/opt-125m",
+            "tools": None,
+        }
+    )
+    assert request.tool_choice == "none"
+
+    # tools key present but empty
+    request = ChatCompletionRequest.model_validate(
+        {
+            "messages": [{"role": "user", "content": "Hello"}],
+            "model": "facebook/opt-125m",
+            "tools": [],
+        }
+    )
+    assert request.tool_choice == "none"
+
+
+@pytest.mark.parametrize("tool_choice", ["auto", "required"])
+def test_chat_completion_request_with_tool_choice_but_no_tools(tool_choice):
+    with pytest.raises(
+        ValueError, match="When using `tool_choice`, `tools` must be set."
+    ):
+        ChatCompletionRequest.model_validate(
+            {
+                "messages": [{"role": "user", "content": "Hello"}],
+                "model": "facebook/opt-125m",
+                "tool_choice": tool_choice,
+            }
+        )
+
+    with pytest.raises(
+        ValueError, match="When using `tool_choice`, `tools` must be set."
+    ):
+        ChatCompletionRequest.model_validate(
+            {
+                "messages": [{"role": "user", "content": "Hello"}],
+                "model": "facebook/opt-125m",
+                "tool_choice": tool_choice,
+                "tools": None,
+            }
+        )
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai
+import pytest
+
+from .utils import (
+    MESSAGES_WITHOUT_TOOLS,
+    WEATHER_TOOL,
+    ServerConfig,
+    ensure_system_prompt,
+)
+
+
+# test: make sure chat completions without tools provided work even when tools
+# are enabled. This makes sure tool call chat templates work, AND that the tool
+# parser stream processing doesn't change the output of the model.
+@pytest.mark.asyncio
+async def test_chat_completion_without_tools(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
+        temperature=0,
+        max_completion_tokens=150,
+        model=model_name,
+        logprobs=False,
+    )
+    choice = chat_completion.choices[0]
+    stop_reason = chat_completion.choices[0].finish_reason
+    output_text = chat_completion.choices[0].message.content
+
+    # check to make sure we got text
+    assert output_text is not None
+    assert len(output_text) > 0
+    assert stop_reason != "tool_calls"
+
+    # check to make sure no tool calls were returned
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
+        temperature=0,
+        max_completion_tokens=150,
+        model=model_name,
+        logprobs=False,
+        stream=True,
+    )
+    chunks: list[str] = []
+    finish_reason_count = 0
+    role_sent: bool = False
+
+    # assemble streamed chunks
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+
+        # make sure the role is assistant
+        if delta.role:
+            assert not role_sent
+            assert delta.role == "assistant"
+            role_sent = True
+
+        if delta.content:
+            chunks.append(delta.content)
+
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == choice.finish_reason
+
+        # make sure tool call chunks aren't being streamed
+        assert not delta.tool_calls or len(delta.tool_calls) == 0
+
+    # make sure the role was sent, only 1 finish reason was sent, that chunks
+    # were in fact sent, and that the chunks match non-streaming
+    assert role_sent
+    assert finish_reason_count == 1
+    assert len(chunks)
+    assert "".join(chunks) == output_text
+
+
+# test: conversation with tools enabled and provided that should not invoke
+# tools, to make sure we can still get normal chat completion responses
+# and that they won't be parsed as tools
+@pytest.mark.asyncio
+async def test_chat_completion_with_tools(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
+        temperature=0,
+        max_completion_tokens=150,
+        model=model_name,
+        tools=[WEATHER_TOOL],
+        logprobs=False,
+    )
+    choice = chat_completion.choices[0]
+    stop_reason = chat_completion.choices[0].finish_reason
+    output_text = chat_completion.choices[0].message.content
+
+    # check to make sure we got text
+    assert output_text is not None
+    assert stop_reason != "tool_calls"
+    assert len(output_text) > 0
+
+    # check to make sure no tool calls were returned
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
+        temperature=0,
+        max_completion_tokens=150,
+        model=model_name,
+        logprobs=False,
+        tools=[WEATHER_TOOL],
+        stream=True,
+    )
+
+    chunks: list[str] = []
+    finish_reason_count = 0
+    role_sent: bool = False
+
+    # assemble streamed chunks
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+
+        # make sure the role is assistant
+        if delta.role:
+            assert delta.role == "assistant"
+            role_sent = True
+
+        if delta.content:
+            chunks.append(delta.content)
+
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+
+        # make sure tool call chunks aren't being streamed
+        assert not delta.tool_calls or len(delta.tool_calls) == 0
+
+    # make sure the role was sent, only 1 finish reason was sent, that chunks
+    # were in fact sent, and that the chunks match non-streaming
+    assert role_sent
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert chunk.choices[0].finish_reason != "tool_calls"
+    assert len(chunks)
+    assert "".join(chunks) == output_text
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -0,0 +1,271 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai
+import pytest
+
+from .utils import (
+    MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+    MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+    SEARCH_TOOL,
+    WEATHER_TOOL,
+    ServerConfig,
+)
+
+
+# test: getting the model to generate parallel tool calls (streaming/not)
+# when requested. NOTE that not all models may support this, so some exclusions
+# may be added in the future. e.g. llama 3.1 models are not designed to support
+# parallel tool calls.
+@pytest.mark.asyncio
+async def test_parallel_tool_calls(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
+    if not server_config.get("supports_parallel", True):
+        pytest.skip(
+            "The {} model doesn't support parallel tool calls".format(
+                server_config["model"]
+            )
+        )
+
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+    )
+
+    choice = chat_completion.choices[0]
+    stop_reason = chat_completion.choices[0].finish_reason
+    non_streamed_tool_calls = chat_completion.choices[0].message.tool_calls
+
+    # make sure 2 tool calls are present
+    assert choice.message.role == "assistant"
+    assert non_streamed_tool_calls is not None
+    assert len(non_streamed_tool_calls) == 2
+
+    for tool_call in non_streamed_tool_calls:
+        # make sure the tool includes a function and ID
+        assert tool_call.type == "function"
+        assert tool_call.function is not None
+        assert isinstance(tool_call.id, str)
+        assert len(tool_call.id) >= 9
+
+        # make sure the weather tool was called correctly
+        assert tool_call.function.name == WEATHER_TOOL["function"]["name"]
+        assert isinstance(tool_call.function.arguments, str)
+
+        parsed_arguments = json.loads(tool_call.function.arguments)
+        assert isinstance(parsed_arguments, dict)
+        assert isinstance(parsed_arguments.get("city"), str)
+        assert isinstance(parsed_arguments.get("state"), str)
+
+    assert stop_reason == "tool_calls"
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        stream=True,
+    )
+
+    role_name: str | None = None
+    finish_reason_count: int = 0
+
+    tool_call_names: list[str] = []
+    tool_call_args: list[str] = []
+    tool_call_idx: int = -1
+    tool_call_id_count: int = 0
+
+    async for chunk in stream:
+        # if there's a finish reason make sure it's tools
+        if chunk.choices[0].finish_reason:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == "tool_calls"
+
+        # if a role is being streamed make sure it wasn't already set to
+        # something else
+        if chunk.choices[0].delta.role:
+            assert not role_name or role_name == "assistant"
+            role_name = "assistant"
+
+        # if a tool call is streamed make sure there's exactly one
+        # (based on the request parameters
+        streamed_tool_calls = chunk.choices[0].delta.tool_calls
+
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            # make sure only one diff is present - correct even for parallel
+            assert len(streamed_tool_calls) == 1
+            tool_call = streamed_tool_calls[0]
+
+            # if a new tool is being called, set up empty arguments
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = tool_call.index
+                tool_call_args.append("")
+
+            # if a tool call ID is streamed, make sure one hasn't been already
+            if tool_call.id:
+                tool_call_id_count += 1
+                assert isinstance(tool_call.id, str) and (len(tool_call.id) >= 9)
+
+            # if parts of the function start being streamed
+            if tool_call.function:
+                # if the function name is defined, set it. it should be streamed
+                # IN ENTIRETY, exactly one time.
+                if tool_call.function.name:
+                    assert isinstance(tool_call.function.name, str)
+                    tool_call_names.append(tool_call.function.name)
+
+                if tool_call.function.arguments:
+                    # make sure they're a string and then add them to the list
+                    assert isinstance(tool_call.function.arguments, str)
+
+                    tool_call_args[tool_call.index] += tool_call.function.arguments
+
+    assert finish_reason_count == 1
+    assert role_name == "assistant"
+
+    assert len(non_streamed_tool_calls) == len(tool_call_names) == len(tool_call_args)
+
+    for i in range(2):
+        assert non_streamed_tool_calls[i].function.name == tool_call_names[i]
+        streamed_args = json.loads(tool_call_args[i])
+        non_streamed_args = json.loads(non_streamed_tool_calls[i].function.arguments)
+        assert streamed_args == non_streamed_args
+
+
+# test: providing parallel tool calls back to the model to get a response
+# (streaming/not)
+@pytest.mark.asyncio
+async def test_parallel_tool_calls_with_results(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
+    if not server_config.get("supports_parallel", True):
+        pytest.skip(
+            "The {} model doesn't support parallel tool calls".format(
+                server_config["model"]
+            )
+        )
+
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+        temperature=0,
+        max_completion_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+    )
+
+    choice = chat_completion.choices[0]
+
+    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
+    assert choice.message.role == "assistant"
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
+    assert choice.message.content is not None
+    assert "98" in choice.message.content  # Dallas temp in tool response
+    assert "78" in choice.message.content  # Orlando temp in tool response
+
+    stream = await client.chat.completions.create(
+        messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+        temperature=0,
+        max_completion_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        stream=True,
+    )
+
+    chunks: list[str] = []
+    finish_reason_count = 0
+    role_sent: bool = False
+
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+
+        if delta.role:
+            assert not role_sent
+            assert delta.role == "assistant"
+            role_sent = True
+
+        if delta.content:
+            chunks.append(delta.content)
+
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == choice.finish_reason
+
+        assert not delta.tool_calls or len(delta.tool_calls) == 0
+
+    assert role_sent
+    assert finish_reason_count == 1
+    assert len(chunks)
+    assert "".join(chunks) == choice.message.content
+
+
+@pytest.mark.asyncio
+async def test_parallel_tool_calls_false(client: openai.AsyncOpenAI):
+    """
+    Ensure only one tool call is returned when parallel_tool_calls is False.
+    """
+
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        parallel_tool_calls=False,
+    )
+
+    stop_reason = chat_completion.choices[0].finish_reason
+    non_streamed_tool_calls = chat_completion.choices[0].message.tool_calls
+
+    # make sure only 1 tool call is present
+    assert len(non_streamed_tool_calls) == 1
+    assert stop_reason == "tool_calls"
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        parallel_tool_calls=False,
+        stream=True,
+    )
+
+    finish_reason_count: int = 0
+    tool_call_id_count: int = 0
+
+    async for chunk in stream:
+        # if there's a finish reason make sure it's tools
+        if chunk.choices[0].finish_reason:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == "tool_calls"
+
+        streamed_tool_calls = chunk.choices[0].delta.tool_calls
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            tool_call = streamed_tool_calls[0]
+            if tool_call.id:
+                tool_call_id_count += 1
+
+    # make sure only 1 streaming tool call is present
+    assert tool_call_id_count == 1
+    assert finish_reason_count == 1
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai
+import pytest
+
+from .utils import (
+    MESSAGES_ASKING_FOR_TOOLS,
+    MESSAGES_WITH_TOOL_RESPONSE,
+    SEARCH_TOOL,
+    WEATHER_TOOL,
+)
+
+
+# test: request a chat completion that should return tool calls, so we know they
+# are parsable
+@pytest.mark.asyncio
+async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_TOOLS,
+        temperature=0,
+        max_completion_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+    )
+
+    choice = chat_completion.choices[0]
+    stop_reason = chat_completion.choices[0].finish_reason
+    tool_calls = chat_completion.choices[0].message.tool_calls
+
+    # make sure a tool call is present
+    assert choice.message.role == "assistant"
+    assert tool_calls is not None
+    assert len(tool_calls) == 1
+    assert tool_calls[0].type == "function"
+    assert tool_calls[0].function is not None
+    assert isinstance(tool_calls[0].id, str)
+    assert len(tool_calls[0].id) >= 9
+
+    # make sure the weather tool was called (classic example) with arguments
+    assert tool_calls[0].function.name == WEATHER_TOOL["function"]["name"]
+    assert tool_calls[0].function.arguments is not None
+    assert isinstance(tool_calls[0].function.arguments, str)
+
+    # make sure the arguments parse properly
+    parsed_arguments = json.loads(tool_calls[0].function.arguments)
+    assert isinstance(parsed_arguments, dict)
+    assert isinstance(parsed_arguments.get("city"), str)
+    assert isinstance(parsed_arguments.get("state"), str)
+    assert parsed_arguments.get("city") == "Dallas"
+    assert parsed_arguments.get("state") == "TX"
+
+    assert stop_reason == "tool_calls"
+
+    function_name: str | None = None
+    function_args_str: str = ""
+    tool_call_id: str | None = None
+    role_name: str | None = None
+    finish_reason_count: int = 0
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=MESSAGES_ASKING_FOR_TOOLS,
+        temperature=0,
+        max_completion_tokens=100,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        stream=True,
+    )
+
+    async for chunk in stream:
+        assert chunk.choices[0].index == 0
+
+        if chunk.choices[0].finish_reason:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == "tool_calls"
+
+        # if a role is being streamed make sure it wasn't already set to
+        # something else
+        if chunk.choices[0].delta.role:
+            assert not role_name or role_name == "assistant"
+            role_name = "assistant"
+
+        # if a tool call is streamed make sure there's exactly one
+        # (based on the request parameters
+        streamed_tool_calls = chunk.choices[0].delta.tool_calls
+
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            assert len(streamed_tool_calls) == 1
+            tool_call = streamed_tool_calls[0]
+
+            # if a tool call ID is streamed, make sure one hasn't been already
+            if tool_call.id:
+                assert not tool_call_id
+                tool_call_id = tool_call.id
+
+            # if parts of the function start being streamed
+            if tool_call.function:
+                # if the function name is defined, set it. it should be streamed
+                # IN ENTIRETY, exactly one time.
+                if tool_call.function.name:
+                    assert function_name is None
+                    assert isinstance(tool_call.function.name, str)
+                    function_name = tool_call.function.name
+                if tool_call.function.arguments:
+                    assert isinstance(tool_call.function.arguments, str)
+                    function_args_str += tool_call.function.arguments
+
+    assert finish_reason_count == 1
+    assert role_name == "assistant"
+    assert isinstance(tool_call_id, str) and (len(tool_call_id) >= 9)
+
+    # validate the name and arguments
+    assert function_name == WEATHER_TOOL["function"]["name"]
+    assert function_name == tool_calls[0].function.name
+    assert isinstance(function_args_str, str)
+
+    # validate arguments
+    streamed_args = json.loads(function_args_str)
+    assert isinstance(streamed_args, dict)
+    assert isinstance(streamed_args.get("city"), str)
+    assert isinstance(streamed_args.get("state"), str)
+    assert streamed_args.get("city") == "Dallas"
+    assert streamed_args.get("state") == "TX"
+
+    # make sure everything matches non-streaming except for ID
+    assert function_name == tool_calls[0].function.name
+    assert choice.message.role == role_name
+    assert choice.message.tool_calls[0].function.name == function_name
+
+    # compare streamed with non-streamed args dict-wise, not string-wise
+    # because character-to-character comparison might not work e.g. the tool
+    # call parser adding extra spaces or something like that. we care about the
+    # dicts matching not byte-wise match
+    assert parsed_arguments == streamed_args
+
+
+# test: providing tools and results back to model to get a non-tool response
+# (streaming/not)
+@pytest.mark.asyncio
+async def test_tool_call_with_results(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_WITH_TOOL_RESPONSE,
+        temperature=0,
+        max_completion_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+    )
+
+    choice = chat_completion.choices[0]
+
+    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
+    assert choice.message.role == "assistant"
+    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
+    assert choice.message.content is not None
+    assert "98" in choice.message.content  # the temperature from the response
+
+    stream = await client.chat.completions.create(
+        messages=MESSAGES_WITH_TOOL_RESPONSE,
+        temperature=0,
+        max_completion_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        stream=True,
+    )
+
+    chunks: list[str] = []
+    finish_reason_count = 0
+    role_sent: bool = False
+
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+
+        if delta.role:
+            assert not role_sent
+            assert delta.role == "assistant"
+            role_sent = True
+
+        if delta.content:
+            chunks.append(delta.content)
+
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == choice.finish_reason
+
+        assert not delta.tool_calls or len(delta.tool_calls) == 0
+
+    assert role_sent
+    assert finish_reason_count == 1
+    assert len(chunks)
+    assert "".join(chunks) == choice.message.content
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -0,0 +1,330 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from copy import deepcopy
+from unittest.mock import MagicMock
+
+import pytest
+import regex as re
+from pydantic import TypeAdapter
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionToolsParam,
+)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.tool_parsers.utils import get_json_schema_from_tools
+
+pytestmark = pytest.mark.cpu_test
+
+EXAMPLE_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                },
+                "required": ["city"],
+                "additionalProperties": False,
+            },
+        },
+        "strict": True,
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to get the forecast for, e.g. "
+                        "'New York'",
+                    },
+                    "days": {
+                        "type": "integer",
+                        "description": "Number of days to get the forecast for (1-7)",
+                    },
+                },
+                "required": ["city", "days"],
+                "additionalProperties": False,
+            },
+        },
+        "strict": True,
+    },
+]
+
+
+def _compile_and_check(
+    tools: list[ChatCompletionToolsParam], sample_output, should_match: bool
+):
+    # self = MagicMock(tool_choice="required", tools=tools)
+    # schema = ChatCompletionRequest._get_json_schema_from_tool(self)
+    schema = get_json_schema_from_tools(tools=tools, tool_choice="required")
+    assert isinstance(schema, dict)
+
+    # use build_regex_from_schema used in JSONLogitsProcessor to create Guide
+    from outlines_core.json_schema import build_regex_from_schema
+
+    regex = build_regex_from_schema(json.dumps(schema))
+    compiled = re.compile(regex)
+    matches = compiled.fullmatch(json.dumps(sample_output)) is not None
+
+    assert matches == should_match
+
+
+VALID_TOOL_OUTPUTS = [
+    ([{"name": "get_current_weather", "parameters": {"city": "Vienna"}}], True),
+    (
+        [
+            {"name": "get_current_weather", "parameters": {"city": "Vienna"}},
+            {"name": "get_current_weather", "parameters": {"city": "Berlin"}},
+        ],
+        True,
+    ),
+    ([{"name": "get_forecast", "parameters": {"city": "Vienna", "days": 7}}], True),
+    (
+        [
+            {"name": "get_forecast", "parameters": {"city": "Vienna", "days": 7}},
+            {"name": "get_current_weather", "parameters": {"city": "Vienna"}},
+        ],
+        True,
+    ),
+    (
+        [
+            {"name": "get_forecast", "parameters": {"city": "Vienna", "days": 7}},
+            {"name": "get_current_weather", "parameters": {"city": "Vienna"}},
+            {"name": "get_forecast", "parameters": {"city": "Berlin", "days": 7}},
+            {"name": "get_current_weather", "parameters": {"city": "Berlin"}},
+        ],
+        True,
+    ),
+]
+
+VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
+
+
+@pytest.mark.parametrize(
+    "sample_output, should_match",
+    VALID_TOOL_OUTPUTS
+    + [
+        (None, False),
+        ([], False),  # empty list cannot be generated
+        ({}, False),  # empty object cannot be generated
+        ([{}], False),  # list with empty object cannot be generated
+        (
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather"
+                }
+            ],
+            False,
+        ),
+        (
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": {},
+                }
+            ],
+            False,
+        ),
+        (
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": None,
+                }
+            ],
+            False,
+        ),
+        (
+            {  # tool call without lists cannot be generated
+                "name": "get_current_weather",
+                "parameters": {"city": "Vienna"},
+            },
+            False,
+        ),
+        (
+            [
+                {  # tool call with extra parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": {"city": "Vienna", "extra": "value"},
+                }
+            ],
+            False,
+        ),
+        (
+            [
+                {  # tool call where parameters are first cannot be generated
+                    "parameters": {"city": "Vienna"},
+                    "name": "get_current_weather",
+                }
+            ],
+            False,
+        ),
+        (
+            [
+                {  # tool call without all required parameters cannot be generated
+                    "name": "get_forecast",
+                    "parameters": {"city": "Vienna"},
+                }
+            ],
+            False,
+        ),
+        (  # tool call with incorrect name/parameters cannot be generated
+            [{"name": "get_weather", "parameters": {"city": "Vienna", "days": 7}}],
+            False,
+        ),
+        (  #  tool call with both valid and empty function cannot be generated
+            [{"name": "get_current_weather", "parameters": {"city": "Vienna"}}, {}],
+            False,
+        ),
+    ],
+)
+def test_structured_outputs_json(sample_output, should_match):
+    _compile_and_check(
+        tools=TypeAdapter(list[ChatCompletionToolsParam]).validate_python(
+            EXAMPLE_TOOLS
+        ),
+        sample_output=sample_output,
+        should_match=should_match,
+    )
+
+
+def update_parameters_none(tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool.function.parameters = None
+    return tool
+
+
+def update_parameters_empty_dict(
+    tool: ChatCompletionToolsParam,
+) -> ChatCompletionToolsParam:
+    tool.function.parameters = {}
+    return tool
+
+
+@pytest.mark.parametrize(
+    "sample_output, should_match",
+    [
+        (None, False),
+        ([], False),  # empty list cannot be generated
+        ({}, False),  # empty object cannot be generated
+        ([{}], False),  # list with empty object cannot be generated
+        (
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather"
+                }
+            ],
+            False,
+        ),
+        (
+            [
+                {  # function without required parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": None,
+                }
+            ],
+            False,
+        ),
+        (
+            [
+                {  # function with extra parameters cannot be generated
+                    "name": "get_current_weather",
+                    "parameters": {"extra": "value"},
+                }
+            ],
+            False,
+        ),
+        (
+            [
+                {  # only function with empty parameters object is valid
+                    "name": "get_current_weather",
+                    "parameters": {},
+                }
+            ],
+            True,
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "update_parameters", [update_parameters_none, update_parameters_empty_dict]
+)
+def test_structured_outputs_json_without_parameters(
+    sample_output, should_match, update_parameters
+):
+    updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
+    tools = TypeAdapter(list[ChatCompletionToolsParam]).validate_python(updated_tools)
+    tools = list(map(update_parameters, tools))
+    assert all(
+        [
+            tool.function.parameters is None or tool.function.parameters == {}
+            for tool in tools
+        ]
+    )
+    _compile_and_check(
+        tools=tools, sample_output=sample_output, should_match=should_match
+    )
+
+
+@pytest.mark.parametrize("output", VALID_TOOLS)
+@pytest.mark.parametrize("empty_params", [False, True])
+@pytest.mark.parametrize("delta_len", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+def test_streaming_output_valid(output, empty_params, delta_len):
+    self = MagicMock()
+
+    output = deepcopy(output)
+    if empty_params:
+        output = [{"name": o["name"], "parameters": {}} for o in output]
+    output_json = json.dumps(output)
+
+    previous_text = ""
+    function_name_returned = False
+    messages = []
+    for i in range(0, len(output_json), delta_len):
+        delta_text = output_json[i : i + delta_len]
+        current_text = previous_text + delta_text
+
+        delta_message, function_name_returned = (
+            OpenAIServingChat.extract_tool_call_required_streaming(
+                self,
+                previous_text=previous_text,
+                current_text=current_text,
+                delta_text=delta_text,
+                function_name_returned=function_name_returned,
+            )
+        )
+
+        if delta_message:
+            messages.append(delta_message)
+
+        previous_text = current_text
+
+    assert len(messages) > 0
+    combined_messages = "["
+    for message in messages:
+        if message.tool_calls[0].function.name:
+            if len(combined_messages) > 1:
+                combined_messages += "},"
+
+            combined_messages += (
+                '{"name": "'
+                + message.tool_calls[0].function.name
+                + '", "parameters": '
+                + message.tool_calls[0].function.arguments
+            )
+        else:
+            combined_messages += message.tool_calls[0].function.arguments
+    combined_messages += "}]"
+    assert json.loads(combined_messages) == output
+    assert json.dumps(json.loads(combined_messages)) == output_json
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -0,0 +1,375 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from copy import deepcopy
+from typing import Any
+
+from openai.types.chat import ChatCompletionMessageParam, ChatCompletionToolParam
+from typing_extensions import TypedDict
+
+from tests.utils import VLLM_PATH
+
+
+class ServerConfig(TypedDict, total=False):
+    model: str
+    arguments: list[str]
+    system_prompt: str | None
+    supports_parallel: bool | None
+    supports_rocm: bool | None
+    extended: bool | None  # tests do not run in CI automatically
+
+
+def patch_system_prompt(
+    messages: list[dict[str, Any]], system_prompt: str
+) -> list[dict[str, Any]]:
+    new_messages = deepcopy(messages)
+    if new_messages[0]["role"] == "system":
+        new_messages[0]["content"] = system_prompt
+    else:
+        new_messages.insert(0, {"role": "system", "content": system_prompt})
+    return new_messages
+
+
+def ensure_system_prompt(
+    messages: list[dict[str, Any]], config: ServerConfig
+) -> list[dict[str, Any]]:
+    prompt = config.get("system_prompt")
+    if prompt:
+        return patch_system_prompt(messages, prompt)
+    else:
+        return messages
+
+
+# universal args for all models go here. also good if you need to test locally
+# and change type or KV cache quantization or something.
+ARGS: list[str] = [
+    "--enable-auto-tool-choice",
+    "--max-model-len",
+    "1024",
+    "--max-num-seqs",
+    "256",
+]
+
+CONFIGS: dict[str, ServerConfig] = {
+    "hermes": {
+        "model": "NousResearch/Hermes-3-Llama-3.1-8B",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "hermes",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja"),
+        ],
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally.",
+    },
+    "llama": {
+        "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "llama3_json",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja"),
+        ],
+        "supports_parallel": False,
+    },
+    "llama3.2": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "llama3_json",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja"),
+        ],
+        "supports_parallel": False,
+    },
+    "llama4": {
+        "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "llama4_pythonic",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama4_pythonic.jinja"),
+            "-tp",
+            "4",
+        ],
+        "supports_parallel": False,
+        "extended": True,
+    },
+    "llama4_json": {
+        "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "-tp",
+            "4",
+            "--distributed-executor-backend",
+            "mp",
+            "--tool-call-parser",
+            "llama4_json",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama4_json.jinja"),
+        ],
+        "supports_parallel": True,
+        "extended": True,
+    },
+    "mistral-7b": {
+        "model": "mistralai/Mistral-7B-Instruct-v0.3",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tokenizer_mode",
+            "hf",
+            "--load_format",
+            "hf",
+            "--config_format",
+            "hf",
+            "--tool-call-parser",
+            "mistral",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
+            '--ignore-patterns="consolidated.safetensors"',
+        ],
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally.",
+        "supports_parallel": True,
+    },
+    "mistral-small-3.2": {
+        "model": "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "mistral",
+            "--tokenizer-mode",
+            "mistral",
+            "--config-format",
+            "mistral",
+            "--load-format",
+            "mistral",
+            "--tensor-parallel-size",
+            "4",
+            '--ignore-patterns="consolidated.safetensors"',
+        ],
+        "system_prompt": "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally.",
+        "supports_parallel": True,
+        "extended": True,
+    },
+    # FIXME: This test currently fails, need to debug why.
+    # "granite20b": {
+    #     "model": "mbayser/granite-20b-functioncalling-FP8-KV",
+    #     "arguments": [
+    #         "--tool-call-parser",
+    #         "granite-20b-fc",
+    #         "--chat-template",
+    #         str(VLLM_PATH / "examples/tool_chat_template_granite_20b_fc.jinja"),
+    #         "--max_num_seqs",
+    #         "1",
+    #         "--enforce-eager",
+    #         "--cpu-offload-gb",
+    #         "20",
+    #     ],
+    #     "supports_parallel": False,
+    #     "supports_rocm": False,
+    # },
+    "granite-3.0-8b": {
+        "model": "ibm-granite/granite-3.0-8b-instruct",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "granite",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_granite.jinja"),
+        ],
+    },
+    "granite-3.1-8b": {
+        "model": "ibm-granite/granite-3.1-8b-instruct",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "granite",
+        ],
+        "supports_parallel": True,
+    },
+    "internlm": {
+        "model": "internlm/internlm2_5-7b-chat",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "internlm",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_internlm2_tool.jinja"),
+            "--trust_remote_code",
+        ],
+        "supports_parallel": False,
+    },
+    "toolACE": {
+        "model": "Team-ACE/ToolACE-8B",
+        "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
+            "--tool-call-parser",
+            "pythonic",
+            "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja"),
+        ],
+        "supports_parallel": True,
+    },
+}
+
+WEATHER_TOOL: ChatCompletionToolParam = {
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type": "string",
+                    "description": "The city to find the weather for, "
+                    "e.g. 'San Francisco'",
+                },
+                "state": {
+                    "type": "string",
+                    "description": "must the two-letter abbreviation for the state "
+                    "that the city is in, e.g. 'CA' which would "
+                    "mean 'California'",
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"],
+                },
+            },
+        },
+    },
+}
+
+SEARCH_TOOL: ChatCompletionToolParam = {
+    "type": "function",
+    "function": {
+        "name": "web_search",
+        "description": "Search the internet and get a summary of the top "
+        "10 webpages. Should only be used if you don't know "
+        "the answer to a user query, and the results are likely"
+        "to be able to be found with a web search",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "search_term": {
+                    "type": "string",
+                    "description": "The term to use in the search. This should"
+                    "ideally be keywords to search for, not a"
+                    "natural-language question",
+                }
+            },
+            "required": ["search_term"],
+        },
+    },
+}
+
+MESSAGES_WITHOUT_TOOLS: list[ChatCompletionMessageParam] = [
+    {"role": "user", "content": "Hi! How are you?"},
+    {"role": "assistant", "content": "I'm doing great! How can I assist you?"},
+    {"role": "user", "content": "Can you tell me a joke please?"},
+]
+
+MESSAGES_ASKING_FOR_TOOLS: list[ChatCompletionMessageParam] = [
+    {"role": "user", "content": "What is the weather in Dallas, Texas in Fahrenheit?"}
+]
+
+MESSAGES_WITH_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [
+    {"role": "user", "content": "What is the weather in Dallas, Texas in Fahrenheit?"},
+    {
+        "role": "assistant",
+        "tool_calls": [
+            {
+                "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+                "type": "function",
+                "function": {
+                    "name": WEATHER_TOOL["function"]["name"],
+                    "arguments": '{"city": "Dallas", "state": "TX", '
+                    '"unit": "fahrenheit"}',
+                },
+            }
+        ],
+    },
+    {
+        "role": "tool",
+        "tool_call_id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+        "content": "The weather in Dallas is 98 degrees fahrenheit, with partly"
+        "cloudy skies and a low chance of rain.",
+    },
+]
+
+MESSAGES_ASKING_FOR_PARALLEL_TOOLS: list[ChatCompletionMessageParam] = [
+    {
+        "role": "user",
+        "content": "What is the weather in Dallas, Texas and Orlando, Florida in "
+        "Fahrenheit?",
+    }
+]
+
+MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [
+    {
+        "role": "user",
+        "content": "What is the weather in Dallas, Texas and Orlando, Florida in "
+        "Fahrenheit?",
+    },
+    {
+        "role": "assistant",
+        "tool_calls": [
+            {
+                "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+                "type": "function",
+                "function": {
+                    "name": WEATHER_TOOL["function"]["name"],
+                    "arguments": '{"city": "Dallas", "state": "TX", '
+                    '"unit": "fahrenheit"}',
+                },
+            },
+            {
+                "id": "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
+                "type": "function",
+                "function": {
+                    "name": WEATHER_TOOL["function"]["name"],
+                    "arguments": '{"city": "Orlando", "state": "Fl", '
+                    '"unit": "fahrenheit"}',
+                },
+            },
+        ],
+    },
+    {
+        "role": "tool",
+        "tool_call_id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295",
+        "content": "The weather in Dallas TX is 98 degrees fahrenheit with mostly "
+        "cloudy skies and a chance of rain in the evening.",
+    },
+    {
+        "role": "tool",
+        "tool_call_id": "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b",
+        "content": "The weather in Orlando FL is 78 degrees fahrenheit with clear"
+        "skies.",
+    },
+]