Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/entrypoints/init.py
+++ b/tests/entrypoints/init.py
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+
+@pytest.fixture
+def sample_prompts():
+    return [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+
+@pytest.fixture
+def sample_token_ids():
+    return [
+        [0],
+        [0, 1],
+        [0, 2, 1],
+        [0, 3, 1, 2],
+    ]
+
+
+@pytest.fixture
+def sample_regex():
+    return (
+        r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+        r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+    )
+
+
+@pytest.fixture
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
+            "skills": {
+                "type": "array",
+                "items": {"type": "string", "maxLength": 10},
+                "minItems": 3,
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {"type": "string"},
+                        "duration": {"type": "number"},
+                        "position": {"type": "string"},
+                    },
+                    "required": ["company", "position"],
+                },
+            },
+        },
+        "required": ["name", "age", "skills", "work_history"],
+    }
+
+
+@pytest.fixture
+def sample_complex_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "score": {
+                "type": "integer",
+                "minimum": 0,
+                "maximum": 100,  # Numeric range
+            },
+            "grade": {
+                "type": "string",
+                "pattern": "^[A-D]$",  # Regex pattern
+            },
+            "email": {
+                "type": "string",
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
+            },
+            "tags": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    # Combining length and pattern restrictions
+                    "pattern": "^[a-z]{1,10}$",
+                },
+            },
+        },
+        "required": ["score", "grade", "email", "tags"],
+    }
+
+
+@pytest.fixture
+def sample_definition_json_schema():
+    return {
+        "$defs": {
+            "Step": {
+                "properties": {
+                    "explanation": {"title": "Explanation", "type": "string"},
+                    "output": {"title": "Output", "type": "string"},
+                },
+                "required": ["explanation", "output"],
+                "title": "Step",
+                "type": "object",
+            }
+        },
+        "properties": {
+            "steps": {
+                "items": {"$ref": "#/$defs/Step"},
+                "title": "Steps",
+                "type": "array",
+            },
+            "final_answer": {"title": "Final Answer", "type": "string"},
+        },
+        "required": ["steps", "final_answer"],
+        "title": "MathReasoning",
+        "type": "object",
+    }
+
+
+@pytest.fixture
+def sample_enum_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "status": {
+                "type": "string",
+                "enum": ["active", "inactive", "pending"],  # Literal values using enum
+            },
+            "priority": {
+                "type": "string",
+                "enum": ["low", "medium", "high", "critical"],
+            },
+            "category": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "enum": ["bug", "feature", "improvement"],
+                    },
+                    "severity": {
+                        "type": "integer",
+                        "enum": [1, 2, 3, 4, 5],  # Enum can also contain numbers
+                    },
+                },
+                "required": ["type", "severity"],
+            },
+            "flags": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "enum": ["urgent", "blocked", "needs_review", "approved"],
+                },
+            },
+        },
+        "required": ["status", "priority", "category", "flags"],
+    }
+
+
+@pytest.fixture
+def sample_structured_outputs_choices():
+    return [
+        "Python",
+        "Java",
+        "JavaScript",
+        "C++",
+        "C#",
+        "PHP",
+        "TypeScript",
+        "Ruby",
+        "Swift",
+        "Kotlin",
+    ]
+
+
+@pytest.fixture
+def sample_sql_statements():
+    return """
+start: select_statement
+select_statement: "SELECT" column "from" table "where" condition
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+number: "1" | "2"
+"""
+
+
+@pytest.fixture(scope="session")
+def qwen3_lora_files():
+    """Download Qwen3 LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id="charent/self_cognition_Alice")
+
+
+@pytest.fixture(scope="session")
+def opt125_lora_files() -> str:
+    """Download opt-125m LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id="peft-internal-testing/opt-125m-dummy-lora")
--- a/tests/entrypoints/llm/init.py
+++ b/tests/entrypoints/llm/init.py
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+import pytest
+
+from vllm.platforms import current_platform
+
+MODEL_NAMES = [
+    "Qwen/Qwen3-1.7B",
+    "google/gemma-3-1b-it",
+]
+FP8_KV_MODEL_NAMES = [
+    "Qwen/Qwen3-1.7B",
+]
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUES = {
+    "Qwen/Qwen3-1.7B": 0.68,
+    "google/gemma-3-1b-it": 0.25,
+}
+
+
+def run_test(model_name, more_args=None):
+    """Run the end to end accuracy test."""
+
+    model_args = f"pretrained={model_name},max_model_len=4096"
+
+    if more_args is not None:
+        model_args = "{},{}".format(model_args, more_args)
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    measured_value = results["results"][TASK][FILTER]
+    assert model_name in EXPECTED_VALUES, (
+        f"Cannot find the expected value for the model {model_name=}"
+    )
+    expected_value = EXPECTED_VALUES[model_name]
+    assert (
+        measured_value - RTOL < expected_value
+        and measured_value + RTOL > expected_value
+    ), f"Expected: {expected_value} |  Measured: {measured_value}"
+
+
+# TODO: [AlexM] Fix it with new CI/CD tests
+TPU_TP_TEST_STR = ""  # "tensor_parallel_size=4"
+
+
+@pytest.mark.parametrize("model", MODEL_NAMES)
+def test_lm_eval_accuracy_v1_engine(model):
+    """Run with the V1 Engine."""
+
+    more_args = None
+    if current_platform.is_tpu():
+        # Limit compilation time for TPU V1
+
+        more_args = "max_model_len=2048,max_num_seqs=64"
+
+        # Add TP test (if provided)
+        if TPU_TP_TEST_STR:
+            more_args += ",{}".format(TPU_TP_TEST_STR)
+
+    run_test(model, more_args)
+
+
+@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
+def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
+    """Run with the V1 Engine."""
+
+    more_args = None
+    if current_platform.is_tpu():
+        # Limit compilation time for TPU V1
+        more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
+
+        # Add TP test (if provided)
+        if TPU_TP_TEST_STR:
+            more_args += ",{}".format(TPU_TP_TEST_STR)
+
+    run_test(model, more_args)
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -0,0 +1,212 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import weakref
+
+import pytest
+
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.sampling_params import SamplingParams
+
+from ..openai.test_vision import TEST_IMAGE_ASSETS
+
+
+@pytest.fixture(scope="function")
+def text_llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, seed=0)
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.fixture(scope="function")
+def llm_for_failure_test():
+    """
+    Fixture for testing issue #26081.
+    Uses a small max_model_len to easily trigger length errors.
+    """
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        seed=0,
+        max_model_len=128,
+        disable_log_stats=True,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+def test_chat(text_llm):
+    prompt1 = "Explain the concept of entropy."
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": prompt1},
+    ]
+    outputs = text_llm.chat(messages)
+    assert len(outputs) == 1
+
+
+def test_multi_chat(text_llm):
+    prompt1 = "Explain the concept of entropy."
+    prompt2 = "Explain what among us is."
+
+    conversation1 = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": prompt1},
+    ]
+
+    conversation2 = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": prompt2},
+    ]
+
+    messages = [conversation1, conversation2]
+
+    outputs = text_llm.chat(messages)
+    assert len(outputs) == 2
+
+
+@pytest.fixture(scope="function")
+def vision_llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        max_model_len=4096,
+        max_num_seqs=5,
+        enforce_eager=True,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 2},
+        seed=0,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize(
+    "image_urls", [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], indirect=True
+)
+def test_chat_multi_image(vision_llm, image_urls: list[str]):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "image_url", "image_url": {"url": image_url}}
+                    for image_url in image_urls
+                ),
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }
+    ]
+    outputs = vision_llm.chat(messages)
+    assert len(outputs) >= 0
+
+
+def test_llm_chat_tokenization_no_double_bos(text_llm):
+    """
+    LLM.chat() should not add special tokens when using chat templates.
+    Check we get a single BOS token for llama chat.
+    """
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello!"},
+    ]
+    outputs = text_llm.chat(messages)
+    assert len(outputs) == 1
+
+    prompt_token_ids = outputs[0].prompt_token_ids
+    assert prompt_token_ids is not None
+
+    bos_token = text_llm.get_tokenizer().bos_token_id
+
+    # Ensure we have a single BOS
+    assert prompt_token_ids[0] == bos_token
+    assert prompt_token_ids[1] != bos_token, "Double BOS"
+
+
+@pytest.fixture(scope="function")
+def thinking_llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model="Qwen/Qwen3-0.6B",
+        max_model_len=4096,
+        enforce_eager=True,
+        seed=0,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize("enable_thinking", [True, False])
+def test_chat_extra_kwargs(thinking_llm, enable_thinking):
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "What is 1+1?"},
+    ]
+
+    outputs = thinking_llm.chat(
+        messages,
+        chat_template_kwargs={"enable_thinking": enable_thinking},
+    )
+    assert len(outputs) == 1
+
+    prompt_token_ids = outputs[0].prompt_token_ids
+    assert prompt_token_ids is not None
+
+    think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]
+
+    if enable_thinking:
+        assert think_id not in prompt_token_ids
+    else:
+        # The chat template includes dummy thinking process
+        assert think_id in prompt_token_ids
+
+
+def test_chat_batch_failure_cleanup(llm_for_failure_test):
+    """
+    Tests that if a batch call to llm.chat() fails mid-way
+    (e.g., due to one invalid prompt), the requests that
+    were already enqueued are properly aborted and do not
+    pollute the queue for subsequent calls.
+    (Fixes Issue #26081)
+    """
+    llm = llm_for_failure_test
+    valid_msg = [{"role": "user", "content": "Hello"}]
+    long_text = "This is a very long text to test the error " * 50
+    invalid_msg = [{"role": "user", "content": long_text}]
+    batch_1 = [
+        valid_msg,
+        valid_msg,
+        invalid_msg,
+    ]
+    batch_2 = [
+        valid_msg,
+        valid_msg,
+    ]
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    with pytest.raises(ValueError, match="longer than the maximum model length"):
+        llm.chat(batch_1, sampling_params=sampling_params)
+    outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
+    assert len(outputs_2) == len(batch_2)
+    assert llm.llm_engine.get_num_unfinished_requests() == 0
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm import LLM
+
+from ...utils import create_new_process_for_each_test
+
+
+@pytest.mark.parametrize("tp_size", [1, 2])
+@pytest.mark.parametrize("backend", ["mp", "ray"])
+@create_new_process_for_each_test()
+def test_collective_rpc(tp_size, backend, monkeypatch):
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+    if tp_size == 1 and backend == "ray":
+        pytest.skip("Skip duplicate test case")
+    if tp_size == 1:
+        backend = None
+
+    # intentionally define the method and class in the test function,
+    # to test if they can be serialized and sent to the workers
+    def echo_rank(self):
+        return self.rank
+
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+    llm = LLM(
+        model="hmellor/tiny-random-LlamaForCausalLM",
+        enforce_eager=True,
+        load_format="dummy",
+        tensor_parallel_size=tp_size,
+        distributed_executor_backend=backend,
+    )
+    assert llm.collective_rpc(echo_rank) == list(range(tp_size))
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+
+from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+MODEL_NAME = "distilbert/distilgpt2"
+
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+TOKEN_IDS = [
+    [0],
+    [0, 1],
+    [0, 2, 1],
+    [0, 3, 1, 2],
+]
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=4096,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.10,
+        enforce_eager=True,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_multiple_sampling_params(llm: LLM):
+    sampling_params = [
+        SamplingParams(temperature=0.01, top_p=0.95),
+        SamplingParams(temperature=0.3, top_p=0.95),
+        SamplingParams(temperature=0.7, top_p=0.95),
+        SamplingParams(temperature=0.99, top_p=0.95),
+    ]
+
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, sampling_params=sampling_params)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3])
+
+    # Single SamplingParams should be applied to every prompt
+    single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
+    outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params)
+    assert len(PROMPTS) == len(outputs)
+
+    # sampling_params is None, default params should be applied
+    outputs = llm.generate(PROMPTS, sampling_params=None)
+    assert len(PROMPTS) == len(outputs)
+
+
+def test_multiple_priority(llm: LLM):
+    # Generate works when priority is None
+    outputs = llm.generate(PROMPTS, sampling_params=None, priority=None)
+    assert len(PROMPTS) == len(outputs)
+
+    # Generate works when length of priority is same as the len(PROMPTS)
+    outputs = llm.generate(PROMPTS, sampling_params=None, priority=[0] * len(PROMPTS))
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the length of priority does not match the length of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(
+            PROMPTS, sampling_params=None, priority=[0] * (len(PROMPTS) - 1)
+        )
+
+    # Exception raised, if the priority list is empty
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, sampling_params=None, priority=[])
+
+
+def test_max_model_len():
+    max_model_len = 20
+    llm = LLM(
+        model=MODEL_NAME,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=0.10,
+        enforce_eager=True,  # reduce test time
+    )
+    sampling_params = SamplingParams(max_tokens=max_model_len + 10)
+    outputs = llm.generate(PROMPTS, sampling_params)
+    for output in outputs:
+        num_total_tokens = len(output.prompt_token_ids) + len(
+            output.outputs[0].token_ids
+        )
+        # Total tokens must not exceed max_model_len.
+        # It can be less if generation finishes due to other reasons (e.g., EOS)
+        # before reaching the absolute model length limit.
+        assert num_total_tokens <= max_model_len
+
+
+def test_log_stats():
+    llm = LLM(
+        model=MODEL_NAME,
+        disable_log_stats=False,
+        gpu_memory_utilization=0.10,
+        enforce_eager=True,  # reduce test time
+    )
+    outputs = llm.generate(PROMPTS, sampling_params=None)
+
+    # disable_log_stats is False, every output should have metrics
+    assert all(output.metrics is not None for output in outputs)
--- a/tests/entrypoints/llm/test_gpu_utilization.py
+++ b/tests/entrypoints/llm/test_gpu_utilization.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import LLM, SamplingParams
+
+
+def test_gpu_memory_utilization():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # makes sure gpu_memory_utilization is per-instance limit,
+    # not a global limit
+    llms = [
+        LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3, enforce_eager=True)
+        for i in range(3)
+    ]
+    for llm in llms:
+        outputs = llm.generate(prompts, sampling_params)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/entrypoints/llm/test_mm_cache_stats.py
+++ b/tests/entrypoints/llm/test_mm_cache_stats.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import logging
+
+import pytest
+import regex as re
+
+from vllm import LLM
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.v1.metrics import loggers as stat_loggers
+from vllm.v1.metrics.reader import Counter, Metric
+
+from ..openai.test_vision import TEST_IMAGE_ASSETS
+
+
+def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                },
+            ],
+        }
+    ]
+
+
+def _get_counter_value(metrics: list[Metric], name: str):
+    metric = next(m for m in metrics if m.name == name)
+    assert isinstance(metric, Counter)
+    return metric.value
+
+
+def _get_mm_cache_stats(metrics: list[Metric]):
+    mm_cache_queries = _get_counter_value(metrics, "vllm:mm_cache_queries")
+    mm_cache_hits = _get_counter_value(metrics, "vllm:mm_cache_hits")
+
+    return mm_cache_queries, mm_cache_hits
+
+
+def _get_mm_cache_log(llm: LLM, caplog_vllm: pytest.LogCaptureFixture) -> float:
+    caplog_vllm.clear()
+    with caplog_vllm.at_level(logging.INFO, logger=stat_loggers.__name__):
+        llm.llm_engine.do_log_stats()
+
+    assert len(caplog_vllm.records) == 1
+    msg = caplog_vllm.records[0].getMessage()
+
+    assert "MM cache hit rate" in msg
+    match = re.search(r"MM cache hit rate: ([0-9.]+)%", msg)
+    assert match is not None
+    return float(match.group(1))
+
+
+@pytest.mark.parametrize("image_urls", [TEST_IMAGE_ASSETS[:2]], indirect=True)
+@pytest.mark.parametrize("mm_processor_cache_type", ["lru", "shm"])
+def test_mm_cache_stats(
+    num_gpus_available,
+    image_urls,
+    mm_processor_cache_type,
+    caplog_vllm,
+):
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_model_len=4096,
+        max_num_seqs=5,
+        enforce_eager=True,
+        mm_processor_cache_type=mm_processor_cache_type,
+        disable_log_stats=False,
+        limit_mm_per_prompt={"image": 2},
+    )
+
+    llm.chat(_make_messages(image_urls[0]))
+    assert _get_mm_cache_stats(llm.get_metrics()) == (1, 0)
+    assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
+
+    llm.chat(_make_messages(image_urls[1]))
+    assert _get_mm_cache_stats(llm.get_metrics()) == (2, 0)
+    assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
+
+    llm.chat(_make_messages(image_urls[0]))
+    assert _get_mm_cache_stats(llm.get_metrics()) == (3, 1)
+    assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(33.3)
+
+    # NOTE: This only resets hit rate stats in CachingMetrics
+    # The raw queries and hits counts remain unaffected
+    llm.reset_mm_cache()
+
+    llm.chat(_make_messages(image_urls[0]))
+    assert _get_mm_cache_stats(llm.get_metrics()) == (4, 1)
+    assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
+
+    llm.chat(_make_messages(image_urls[1]))
+    assert _get_mm_cache_stats(llm.get_metrics()) == (5, 1)
+    assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm import LLM
+
+
+def test_empty_prompt():
+    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
+    with pytest.raises(ValueError, match="decoder prompt cannot be empty"):
+        llm.generate([""])
+
+
+def test_out_of_vocab_token():
+    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
+    with pytest.raises(ValueError, match="out of vocabulary"):
+        llm.generate({"prompt_token_ids": [999999]})
+
+
+def test_require_mm_embeds():
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf",
+        enforce_eager=True,
+        enable_mm_embeds=False,
+    )
+    with pytest.raises(ValueError, match="--enable-mm-embeds"):
+        llm.generate(
+            {
+                "prompt": "<image>",
+                "multi_modal_data": {"image": torch.empty(1, 1, 1)},
+            }
+        )
--- a/tests/entrypoints/offline_mode/init.py
+++ b/tests/entrypoints/offline_mode/init.py
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for HF_HUB_OFFLINE mode"""
+
+import dataclasses
+import importlib
+import sys
+
+import pytest
+import urllib3
+
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.engine.arg_utils import EngineArgs
+
+MODEL_CONFIGS = [
+    {
+        "model": "facebook/opt-125m",
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.20,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+    },
+    {
+        "model": "Qwen/Qwen3-0.6B",
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.50,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+        "tokenizer": "Qwen/Qwen3-4B",
+    },
+    {
+        "model": "mistralai/Mistral-7B-Instruct-v0.1",
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.95,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+        "tokenizer_mode": "mistral",
+    },
+    # TODO: re-enable once these tests are run with V1
+    # {
+    #     "model": "sentence-transformers/all-MiniLM-L12-v2",
+    #     "enforce_eager": True,
+    #     "gpu_memory_utilization": 0.20,
+    #     "max_model_len": 64,
+    #     "max_num_batched_tokens": 64,
+    #     "max_num_seqs": 64,
+    #     "tensor_parallel_size": 1,
+    # },
+]
+
+
+@pytest.fixture(scope="module")
+def cache_models():
+    # Cache model files first
+    for model_config in MODEL_CONFIGS:
+        LLM(**model_config)
+        cleanup_dist_env_and_memory()
+
+    yield
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.usefixtures("cache_models")
+def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
+    # Set HF to offline mode and ensure we can still construct an LLM
+    with monkeypatch.context() as m:
+        try:
+            m.setenv("HF_HUB_OFFLINE", "1")
+            m.setenv("VLLM_NO_USAGE_STATS", "1")
+
+            def disable_connect(*args, **kwargs):
+                raise RuntimeError("No http calls allowed")
+
+            m.setattr(
+                urllib3.connection.HTTPConnection,
+                "connect",
+                disable_connect,
+            )
+            m.setattr(
+                urllib3.connection.HTTPSConnection,
+                "connect",
+                disable_connect,
+            )
+
+            # Need to re-import huggingface_hub
+            # and friends to set up offline mode
+            _re_import_modules()
+            # Cached model files should be used in offline mode
+            for model_config in MODEL_CONFIGS:
+                LLM(**model_config)
+        finally:
+            # Reset the environment after the test
+            # NB: Assuming tests are run in online mode
+            _re_import_modules()
+
+
+def _re_import_modules():
+    hf_hub_module_names = [k for k in sys.modules if k.startswith("huggingface_hub")]
+    transformers_module_names = [
+        k
+        for k in sys.modules
+        if k.startswith("transformers") and not k.startswith("transformers_modules")
+    ]
+
+    reload_exception = None
+    for module_name in hf_hub_module_names + transformers_module_names:
+        try:
+            importlib.reload(sys.modules[module_name])
+        except Exception as e:
+            reload_exception = e
+            # Try to continue clean up so that other tests are less likely to
+            # be affected
+
+    # Error this test if reloading a module failed
+    if reload_exception is not None:
+        raise reload_exception
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.usefixtures("cache_models")
+def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
+    # Set HF to offline mode and ensure we can still construct an LLM
+    with monkeypatch.context() as m:
+        try:
+            m.setenv("HF_HUB_OFFLINE", "1")
+            m.setenv("VLLM_NO_USAGE_STATS", "1")
+
+            def disable_connect(*args, **kwargs):
+                raise RuntimeError("No http calls allowed")
+
+            m.setattr(
+                urllib3.connection.HTTPConnection,
+                "connect",
+                disable_connect,
+            )
+            m.setattr(
+                urllib3.connection.HTTPSConnection,
+                "connect",
+                disable_connect,
+            )
+            # Need to re-import huggingface_hub
+            # and friends to set up offline mode
+            _re_import_modules()
+            engine_args = EngineArgs(model="facebook/opt-125m")
+            LLM(**dataclasses.asdict(engine_args))
+        finally:
+            # Reset the environment after the test
+            # NB: Assuming tests are run in online mode
+            _re_import_modules()
--- a/tests/entrypoints/openai/init.py
+++ b/tests/entrypoints/openai/init.py
--- a/tests/entrypoints/openai/conftest.py
+++ b/tests/entrypoints/openai/conftest.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.assets.audio import AudioAsset
+
+
+@pytest.fixture
+def mary_had_lamb():
+    path = AudioAsset("mary_had_lamb").get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.fixture
+def winning_call():
+    path = AudioAsset("winning_call").get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.fixture
+def foscolo():
+    # Test translation it->en
+    path = AudioAsset("azacinto_foscolo").get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
--- a/tests/entrypoints/openai/correctness/init.py
+++ b/tests/entrypoints/openai/correctness/init.py
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+
+from vllm.platforms import current_platform
+
+from ....utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUE = 0.54
+DEFAULT_ARGS = ["--max-model-len", "4096"]
+MORE_ARGS_LIST = [
+    [],  # Default
+    ["--enable-chunked-prefill"],  # Chunked
+]
+MAX_WAIT_SECONDS = None
+
+if current_platform.is_tpu():
+    MORE_ARGS_LIST = [
+        [],  # Default
+    ]
+    MAX_WAIT_SECONDS = 600
+
+
+def run_test(more_args):
+    """Run the end to end accuracy test."""
+
+    args = list(DEFAULT_ARGS)
+    args.extend(more_args)
+    print(f"Running with: {args}")
+
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, max_wait_seconds=MAX_WAIT_SECONDS
+    ) as remote_server:
+        url = f"{remote_server.url_for('v1')}/completions"
+
+        model_args = (
+            f"model={MODEL_NAME},"
+            f"base_url={url},"
+            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False"
+        )
+
+        results = lm_eval.simple_evaluate(
+            model="local-completions",
+            model_args=model_args,
+            tasks=TASK,
+        )
+
+        measured_value = results["results"][TASK][FILTER]
+        assert (
+            measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+        ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+
+
+def test_lm_eval_accuracy_v1_engine():
+    """Run with the V1 Engine."""
+
+    more_args = []
+
+    # Limit compilation time for V1
+    if current_platform.is_tpu():
+        more_args = ["--max-num-seqs", "64"]
+
+    run_test(more_args)
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Evaluate Transcription API correctness by computing Word Error Rate (WER)
+on a given ASR dataset. When provided, it will also compare the WER against
+a baseline.
+This simulates real work usage of the API and makes sure that the frontend and
+AsyncLLMEngine are working correctly.
+"""
+
+import asyncio
+import io
+import time
+from statistics import mean, median
+
+import librosa
+import pytest
+import soundfile
+import torch
+from datasets import load_dataset
+from evaluate import load
+from transformers import AutoTokenizer
+
+from ....utils import RemoteOpenAIServer
+
+
+def to_bytes(y, sr):
+    buffer = io.BytesIO()
+    soundfile.write(buffer, y, sr, format="WAV")
+    buffer.seek(0)
+    return buffer
+
+
+async def transcribe_audio(client, tokenizer, y, sr):
+    # Send loaded audio directly instead of loading from disk,
+    # don't account for that time though
+    with to_bytes(y, sr) as f:
+        start_time = time.perf_counter()
+        transcription = await client.audio.transcriptions.create(
+            file=f,
+            model=tokenizer.name_or_path,
+            language="en",
+            temperature=0.0,
+        )
+        end_time = time.perf_counter()
+        # NOTE there's no streaming in transcriptions, can't measure ttft
+    latency = end_time - start_time
+    num_output_tokens = len(
+        tokenizer(transcription.text, add_special_tokens=False).input_ids
+    )
+    return latency, num_output_tokens, transcription.text
+
+
+async def bound_transcribe(sem, client, tokenizer, audio, reference):
+    # Use semaphore to limit concurrent requests.
+    async with sem:
+        result = await transcribe_audio(client, tokenizer, *audio)
+        # Normalize *english* output/reference for evaluation.
+        out = tokenizer.normalize(result[2])
+        ref = tokenizer.normalize(reference)
+        return result[:2] + (out, ref)
+
+
+async def process_dataset(model, client, data, concurrent_request):
+    sem = asyncio.Semaphore(concurrent_request)
+
+    # Load tokenizer once outside the loop
+    tokenizer = AutoTokenizer.from_pretrained(model)
+
+    # Warmup call as the first `librosa.load` server-side is quite slow.
+    audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
+    _ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "")
+
+    tasks: list[asyncio.Task] = []
+    for sample in data:
+        audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
+        task = asyncio.create_task(
+            bound_transcribe(sem, client, tokenizer, (audio, sr), sample["text"])
+        )
+        tasks.append(task)
+    return await asyncio.gather(*tasks)
+
+
+def print_performance_metrics(results, total_time):
+    latencies = [res[0] for res in results]
+    total_tokens = sum([res[1] for res in results])
+
+    total = len(results)
+    print(f"Total Requests: {total}")
+    print(f"Successful Requests: {len(latencies)}")
+    print(f"Average Latency: {mean(latencies):.4f} seconds")
+    print(f"Median Latency: {median(latencies):.4f} seconds")
+    perc = sorted(latencies)[int(len(latencies) * 0.95) - 1]
+    print(f"95th Percentile Latency: {perc:.4f} seconds")
+    # Throughput
+    req_throughput = len(latencies) / total_time
+    print(f"Estimated req_Throughput: {req_throughput:.2f} requests/s")
+    throughput = total_tokens / total_time
+    print(f"Estimated Throughput: {throughput:.2f} tok/s")
+
+
+def add_duration(sample):
+    y, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
+    sample["duration_ms"] = librosa.get_duration(y=y, sr=sr) * 1000
+    return sample
+
+
+def load_hf_dataset(dataset_repo: str, split="validation", **hf_kwargs):
+    ## Load and filter the dataset
+    dataset = load_dataset(dataset_repo, split=split, **hf_kwargs)
+    if "duration_ms" not in dataset[0]:
+        # compute duration to filter
+        dataset = dataset.map(add_duration)
+
+    # Whisper max supported duration
+    dataset = dataset.filter(lambda example: example["duration_ms"] < 30000)
+    return dataset
+
+
+def run_evaluation(
+    model: str,
+    client,
+    dataset,
+    max_concurrent_reqs: int,
+    n_examples: int = -1,
+    print_metrics: bool = True,
+):
+    if n_examples > 0:
+        dataset = dataset.select(range(n_examples))
+    start = time.perf_counter()
+    results = asyncio.run(process_dataset(model, client, dataset, max_concurrent_reqs))
+    end = time.perf_counter()
+    total_time = end - start
+    print(f"Total Test Time: {total_time:.4f} seconds")
+    if print_metrics:
+        print_performance_metrics(results, total_time)
+    # Compute WER
+    predictions = [res[2] for res in results]
+    references = [res[3] for res in results]
+    wer = load("wer")
+    wer_score = 100 * wer.compute(references=references, predictions=predictions)
+    print("WER:", wer_score)
+    return wer_score
+
+
+# alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo"..
+@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"])
+# Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
+@pytest.mark.parametrize(
+    "dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"]
+)
+# NOTE: Expected WER measured with equivalent hf.transformers args:
+# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
+@pytest.mark.parametrize("expected_wer", [12.744980])
+def test_wer_correctness(
+    model_name, dataset_repo, expected_wer, n_examples=-1, max_concurrent_request=None
+):
+    # TODO refactor to use `ASRDataset`
+    with RemoteOpenAIServer(model_name, ["--enforce-eager"]) as remote_server:
+        dataset = load_hf_dataset(dataset_repo)
+
+        if not max_concurrent_request:
+            # No max concurrency
+            max_concurrent_request = n_examples if n_examples > 0 else len(dataset)
+
+        client = remote_server.get_async_client()
+        wer = run_evaluation(
+            model_name, client, dataset, max_concurrent_request, n_examples
+        )
+        if expected_wer:
+            torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)
--- a/tests/entrypoints/openai/parser/init.py
+++ b/tests/entrypoints/openai/parser/init.py
--- a/tests/entrypoints/openai/parser/test_harmony_utils.py
+++ b/tests/entrypoints/openai/parser/test_harmony_utils.py
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import random
+from collections.abc import Callable
+
+import openai
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        "--load-format",
+        "dummy",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ids=["completion", "chat"],
+    argnames=["create_func_gen", "content_body"],
+    argvalues=[
+        (lambda x: x.completions.create, {"prompt": " ".join(["A"] * 10_000)}),
+        (
+            lambda x: x.chat.completions.create,
+            {"messages": [{"role": "user", "content": " ".join(["A"] * 10_000)}]},
+        ),
+    ],
+)
+async def test_with_and_without_truncate(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncOpenAI,
+    create_func_gen: Callable,
+    content_body: dict,
+):
+    create_func = create_func_gen(client)
+    body = {"model": MODEL_NAME, **content_body, "max_tokens": 10}
+
+    num_requests = 10
+    truncate_prompt_tokens = [1000] * (num_requests // 2) + [None] * (
+        num_requests - num_requests // 2
+    )
+    random.shuffle(truncate_prompt_tokens)
+
+    bodies = [
+        {**body, "extra_body": {"truncate_prompt_tokens": t}}
+        for t in truncate_prompt_tokens
+    ]
+
+    async def get_status_code(**kwargs):
+        try:
+            await create_func(**kwargs)
+            return 200
+        except openai.APIStatusError as e:
+            return e.status_code
+
+    responses = await asyncio.gather(*[get_status_code(**b) for b in bodies])
+    assert 500 not in responses
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -0,0 +1,392 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.assets.audio import AudioAsset
+from vllm.multimodal.utils import encode_audio_base64, fetch_audio
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
+TEST_AUDIO_URLS = [
+    AudioAsset("winning_call").url,
+    AudioAsset("mary_had_lamb").url,
+]
+MAXIMUM_AUDIOS = 2
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--dtype",
+        "float32",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": MAXIMUM_AUDIOS}),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_audio() -> dict[str, str]:
+    return {
+        audio_url: encode_audio_base64(*fetch_audio(audio_url))
+        for audio_url in TEST_AUDIO_URLS
+    }
+
+
+def dummy_messages_from_audio_url(
+    audio_urls: str | list[str],
+    content_text: str = "What's happening in this audio?",
+):
+    if isinstance(audio_urls, str):
+        audio_urls = [audio_urls]
+
+    return [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "audio_url", "audio_url": {"url": audio_url}}
+                    for audio_url in audio_urls
+                ),
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_single_chat_session_audio(
+    client: openai.AsyncOpenAI, model_name: str, audio_url: str
+):
+    messages = dummy_messages_from_audio_url(audio_url)
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        temperature=0.0,
+        top_logprobs=5,
+    )
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212
+    )
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_error_on_invalid_audio_url_type(
+    client: openai.AsyncOpenAI, model_name: str, audio_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio_url", "audio_url": audio_url},
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
+
+    # audio_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_single_chat_session_audio_base64encoded(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    audio_url: str,
+    base64_encoded_audio: dict[str, str],
+):
+    messages = dummy_messages_from_audio_url(
+        f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
+    )
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        temperature=0.0,
+        top_logprobs=5,
+    )
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212
+    )
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_single_chat_session_input_audio(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    audio_url: str,
+    base64_encoded_audio: dict[str, str],
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": base64_encoded_audio[audio_url],
+                        "format": "wav",
+                    },
+                },
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+    )
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212
+    )
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_chat_streaming_audio(
+    client: openai.AsyncOpenAI, model_name: str, audio_url: str
+):
+    messages = dummy_messages_from_audio_url(
+        audio_url, "What's a short title for this audio?"
+    )
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=8,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=8,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_chat_streaming_input_audio(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    audio_url: str,
+    base64_encoded_audio: dict[str, str],
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": base64_encoded_audio[audio_url],
+                        "format": "wav",
+                    },
+                },
+                {"type": "text", "text": "What's happening in this audio?"},
+            ],
+        }
+    ]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=8,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=8,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]]
+)
+async def test_multi_audio_input(
+    client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]
+):
+    messages = dummy_messages_from_audio_url(audio_urls)
+
+    if len(audio_urls) > MAXIMUM_AUDIOS:
+        with pytest.raises(openai.BadRequestError):  # test multi-audio input
+            await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                max_completion_tokens=10,
+                temperature=0.0,
+            )
+
+        # the server should still work afterwards
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+        )
+        completion = completion.choices[0].text
+        assert completion is not None and len(completion) >= 0
+    else:
+        chat_completion = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
+        message = chat_completion.choices[0].message
+        assert message.content is not None and len(message.content) >= 0
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -0,0 +1,250 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from http import HTTPStatus
+from unittest.mock import AsyncMock, Mock
+
+import openai
+import pytest
+import pytest_asyncio
+import requests
+from fastapi import Request
+
+from vllm.v1.engine.exceptions import EngineDeadError
+from vllm.version import __version__ as VLLM_VERSION
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def server_args(request: pytest.FixtureRequest) -> list[str]:
+    """Provide extra arguments to the server via indirect parametrization
+
+    Usage:
+
+    >>> @pytest.mark.parametrize(
+    >>>     "server_args",
+    >>>     [
+    >>>         ["--disable-frontend-multiprocessing"],
+    >>>         [
+    >>>             "--model=NousResearch/Hermes-3-Llama-3.1-70B",
+    >>>             "--enable-auto-tool-choice",
+    >>>         ],
+    >>>     ],
+    >>>     indirect=True,
+    >>> )
+    >>> def test_foo(server, client):
+    >>>     ...
+
+    This will run `test_foo` twice with servers with:
+    - `--disable-frontend-multiprocessing`
+    - `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
+
+    """
+    if not hasattr(request, "param"):
+        return []
+
+    val = request.param
+
+    if isinstance(val, str):
+        return [val]
+
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def server(server_args):
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        *server_args,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param([], id="default-frontend-multiprocessing"),
+        pytest.param(
+            ["--disable-frontend-multiprocessing"],
+            id="disable-frontend-multiprocessing",
+        ),
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_show_version(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("version"))
+    response.raise_for_status()
+
+    assert response.json() == {"version": VLLM_VERSION}
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param([], id="default-frontend-multiprocessing"),
+        pytest.param(
+            ["--disable-frontend-multiprocessing"],
+            id="disable-frontend-multiprocessing",
+        ),
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_check_health(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
+
+    assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param(
+            ["--max-model-len", "10100"], id="default-frontend-multiprocessing"
+        ),
+        pytest.param(
+            ["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
+            id="disable-frontend-multiprocessing",
+        ),
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_request_cancellation(server: RemoteOpenAIServer):
+    # clunky test: send an ungodly amount of load in with short timeouts
+    # then ensure that it still responds quickly afterwards
+
+    chat_input = [{"role": "user", "content": "Write a long story"}]
+    client = server.get_async_client(timeout=0.5)
+    tasks = []
+    # Request about 2 million tokens
+    for _ in range(200):
+        task = asyncio.create_task(
+            client.chat.completions.create(
+                messages=chat_input,
+                model=MODEL_NAME,
+                max_tokens=10000,
+                extra_body={"min_tokens": 10000},
+            )
+        )
+        tasks.append(task)
+
+    done, pending = await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED)
+
+    # Make sure all requests were sent to the server and timed out
+    # (We don't want to hide other errors like 400s that would invalidate this
+    # test)
+    assert len(pending) == 0
+    for d in done:
+        with pytest.raises(openai.APITimeoutError):
+            d.result()
+
+    # If the server had not cancelled all the other requests, then it would not
+    # be able to respond to this one within the timeout
+    client = server.get_async_client(timeout=5)
+    response = await client.chat.completions.create(
+        messages=chat_input, model=MODEL_NAME, max_tokens=10
+    )
+
+    assert len(response.choices) == 1
+
+
+@pytest.mark.asyncio
+async def test_request_wrong_content_type(server: RemoteOpenAIServer):
+    chat_input = [{"role": "user", "content": "Write a long story"}]
+    client = server.get_async_client()
+
+    with pytest.raises(openai.APIStatusError):
+        await client.chat.completions.create(
+            messages=chat_input,
+            model=MODEL_NAME,
+            max_tokens=10000,
+            extra_headers={"Content-Type": "application/x-www-form-urlencoded"},
+        )
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [pytest.param(["--enable-server-load-tracking"], id="enable-server-load-tracking")],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_server_load(server: RemoteOpenAIServer):
+    # Check initial server load
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 0
+
+    def make_long_completion_request():
+        return requests.post(
+            server.url_for("v1/completions"),
+            headers={"Content-Type": "application/json"},
+            json={
+                "prompt": "Give me a long story",
+                "max_tokens": 1000,
+                "temperature": 0,
+            },
+        )
+
+    # Start the completion request in a background thread.
+    completion_future = asyncio.create_task(
+        asyncio.to_thread(make_long_completion_request)
+    )
+
+    # Give a short delay to ensure the request has started.
+    await asyncio.sleep(0.1)
+
+    # Check server load while the completion request is running.
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 1
+
+    # Wait for the completion request to finish.
+    await completion_future
+    await asyncio.sleep(0.1)
+
+    # Check server load after the completion request has finished.
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 0
+
+
+@pytest.mark.asyncio
+async def test_health_check_engine_dead_error():
+    # Import the health function directly to test it in isolation
+    from vllm.entrypoints.serve.instrumentator.health import health
+
+    # Create a mock request that simulates what FastAPI would provide
+    mock_request = Mock(spec=Request)
+    mock_app_state = Mock()
+    mock_engine_client = AsyncMock()
+    mock_engine_client.check_health.side_effect = EngineDeadError()
+    mock_app_state.engine_client = mock_engine_client
+    mock_request.app.state = mock_app_state
+
+    # Test the health function directly with our mocked request
+    # This simulates what would happen if the engine dies
+    response = await health(mock_request)
+
+    # Assert that it returns 503 Service Unavailable
+    assert response.status_code == 503
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -0,0 +1,798 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# imports for structured outputs tests
+import json
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+import regex as re
+import requests
+import torch
+from openai import BadRequestError
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    """Download zephyr LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+
+
+@pytest.fixture(scope="module")
+def server(zephyr_lora_files):  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=5,
+        temperature=0.0,
+        logprobs=False,
+    )
+
+    choice = chat_completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=5,
+        temperature=0.0,
+        logprobs=True,
+        top_logprobs=0,
+    )
+
+    choice = chat_completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.content is not None
+    assert len(choice.logprobs.content[0].top_logprobs) == 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=5,
+        temperature=0.0,
+        logprobs=True,
+        top_logprobs=5,
+    )
+
+    choice = chat_completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.content is not None
+    assert len(choice.logprobs.content[0].top_logprobs) == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
+
+    # Default max_logprobs is 20, so this should raise an error
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        stream = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            logprobs=True,
+            top_logprobs=21,
+            stream=True,
+        )
+        async for chunk in stream:
+            ...
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            logprobs=True,
+            top_logprobs=30,
+            stream=False,
+        )
+
+    # the server should still work afterwards
+    chat_completion = await client.chat.completions.create(
+        model=model_name, messages=messages, max_completion_tokens=10, stream=False
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name, prompt_logprobs",
+    [(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
+)
+async def test_prompt_logprobs_chat(
+    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
+):
+    params: dict = {
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Who won the world series in 2020?"},
+            {
+                "role": "assistant",
+                "content": "The Los Angeles Dodgers won the World Series in 2020.",
+            },
+            {"role": "user", "content": "Where was it played?"},
+        ],
+        "model": model_name,
+    }
+
+    if prompt_logprobs is not None:
+        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
+
+    if prompt_logprobs is not None and prompt_logprobs < 0:
+        with pytest.raises(BadRequestError):
+            await client.chat.completions.create(**params)
+    else:
+        completion = await client.chat.completions.create(**params)
+        if prompt_logprobs is not None:
+            assert completion.prompt_logprobs is not None
+            assert len(completion.prompt_logprobs) > 0
+        else:
+            assert completion.prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_more_than_one_prompt_logprobs_chat(
+    client: openai.AsyncOpenAI, model_name: str
+):
+    params: dict = {
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Who won the world series in 2020?"},
+            {
+                "role": "assistant",
+                "content": "The Los Angeles Dodgers won the World Series in 2020.",
+            },
+            {"role": "user", "content": "Where was it played?"},
+        ],
+        "model": model_name,
+        "extra_body": {"prompt_logprobs": 1},
+    }
+
+    completion_1 = await client.chat.completions.create(**params)
+
+    params["extra_body"] = {"prompt_logprobs": 2}
+    completion_2 = await client.chat.completions.create(**params)
+
+    assert len(completion_1.prompt_logprobs[3]) == 1
+    assert len(completion_2.prompt_logprobs[3]) == 2
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+    )
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=37, total_tokens=47
+    )
+
+    message = choice.message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+)
+async def test_chat_completion_stream_options(
+    client: openai.AsyncOpenAI, model_name: str
+):
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?"},
+    ]
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False},
+    )
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options={"include_usage": True,
+    #                                   "continuous_usage_stats": False}}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True, "continuous_usage_stats": False},
+    )
+
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await anext(stream)
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens + final_chunk.usage.completion_tokens
+            )
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None},
+        )
+
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True},
+        )
+
+    # Test stream=True, stream_options={"include_usage": True,
+    #                           "continuous_usage_stats": True}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        extra_body=dict(min_tokens=10),
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+    )
+    last_completion_tokens = 0
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert (
+            last_completion_tokens == 0
+            or chunk.usage.completion_tokens > last_completion_tokens
+            or (
+                not chunk.choices
+                and chunk.usage.completion_tokens == last_completion_tokens
+            )
+        )
+        assert chunk.usage.total_tokens == (
+            chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+        )
+        last_completion_tokens = chunk.usage.completion_tokens
+
+    assert last_completion_tokens == 10
+
+
+@pytest.mark.asyncio
+async def test_structured_outputs_choice_chat(
+    client: openai.AsyncOpenAI,
+    sample_structured_outputs_choices,
+):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": "The best language for type-safe systems programming is ",
+        },
+    ]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.7,
+        extra_body=dict(
+            structured_outputs={"choice": sample_structured_outputs_choices}
+        ),
+    )
+    choice1 = chat_completion.choices[0].message.content
+    assert choice1 in sample_structured_outputs_choices
+
+    messages.append({"role": "assistant", "content": choice1})
+    messages.append({"role": "user", "content": "I disagree, pick another one"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.7,
+        extra_body=dict(
+            structured_outputs={"choice": sample_structured_outputs_choices}
+        ),
+    )
+    choice2 = chat_completion.choices[0].message.content
+    assert choice2 in sample_structured_outputs_choices
+    assert choice1 != choice2
+
+
+@pytest.mark.asyncio
+async def test_structured_outputs_json_chat(
+    client: openai.AsyncOpenAI,
+    sample_json_schema,
+):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": f"Give an example JSON for an employee profile that "
+            f"fits this schema: {sample_json_schema}",
+        },
+    ]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        extra_body=dict(structured_outputs={"json": sample_json_schema}),
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json1 = json.loads(message.content)
+    jsonschema.validate(instance=json1, schema=sample_json_schema)
+
+    messages.append({"role": "assistant", "content": message.content})
+    messages.append(
+        {"role": "user", "content": "Give me another one with a different name and age"}
+    )
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        extra_body=dict(structured_outputs={"json": sample_json_schema}),
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json2 = json.loads(message.content)
+    jsonschema.validate(instance=json2, schema=sample_json_schema)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+
+
+@pytest.mark.asyncio
+async def test_structured_outputs_regex_chat(
+    client: openai.AsyncOpenAI,
+    sample_regex,
+):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": f"Give an example IP address with this regex: {sample_regex}",
+        },
+    ]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=20,
+        extra_body=dict(structured_outputs={"regex": sample_regex}),
+    )
+    ip1 = chat_completion.choices[0].message.content
+    assert ip1 is not None
+    assert re.fullmatch(sample_regex, ip1) is not None
+
+    messages.append({"role": "assistant", "content": ip1})
+    messages.append({"role": "user", "content": "Give me a different one"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=20,
+        extra_body=dict(structured_outputs={"regex": sample_regex}),
+    )
+    ip2 = chat_completion.choices[0].message.content
+    assert ip2 is not None
+    assert re.fullmatch(sample_regex, ip2) is not None
+    assert ip1 != ip2
+
+
+@pytest.mark.asyncio
+async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": "The best language for type-safe systems programming is ",
+        },
+    ]
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            extra_body=dict(structured_outputs={"regex": {1: "Python", 2: "C++"}}),
+        )
+
+
+@pytest.mark.asyncio
+async def test_structured_outputs_choice_chat_logprobs(
+    client: openai.AsyncOpenAI, sample_structured_outputs_choices
+):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": "The best language for type-safe systems programming is ",
+        },
+    ]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+        extra_body=dict(
+            structured_outputs={"choice": sample_structured_outputs_choices}
+        ),
+    )
+
+    assert chat_completion.choices[0].logprobs is not None
+    assert chat_completion.choices[0].logprobs.content is not None
+    top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
+
+    # -9999.0 is the minimum logprob returned by OpenAI
+    for item in top_logprobs:
+        assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
+
+
+@pytest.mark.asyncio
+async def test_response_format_json_object(client: openai.AsyncOpenAI):
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {
+                    "role": "user",
+                    "content": (
+                        "what is 1+1? please respond with a JSON object, "
+                        'the format is {"result": 2}'
+                    ),
+                }
+            ],
+            response_format={"type": "json_object"},
+        )
+
+        content = resp.choices[0].message.content
+        assert content is not None
+
+        loaded = json.loads(content)
+        assert loaded == {"result": 2}, loaded
+
+
+@pytest.mark.asyncio
+async def test_response_format_json_schema(client: openai.AsyncOpenAI):
+    prompt = 'what is 1+1? The format is "result": 2'
+    # Check that this prompt cannot lead to a valid JSON without json_schema
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        content = resp.choices[0].message.content
+        assert content is not None
+        with pytest.raises((json.JSONDecodeError, AssertionError)):
+            loaded = json.loads(content)
+            assert loaded == {"result": 2}, loaded
+
+    for _ in range(2):
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{"role": "user", "content": prompt}],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "foo_test",
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "result": {"type": "integer"},
+                        },
+                    },
+                },
+            },
+        )
+
+        content = resp.choices[0].message.content
+        assert content is not None
+
+        loaded = json.loads(content)
+        assert loaded == {"result": 2}, loaded
+
+
+@pytest.mark.asyncio
+async def test_extra_fields_allowed(client: openai.AsyncOpenAI):
+    resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[
+            {
+                "role": "user",
+                "content": "what is 1+1?",
+                "extra_field": "0",
+            }
+        ],  # type: ignore
+        temperature=0,
+        seed=0,
+    )
+
+    content = resp.choices[0].message.content
+    assert content is not None
+
+
+@pytest.mark.asyncio
+async def test_complex_message_content(client: openai.AsyncOpenAI):
+    content = [
+        {
+            "type": "text",
+            "text": "what is 1+1? please provide the result without any other text.",
+        }
+    ]
+    resp = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[
+            {
+                "role": "user",
+                "content": content,
+            }
+        ],
+        temperature=0,
+        seed=0,
+    )
+    content = resp.choices[0].message.content
+    assert content == "2"
+
+
+@pytest.mark.asyncio
+async def test_custom_role(client: openai.AsyncOpenAI):
+    # Not sure how the model handles custom roles so we just check that
+    # both string and complex message content are handled in the same way
+
+    resp1 = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[
+            {
+                "role": "my-custom-role",
+                "content": "what is 1+1?",
+            }
+        ],  # type: ignore
+        temperature=0,
+        seed=0,
+    )
+
+    resp2 = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[
+            {
+                "role": "my-custom-role",
+                "content": [{"type": "text", "text": "what is 1+1?"}],
+            }
+        ],  # type: ignore
+        temperature=0,
+        seed=0,
+    )
+
+    content1 = resp1.choices[0].message.content
+    content2 = resp2.choices[0].message.content
+    assert content1 == content2
+
+
+@pytest.mark.asyncio
+async def test_long_seed(client: openai.AsyncOpenAI):
+    for seed in [torch.iinfo(torch.long).min - 1, torch.iinfo(torch.long).max + 1]:
+        with pytest.raises(BadRequestError) as exc_info:
+            await client.chat.completions.create(
+                model=MODEL_NAME,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant.",
+                    }
+                ],
+                temperature=0,
+                seed=seed,
+            )
+
+        assert (
+            "greater_than_equal" in exc_info.value.message
+            or "less_than_equal" in exc_info.value.message
+        )
+
+
+@pytest.mark.asyncio
+async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "messages": messages,
+        "max_completion_tokens": 5,
+        "temperature": 0.0,
+        "logprobs": False,
+    }
+
+    chat_completion = await client.chat.completions.create(**request_args)
+
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
+    invocation_response.raise_for_status()
+
+    chat_output = chat_completion.model_dump()
+    invocation_output = invocation_response.json()
+
+    assert chat_output.keys() == invocation_output.keys()
+    assert chat_output["choices"] == invocation_output["choices"]
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import NamedTuple
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from vllm.config import ModelConfig
+
+from ...utils import RemoteOpenAIServer
+
+# # any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+
+
+def get_vocab_size(model_name):
+    config = ModelConfig(
+        model=model_name,
+        seed=0,
+        dtype="float16",
+    )
+    return config.get_vocab_size()
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--enforce-eager",
+        "--max-model-len",
+        "4080",
+        "--max-logprobs",  # test prompt_logprobs equal to -1
+        "151936",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+class TestCase(NamedTuple):
+    model_name: str
+    echo: bool
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        TestCase(model_name=MODEL_NAME, echo=True),
+        TestCase(model_name=MODEL_NAME, echo=False),
+    ],
+)
+async def test_chat_session_with_echo_and_continue_final_message(
+    client: openai.AsyncOpenAI, test_case: TestCase
+):
+    saying: str = "Here is a common saying about apple. An apple a day, keeps"
+    # test echo with continue_final_message parameter
+    chat_completion = await client.chat.completions.create(
+        model=test_case.model_name,
+        messages=[
+            {"role": "user", "content": "tell me a common saying"},
+            {"role": "assistant", "content": saying},
+        ],
+        extra_body={
+            "echo": test_case.echo,
+            "continue_final_message": True,
+            "add_generation_prompt": False,
+        },
+    )
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "stop"
+
+    message = choice.message
+    if test_case.echo:
+        assert message.content is not None and saying in message.content
+    else:
+        assert message.content is not None and saying not in message.content
+    assert message.role == "assistant"
+
+
+@pytest.mark.asyncio
+async def test_prompt_logprobs(client: openai.AsyncOpenAI):
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Beijing is the capital of which country?"},
+    ]
+
+    completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        extra_body={"prompt_logprobs": -1},
+    )
+
+    assert completion.prompt_logprobs is not None
+    assert len(completion.prompt_logprobs) > 0
+
+
+@pytest.mark.asyncio
+async def test_top_logprobs(client: openai.AsyncOpenAI):
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Beijing is the capital of which country?"},
+    ]
+
+    completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=1,
+        extra_body={
+            "top_logprobs": -1,
+            "logprobs": "true",
+        },
+    )
+    assert completion.choices[0].logprobs is not None
+    assert completion.choices[0].logprobs.content is not None
+    assert len(completion.choices[0].logprobs.content) > 0
+    assert len(
+        completion.choices[0].logprobs.content[0].top_logprobs
+    ) == get_vocab_size(MODEL_NAME)
--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@@ -0,0 +1,227 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass, field
+from http import HTTPStatus
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from vllm.config.multimodal import MultiModalConfig
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODEL_NAME = "openai-community/gpt2"
+MODEL_NAME_SHORT = "gpt2"
+BASE_MODEL_PATHS = [
+    BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
+    BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
+]
+
+
+@dataclass
+class MockHFConfig:
+    model_type: str = "any"
+
+
+@dataclass
+class MockModelConfig:
+    task = "generate"
+    runner_type = "generate"
+    tokenizer = MODEL_NAME
+    trust_remote_code = False
+    tokenizer_mode = "auto"
+    max_model_len = 100
+    tokenizer_revision = None
+    multimodal_config = MultiModalConfig()
+    hf_config = MockHFConfig()
+    logits_processor_pattern = None
+    logits_processors: list[str] | None = None
+    diff_sampling_param: dict | None = None
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
+    encoder_config = None
+    generation_config: str = "auto"
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    skip_tokenizer_init = False
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
+
+
+def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
+    models = OpenAIServingModels(
+        engine_client=engine,
+        base_model_paths=BASE_MODEL_PATHS,
+    )
+    serving_chat = OpenAIServingChat(
+        engine,
+        models,
+        response_role="assistant",
+        request_logger=None,
+        chat_template=None,
+        chat_template_content_format="auto",
+    )
+
+    async def _fake_process_inputs(
+        request_id,
+        engine_prompt,
+        sampling_params,
+        *,
+        lora_request,
+        trace_headers,
+        priority,
+    ):
+        return dict(engine_prompt), {}
+
+    async def _fake_preprocess_chat(*args, **kwargs):
+        # return conversation, engine_prompts
+        return (
+            [{"role": "user", "content": "Test"}],
+            [{"prompt_token_ids": [1, 2, 3]}],
+        )
+
+    serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
+    serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
+    return serving_chat
+
+
+@pytest.mark.asyncio
+async def test_chat_error_non_stream():
+    """test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    serving_chat = _build_serving_chat(mock_engine)
+
+    completion_output = CompletionOutput(
+        index=0,
+        text="",
+        token_ids=[],
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="error",
+    )
+
+    request_output = RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output],
+        finished=True,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+    )
+
+    async def mock_generate(*args, **kwargs):
+        yield request_output
+
+    mock_engine.generate = MagicMock(side_effect=mock_generate)
+
+    request = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "Test prompt"}],
+        max_tokens=10,
+        stream=False,
+    )
+
+    response = await serving_chat.create_chat_completion(request)
+
+    assert isinstance(response, ErrorResponse)
+    assert response.error.type == "InternalServerError"
+    assert response.error.message == "Internal server error"
+    assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
+
+
+@pytest.mark.asyncio
+async def test_chat_error_stream():
+    """test finish_reason='error' returns 500 InternalServerError (streaming)"""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    serving_chat = _build_serving_chat(mock_engine)
+
+    completion_output_1 = CompletionOutput(
+        index=0,
+        text="Hello",
+        token_ids=[100],
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+    )
+
+    request_output_1 = RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output_1],
+        finished=False,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+    )
+
+    completion_output_2 = CompletionOutput(
+        index=0,
+        text="Hello",
+        token_ids=[100],
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="error",
+    )
+
+    request_output_2 = RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output_2],
+        finished=True,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+    )
+
+    async def mock_generate(*args, **kwargs):
+        yield request_output_1
+        yield request_output_2
+
+    mock_engine.generate = MagicMock(side_effect=mock_generate)
+
+    request = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "Test prompt"}],
+        max_tokens=10,
+        stream=True,
+    )
+
+    response = await serving_chat.create_chat_completion(request)
+
+    chunks = []
+    async for chunk in response:
+        chunks.append(chunk)
+
+    assert len(chunks) >= 2
+    assert any("Internal server error" in chunk for chunk in chunks), (
+        f"Expected error message in chunks: {chunks}"
+    )
+    assert chunks[-1] == "data: [DONE]\n\n"
--- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.config import ModelConfig
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+def get_vocab_size(model_name):
+    config = ModelConfig(
+        model=model_name,
+        seed=0,
+        dtype="bfloat16",
+    )
+    return config.get_vocab_size()
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_chat_logit_bias_valid(client):
+    """Test that valid logit_bias values are accepted in chat completions."""
+    vocab_size = get_vocab_size(MODEL_NAME)
+    valid_token_id = vocab_size - 1
+
+    completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": "Testing valid logit bias"}],
+        max_tokens=5,
+        logit_bias={str(valid_token_id): 1.0},
+    )
+
+    assert completion.choices[0].message.content is not None
+
+
+@pytest.mark.asyncio
+async def test_chat_logit_bias_invalid(client):
+    """Test that invalid logit_bias values are rejected in chat completions."""
+    vocab_size = get_vocab_size(MODEL_NAME)
+    invalid_token_id = vocab_size + 1
+
+    with pytest.raises(openai.BadRequestError) as excinfo:
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{"role": "user", "content": "Testing invalid logit bias"}],
+            max_tokens=5,
+            logit_bias={str(invalid_token_id): 1.0},
+        )
+
+    error = excinfo.value
+    error_message = str(error)
+
+    assert error.status_code == 400
+    assert str(invalid_token_id) in error_message
+    assert str(vocab_size) in error_message
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.tokenizers import get_tokenizer
+
+from ...models.registry import HF_EXAMPLE_MODELS
+from ...utils import VLLM_PATH
+
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
+
+# Define models, templates, and their corresponding expected outputs
+MODEL_TEMPLATE_GENERATION_OUTPUT = [
+    (
+        "facebook/opt-125m",
+        chatml_jinja_path,
+        True,
+        False,
+        """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of<|im_end|>
+<|im_start|>assistant
+""",
+    ),
+    (
+        "facebook/opt-125m",
+        chatml_jinja_path,
+        False,
+        False,
+        """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of""",
+    ),
+    (
+        "facebook/opt-125m",
+        chatml_jinja_path,
+        False,
+        True,
+        """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of<|im_end|>
+<|im_start|>assistant
+The capital of""",
+    ),
+]
+
+TEST_MESSAGES = [
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hi there!"},
+    {"role": "user", "content": "What is the capital of"},
+]
+ASSISTANT_MESSAGE_TO_CONTINUE = {"role": "assistant", "content": "The capital of"}
+
+
+def test_load_chat_template():
+    # Testing chatml template
+    template_content = load_chat_template(chat_template=chatml_jinja_path)
+
+    # Test assertions
+    assert template_content is not None
+    # Hard coded value for template_chatml.jinja
+    assert (
+        template_content
+        == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501
+    )
+
+
+def test_no_load_chat_template_filelike():
+    # Testing chatml template
+    template = "../../examples/does_not_exist"
+
+    with pytest.raises(ValueError, match="looks like a file path"):
+        load_chat_template(chat_template=template)
+
+
+def test_no_load_chat_template_literallike():
+    # Testing chatml template
+    template = "{{ messages }}"
+
+    template_content = load_chat_template(chat_template=template)
+
+    assert template_content == template
+
+
+@pytest.mark.parametrize(
+    "model,template,add_generation_prompt,continue_final_message,expected_output",
+    MODEL_TEMPLATE_GENERATION_OUTPUT,
+)
+def test_get_gen_prompt(
+    model, template, add_generation_prompt, continue_final_message, expected_output
+):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+
+    model_config = ModelConfig(
+        model,
+        tokenizer=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        revision=model_info.revision,
+        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.require_embed_inputs,
+        enable_prompt_embeds=model_info.require_embed_inputs,
+        enable_mm_embeds=model_info.require_embed_inputs,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype,
+    )
+
+    # Initialize the tokenizer
+    tokenizer = get_tokenizer(
+        tokenizer_name=model_config.tokenizer,
+        trust_remote_code=model_config.trust_remote_code,
+    )
+    template_content = load_chat_template(chat_template=template)
+
+    # Create a mock request object using keyword arguments
+    mock_request = ChatCompletionRequest(
+        model=model,
+        messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
+        if continue_final_message
+        else TEST_MESSAGES,
+        add_generation_prompt=add_generation_prompt,
+        continue_final_message=continue_final_message,
+    )
+
+    # Call the function and get the result
+    result = apply_hf_chat_template(
+        tokenizer=tokenizer,
+        conversation=mock_request.messages,
+        chat_template=mock_request.chat_template or template_content,
+        model_config=model_config,
+        tools=None,
+        add_generation_prompt=mock_request.add_generation_prompt,
+        continue_final_message=mock_request.continue_final_message,
+    )
+
+    # Test assertion
+    assert result == expected_output, (
+        f"The generated prompt does not match the expected output for "
+        f"model {model} and template {template}"
+    )
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# a reasoning and tool calling model
+MODEL_NAME = "Qwen/QwQ-32B"
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--reasoning-parser",
+        "deepseek_r1",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "hermes",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. "
+                        "'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that "
+                        "the city is in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }
+]
+
+MESSAGES = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": "Can you tell me what the temperate will be in Dallas, "
+        "in fahrenheit?",
+    },
+]
+
+FUNC_NAME = "get_current_weather"
+FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
+
+
+def extract_reasoning_and_calls(chunks: list):
+    reasoning = ""
+    tool_call_idx = -1
+    arguments = []
+    function_names = []
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+                function_names.append("")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    function_names[tool_call_idx] = tool_call.function.name
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+        else:
+            if hasattr(chunk.choices[0].delta, "reasoning"):
+                reasoning += chunk.choices[0].delta.reasoning
+    return reasoning, arguments, function_names
+
+
+# test streaming
+@pytest.mark.asyncio
+async def test_chat_streaming_of_tool_and_reasoning(client: openai.AsyncOpenAI):
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=True,
+    )
+
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk)
+
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+    assert len(reasoning) > 0
+    assert len(function_names) > 0 and function_names[0] == FUNC_NAME
+    assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
+
+
+# test full generate
+@pytest.mark.asyncio
+async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
+    tool_calls = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=False,
+    )
+
+    assert len(tool_calls.choices[0].message.reasoning) > 0
+    assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME
+    assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        "--enable-chunked-prefill",
+        "--max-num-batched-tokens",
+        "1000",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_completion_stream_options_and_logprobs_with_long_prompts(
+    client: openai.AsyncOpenAI,
+):
+    # Test stream with long prompt
+    prompt = "What is the capital of France?" * 400
+
+    stream = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=5,
+    )
+
+    tokens_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (
+            chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+        )
+        if not finished:
+            tokens_received += 1
+            assert chunk.choices[0].text
+
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
+    client: openai.AsyncOpenAI,
+):
+    # Test stream with long prompt
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?" * 400},
+    ]
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=True,
+        top_logprobs=5,
+    )
+
+    tokens_received = 0
+    empty_chunks_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (
+            chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+        )
+
+        if not finished:
+            if chunk.choices[0].delta.content == "":
+                # when there is no tokens generated
+                assert chunk.usage.completion_tokens == 0
+                assert chunk.choices[0].logprobs is None
+                empty_chunks_received += 1
+            else:
+                tokens_received += 1
+
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+
+    assert empty_chunks_received <= 1
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
+from vllm.entrypoints.openai.serving_models import LoRAModulePath
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+from ...utils import VLLM_PATH
+
+LORA_MODULE = {
+    "name": "module2",
+    "path": "/path/to/module2",
+    "base_model_name": "llama",
+}
+CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
+assert CHATML_JINJA_PATH.exists()
+
+
+@pytest.fixture
+def serve_parser():
+    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
+    return make_arg_parser(parser)
+
+
+### Test config parsing
+def test_config_arg_parsing(serve_parser, cli_config_file):
+    args = serve_parser.parse_args([])
+    assert args.port == 8000
+    args = serve_parser.parse_args(["--config", cli_config_file])
+    assert args.port == 12312
+    args = serve_parser.parse_args(
+        [
+            "--config",
+            cli_config_file,
+            "--port",
+            "9000",
+        ]
+    )
+    assert args.port == 9000
+    args = serve_parser.parse_args(
+        [
+            "--port",
+            "9000",
+            "--config",
+            cli_config_file,
+        ]
+    )
+    assert args.port == 9000
+
+
+### Tests for LoRA module parsing
+def test_valid_key_value_format(serve_parser):
+    # Test old format: name=path
+    args = serve_parser.parse_args(
+        [
+            "--lora-modules",
+            "module1=/path/to/module1",
+        ]
+    )
+    expected = [LoRAModulePath(name="module1", path="/path/to/module1")]
+    assert args.lora_modules == expected
+
+
+def test_valid_json_format(serve_parser):
+    # Test valid JSON format input
+    args = serve_parser.parse_args(
+        [
+            "--lora-modules",
+            json.dumps(LORA_MODULE),
+        ]
+    )
+    expected = [
+        LoRAModulePath(name="module2", path="/path/to/module2", base_model_name="llama")
+    ]
+    assert args.lora_modules == expected
+
+
+def test_invalid_json_format(serve_parser):
+    # Test invalid JSON format input, missing closing brace
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args(
+            ["--lora-modules", '{"name": "module3", "path": "/path/to/module3"']
+        )
+
+
+def test_invalid_type_error(serve_parser):
+    # Test type error when values are not JSON or key=value
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args(
+            [
+                "--lora-modules",
+                "invalid_format",  # This is not JSON or key=value format
+            ]
+        )
+
+
+def test_invalid_json_field(serve_parser):
+    # Test valid JSON format but missing required fields
+    with pytest.raises(SystemExit):
+        serve_parser.parse_args(
+            [
+                "--lora-modules",
+                '{"name": "module4"}',  # Missing required 'path' field
+            ]
+        )
+
+
+def test_empty_values(serve_parser):
+    # Test when no LoRA modules are provided
+    args = serve_parser.parse_args(["--lora-modules", ""])
+    assert args.lora_modules == []
+
+
+def test_multiple_valid_inputs(serve_parser):
+    # Test multiple valid inputs (both old and JSON format)
+    args = serve_parser.parse_args(
+        [
+            "--lora-modules",
+            "module1=/path/to/module1",
+            json.dumps(LORA_MODULE),
+        ]
+    )
+    expected = [
+        LoRAModulePath(name="module1", path="/path/to/module1"),
+        LoRAModulePath(
+            name="module2", path="/path/to/module2", base_model_name="llama"
+        ),
+    ]
+    assert args.lora_modules == expected
+
+
+### Tests for serve argument validation that run prior to loading
+def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
+    """Ensure validation fails if tool choice is enabled with no call parser"""
+    # If we enable-auto-tool-choice, explode with no tool-call-parser
+    args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+
+
+def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
+    """Ensure validation passes with tool choice enabled with a call parser"""
+    args = serve_parser.parse_args(
+        args=[
+            "--enable-auto-tool-choice",
+            "--tool-call-parser",
+            "mistral",
+        ]
+    )
+    validate_parsed_serve_args(args)
+
+
+def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
+    """Ensure validation fails if reasoning is enabled with auto tool choice"""
+    args = serve_parser.parse_args(
+        args=[
+            "--enable-auto-tool-choice",
+            "--reasoning-parser",
+            "deepseek_r1",
+        ]
+    )
+    with pytest.raises(TypeError):
+        validate_parsed_serve_args(args)
+
+
+def test_passes_with_reasoning_parser(serve_parser):
+    """Ensure validation passes if reasoning is enabled
+    with a reasoning parser"""
+    args = serve_parser.parse_args(
+        args=[
+            "--reasoning-parser",
+            "deepseek_r1",
+        ]
+    )
+    validate_parsed_serve_args(args)
+
+
+def test_chat_template_validation_for_happy_paths(serve_parser):
+    """Ensure validation passes if the chat template exists"""
+    args = serve_parser.parse_args(
+        args=["--chat-template", CHATML_JINJA_PATH.absolute().as_posix()]
+    )
+    validate_parsed_serve_args(args)
+
+
+def test_chat_template_validation_for_sad_paths(serve_parser):
+    """Ensure validation fails if the chat template doesn't exist"""
+    args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
+    with pytest.raises(ValueError):
+        validate_parsed_serve_args(args)
+
+
+@pytest.mark.parametrize(
+    "cli_args, expected_middleware",
+    [
+        (
+            ["--middleware", "middleware1", "--middleware", "middleware2"],
+            ["middleware1", "middleware2"],
+        ),
+        ([], []),
+    ],
+)
+def test_middleware(serve_parser, cli_args, expected_middleware):
+    """Ensure multiple middleware args are parsed properly"""
+    args = serve_parser.parse_args(args=cli_args)
+    assert args.middleware == expected_middleware
--- a/tests/entrypoints/openai/test_collective_rpc.py
+++ b/tests/entrypoints/openai/test_collective_rpc.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+class TestWorkerExtension:
+    def get_model_name(self) -> str:
+        """Test non-pydantic return type."""
+        return MODEL_NAME
+
+    def echo_args_kwargs(self, *args, **kwargs) -> dict[str, Any]:
+        """Echo back both args and kwargs."""
+        return dict(
+            args=list(args),
+            kwargs=kwargs,
+            total_items=len(args) + len(kwargs),
+        )
+
+    def return_none(self, *args, **kwargs) -> None:
+        """Test method that does not return anything"""
+        return
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--worker-extension-cls",
+        "tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
+    ]
+    with RemoteOpenAIServer(
+        MODEL_NAME,
+        args,
+        env_dict={"VLLM_SERVER_DEV_MODE": "1", "CUDA_VISIBLE_DEVICES": "0"},
+    ) as remote_server:
+        yield remote_server
+
+
+def test_get_model_name(server):
+    """Test basic response"""
+    response = requests.post(
+        server.url_for("collective_rpc"), json={"method": "get_model_name"}
+    )
+    assert response.status_code == 200
+    results = response.json()
+    assert "results" in results
+    assert results["results"] == [MODEL_NAME]
+
+
+def test_return_none(server):
+    """Test return none"""
+    response = requests.post(
+        server.url_for("collective_rpc"), json={"method": "return_none"}
+    )
+    assert response.status_code == 200
+    results = response.json()
+    assert results["results"] == [None]
+
+
+def test_echo_args_kwargs(server):
+    """Test args, kwargs, and dict response"""
+    args = ["arg1", "arg2"]
+    kwargs = {"key1": "value1", "key2": "value2"}
+    response = requests.post(
+        server.url_for("collective_rpc"),
+        json={"method": "echo_args_kwargs", "args": args, "kwargs": kwargs},
+    )
+    assert response.status_code == 200
+    results = response.json()
+    result = results["results"][0]
+    assert result["args"] == args
+    assert result["kwargs"] == kwargs
+    assert result["total_items"] == len(args) + len(kwargs)
--- a/tests/entrypoints/openai/test_completion_error.py
+++ b/tests/entrypoints/openai/test_completion_error.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass, field
+from http import HTTPStatus
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from vllm.config.multimodal import MultiModalConfig
+from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODEL_NAME = "openai-community/gpt2"
+MODEL_NAME_SHORT = "gpt2"
+BASE_MODEL_PATHS = [
+    BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
+    BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
+]
+
+
+@dataclass
+class MockHFConfig:
+    model_type: str = "any"
+
+
+@dataclass
+class MockModelConfig:
+    task = "generate"
+    runner_type = "generate"
+    tokenizer = MODEL_NAME
+    trust_remote_code = False
+    tokenizer_mode = "auto"
+    max_model_len = 100
+    tokenizer_revision = None
+    multimodal_config = MultiModalConfig()
+    hf_config = MockHFConfig()
+    logits_processor_pattern = None
+    logits_processors: list[str] | None = None
+    diff_sampling_param: dict | None = None
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
+    encoder_config = None
+    generation_config: str = "auto"
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
+    skip_tokenizer_init = False
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
+
+
+def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
+    models = OpenAIServingModels(
+        engine_client=engine,
+        base_model_paths=BASE_MODEL_PATHS,
+    )
+    serving_completion = OpenAIServingCompletion(
+        engine,
+        models,
+        request_logger=None,
+    )
+
+    async def _fake_process_inputs(
+        request_id,
+        engine_prompt,
+        sampling_params,
+        *,
+        lora_request,
+        trace_headers,
+        priority,
+    ):
+        return dict(engine_prompt), {}
+
+    serving_completion._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
+    return serving_completion
+
+
+@pytest.mark.asyncio
+async def test_completion_error_non_stream():
+    """test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    serving_completion = _build_serving_completion(mock_engine)
+
+    completion_output = CompletionOutput(
+        index=0,
+        text="",
+        token_ids=[],
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="error",
+    )
+
+    request_output = RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output],
+        finished=True,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+    )
+
+    async def mock_generate(*args, **kwargs):
+        yield request_output
+
+    mock_engine.generate = MagicMock(side_effect=mock_generate)
+
+    request = CompletionRequest(
+        model=MODEL_NAME,
+        prompt="Test prompt",
+        max_tokens=10,
+        stream=False,
+    )
+
+    response = await serving_completion.create_completion(request)
+
+    assert isinstance(response, ErrorResponse)
+    assert response.error.type == "InternalServerError"
+    assert response.error.message == "Internal server error"
+    assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
+
+
+@pytest.mark.asyncio
+async def test_completion_error_stream():
+    """test finish_reason='error' returns 500 InternalServerError (streaming)"""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    serving_completion = _build_serving_completion(mock_engine)
+
+    completion_output_1 = CompletionOutput(
+        index=0,
+        text="Hello",
+        token_ids=[100],
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason=None,
+    )
+
+    request_output_1 = RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output_1],
+        finished=False,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+    )
+
+    completion_output_2 = CompletionOutput(
+        index=0,
+        text="Hello",
+        token_ids=[100],
+        cumulative_logprob=None,
+        logprobs=None,
+        finish_reason="error",
+    )
+
+    request_output_2 = RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output_2],
+        finished=True,
+        metrics=None,
+        lora_request=None,
+        encoder_prompt=None,
+        encoder_prompt_token_ids=None,
+    )
+
+    async def mock_generate(*args, **kwargs):
+        yield request_output_1
+        yield request_output_2
+
+    mock_engine.generate = MagicMock(side_effect=mock_generate)
+
+    request = CompletionRequest(
+        model=MODEL_NAME,
+        prompt="Test prompt",
+        max_tokens=10,
+        stream=True,
+    )
+
+    response = await serving_completion.create_completion(request)
+
+    chunks = []
+    async for chunk in response:
+        chunks.append(chunk)
+
+    assert len(chunks) >= 2
+    assert any("Internal server error" in chunk for chunk in chunks), (
+        f"Expected error message in chunks: {chunks}"
+    )
+    assert chunks[-1] == "data: [DONE]\n\n"
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -0,0 +1,486 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import datetime
+import json
+
+import jsonschema
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+# downloading lora to test lora requests
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. "
+                        "'Vienna'",
+                        "default": "Vienna",
+                    },
+                    "country": {
+                        "type": "string",
+                        "description": "The country that the city is in, e.g. "
+                        "'Austria'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                    "options": {
+                        "$ref": "#/$defs/WeatherOptions",
+                        "description": "Optional parameters for weather query",
+                    },
+                },
+                "required": ["country", "unit"],
+                "$defs": {
+                    "WeatherOptions": {
+                        "title": "WeatherOptions",
+                        "type": "object",
+                        "additionalProperties": False,
+                        "properties": {
+                            "unit": {
+                                "type": "string",
+                                "enum": ["celsius", "fahrenheit"],
+                                "default": "celsius",
+                                "description": "Temperature unit",
+                                "title": "Temperature Unit",
+                            },
+                            "include_forecast": {
+                                "type": "boolean",
+                                "default": False,
+                                "description": "Whether to include a 24-hour forecast",
+                                "title": "Include Forecast",
+                            },
+                            "language": {
+                                "type": "string",
+                                "default": "zh-CN",
+                                "description": "Language of the response",
+                                "title": "Language",
+                                "enum": ["zh-CN", "en-US", "ja-JP"],
+                            },
+                        },
+                    },
+                },
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to get the forecast for, e.g. "
+                        "'Vienna'",
+                        "default": "Vienna",
+                    },
+                    "country": {
+                        "type": "string",
+                        "description": "The country that the city is in, e.g. "
+                        "'Austria'",
+                    },
+                    "days": {
+                        "type": "integer",
+                        "description": "Number of days to get the forecast for (1-7)",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["country", "days", "unit"],
+            },
+        },
+    },
+]
+
+messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": "Can you tell me what the current weather is in Berlin and the "
+        "forecast for the next 5 days, in fahrenheit?",
+    },
+]
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "half",
+        "--enable-auto-tool-choice",
+        "--structured-outputs-config.backend",
+        "xgrammar",
+        "--tool-call-parser",
+        "hermes",
+        "--reasoning-parser",
+        "qwen3",
+        "--gpu-memory-utilization",
+        "0.4",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("stream", [True, False])
+@pytest.mark.parametrize(
+    "tool_choice",
+    [
+        "auto",
+        "required",
+        {"type": "function", "function": {"name": "get_current_weather"}},
+    ],
+)
+@pytest.mark.parametrize("enable_thinking", [True, False])
+async def test_function_tool_use(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    stream: bool,
+    tool_choice: str | dict,
+    enable_thinking: bool,
+):
+    if not stream:
+        # Non-streaming test
+        chat_completion = await client.chat.completions.create(
+            messages=messages,
+            model=model_name,
+            tools=tools,
+            tool_choice=tool_choice,
+            extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
+        )
+        if enable_thinking:
+            assert chat_completion.choices[0].message.reasoning is not None
+            assert chat_completion.choices[0].message.reasoning != ""
+        assert chat_completion.choices[0].message.tool_calls is not None
+        assert len(chat_completion.choices[0].message.tool_calls) > 0
+    else:
+        # Streaming test
+        output_stream = await client.chat.completions.create(
+            messages=messages,
+            model=model_name,
+            tools=tools,
+            tool_choice=tool_choice,
+            stream=True,
+            extra_body={"chat_template_kwargs": {"enable_thinking": enable_thinking}},
+        )
+
+        output = []
+        reasoning = []
+        async for chunk in output_stream:
+            if chunk.choices:
+                if enable_thinking and getattr(
+                    chunk.choices[0].delta, "reasoning", None
+                ):
+                    reasoning.append(chunk.choices[0].delta.reasoning)
+                if chunk.choices[0].delta.tool_calls:
+                    output.extend(chunk.choices[0].delta.tool_calls)
+
+        assert len(output) > 0
+        if enable_thinking:
+            assert len(reasoning) > 0
+
+
+@pytest.fixture(scope="module")
+def k2_server():  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "half",
+        "--enable-auto-tool-choice",
+        "--structured-outputs-config.backend",
+        "xgrammar",
+        "--tool-call-parser",
+        "hermes",
+        "--reasoning-parser",
+        "qwen3",
+        "--gpu-memory-utilization",
+        "0.4",
+    ]
+    # hack to test kimi_k2 tool use tool_id format.
+    # avoid error in is_deepseek_mla check by setting kv_lora_rank=null
+    with RemoteOpenAIServer(
+        MODEL_NAME,
+        args,
+        override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None},
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def k2_client(k2_server):
+    async with k2_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("stream", [True, False])
+@pytest.mark.parametrize("tool_choice", ["required"])
+async def test_tool_id_kimi_k2(
+    k2_client: openai.AsyncOpenAI, model_name: str, stream: bool, tool_choice: str
+):
+    if not stream:
+        # Non-streaming test
+        chat_completion = await k2_client.chat.completions.create(
+            messages=messages, model=model_name, tools=tools, tool_choice=tool_choice
+        )
+        assert chat_completion.choices[0].message.tool_calls is not None
+        assert len(chat_completion.choices[0].message.tool_calls) > 0
+        assert chat_completion.choices[0].message.tool_calls[0].id in [
+            "functions.get_current_weather:0",
+            "functions.get_forecast:1",
+        ]
+    else:
+        # Streaming test
+        output_stream = await k2_client.chat.completions.create(
+            messages=messages,
+            model=model_name,
+            tools=tools,
+            tool_choice=tool_choice,
+            stream=True,
+        )
+
+        output = []
+        async for chunk in output_stream:
+            if chunk.choices and chunk.choices[0].delta.tool_calls:
+                output.extend(chunk.choices[0].delta.tool_calls)
+        for o in output:
+            assert o.id is None or o.id in [
+                "functions.get_current_weather:0",
+                "functions.get_forecast:1",
+            ]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("arguments", ["{}", ""])
+async def test_no_args_tool_call(
+    client: openai.AsyncOpenAI, model_name: str, arguments: str
+):
+    # Step 1: Define a tool that requires no parameters
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_time",
+                "description": "Get the current date and time. No parameters needed.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {},  # No parameters
+                    "required": [],  # No required fields
+                },
+            },
+        }
+    ]
+    messages = [{"role": "user", "content": "What time is it now?"}]
+    # Step 2: Send user message and let model decide whether to call the tool
+    response = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",  # Let model choose automatically
+    )
+
+    # Step 3: Check if model wants to call a tool
+    message = response.choices[0].message
+    if message.tool_calls:
+        # Get the first tool call
+        tool_call = message.tool_calls[0]
+        tool_name = tool_call.function.name
+        # Step 4: Execute the tool locally (no parameters)
+        if tool_name == "get_current_time":
+            # Test both empty string and "{}" for no-arg tool calls
+            tool_call.function.arguments = arguments
+            messages.append(message)
+            current_time = datetime.datetime.now()
+            result = current_time.isoformat()
+            messages.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": tool_call.id,
+                    "content": result,
+                }
+            )
+            # Step 5: Send tool result back to model to continue conversation
+            final_response = await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+            )
+            # Output final natural language response
+            assert final_response.choices[0].message.content is not None
+
+    else:
+        # No tool called — just print model's direct reply
+        assert message.content is not None
+
+
+@pytest.mark.asyncio
+async def test_named_tool_use(
+    client: openai.AsyncOpenAI,
+    sample_json_schema,
+):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": (
+                "Give an example JSON for an employee profile using the specified tool."
+            ),
+        },
+    ]
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": sample_json_schema,
+            },
+        }
+    ]
+    tool_choice = {"type": "function", "function": {"name": "dummy_function_name"}}
+
+    # non-streaming
+
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        tools=tools,
+        temperature=0.0,
+        tool_choice=tool_choice,
+    )
+    message = chat_completion.choices[0].message
+    assert len(message.content) == 0
+    json_string = message.tool_calls[0].function.arguments
+    json1 = json.loads(json_string)
+    jsonschema.validate(instance=json1, schema=sample_json_schema)
+
+    messages.append({"role": "assistant", "content": json_string})
+    messages.append(
+        {"role": "user", "content": "Give me another one with a different name and age"}
+    )
+
+    # streaming
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_completion_tokens=1000,
+        tools=tools,
+        tool_choice=tool_choice,
+        temperature=0.0,
+        stream=True,
+    )
+
+    output = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        assert delta.content is None or len(delta.content) == 0
+        if delta.tool_calls:
+            output.append(delta.tool_calls[0].function.arguments)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    json2 = json.loads("".join(output))
+    jsonschema.validate(instance=json2, schema=sample_json_schema)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+
+
+@pytest.mark.asyncio
+async def test_inconsistent_tool_choice_and_tools(
+    client: openai.AsyncOpenAI, sample_json_schema
+):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {
+            "role": "user",
+            "content": f"Give an example JSON for an employee profile that "
+            f"fits this schema: {sample_json_schema}",
+        },
+    ]
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tool_choice={
+                "type": "function",
+                "function": {"name": "dummy_function_name"},
+            },
+        )
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tools=[
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "dummy_function_name",
+                        "description": "This is a dummy function",
+                        "parameters": sample_json_schema,
+                    },
+                }
+            ],
+            tool_choice={
+                "type": "function",
+                "function": {"name": "nondefined_function_name"},
+            },
+        )
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_completion_tokens=1000,
+            tools=[
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "dummy_function_name",
+                        "description": "This is a dummy function",
+                        "parameters": sample_json_schema,
+                    },
+                }
+            ],
+            tool_choice={},
+        )
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -0,0 +1,307 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import io
+import json
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+import torch
+
+# downloading lora to test lora requests
+from openai import BadRequestError
+from transformers import AutoConfig
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "facebook/opt-125m"
+LORA_SERVING_MODEL_NAME = "opt125m-lora"
+
+CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
+
+
+@pytest.fixture(scope="module", params=["use-lora"])
+def default_server_args(
+    request: pytest.FixtureRequest, opt125_lora_files: str
+) -> list[str]:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # Prompt Embeds server args
+        "--enable-prompt-embeds",
+    ]
+
+    if request.param == "use-lora":
+        lora_module_1 = {
+            "name": LORA_SERVING_MODEL_NAME,
+            "path": opt125_lora_files,
+            "base_model_name": MODEL_NAME,
+        }
+
+        args.extend(
+            [
+                "--enable-lora",
+                "--lora-module",
+                json.dumps(lora_module_1),
+                "--max-lora-rank",
+                "64",
+                "--max-cpu-loras",
+                "2",
+            ]
+        )
+
+    return args
+
+
+EXAMPLE_PROMPTS = [
+    "Hello, my name is",
+    "What is an LLM?",
+]
+
+
+def _encode_embeds(embeds: torch.Tensor):
+    buffer = io.BytesIO()
+    torch.save(embeds, buffer)
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+
+
+@pytest.fixture(scope="module")
+def example_prompt_embeds(hf_runner):
+    """Create example embeddings and return them as base64 encoded string."""
+    with hf_runner(MODEL_NAME) as hf_model:
+        example_embeddings = hf_model.get_prompt_embeddings(EXAMPLE_PROMPTS)
+
+    return [_encode_embeds(item) for item in example_embeddings]
+
+
+@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"])
+def server_with_prompt_embeds(default_server_args, request):
+    if request.param:
+        default_server_args.append(request.param)
+
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_with_prompt_embeds(server_with_prompt_embeds):
+    async with server_with_prompt_embeds.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
+async def test_completions_with_prompt_embeds(
+    example_prompt_embeds,
+    client_with_prompt_embeds: openai.AsyncOpenAI,
+    model_name: str,
+):
+    encoded_embeds, encoded_embeds2 = example_prompt_embeds
+
+    # Test case: Single prompt embeds input
+    completion = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
+    assert len(completion.choices[0].text) >= 1
+    assert completion.choices[0].prompt_logprobs is None
+
+    # Test case: batch completion with prompt_embeds
+    completion = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
+    )
+    assert len(completion.choices) == 2
+    assert len(completion.choices[0].text) >= 1
+    assert len(completion.choices[1].text) >= 1
+
+    # Test case: streaming with prompt_embeds
+    single_completion = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
+    single_output = single_completion.choices[0].text
+
+    stream = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
+    chunks = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+    # Test case: batch streaming with prompt_embeds
+    stream = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
+    )
+    chunks_stream_embeds: list[list[str]] = [[], []]
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks_stream_embeds[chunk.choices[0].index].append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    assert finish_reason_count == 2
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert len(chunks_stream_embeds[0]) > 0
+    assert len(chunks_stream_embeds[1]) > 0
+
+    # Test case: mixed text and prompt_embeds
+    completion_mixed = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="This is a prompt",
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
+    assert len(completion.choices) == 2
+    completion_text_only = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="This is a prompt",
+        max_tokens=5,
+        temperature=0.0,
+    )
+    completion_embeds_only = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
+    # Embeddings responses should be handled first
+    assert completion_mixed.choices[0].text == completion_embeds_only.choices[0].text
+    assert completion_mixed.choices[1].text == completion_text_only.choices[0].text
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
+async def test_completions_errors_with_prompt_embeds(
+    client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str
+):
+    # Test error case: invalid prompt_embeds
+    with pytest.raises(BadRequestError):
+        await client_with_prompt_embeds.completions.create(
+            prompt="",
+            model=model_name,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body={"prompt_embeds": "invalid_base64"},
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
+async def test_completions_with_logprobs_and_prompt_embeds(
+    example_prompt_embeds,
+    client_with_prompt_embeds: openai.AsyncOpenAI,
+    logprobs_arg: int,
+    model_name: str,
+):
+    encoded_embeds, encoded_embeds2 = example_prompt_embeds
+
+    # Test case: Logprobs using prompt_embeds
+    completion = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        echo=False,
+        logprobs=logprobs_arg,
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
+
+    logprobs = completion.choices[0].logprobs
+    assert logprobs is not None
+    assert len(logprobs.text_offset) == 5
+    assert len(logprobs.token_logprobs) == 5
+    assert len(logprobs.top_logprobs) == 5
+    for top_logprobs in logprobs.top_logprobs[1:]:
+        assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
+    assert len(logprobs.tokens) == 5
+
+    # Test case: Log probs with batch completion and prompt_embeds
+    completion = await client_with_prompt_embeds.completions.create(
+        model=model_name,
+        prompt="",  # Add empty prompt as required parameter
+        max_tokens=5,
+        temperature=0.0,
+        echo=False,
+        logprobs=logprobs_arg,
+        extra_body={"prompt_embeds": [encoded_embeds, encoded_embeds2]},
+    )
+
+    assert len(completion.choices) == 2
+    for choice in completion.choices:
+        logprobs = choice.logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) == 5
+        assert len(logprobs.token_logprobs) == 5
+        assert len(logprobs.top_logprobs) == 5
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) == 5
+
+
+@pytest.mark.asyncio
+async def test_prompt_logprobs_raises_error(
+    example_prompt_embeds,
+    client_with_prompt_embeds: openai.AsyncOpenAI,
+):
+    encoded_embeds, _ = example_prompt_embeds
+
+    with pytest.raises(BadRequestError, match="not compatible"):
+        await client_with_prompt_embeds.completions.create(
+            model=MODEL_NAME,
+            prompt="",
+            max_tokens=5,
+            temperature=0.0,
+            extra_body={"prompt_embeds": encoded_embeds, "prompt_logprobs": True},
+        )
+
+
+@pytest.mark.asyncio
+async def test_empty_prompt_embeds(
+    client_with_prompt_embeds: openai.AsyncOpenAI,
+) -> None:
+    await client_with_prompt_embeds.completions.create(
+        model=MODEL_NAME,
+        prompt="Hello",
+        max_tokens=5,
+        temperature=0.0,
+        extra_body={"prompt_embeds": []},
+    )
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+from huggingface_hub import snapshot_download
+
+from ...conftest import AudioTestAssets
+from ...utils import RemoteOpenAIServer
+
+# NOTE - the tests in this module are currently analogous to test_chat, but are
+# separated to avoid OOM killing due to module-scoped servers, since we
+# need a multimodal model for these tests.
+
+# Contains a modality specific lora alongside the base model
+MULTIMODAL_MODEL_NAME = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+AUDIO_LORA_PATH = os.path.join(MULTIMODAL_MODEL_NAME, "speech-lora")
+
+ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go."  # noqa: E501
+
+
+@pytest.fixture(scope="module")
+def multimodal_server():  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "half",
+        "--max-model-len",
+        "4096",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"speech={AUDIO_LORA_PATH}",
+        "--max-lora-rank",
+        "320",
+        "--max-num-seqs",
+        "2",
+        "--trust-remote-code",
+        "--gpu-memory-utilization",
+        "0.8",
+        "--default-mm-loras",
+        f'{{"audio": "{AUDIO_LORA_PATH}"}}',
+    ]
+
+    with RemoteOpenAIServer(
+        MULTIMODAL_MODEL_NAME, args, max_wait_seconds=480
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def multi_modal_client(multimodal_server):
+    async with multimodal_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    # base model with default lora should give the same response as lora model
+    "model_name",
+    [MULTIMODAL_MODEL_NAME, "speech"],
+)
+async def test_default_mm_lora_chat_completions(
+    model_name: str,
+    multi_modal_client: openai.AsyncOpenAI,
+    audio_assets: AudioTestAssets,
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Can you transcribe this audio?",
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {"url": audio_assets[0].url},
+                },
+            ],
+        }
+    ]
+
+    chat_completion = await multi_modal_client.chat.completions.create(
+        model=model_name, messages=messages, max_completion_tokens=128, temperature=0.0
+    )
+
+    assert len(chat_completion.choices) > 0
+
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+    assert message.content == ACTIVE_MM_LORA_RESPONSE
--- a/tests/entrypoints/openai/test_enable_force_include_usage.py
+++ b/tests/entrypoints/openai/test_enable_force_include_usage.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import openai
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+
+@pytest.fixture(scope="module")
+def chat_server_with_force_include_usage(request):  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "128",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "4",
+        "--enable-force-include-usage",
+        "--port",
+        "55857",
+        "--gpu-memory-utilization",
+        "0.2",
+    ]
+
+    with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def chat_client_with_force_include_usage(chat_server_with_force_include_usage):
+    async with chat_server_with_force_include_usage.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_chat_with_enable_force_include_usage(
+    chat_client_with_force_include_usage: openai.AsyncOpenAI,
+):
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?"},
+    ]
+
+    stream = await chat_client_with_force_include_usage.chat.completions.create(
+        model="Qwen/Qwen3-0.6B",
+        messages=messages,
+        max_completion_tokens=10,
+        extra_body=dict(min_tokens=10),
+        temperature=0.0,
+        stream=True,
+    )
+    last_completion_tokens = 0
+    async for chunk in stream:
+        if not len(chunk.choices):
+            assert chunk.usage.prompt_tokens >= 0
+            assert (
+                last_completion_tokens == 0
+                or chunk.usage.completion_tokens > last_completion_tokens
+                or (
+                    not chunk.choices
+                    and chunk.usage.completion_tokens == last_completion_tokens
+                )
+            )
+            assert chunk.usage.total_tokens == (
+                chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+            )
+        else:
+            assert chunk.usage is None
+
+
+@pytest.fixture(scope="module")
+def transcription_server_with_force_include_usage():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-num-seqs",
+        "4",
+        "--enforce-eager",
+        "--enable-force-include-usage",
+        "--gpu-memory-utilization",
+        "0.2",
+    ]
+
+    with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def transcription_client_with_force_include_usage(
+    transcription_server_with_force_include_usage,
+):
+    async with (
+        transcription_server_with_force_include_usage.get_async_client() as async_client
+    ):
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_transcription_with_enable_force_include_usage(
+    transcription_client_with_force_include_usage, winning_call
+):
+    res = (
+        await transcription_client_with_force_include_usage.audio.transcriptions.create(
+            model="openai/whisper-large-v3-turbo",
+            file=winning_call,
+            language="en",
+            temperature=0.0,
+            stream=True,
+            timeout=30,
+        )
+    )
+
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            usage = chunk.usage
+            assert isinstance(usage, dict)
+            assert usage["prompt_tokens"] > 0
+            assert usage["completion_tokens"] > 0
+            assert usage["total_tokens"] > 0
+        else:
+            assert not hasattr(chunk, "usage")
--- a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
+++ b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
@@ -0,0 +1,280 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Integration tests for GPT-OSS structural tags functionality (PR #25515)."""
+
+import json
+from unittest.mock import Mock
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import (
+    StructuredOutputsParams,
+)
+from vllm.entrypoints.tool_server import ToolServer
+from vllm.reasoning.gptoss_reasoning_parser import (
+    GptOssReasoningParser,
+)
+
+
+class TestGptOssStructuralTagsIntegration:
+    """Integration tests for structural tags in GPT-OSS tool calls."""
+
+    @pytest.fixture
+    def mock_tokenizer(self):
+        """Create a mock tokenizer."""
+        tokenizer = Mock()
+        tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5])
+        return tokenizer
+
+    @pytest.fixture
+    def gptoss_parser(self, mock_tokenizer):
+        """Create a real GptOssReasoningParser instance."""
+        return GptOssReasoningParser(mock_tokenizer)
+
+    @pytest.fixture
+    def tool_server_with_python(self):
+        """Create a tool server with Python tool enabled."""
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
+        return tool_server
+
+    @pytest.fixture
+    def tool_server_empty(self):
+        """Create a tool server with no tools."""
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(return_value=False)
+        return tool_server
+
+    def test_end_to_end_no_tools(self, gptoss_parser):
+        """Test end-to-end flow when no tools are available."""
+        # Test the parser directly
+        result = gptoss_parser.prepare_structured_tag(None, None)
+        parsed_result = json.loads(result)
+
+        # Verify basic structure
+        assert parsed_result["type"] == "structural_tag"
+        assert parsed_result["format"]["type"] == "triggered_tags"
+        assert len(parsed_result["format"]["tags"]) == 1
+
+        # Verify only analysis channel is allowed
+        analysis_tag = parsed_result["format"]["tags"][0]
+        assert analysis_tag["begin"] == "<|channel|>analysis<|message|>"
+        assert analysis_tag["content"]["type"] == "any_text"
+        assert analysis_tag["end"] == "<|end|>"
+
+        # Verify triggers
+        assert parsed_result["format"]["triggers"] == ["<|channel|>analysis"]
+        assert parsed_result["format"]["stop_after_first"] is False
+
+    def test_end_to_end_with_python_tool(self, gptoss_parser, tool_server_with_python):
+        """Test end-to-end flow with Python tool enabled."""
+        result = gptoss_parser.prepare_structured_tag(None, tool_server_with_python)
+        parsed_result = json.loads(result)
+
+        # Should have analysis tag + 2 python tags
+        assert len(parsed_result["format"]["tags"]) == 3
+
+        # Verify all expected tags are present
+        tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
+        expected_begins = [
+            "<|channel|>analysis<|message|>",
+            "<|channel|>commentary to=python",
+            "<|channel|>analysis to=python",
+        ]
+
+        for expected in expected_begins:
+            assert expected in tag_begins
+
+        # Verify triggers include commentary
+        assert "<|channel|>analysis" in parsed_result["format"]["triggers"]
+        assert "<|channel|>commentary to=" in parsed_result["format"]["triggers"]
+
+    def test_structured_outputs_params_integration(
+        self, gptoss_parser, tool_server_with_python
+    ):
+        """Test integration with StructuredOutputsParams."""
+        # Generate structural tag
+        structural_tag = gptoss_parser.prepare_structured_tag(
+            None, tool_server_with_python
+        )
+
+        # Create StructuredOutputsParams
+        params = StructuredOutputsParams(structural_tag=structural_tag)
+
+        # Verify the tag is properly stored and accessible
+        assert params.structural_tag == structural_tag
+
+        # Verify the tag is valid JSON
+        parsed_tag = json.loads(params.structural_tag)
+        assert parsed_tag["type"] == "structural_tag"
+
+    @pytest.mark.parametrize(
+        "browser, python, container, expected_tags",
+        [
+            # No tools
+            (False, False, False, 1),
+            # Single tool
+            (True, False, False, 3),
+            # Multiple tools
+            (True, True, False, 5),
+            # All tools
+            (True, True, True, 7),
+        ],
+    )
+    def test_tool_server_interaction_flow(
+        self, gptoss_parser, browser, python, container, expected_tags
+    ):
+        """Test the complete tool server interaction flow."""
+
+        # Create a mock ToolServer
+        tool_server = Mock(spec=ToolServer)
+
+        # Simulate tool availability based on parameters
+        tool_server.has_tool = Mock(
+            side_effect=lambda tool: {
+                "browser": browser,
+                "python": python,
+                "container": container,
+            }.get(tool, False)
+        )
+
+        # Run the parser and verify results
+        result = gptoss_parser.prepare_structured_tag(None, tool_server)
+        parsed_result = json.loads(result)
+
+        # Validate number of tags
+        assert len(parsed_result["format"]["tags"]) == expected_tags
+
+        # Verify tool-specific tags exist for enabled tools
+        tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
+        for tool, enabled in {
+            "browser": browser,
+            "python": python,
+            "container": container,
+        }.items():
+            if enabled:
+                assert f"<|channel|>commentary to={tool}" in tag_begins
+                assert f"<|channel|>analysis to={tool}" in tag_begins
+
+    def test_original_tag_preservation(self, gptoss_parser, tool_server_with_python):
+        """Test that original tags are preserved when provided."""
+        original_tag = '{"type": "custom_tag", "data": "preserved"}'
+
+        result = gptoss_parser.prepare_structured_tag(
+            original_tag, tool_server_with_python
+        )
+
+        # Should return original tag unchanged
+        assert result == original_tag
+
+    @pytest.mark.parametrize(
+        "tools",
+        [
+            [],
+            ["browser"],
+            ["python"],
+            ["container"],
+            ["browser", "python"],
+            ["browser", "container"],
+            ["python", "container"],
+            ["browser", "python", "container"],
+        ],
+    )
+    def test_json_validity_comprehensive(self, gptoss_parser, tools):
+        """Test JSON validity across all possible tool combinations."""
+
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(side_effect=lambda tool: tool in tools)
+
+        result = gptoss_parser.prepare_structured_tag(None, tool_server)
+
+        # Should be valid JSON
+        parsed_result = json.loads(result)
+
+        # Should have correct structure
+        assert parsed_result["type"] == "structural_tag"
+        assert "format" in parsed_result
+        assert "tags" in parsed_result["format"]
+        assert "triggers" in parsed_result["format"]
+
+        # Tag count should be: 1 (analysis) + 2 * len(tools)
+        expected_tag_count = 1 + (2 * len(tools))
+        assert len(parsed_result["format"]["tags"]) == expected_tag_count
+
+    def test_error_handling_invalid_tool_server(self, gptoss_parser):
+        """Test error handling with invalid tool server."""
+        # Tool server that raises exceptions
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(side_effect=Exception("Tool server error"))
+
+        # Should handle gracefully and still return a valid tag
+        with pytest.raises(Exception, match="Tool server error"):
+            gptoss_parser.prepare_structured_tag(None, tool_server)
+
+    def test_concurrent_requests_isolation(self, gptoss_parser):
+        """Test that concurrent requests don't interfere with each other."""
+        # Simulate concurrent requests with different tool servers
+        tool_server_1 = Mock(spec=ToolServer)
+        tool_server_1.has_tool = Mock(side_effect=lambda tool: tool == "python")
+
+        tool_server_2 = Mock(spec=ToolServer)
+        tool_server_2.has_tool = Mock(side_effect=lambda tool: tool == "browser")
+
+        # Generate tags concurrently
+        result_1 = gptoss_parser.prepare_structured_tag(None, tool_server_1)
+        result_2 = gptoss_parser.prepare_structured_tag(None, tool_server_2)
+
+        # Parse results
+        parsed_1 = json.loads(result_1)
+        parsed_2 = json.loads(result_2)
+
+        # Verify they have different tool configurations
+        tags_1 = [tag["begin"] for tag in parsed_1["format"]["tags"]]
+        tags_2 = [tag["begin"] for tag in parsed_2["format"]["tags"]]
+
+        # Result 1 should have python tags
+        assert "<|channel|>commentary to=python" in tags_1
+        assert "<|channel|>commentary to=browser" not in tags_1
+
+        # Result 2 should have browser tags
+        assert "<|channel|>commentary to=browser" in tags_2
+        assert "<|channel|>commentary to=python" not in tags_2
+
+    def test_tag_format_consistency(self, gptoss_parser):
+        """Test that all generated tags follow consistent format."""
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(
+            side_effect=lambda tool: tool in ["python", "browser"]
+        )
+
+        result = gptoss_parser.prepare_structured_tag(None, tool_server)
+        parsed_result = json.loads(result)
+
+        # Verify all tags have required fields
+        for tag in parsed_result["format"]["tags"]:
+            assert "begin" in tag
+            assert "content" in tag
+            assert "end" in tag
+            assert tag["content"]["type"] == "any_text"
+            assert tag["end"] == "<|end|>"
+
+            # Verify begin format
+            assert tag["begin"].startswith("<|channel|>")
+
+    def test_trigger_configuration(self, gptoss_parser):
+        """Test trigger configuration for different tool setups."""
+        # Test with no tools
+        result_no_tools = gptoss_parser.prepare_structured_tag(None, None)
+        parsed_no_tools = json.loads(result_no_tools)
+        assert parsed_no_tools["format"]["triggers"] == ["<|channel|>analysis"]
+
+        # Test with tools
+        tool_server = Mock(spec=ToolServer)
+        tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
+
+        result_with_tools = gptoss_parser.prepare_structured_tag(None, tool_server)
+        parsed_with_tools = json.loads(result_with_tools)
+
+        expected_triggers = ["<|channel|>analysis", "<|channel|>commentary to="]
+        assert set(parsed_with_tools["format"]["triggers"]) == set(expected_triggers)
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -0,0 +1,294 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import json
+import shutil
+from contextlib import suppress
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+BADREQUEST_CASES = [
+    (
+        "test_rank",
+        {"r": 1024},
+        "is greater than max_lora_rank",
+    ),
+    ("test_dora", {"use_dora": True}, "does not yet support DoRA"),
+    (
+        "test_modules_to_save",
+        {"modules_to_save": ["lm_head"]},
+        "only supports modules_to_save being None",
+    ),
+]
+
+
+@pytest.fixture(scope="module", params=[True])
+def server_with_lora_modules_json(request, qwen3_lora_files):
+    # Define the json format LoRA module configurations
+    lora_module_1 = {
+        "name": "qwen3-lora",
+        "path": qwen3_lora_files,
+        "base_model_name": MODEL_NAME,
+    }
+
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        json.dumps(lora_module_1),
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "64",
+    ]
+
+    # Enable the /v1/load_lora_adapter endpoint
+    envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server_with_lora_modules_json):
+    async with server_with_lora_modules_json.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_static_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert served_model.root == MODEL_NAME
+    assert served_model.parent is None
+    assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
+    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
+    assert lora_models[0].id == "qwen3-lora"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
+    response = await client.post(
+        "load_lora_adapter",
+        cast_to=str,
+        body={"lora_name": "qwen3-lora-3", "lora_path": qwen3_lora_files},
+    )
+    # Ensure adapter loads before querying /models
+    assert "success" in response
+
+    models = await client.models.list()
+    models = models.data
+    dynamic_lora_model = models[-1]
+    assert dynamic_lora_model.root == qwen3_lora_files
+    assert dynamic_lora_model.parent == MODEL_NAME
+    assert dynamic_lora_model.id == "qwen3-lora-3"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
+    with pytest.raises(openai.NotFoundError):
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
+        )
+
+
+@pytest.mark.asyncio
+async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+
+    with pytest.raises(openai.BadRequestError):
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": "invalid-json", "lora_path": str(invalid_files)},
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("test_name,config_change,expected_error", BADREQUEST_CASES)
+async def test_dynamic_lora_badrequests(
+    client: openai.AsyncOpenAI,
+    tmp_path,
+    qwen3_lora_files,
+    test_name: str,
+    config_change: dict,
+    expected_error: str,
+):
+    # Create test directory
+    test_dir = tmp_path / test_name
+
+    # Copy adapter files
+    shutil.copytree(qwen3_lora_files, test_dir)
+
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
+        adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(config_change)
+
+    # Save modified configuration
+    with open(config_path, "w") as f:
+        json.dump(adapter_config, f)
+
+    # Test loading the adapter
+    with pytest.raises(openai.BadRequestError, match=expected_error):
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": test_name, "lora_path": str(test_dir)},
+        )
+
+
+@pytest.mark.asyncio
+async def test_multiple_lora_adapters(
+    client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
+):
+    """Validate that many loras can be dynamically registered and inferenced
+    with concurrently"""
+
+    # This test file configures the server with --max-cpu-loras=2 and this test
+    # will concurrently load 10 adapters, so it should flex the LRU cache
+    async def load_and_run_adapter(adapter_name: str):
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
+        )
+        for _ in range(3):
+            await client.completions.create(
+                model=adapter_name,
+                prompt=["Hello there", "Foo bar bazz buzz"],
+                max_tokens=5,
+            )
+
+    lora_tasks = []
+    for i in range(10):
+        lora_tasks.append(asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+
+    results, _ = await asyncio.wait(lora_tasks)
+
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+
+
+@pytest.mark.asyncio
+async def test_loading_invalid_adapters_does_not_break_others(
+    client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
+):
+    invalid_files = tmp_path / "invalid_files"
+    invalid_files.mkdir()
+    (invalid_files / "adapter_config.json").write_text("this is not json")
+
+    stop_good_requests_event = asyncio.Event()
+
+    async def run_good_requests(client):
+        # Run chat completions requests until event set
+
+        results = []
+
+        while not stop_good_requests_event.is_set():
+            try:
+                batch = await client.completions.create(
+                    model="qwen3-lora",
+                    prompt=["Hello there", "Foo bar bazz buzz"],
+                    max_tokens=5,
+                )
+                results.append(batch)
+            except Exception as e:
+                results.append(e)
+
+        return results
+
+    # Create task to run good requests
+    good_task = asyncio.create_task(run_good_requests(client))
+
+    # Run a bunch of bad adapter loads
+    for _ in range(25):
+        with suppress(openai.NotFoundError):
+            await client.post(
+                "load_lora_adapter",
+                cast_to=str,
+                body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
+            )
+    for _ in range(25):
+        with suppress(openai.BadRequestError):
+            await client.post(
+                "load_lora_adapter",
+                cast_to=str,
+                body={"lora_name": "invalid", "lora_path": str(invalid_files)},
+            )
+
+    # Ensure all the running requests with lora adapters succeeded
+    stop_good_requests_event.set()
+    results = await good_task
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
+
+    # Ensure we can load another adapter and run it
+    await client.post(
+        "load_lora_adapter",
+        cast_to=str,
+        body={"lora_name": "valid", "lora_path": qwen3_lora_files},
+    )
+    await client.completions.create(
+        model="valid",
+        prompt=["Hello there", "Foo bar bazz buzz"],
+        max_tokens=5,
+    )
+
+
+@pytest.mark.asyncio
+async def test_beam_search_with_lora_adapters(
+    client: openai.AsyncOpenAI,
+    tmp_path,
+    qwen3_lora_files,
+):
+    """Validate that async beam search can be used with lora."""
+
+    async def load_and_run_adapter(adapter_name: str):
+        await client.post(
+            "load_lora_adapter",
+            cast_to=str,
+            body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
+        )
+        for _ in range(3):
+            await client.completions.create(
+                model=adapter_name,
+                prompt=["Hello there", "Foo bar bazz buzz"],
+                max_tokens=5,
+                extra_body=dict(use_beam_search=True),
+            )
+
+    lora_tasks = []
+    for i in range(3):
+        lora_tasks.append(asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
+
+    results, _ = await asyncio.wait(lora_tasks)
+
+    for r in results:
+        assert not isinstance(r, Exception), f"Got exception {r}"
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -0,0 +1,230 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import suppress
+from dataclasses import dataclass, field
+from http import HTTPStatus
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from vllm.config.multimodal import MultiModalConfig
+from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
+from vllm.lora.request import LoRARequest
+from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+from vllm.tokenizers import get_tokenizer
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODEL_NAME = "openai-community/gpt2"
+BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+
+MOCK_RESOLVER_NAME = "mock_test_resolver"
+
+
+@dataclass
+class MockHFConfig:
+    model_type: str = "any"
+
+
+@dataclass
+class MockModelConfig:
+    """Minimal mock ModelConfig for testing."""
+
+    model: str = MODEL_NAME
+    tokenizer: str = MODEL_NAME
+    trust_remote_code: bool = False
+    tokenizer_mode: str = "auto"
+    max_model_len: int = 100
+    tokenizer_revision: str | None = None
+    multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
+    hf_config: MockHFConfig = field(default_factory=MockHFConfig)
+    logits_processors: list[str] | None = None
+    logits_processor_pattern: str | None = None
+    diff_sampling_param: dict | None = None
+    allowed_local_media_path: str = ""
+    allowed_media_domains: list[str] | None = None
+    encoder_config = None
+    generation_config: str = "auto"
+    skip_tokenizer_init: bool = False
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}
+
+
+class MockLoRAResolver(LoRAResolver):
+    async def resolve_lora(
+        self, base_model_name: str, lora_name: str
+    ) -> LoRARequest | None:
+        if lora_name == "test-lora":
+            return LoRARequest(
+                lora_name="test-lora",
+                lora_int_id=1,
+                lora_local_path="/fake/path/test-lora",
+            )
+        elif lora_name == "invalid-lora":
+            return LoRARequest(
+                lora_name="invalid-lora",
+                lora_int_id=2,
+                lora_local_path="/fake/path/invalid-lora",
+            )
+        return None
+
+
+@pytest.fixture(autouse=True)
+def register_mock_resolver():
+    """Fixture to register and unregister the mock LoRA resolver."""
+    resolver = MockLoRAResolver()
+    LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver)
+    yield
+    # Cleanup: remove the resolver after the test runs
+    if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers:
+        del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME]
+
+
+@pytest.fixture
+def mock_serving_setup():
+    """Provides a mocked engine and serving completion instance."""
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.errored = False
+
+    tokenizer = get_tokenizer(MODEL_NAME)
+    mock_engine.get_tokenizer = AsyncMock(return_value=tokenizer)
+
+    async def mock_add_lora_side_effect(lora_request: LoRARequest):
+        """Simulate engine behavior when adding LoRAs."""
+        if lora_request.lora_name == "test-lora":
+            # Simulate successful addition
+            return True
+        if lora_request.lora_name == "invalid-lora":
+            # Simulate failure during addition (e.g. invalid format)
+            raise ValueError(f"Simulated failure adding LoRA: {lora_request.lora_name}")
+        return True
+
+    mock_engine.add_lora = AsyncMock(side_effect=mock_add_lora_side_effect)
+
+    async def mock_generate(*args, **kwargs):
+        for _ in []:
+            yield _
+
+    mock_engine.generate = MagicMock(spec=AsyncLLM.generate, side_effect=mock_generate)
+
+    mock_engine.generate.reset_mock()
+    mock_engine.add_lora.reset_mock()
+
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.io_processor = MagicMock()
+
+    models = OpenAIServingModels(
+        engine_client=mock_engine,
+        base_model_paths=BASE_MODEL_PATHS,
+    )
+
+    serving_completion = OpenAIServingCompletion(
+        mock_engine, models, request_logger=None
+    )
+
+    serving_completion._process_inputs = AsyncMock(
+        return_value=(MagicMock(name="engine_request"), {})
+    )
+
+    return mock_engine, serving_completion
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_with_lora_resolver(mock_serving_setup, monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    lora_model_name = "test-lora"
+    req_found = CompletionRequest(
+        model=lora_model_name,
+        prompt="Generate with LoRA",
+    )
+
+    # Suppress potential errors during the mocked generate call,
+    # as we are primarily checking for add_lora and generate calls
+    with suppress(Exception):
+        await serving_completion.create_completion(req_found)
+
+    mock_engine.add_lora.assert_awaited_once()
+    called_lora_request = mock_engine.add_lora.call_args[0][0]
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == lora_model_name
+
+    mock_engine.generate.assert_called_once()
+    called_lora_request = mock_engine.generate.call_args[1]["lora_request"]
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == lora_model_name
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_resolver_not_found(mock_serving_setup, monkeypatch):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    non_existent_model = "non-existent-lora-adapter"
+    req = CompletionRequest(
+        model=non_existent_model,
+        prompt="what is 1+1?",
+    )
+
+    response = await serving_completion.create_completion(req)
+
+    mock_engine.add_lora.assert_not_awaited()
+    mock_engine.generate.assert_not_called()
+
+    assert isinstance(response, ErrorResponse)
+    assert response.error.code == HTTPStatus.NOT_FOUND.value
+    assert non_existent_model in response.error.message
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_resolver_add_lora_fails(
+    mock_serving_setup, monkeypatch
+):
+    monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true")
+
+    mock_engine, serving_completion = mock_serving_setup
+
+    invalid_model = "invalid-lora"
+    req = CompletionRequest(
+        model=invalid_model,
+        prompt="what is 1+1?",
+    )
+
+    response = await serving_completion.create_completion(req)
+
+    # Assert add_lora was called before the failure
+    mock_engine.add_lora.assert_awaited_once()
+    called_lora_request = mock_engine.add_lora.call_args[0][0]
+    assert isinstance(called_lora_request, LoRARequest)
+    assert called_lora_request.lora_name == invalid_model
+
+    # Assert generate was *not* called due to the failure
+    mock_engine.generate.assert_not_called()
+
+    # Assert the correct error response
+    assert isinstance(response, ErrorResponse)
+    assert response.error.code == HTTPStatus.BAD_REQUEST.value
+    assert invalid_model in response.error.message
+
+
+@pytest.mark.asyncio
+async def test_serving_completion_flag_not_set(mock_serving_setup):
+    mock_engine, serving_completion = mock_serving_setup
+
+    lora_model_name = "test-lora"
+    req_found = CompletionRequest(
+        model=lora_model_name,
+        prompt="Generate with LoRA",
+    )
+
+    await serving_completion.create_completion(req_found)
+
+    mock_engine.add_lora.assert_not_called()
+    mock_engine.generate.assert_not_called()
--- a/tests/entrypoints/openai/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import anthropic
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        "--max-model-len",
+        "2048",
+        "--enforce-eager",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "hermes",
+        "--served-model-name",
+        "claude-3-7-sonnet-latest",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client_anthropic() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_simple_messages(client: anthropic.AsyncAnthropic):
+    resp = await client.messages.create(
+        model="claude-3-7-sonnet-latest",
+        max_tokens=1024,
+        messages=[{"role": "user", "content": "how are you!"}],
+    )
+    assert resp.stop_reason == "end_turn"
+    assert resp.role == "assistant"
+
+    print(f"Anthropic response: {resp.model_dump_json()}")
+
+
+@pytest.mark.asyncio
+async def test_system_message(client: anthropic.AsyncAnthropic):
+    resp = await client.messages.create(
+        model="claude-3-7-sonnet-latest",
+        max_tokens=1024,
+        system="you are a helpful assistant",
+        messages=[{"role": "user", "content": "how are you!"}],
+    )
+    assert resp.stop_reason == "end_turn"
+    assert resp.role == "assistant"
+
+    print(f"Anthropic response: {resp.model_dump_json()}")
+
+
+@pytest.mark.asyncio
+async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
+    resp = await client.messages.create(
+        model="claude-3-7-sonnet-latest",
+        max_tokens=1024,
+        messages=[{"role": "user", "content": "how are you!"}],
+        stream=True,
+    )
+
+    first_chunk = None
+    chunk_count = 0
+    async for chunk in resp:
+        chunk_count += 1
+        if first_chunk is None and chunk.type == "message_start":
+            first_chunk = chunk
+        print(chunk.model_dump_json())
+
+    assert chunk_count > 0
+    assert first_chunk is not None, "message_start chunk was never observed"
+    assert first_chunk.message is not None, "first chunk should include message"
+    assert first_chunk.message.usage is not None, (
+        "first chunk should include usage stats"
+    )
+    assert first_chunk.message.usage.output_tokens == 0
+    assert first_chunk.message.usage.input_tokens > 5
+
+
+@pytest.mark.asyncio
+async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
+    resp = await client.messages.create(
+        model="claude-3-7-sonnet-latest",
+        max_tokens=1024,
+        messages=[
+            {"role": "user", "content": "What's the weather like in New York today?"}
+        ],
+        tools=[
+            {
+                "name": "get_current_weather",
+                "description": "Useful for querying the weather in a specified city.",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City or region, for example: "
+                            "New York, London, Tokyo, etc.",
+                        }
+                    },
+                    "required": ["location"],
+                },
+            }
+        ],
+        stream=False,
+    )
+    assert resp.stop_reason == "tool_use"
+    assert resp.role == "assistant"
+
+    print(f"Anthropic response: {resp.model_dump_json()}")
+
+
+@pytest.mark.asyncio
+async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
+    resp = await client.messages.create(
+        model="claude-3-7-sonnet-latest",
+        max_tokens=1024,
+        messages=[
+            {
+                "role": "user",
+                "content": "What's the weather like in New York today?",
+            }
+        ],
+        tools=[
+            {
+                "name": "get_current_weather",
+                "description": "Useful for querying the weather in a specified city.",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City or region, for example: "
+                            "New York, London, Tokyo, etc.",
+                        }
+                    },
+                    "required": ["location"],
+                },
+            }
+        ],
+        stream=True,
+    )
+
+    async for chunk in resp:
+        print(chunk.model_dump_json())
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -0,0 +1,454 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import subprocess
+import sys
+import tempfile
+import time
+from http import HTTPStatus
+
+import openai
+import pytest
+import pytest_asyncio
+import requests
+from prometheus_client.parser import text_string_to_metric_families
+from transformers import AutoTokenizer
+
+from vllm import version
+
+from ...conftest import LocalAssetServer
+from ...utils import RemoteOpenAIServer
+
+MODELS = {
+    "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
+}
+PREV_MINOR_VERSION = version._prev_minor_version()
+
+
+@pytest.fixture(scope="module", params=list(MODELS.keys()))
+def model_key(request):
+    yield request.param
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+    ]
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        "",
+        "--enable-chunked-prefill",
+        "--disable-frontend-multiprocessing",
+        f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
+    ],
+)
+def server(model_key, default_server_args, request):
+    if request.param:
+        default_server_args.append(request.param)
+
+    model_name = MODELS[model_key]
+    with RemoteOpenAIServer(model_name, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as cl:
+        yield cl
+
+
+_PROMPT = "Hello my name is Robert and I love magic"
+
+
+def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
+    num_prompt_tokens = len(prompt_ids)
+
+    # {metric_family: [(suffix, expected_value)]}
+    return {
+        "vllm:time_to_first_token_seconds": [("_count", num_requests)],
+        "vllm:time_per_output_token_seconds": [
+            ("_count", num_requests * (max_tokens - 1))
+        ],
+        "vllm:e2e_request_latency_seconds": [("_count", num_requests)],
+        "vllm:request_queue_time_seconds": [("_count", num_requests)],
+        "vllm:request_inference_time_seconds": [("_count", num_requests)],
+        "vllm:request_prefill_time_seconds": [("_count", num_requests)],
+        "vllm:request_decode_time_seconds": [("_count", num_requests)],
+        "vllm:request_prompt_tokens": [
+            ("_sum", num_requests * num_prompt_tokens),
+            ("_count", num_requests),
+        ],
+        "vllm:request_generation_tokens": [
+            ("_sum", num_requests * max_tokens),
+            ("_count", num_requests),
+        ],
+        "vllm:request_params_n": [("_count", num_requests)],
+        "vllm:request_params_max_tokens": [
+            ("_sum", num_requests * max_tokens),
+            ("_count", num_requests),
+        ],
+        "vllm:iteration_tokens_total": [
+            (
+                "_sum",
+                num_requests * (num_prompt_tokens + max_tokens),
+            ),
+            ("_count", num_requests * max_tokens),
+        ],
+        "vllm:prompt_tokens": [("_total", num_requests * num_prompt_tokens)],
+        "vllm:generation_tokens": [("_total", num_requests * max_tokens)],
+        "vllm:request_success": [("_total", num_requests)],
+    }
+
+
+@pytest.mark.asyncio
+async def test_metrics_counts(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
+    model_key: str,
+):
+    if model_key == "multimodal":
+        pytest.skip("Unnecessary test")
+
+    model_name = MODELS[model_key]
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    prompt_ids = tokenizer.encode(_PROMPT)
+    num_requests = 10
+    max_tokens = 10
+
+    for _ in range(num_requests):
+        # sending a request triggers the metrics to be logged.
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt_ids,
+            max_tokens=max_tokens,
+        )
+
+    response = requests.get(server.url_for("metrics"))
+    print(response.text)
+    assert response.status_code == HTTPStatus.OK
+
+    # Loop over all expected metric_families
+    expected_values = _get_expected_values(num_requests, prompt_ids, max_tokens)
+    for metric_family, suffix_values_list in expected_values.items():
+        if metric_family not in EXPECTED_METRICS_V1 or (
+            not server.show_hidden_metrics
+            and metric_family in HIDDEN_DEPRECATED_METRICS
+        ):
+            continue
+
+        found_metric = False
+
+        # Check to see if the metric_family is found in the prom endpoint.
+        for family in text_string_to_metric_families(response.text):
+            if family.name == metric_family:
+                found_metric = True
+
+                # Check that each suffix is found in the prom endpoint.
+                for suffix, expected_value in suffix_values_list:
+                    metric_name_w_suffix = f"{metric_family}{suffix}"
+                    found_suffix = False
+
+                    for sample in family.samples:
+                        if sample.name == metric_name_w_suffix:
+                            found_suffix = True
+
+                            # For each suffix, value sure the value matches
+                            # what we expect.
+                            assert sample.value == expected_value, (
+                                f"{metric_name_w_suffix} expected value of "
+                                f"{expected_value} did not match found value "
+                                f"{sample.value}"
+                            )
+                            break
+                    assert found_suffix, (
+                        f"Did not find {metric_name_w_suffix} in prom endpoint"
+                    )
+                break
+
+        assert found_metric, f"Did not find {metric_family} in prom endpoint"
+
+
+EXPECTED_METRICS_V1 = [
+    "vllm:num_requests_running",
+    "vllm:num_requests_waiting",
+    "vllm:kv_cache_usage_perc",
+    "vllm:prefix_cache_queries",
+    "vllm:prefix_cache_hits",
+    "vllm:num_preemptions_total",
+    "vllm:prompt_tokens_total",
+    "vllm:generation_tokens_total",
+    "vllm:iteration_tokens_total",
+    "vllm:cache_config_info",
+    "vllm:request_success_total",
+    "vllm:request_prompt_tokens_sum",
+    "vllm:request_prompt_tokens_bucket",
+    "vllm:request_prompt_tokens_count",
+    "vllm:request_generation_tokens_sum",
+    "vllm:request_generation_tokens_bucket",
+    "vllm:request_generation_tokens_count",
+    "vllm:request_params_n_sum",
+    "vllm:request_params_n_bucket",
+    "vllm:request_params_n_count",
+    "vllm:request_params_max_tokens_sum",
+    "vllm:request_params_max_tokens_bucket",
+    "vllm:request_params_max_tokens_count",
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:inter_token_latency_seconds_sum",
+    "vllm:inter_token_latency_seconds_bucket",
+    "vllm:inter_token_latency_seconds_count",
+    "vllm:e2e_request_latency_seconds_sum",
+    "vllm:e2e_request_latency_seconds_bucket",
+    "vllm:e2e_request_latency_seconds_count",
+    "vllm:request_queue_time_seconds_sum",
+    "vllm:request_queue_time_seconds_bucket",
+    "vllm:request_queue_time_seconds_count",
+    "vllm:request_inference_time_seconds_sum",
+    "vllm:request_inference_time_seconds_bucket",
+    "vllm:request_inference_time_seconds_count",
+    "vllm:request_prefill_time_seconds_sum",
+    "vllm:request_prefill_time_seconds_bucket",
+    "vllm:request_prefill_time_seconds_count",
+    "vllm:request_decode_time_seconds_sum",
+    "vllm:request_decode_time_seconds_bucket",
+    "vllm:request_decode_time_seconds_count",
+]
+
+EXPECTED_METRICS_MM = [
+    "vllm:mm_cache_queries",
+    "vllm:mm_cache_hits",
+]
+
+HIDDEN_DEPRECATED_METRICS: list[str] = [
+    "vllm:gpu_cache_usage_perc",
+    "vllm:gpu_prefix_cache_queries",
+    "vllm:gpu_prefix_cache_hits",
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
+]
+
+
+@pytest.mark.asyncio
+async def test_metrics_exist(
+    local_asset_server: LocalAssetServer,
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
+    model_key: str,
+):
+    model_name = MODELS[model_key]
+
+    # sending a request triggers the metrics to be logged.
+    if model_key == "text":
+        await client.completions.create(
+            model=model_name,
+            prompt="Hello, my name is",
+            max_tokens=5,
+            temperature=0.0,
+        )
+    else:
+        # https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
+        await client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": local_asset_server.url_for(
+                                    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                                ),
+                            },
+                        },
+                        {"type": "text", "text": "What's in this image?"},
+                    ],
+                }
+            ],
+            max_tokens=5,
+            temperature=0.0,
+        )
+
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+
+    expected_metrics = EXPECTED_METRICS_V1
+    if model_key == "multimodal":
+        # NOTE: Don't use in-place assignment
+        expected_metrics = expected_metrics + EXPECTED_METRICS_MM
+
+    for metric in expected_metrics:
+        if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
+            continue
+        assert metric in response.text
+
+
+@pytest.mark.asyncio
+async def test_abort_metrics_reset(
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
+    model_key: str,
+):
+    model_name = MODELS[model_key]
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    prompt_ids = tokenizer.encode(_PROMPT)
+
+    running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
+        server,
+    )
+
+    # Expect no running requests or kvcache usage
+    assert running_requests == 0
+    assert waiting_requests == 0
+    assert kv_cache_usage == 0.0
+
+    # Start some long-running requests that we can abort
+    tasks = []
+    for _ in range(3):
+        task = asyncio.create_task(
+            client.completions.create(
+                model=model_name,
+                prompt=prompt_ids,
+                max_tokens=100,  # Long generation to give time to abort
+                temperature=0.0,
+            )
+        )
+        tasks.append(task)
+
+    # Wait a bit for requests to start processing
+    await asyncio.sleep(0.5)
+
+    # Check that we have running requests
+    running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
+        server,
+    )
+
+    # Expect running requests and kvcache usage
+    assert running_requests > 0
+    assert kv_cache_usage > 0
+
+    # Cancel all tasks to abort the requests
+    for task in tasks:
+        task.cancel()
+
+    # Wait for cancellations to be processed
+    await asyncio.sleep(1.0)
+
+    # Check that metrics have reset to zero
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+
+    # Verify running and waiting requests counts and KV cache usage are zero
+    running_requests_after, waiting_requests_after, kv_cache_usage_after = (
+        _get_running_metrics_from_api(server)
+    )
+
+    assert running_requests_after == 0, (
+        f"Expected 0 running requests after abort, got {running_requests_after}"
+    )
+    assert waiting_requests_after == 0, (
+        f"Expected 0 waiting requests after abort, got {waiting_requests_after}"
+    )
+    assert kv_cache_usage_after == 0, (
+        f"Expected 0% KV cache usage after abort, got {kv_cache_usage_after}"
+    )
+
+
+def _get_running_metrics_from_api(server: RemoteOpenAIServer):
+    """Return (running_count, waiting_count, kv_cache_usage)"""
+
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+
+    # Verify running and waiting requests counts and KV cache usage are zero
+    running_requests, waiting_requests, kv_cache_usage = None, None, None
+
+    kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
+
+    for family in text_string_to_metric_families(response.text):
+        if family.name == "vllm:num_requests_running":
+            for sample in family.samples:
+                if sample.name == "vllm:num_requests_running":
+                    running_requests = sample.value
+                    break
+        elif family.name == "vllm:num_requests_waiting":
+            for sample in family.samples:
+                if sample.name == "vllm:num_requests_waiting":
+                    waiting_requests = sample.value
+                    break
+        elif family.name == kv_cache_usage_metric:
+            for sample in family.samples:
+                if sample.name == kv_cache_usage_metric:
+                    kv_cache_usage = sample.value
+                    break
+
+    assert running_requests is not None
+    assert waiting_requests is not None
+    assert kv_cache_usage is not None
+
+    return running_requests, waiting_requests, kv_cache_usage
+
+
+def test_metrics_exist_run_batch():
+    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501
+
+    base_url = "0.0.0.0"
+    port = "8001"
+    server_url = f"http://{base_url}:{port}"
+
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
+        input_file.write(input_batch)
+        input_file.flush()
+        proc = subprocess.Popen(
+            [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.run_batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "intfloat/multilingual-e5-small",
+                "--enable-metrics",
+                "--url",
+                base_url,
+                "--port",
+                port,
+            ],
+        )
+
+        def is_server_up(url):
+            try:
+                response = requests.get(url)
+                return response.status_code == 200
+            except requests.ConnectionError:
+                return False
+
+        while not is_server_up(server_url):
+            time.sleep(1)
+
+        response = requests.get(server_url + "/metrics")
+        assert response.status_code == HTTPStatus.OK
+
+        proc.wait()
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+
+
+@pytest.fixture(scope="module")
+def server(qwen3_lora_files):
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"qwen3-lora={qwen3_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_check_models(client: openai.AsyncOpenAI, qwen3_lora_files):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert served_model.root == MODEL_NAME
+    assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
+    assert lora_models[0].id == "qwen3-lora"
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from ...utils import VLLM_PATH, RemoteOpenAIServer
+
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
+
+
+def run_and_test_dummy_opt_api_server(model, tp=1):
+    # the model is registered through the plugin
+    server_args = [
+        "--gpu-memory-utilization",
+        "0.10",
+        "--dtype",
+        "float32",
+        "--chat-template",
+        str(chatml_jinja_path),
+        "--load-format",
+        "dummy",
+        "-tp",
+        f"{tp}",
+    ]
+    with RemoteOpenAIServer(model, server_args) as server:
+        client = server.get_client()
+        completion = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Hello!"},
+            ],
+            temperature=0,
+        )
+        generated_text = completion.choices[0].message.content
+        assert generated_text is not None
+        # make sure only the first token is generated
+        rest = generated_text.replace("<s>", "")
+        assert rest == ""
+
+
+def test_oot_registration_for_api_server(dummy_opt_path: str):
+    run_and_test_dummy_opt_api_server(dummy_opt_path)
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from typing import Final
+
+import pytest
+import schemathesis
+from hypothesis import settings
+from schemathesis import GenerationConfig
+
+from ...utils import RemoteOpenAIServer
+
+schemathesis.experimental.OPEN_API_3_1.enable()
+
+MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
+MAXIMUM_IMAGES = 2
+DEFAULT_TIMEOUT_SECONDS: Final[int] = 10
+LONG_TIMEOUT_SECONDS: Final[int] = 60
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--runner",
+        "generate",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": MAXIMUM_IMAGES}),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def get_schema(server):
+    # avoid generating null (\x00) bytes in strings during test case generation
+    return schemathesis.openapi.from_uri(
+        f"{server.url_root}/openapi.json",
+        generation_config=GenerationConfig(allow_x00=False),
+    )
+
+
+schema = schemathesis.from_pytest_fixture("get_schema")
+
+
+@schemathesis.hook
+def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
+    op = context.operation
+    assert op is not None
+
+    def no_invalid_types(case: schemathesis.models.Case):
+        """
+        This filter skips test cases with invalid data that schemathesis
+        incorrectly generates due to permissive schema configurations.
+        
+        1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in 
+           message content, which isn't implemented.
+        
+        2. Skips tool_calls with `"type": "custom"` which schemathesis 
+           incorrectly generates instead of the valid `"type": "function"`.
+
+        Example test cases that are skipped:
+        curl -X POST -H 'Content-Type: application/json' \
+            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+            http://localhost:8000/tokenize
+
+        curl -X POST -H 'Content-Type: application/json' \
+            -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
+            http://localhost:8000/v1/chat/completions
+        """  # noqa: E501
+        if hasattr(case, "body") and isinstance(case.body, dict):
+            if (
+                "messages" in case.body
+                and isinstance(case.body["messages"], list)
+                and len(case.body["messages"]) > 0
+            ):
+                for message in case.body["messages"]:
+                    if not isinstance(message, dict):
+                        continue
+
+                    # Check for invalid file type in tokenize endpoint
+                    if op.method.lower() == "post" and op.path == "/tokenize":
+                        content = message.get("content", [])
+                        if (
+                            isinstance(content, list)
+                            and len(content) > 0
+                            and any(item.get("type") == "file" for item in content)
+                        ):
+                            return False
+
+                    # Check for invalid tool_calls with non-function types
+                    tool_calls = message.get("tool_calls", [])
+                    if isinstance(tool_calls, list):
+                        for tool_call in tool_calls:
+                            if isinstance(tool_call, dict):
+                                if tool_call.get("type") != "function":
+                                    return False
+                                if "custom" in tool_call:
+                                    return False
+
+            # Sometimes structured_outputs.grammar is generated to be empty
+            # Causing a server error in EBNF grammar parsing
+            # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
+            structured_outputs = case.body.get("structured_outputs", {})
+            grammar = (
+                structured_outputs.get("grammar")
+                if isinstance(structured_outputs, dict)
+                else None
+            )
+
+            if grammar == "":
+                # Allow None (will be handled as no grammar)
+                # But skip empty strings
+                return False
+
+        return True
+
+    return strategy.filter(no_invalid_types)
+
+
+@schema.parametrize()
+@schema.override(headers={"Content-Type": "application/json"})
+@settings(deadline=LONG_TIMEOUT_SECONDS * 1000)
+def test_openapi_stateless(case: schemathesis.Case):
+    key = (
+        case.operation.method.upper(),
+        case.operation.path,
+    )
+    if case.operation.path.startswith("/v1/responses"):
+        # Skip responses API as it is meant to be stateful.
+        return
+
+    timeout = {
+        # requires a longer timeout
+        ("POST", "/v1/chat/completions"): LONG_TIMEOUT_SECONDS,
+    }.get(key, DEFAULT_TIMEOUT_SECONDS)
+
+    # No need to verify SSL certificate for localhost
+    case.call_and_validate(verify=False, timeout=timeout)
--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for middleware that's off by default and can be toggled through
+server arguments, mainly --api-key and --enable-request-id-headers.
+"""
+
+from http import HTTPStatus
+
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+
+# Use a small embeddings model for faster startup and smaller memory footprint.
+# Since we are not testing any chat functionality,
+# using a chat capable model is overkill.
+MODEL_NAME = "intfloat/multilingual-e5-small"
+
+
+@pytest.fixture(scope="module")
+def server(request: pytest.FixtureRequest):
+    passed_params = []
+    if hasattr(request, "param"):
+        passed_params = request.param
+    if isinstance(passed_params, str):
+        passed_params = [passed_params]
+
+    args = [
+        "--runner",
+        "pooling",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "512",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "2",
+        *passed_params,
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_no_api_token(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("v1/models"))
+    assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.asyncio
+async def test_no_request_id_header(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
+    assert "X-Request-Id" not in response.headers
+
+
+@pytest.mark.parametrize(
+    "server",
+    [["--api-key", "test"]],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_missing_api_token(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("v1/models"))
+    assert response.status_code == HTTPStatus.UNAUTHORIZED
+
+
+@pytest.mark.parametrize(
+    "server",
+    [["--api-key", "test"]],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_passed_api_token(server: RemoteOpenAIServer):
+    response = requests.get(
+        server.url_for("v1/models"), headers={"Authorization": "Bearer test"}
+    )
+    assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.parametrize(
+    "server",
+    [["--api-key", "test"]],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_not_v1_api_token(server: RemoteOpenAIServer):
+    # Authorization check is skipped for any paths that
+    # don't start with /v1 (e.g. /v1/chat/completions).
+    response = requests.get(server.url_for("health"))
+    assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.parametrize(
+    "server",
+    ["--enable-request-id-headers"],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_enable_request_id_header(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
+    assert "X-Request-Id" in response.headers
+    assert len(response.headers.get("X-Request-Id", "")) == 32
+
+
+@pytest.mark.parametrize(
+    "server",
+    ["--enable-request-id-headers"],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_custom_request_id_header(server: RemoteOpenAIServer):
+    response = requests.get(
+        server.url_for("health"), headers={"X-Request-Id": "Custom"}
+    )
+    assert "X-Request-Id" in response.headers
+    assert response.headers.get("X-Request-Id") == "Custom"
--- a/tests/entrypoints/openai/test_orca_metrics.py
+++ b/tests/entrypoints/openai/test_orca_metrics.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[True])
+def server(request, monkeypatch_module):
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_orca_header(server: RemoteOpenAIServer):
+    messages = [
+        {"role": "system", "content": "you are a helpful assistant"},
+        {"role": "user", "content": "what is 1+1?"},
+    ]
+
+    client = openai.OpenAI(
+        api_key="EMPTY",
+        base_url=f"http://localhost:{server.port}/v1",
+        default_headers={"endpoint-load-metrics-format": "TEXT"},
+    )
+
+    # 1. Use raw client to get response headers.
+    raw_client = client.with_raw_response
+
+    # 2. Make the API call using the raw_client
+    response_with_raw = raw_client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        extra_headers={"endpoint-load-metrics-format": "TEXT"},
+    )
+
+    # 3. Access the raw httpx.Response object
+    raw_http_response = response_with_raw.http_response
+
+    # 4. Get the headers from the httpx.Response object
+    response_headers = raw_http_response.headers
+
+    assert "endpoint-load-metrics" in response_headers
+
+
+@pytest.mark.asyncio
+async def test_completion_with_orca_header(client: openai.AsyncOpenAI):
+    # 1. Use raw client to get response headers.
+    raw_client = client.with_raw_response
+
+    # 2. Make the API call using the raw_client
+    completion = await raw_client.completions.create(
+        model=MODEL_NAME,
+        prompt="Hello, my name is",
+        max_tokens=5,
+        extra_headers={"endpoint-load-metrics-format": "JSON"},
+    )
+
+    # 3. Access the raw httpx.Response object
+    raw_http_response = completion.http_response
+
+    # 4. Get the headers from the httpx.Response object
+    response_headers = raw_http_response.headers
+
+    assert "endpoint-load-metrics" in response_headers
+
+
+@pytest.mark.asyncio
+async def test_single_completion(client: openai.AsyncOpenAI):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="Hello, my name is",
+        max_tokens=5,
+        extra_headers={"endpoint-load-metrics-format": "JSON"},
+        temperature=0.0,
+    )
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    # When using Qwen3-0.6B, prompt tokens=[9707, 11, 847, 829, 374]
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=5, total_tokens=10
+    )
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
+    assert completion.choices[0].prompt_logprobs is None
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import io
+from unittest.mock import Mock
+
+# imports for structured outputs tests
+import openai
+import pybase64
+import pytest
+import regex as re
+import torch
+
+from vllm.config import ModelConfig
+from vllm.entrypoints.renderer import CompletionRenderer
+
+from ...utils import RemoteOpenAIServer
+
+
+@pytest.mark.asyncio
+async def test_empty_prompt():
+    model_name = "gpt2"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+
+        with pytest.raises(
+            openai.BadRequestError,
+            match="Either prompt or prompt_embeds must be provided and non-empty.",
+        ):
+            await client.completions.create(
+                model=model_name,
+                prompt="",
+                max_tokens=5,
+                temperature=0.0,
+                extra_body={"prompt_embeds": []},
+            )
+
+
+@pytest.mark.asyncio
+async def test_out_of_vocab_token_ids():
+    model_name = "gpt2"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+
+        with pytest.raises(
+            openai.BadRequestError, match=re.compile(".*out of vocabulary.*").pattern
+        ):
+            await client.completions.create(
+                model=model_name, prompt=[999999], max_tokens=5, temperature=0.0
+            )
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
+@pytest.mark.parametrize(
+    "layout", [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr]
+)
+@pytest.mark.parametrize("seq_len", [2, 10])
+@pytest.mark.parametrize("hidden_size", [2, 10])
+def test_load_prompt_embeds(
+    dtype: torch.dtype, layout: torch.layout, seq_len: int, hidden_size: int
+):
+    model_config = Mock(spec=ModelConfig)
+    model_config.enable_prompt_embeds = True
+    renderer = CompletionRenderer(model_config, tokenizer=None)
+
+    # construct arbitrary tensors of various dtypes, layouts, and sizes.
+    # We need to check against different layouts to make sure that if a user
+    # uses sparse tensors to reduce the transmission size of prompt embeddings,
+    # we must cast them to dense/strided before passing them into the engine.
+    # We don't use non-CPU tensors in this test to avoid preemptively
+    # initializing cuda and break other tests in the suite that fork processes.
+    # We also need to make sure that we only use devices that are actually
+    # available in the environment the test is running on. For simplicity,
+    # we just test against CPU.
+    tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
+    if layout == torch.strided:
+        tensor = tensor.contiguous()
+    elif layout == torch.sparse_coo:
+        tensor = tensor.to_sparse_coo()
+    elif layout == torch.sparse_csc:
+        tensor = tensor.to_sparse_csc()
+    elif layout == torch.sparse_csr:
+        tensor = tensor.to_sparse_csr()
+
+    buffer = io.BytesIO()
+    torch.save(tensor, buffer)
+    buffer.seek(0)
+    encoded_tensor = pybase64.b64encode(buffer.getvalue())
+
+    loaded_prompt_embeds = renderer.load_prompt_embeds(encoded_tensor)
+    assert len(loaded_prompt_embeds) == 1
+    loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
+    assert loaded_tensor.device.type == "cpu"
+    assert loaded_tensor.layout == torch.strided
+    torch.testing.assert_close(
+        loaded_tensor, tensor.to("cpu").to_dense(), equal_nan=True
+    )
+
+
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("seq_len", [2])
+@pytest.mark.parametrize("hidden_size", [2])
+def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
+    model_config = Mock(spec=ModelConfig)
+    model_config.enable_prompt_embeds = False
+    renderer = CompletionRenderer(model_config, tokenizer=None)
+
+    tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
+
+    buffer = io.BytesIO()
+    torch.save(tensor, buffer)
+    buffer.seek(0)
+    encoded_tensor = pybase64.b64encode(buffer.getvalue())
+
+    with pytest.raises(ValueError, match="--enable-prompt-embeds"):
+        renderer.load_prompt_embeds(encoded_tensor)
--- a/tests/entrypoints/openai/test_protocol.py
+++ b/tests/entrypoints/openai/test_protocol.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from openai_harmony import (
+    Message,
+)
+
+from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages
+
+
+def test_serialize_message() -> None:
+    dict_value = {"a": 1, "b": "2"}
+    assert serialize_message(dict_value) == dict_value
+
+    msg_value = {
+        "role": "assistant",
+        "name": None,
+        "content": [{"type": "text", "text": "Test 1"}],
+        "channel": "analysis",
+    }
+    msg = Message.from_dict(msg_value)
+    assert serialize_message(msg) == msg_value
+
+
+def test_serialize_messages() -> None:
+    assert serialize_messages(None) is None
+    assert serialize_messages([]) is None
+
+    dict_value = {"a": 3, "b": "4"}
+    msg_value = {
+        "role": "assistant",
+        "name": None,
+        "content": [{"type": "text", "text": "Test 2"}],
+        "channel": "analysis",
+    }
+    msg = Message.from_dict(msg_value)
+    assert serialize_messages([msg, dict_value]) == [msg_value, dict_value]
--- a/tests/entrypoints/openai/test_response_api_mcp_tools.py
+++ b/tests/entrypoints/openai/test_response_api_mcp_tools.py
@@ -0,0 +1,261 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import pytest_asyncio
+from openai import OpenAI
+from openai_harmony import ToolDescription, ToolNamespaceConfig
+
+from vllm.entrypoints.tool_server import MCPToolServer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/gpt-oss-20b"
+
+
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module")
+def mcp_disabled_server(monkeypatch_module: pytest.MonkeyPatch):
+    args = ["--enforce-eager", "--tool-server", "demo"]
+
+    with monkeypatch_module.context() as m:
+        m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
+        m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
+        # Helps the model follow instructions better
+        m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
+
+
+@pytest.fixture(scope="function")
+def mcp_enabled_server(monkeypatch_module: pytest.MonkeyPatch):
+    args = ["--enforce-eager", "--tool-server", "demo"]
+
+    with monkeypatch_module.context() as m:
+        m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
+        m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
+        m.setenv("VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,container")
+        # Helps the model follow instructions better
+        m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
+
+
+@pytest_asyncio.fixture
+async def mcp_disabled_client(mcp_disabled_server):
+    async with mcp_disabled_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest_asyncio.fixture
+async def mcp_enabled_client(mcp_enabled_server):
+    async with mcp_enabled_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI, model_name: str):
+    response = await mcp_enabled_client.responses.create(
+        model=model_name,
+        input=(
+            "Execute the following code: "
+            "import random; print(random.randint(1, 1000000))"
+        ),
+        instructions=(
+            "You must use the Python tool to execute code. Never simulate execution."
+        ),
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                # URL unused for DemoToolServer
+                "server_url": "http://localhost:8888",
+            }
+        ],
+        extra_body={"enable_response_messages": True},
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # Verify output messages: Tool calls and responses on analysis channel
+    tool_call_found = False
+    tool_response_found = False
+    for message in response.output_messages:
+        recipient = message.get("recipient")
+        if recipient and recipient.startswith("python"):
+            tool_call_found = True
+            assert message.get("channel") == "analysis", (
+                "Tool call should be on analysis channel"
+            )
+        author = message.get("author", {})
+        if (
+            author.get("role") == "tool"
+            and author.get("name")
+            and author.get("name").startswith("python")
+        ):
+            tool_response_found = True
+            assert message.get("channel") == "analysis", (
+                "Tool response should be on analysis channel"
+            )
+
+    assert tool_call_found, "Should have found at least one Python tool call"
+    assert tool_response_found, "Should have found at least one Python tool response"
+    for message in response.input_messages:
+        assert message.get("author").get("role") != "developer", (
+            "No developer messages should be present with valid mcp tool"
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_mcp_tool_with_allowed_tools_star(
+    mcp_enabled_client: OpenAI, model_name: str
+):
+    """Test MCP tool with allowed_tools=['*'] to select all available tools.
+
+    This E2E test verifies that the "*" wildcard works end-to-end.
+    See test_serving_responses.py for detailed unit tests of "*" normalization.
+    """
+    response = await mcp_enabled_client.responses.create(
+        model=model_name,
+        input=(
+            "Execute the following code: "
+            "import random; print(random.randint(1, 1000000))"
+        ),
+        instructions=(
+            "You must use the Python tool to execute code. Never simulate execution."
+        ),
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                "server_url": "http://localhost:8888",
+                # Using "*" to allow all tools from this MCP server
+                "allowed_tools": ["*"],
+            }
+        ],
+        extra_body={"enable_response_messages": True},
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # Verify tool calls work with allowed_tools=["*"]
+    tool_call_found = False
+    for message in response.output_messages:
+        recipient = message.get("recipient")
+        if recipient and recipient.startswith("python"):
+            tool_call_found = True
+            break
+    assert tool_call_found, "Should have found at least one Python tool call with '*'"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_name: str):
+    response = await mcp_disabled_client.responses.create(
+        model=model_name,
+        input=(
+            "Execute the following code if the tool is present: "
+            "import random; print(random.randint(1, 1000000))"
+        ),
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                # URL unused for DemoToolServer
+                "server_url": "http://localhost:8888",
+            }
+        ],
+        extra_body={"enable_response_messages": True},
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # Verify output messages: No tool calls and responses
+    tool_call_found = False
+    tool_response_found = False
+    for message in response.output_messages:
+        recipient = message.get("recipient")
+        if recipient and recipient.startswith("python"):
+            tool_call_found = True
+            assert message.get("channel") == "analysis", (
+                "Tool call should be on analysis channel"
+            )
+        author = message.get("author", {})
+        if (
+            author.get("role") == "tool"
+            and author.get("name")
+            and author.get("name").startswith("python")
+        ):
+            tool_response_found = True
+            assert message.get("channel") == "analysis", (
+                "Tool response should be on analysis channel"
+            )
+
+    assert not tool_call_found, "Should not have a python call"
+    assert not tool_response_found, "Should not have a tool response"
+    for message in response.input_messages:
+        assert message.get("author").get("role") != "developer", (
+            "No developer messages should be present without a valid tool"
+        )
+
+
+def test_get_tool_description():
+    """Test MCPToolServer.get_tool_description filtering logic.
+
+    Note: The wildcard "*" is normalized to None by
+    _extract_allowed_tools_from_mcp_requests before reaching this layer,
+    so we only test None and specific tool filtering here.
+    See test_serving_responses.py for "*" normalization tests.
+    """
+    pytest.importorskip("mcp")
+
+    server = MCPToolServer()
+    tool1 = ToolDescription.new(
+        name="tool1", description="First", parameters={"type": "object"}
+    )
+    tool2 = ToolDescription.new(
+        name="tool2", description="Second", parameters={"type": "object"}
+    )
+    tool3 = ToolDescription.new(
+        name="tool3", description="Third", parameters={"type": "object"}
+    )
+
+    server.harmony_tool_descriptions = {
+        "test_server": ToolNamespaceConfig(
+            name="test_server", description="test", tools=[tool1, tool2, tool3]
+        )
+    }
+
+    # Nonexistent server
+    assert server.get_tool_description("nonexistent") is None
+
+    # None (no filter) - returns all tools
+    result = server.get_tool_description("test_server", allowed_tools=None)
+    assert len(result.tools) == 3
+
+    # Filter to specific tools
+    result = server.get_tool_description(
+        "test_server", allowed_tools=["tool1", "tool3"]
+    )
+    assert len(result.tools) == 2
+    assert result.tools[0].name == "tool1"
+    assert result.tools[1].name == "tool3"
+
+    # Single tool
+    result = server.get_tool_description("test_server", allowed_tools=["tool2"])
+    assert len(result.tools) == 1
+    assert result.tools[0].name == "tool2"
+
+    # No matching tools - returns None
+    result = server.get_tool_description("test_server", allowed_tools=["nonexistent"])
+    assert result is None
+
+    # Empty list - returns None
+    assert server.get_tool_description("test_server", allowed_tools=[]) is None
--- a/tests/entrypoints/openai/test_response_api_parsable_context.py
+++ b/tests/entrypoints/openai/test_response_api_parsable_context.py
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+import json
+
+import pytest
+import pytest_asyncio
+from openai import OpenAI
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-8B"
+
+
+@pytest.fixture(scope="module")
+def server():
+    assert importlib.util.find_spec("gpt_oss") is not None, (
+        "Harmony tests require gpt_oss package to be installed"
+    )
+
+    args = [
+        "--reasoning-parser",
+        "qwen3",
+        "--max_model_len",
+        "5000",
+        "--structured-outputs-config.backend",
+        "xgrammar",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "hermes",
+        "--tool-server",
+        "demo",
+    ]
+    env_dict = dict(
+        VLLM_ENABLE_RESPONSES_API_STORE="1",
+        VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1",
+        PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+    )
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response is not None
+    print("response: ", response)
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_reasoning_and_function_items(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"type": "message", "content": "Hello.", "role": "user"},
+            {
+                "type": "reasoning",
+                "id": "lol",
+                "content": [
+                    {
+                        "type": "reasoning_text",
+                        "text": "We need to respond: greeting.",
+                    }
+                ],
+                "summary": [],
+            },
+            {
+                "arguments": '{"location": "Paris", "unit": "celsius"}',
+                "call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
+                "name": "get_weather",
+                "type": "function_call",
+                "id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
+                "status": "completed",
+            },
+            {
+                "call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab",
+                "id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78",
+                "output": "The weather in Paris is 20 Celsius",
+                "status": "completed",
+                "type": "function_call_output",
+            },
+        ],
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # make sure we get a reasoning and text output
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "message"
+    assert type(response.output[1].content[0].text) is str
+
+
+def get_horoscope(sign):
+    return f"{sign}: Next Tuesday you will befriend a baby otter."
+
+
+def call_function(name, args):
+    if name == "get_horoscope":
+        return get_horoscope(**args)
+    else:
+        raise ValueError(f"Unknown function: {name}")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_call_first_turn(client: OpenAI, model_name: str):
+    tools = [
+        {
+            "type": "function",
+            "name": "get_horoscope",
+            "description": "Get today's horoscope for an astrological sign.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "sign": {"type": "string"},
+                },
+                "required": ["sign"],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        }
+    ]
+
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the horoscope for Aquarius today?",
+        tools=tools,
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    assert len(response.output) == 2
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "function_call"
+
+    function_call = response.output[1]
+    assert function_call.name == "get_horoscope"
+    assert function_call.call_id is not None
+
+    args = json.loads(function_call.arguments)
+    assert "sign" in args
+
+    # the multi turn function call is tested above in
+    # test_reasoning_and_function_items
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_mcp_tool_call(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24? Use python to calculate the result.",
+        tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
+        temperature=0.0,
+    )
+
+    assert response is not None
+    assert response.status == "completed"
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "mcp_call"
+    assert type(response.output[1].arguments) is str
+    assert type(response.output[1].output) is str
+    assert response.output[2].type == "reasoning"
+    # make sure the correct math is in the final output
+    assert response.output[3].type == "message"
+    assert "312" in response.output[3].content[0].text
--- a/tests/entrypoints/openai/test_response_api_simple.py
+++ b/tests/entrypoints/openai/test_response_api_simple.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import pytest_asyncio
+from openai import OpenAI
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-8B"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
+    env_dict = dict(
+        VLLM_ENABLE_RESPONSES_API_STORE="1",
+        # uncomment for tool calling
+        # PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+    )
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response is not None
+    print("response: ", response)
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_enable_response_messages(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="Hello?",
+        extra_body={"enable_response_messages": True},
+    )
+    assert response.status == "completed"
+    assert response.input_messages[0]["type"] == "raw_message_tokens"
+    assert type(response.input_messages[0]["message"]) is str
+    assert len(response.input_messages[0]["message"]) > 10
+    assert type(response.input_messages[0]["tokens"][0]) is int
+    assert type(response.output_messages[0]["message"]) is str
+    assert len(response.output_messages[0]["message"]) > 10
+    assert type(response.output_messages[0]["tokens"][0]) is int
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_reasoning_item(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"type": "message", "content": "Hello.", "role": "user"},
+            {
+                "type": "reasoning",
+                "id": "lol",
+                "content": [
+                    {
+                        "type": "reasoning_text",
+                        "text": "We need to respond: greeting.",
+                    }
+                ],
+                "summary": [],
+            },
+        ],
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # make sure we get a reasoning and text output
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "message"
+    assert type(response.output[1].content[0].text) is str
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -0,0 +1,988 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+import json
+import time
+
+import pytest
+import pytest_asyncio
+import requests
+from openai import BadRequestError, NotFoundError, OpenAI
+from openai_harmony import (
+    Message,
+)
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/gpt-oss-20b"
+
+GET_WEATHER_SCHEMA = {
+    "type": "function",
+    "name": "get_weather",
+    "description": "Get current temperature for provided coordinates in celsius.",  # noqa
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "latitude": {"type": "number"},
+            "longitude": {"type": "number"},
+        },
+        "required": ["latitude", "longitude"],
+        "additionalProperties": False,
+    },
+    "strict": True,
+}
+
+
+@pytest.fixture(scope="module")
+def server():
+    assert importlib.util.find_spec("gpt_oss") is not None, (
+        "Harmony tests require gpt_oss package to be installed"
+    )
+
+    args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
+    env_dict = dict(
+        VLLM_ENABLE_RESPONSES_API_STORE="1",
+        PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+    )
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response is not None
+    print("response: ", response)
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic_with_instructions(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+        instructions="Respond in Korean.",
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the capital of South Korea?",
+        reasoning={"effort": "low"},
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_max_tokens(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the first paragraph of Moby Dick?",
+        reasoning={"effort": "low"},
+        max_output_tokens=30,
+    )
+    assert response is not None
+    assert response.status == "incomplete"
+    assert response.incomplete_details.reason == "max_output_tokens"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chat(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"role": "system", "content": "Respond in Korean."},
+            {"role": "user", "content": "Hello!"},
+            {"role": "assistant", "content": "Hello! How can I help you today?"},
+            {"role": "user", "content": "What is 13 * 24? Explain your answer."},
+        ],
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chat_with_input_type(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "role": "user",
+                "content": [{"type": "input_text", "text": "What is 13*24?"}],
+            },
+        ],
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_structured_output(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"role": "system", "content": "Extract the event information."},
+            {
+                "role": "user",
+                "content": "Alice and Bob are going to a science fair on Friday.",
+            },
+        ],
+        text={
+            "format": {
+                "type": "json_schema",
+                "name": "calendar_event",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "name": {"type": "string"},
+                        "date": {"type": "string"},
+                        "participants": {"type": "array", "items": {"type": "string"}},
+                    },
+                    "required": ["name", "date", "participants"],
+                    "additionalProperties": False,
+                },
+                "description": "A calendar event.",
+                "strict": True,
+            }
+        },
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_structured_output_with_parse(client: OpenAI, model_name: str):
+    from pydantic import BaseModel
+
+    class CalendarEvent(BaseModel):
+        name: str
+        date: str
+        participants: list[str]
+
+    response = await client.responses.parse(
+        model=model_name,
+        input="Alice and Bob are going to a science fair on Friday",
+        instructions="Extract the event information",
+        text_format=CalendarEvent,
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_store(client: OpenAI, model_name: str):
+    for store in [True, False]:
+        response = await client.responses.create(
+            model=model_name,
+            input="What is 13 * 24?",
+            store=store,
+        )
+        assert response is not None
+
+        try:
+            _retrieved_response = await client.responses.retrieve(response.id)
+            is_not_found = False
+        except NotFoundError:
+            is_not_found = True
+
+        assert is_not_found == (not store)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_background(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+        background=True,
+    )
+    assert response is not None
+
+    retries = 0
+    max_retries = 30
+    while retries < max_retries:
+        response = await client.responses.retrieve(response.id)
+        if response.status == "completed":
+            break
+        time.sleep(1)
+        retries += 1
+
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_background_cancel(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="Write a long story about a cat.",
+        background=True,
+    )
+    assert response is not None
+    time.sleep(1)
+
+    cancelled_response = await client.responses.cancel(response.id)
+    assert cancelled_response is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_stateful_multi_turn(client: OpenAI, model_name: str):
+    response1 = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response1 is not None
+    assert response1.status == "completed"
+
+    response2 = await client.responses.create(
+        model=model_name,
+        input="What if I increase both numbers by 1?",
+        previous_response_id=response1.id,
+    )
+    assert response2 is not None
+    assert response2.status == "completed"
+
+    response3 = await client.responses.create(
+        model=model_name,
+        input="Divide the result by 2.",
+        previous_response_id=response2.id,
+    )
+    assert response3 is not None
+    assert response3.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_streaming_types(client: OpenAI, model_name: str):
+    prompts = [
+        "tell me a story about a cat in 20 words",
+    ]
+
+    # this links the "done" type with the "start" type
+    # so every "done" type should have a corresponding "start" type
+    # and every open block should be closed by the end of the stream
+    pairs_of_event_types = {
+        "response.completed": "response.created",
+        "response.output_item.done": "response.output_item.added",
+        "response.content_part.done": "response.content_part.added",
+        "response.output_text.done": "response.output_text.delta",
+        "response.web_search_call.done": "response.web_search_call.added",
+        "response.reasoning_text.done": "response.reasoning_text.delta",
+        "response.reasoning_part.done": "response.reasoning_part.added",
+    }
+
+    for prompt in prompts:
+        response = await client.responses.create(
+            model=model_name,
+            input=prompt,
+            reasoning={"effort": "low"},
+            tools=[],
+            stream=True,
+            background=False,
+        )
+
+        stack_of_event_types = []
+        async for event in response:
+            if event.type == "response.created":
+                stack_of_event_types.append(event.type)
+            elif event.type == "response.completed":
+                assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
+                stack_of_event_types.pop()
+            if event.type.endswith("added"):
+                stack_of_event_types.append(event.type)
+            elif event.type.endswith("delta"):
+                if stack_of_event_types[-1] == event.type:
+                    continue
+                stack_of_event_types.append(event.type)
+            elif event.type.endswith("done"):
+                assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
+                stack_of_event_types.pop()
+        assert len(stack_of_event_types) == 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_with_streaming_types(client: OpenAI, model_name: str):
+    # this links the "done" type with the "start" type
+    # so every "done" type should have a corresponding "start" type
+    # and every open block should be closed by the end of the stream
+    pairs_of_event_types = {
+        "response.completed": "response.created",
+        "response.output_item.done": "response.output_item.added",
+        "response.output_text.done": "response.output_text.delta",
+        "response.reasoning_text.done": "response.reasoning_text.delta",
+        "response.reasoning_part.done": "response.reasoning_part.added",
+        "response.function_call_arguments.done": "response.function_call_arguments.delta",  # noqa
+    }
+
+    tools = [GET_WEATHER_SCHEMA]
+    input_list = [
+        {
+            "role": "user",
+            "content": "What's the weather like in Paris today?",
+        }
+    ]
+    stream_response = await client.responses.create(
+        model=model_name,
+        input=input_list,
+        tools=tools,
+        stream=True,
+    )
+
+    stack_of_event_types = []
+    async for event in stream_response:
+        if event.type == "response.created":
+            stack_of_event_types.append(event.type)
+        elif event.type == "response.completed":
+            assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
+            stack_of_event_types.pop()
+        if event.type.endswith("added"):
+            stack_of_event_types.append(event.type)
+        elif event.type.endswith("delta"):
+            if stack_of_event_types[-1] == event.type:
+                continue
+            stack_of_event_types.append(event.type)
+        elif event.type.endswith("done"):
+            assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
+            stack_of_event_types.pop()
+    assert len(stack_of_event_types) == 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("background", [True, False])
+async def test_streaming(client: OpenAI, model_name: str, background: bool):
+    # TODO: Add back when web search and code interpreter are available in CI
+    prompts = [
+        "tell me a story about a cat in 20 words",
+        "What is 13 * 24? Use python to calculate the result.",
+        # "When did Jensen found NVIDIA? Search it and answer the year only.",
+    ]
+
+    for prompt in prompts:
+        response = await client.responses.create(
+            model=model_name,
+            input=prompt,
+            reasoning={"effort": "low"},
+            tools=[
+                # {
+                #     "type": "web_search_preview"
+                # },
+                {"type": "code_interpreter", "container": {"type": "auto"}},
+            ],
+            stream=True,
+            background=background,
+            extra_body={"enable_response_messages": True},
+        )
+
+        current_item_id = ""
+        current_content_index = -1
+
+        events = []
+        current_event_mode = None
+        resp_id = None
+        checked_response_completed = False
+        async for event in response:
+            if event.type == "response.created":
+                resp_id = event.response.id
+
+            # test vllm custom types are in the response
+            if event.type in [
+                "response.completed",
+                "response.in_progress",
+                "response.created",
+            ]:
+                assert "input_messages" in event.response.model_extra
+                assert "output_messages" in event.response.model_extra
+                if event.type == "response.completed":
+                    # make sure the serialization of content works
+                    for msg in event.response.model_extra["output_messages"]:
+                        # make sure we can convert the messages back into harmony
+                        Message.from_dict(msg)
+
+                    for msg in event.response.model_extra["input_messages"]:
+                        # make sure we can convert the messages back into harmony
+                        Message.from_dict(msg)
+                    checked_response_completed = True
+
+            if current_event_mode != event.type:
+                current_event_mode = event.type
+                print(f"\n[{event.type}] ", end="", flush=True)
+
+            # verify current_item_id is correct
+            if event.type == "response.output_item.added":
+                assert event.item.id != current_item_id
+                current_item_id = event.item.id
+            elif event.type in [
+                "response.output_text.delta",
+                "response.reasoning_text.delta",
+            ]:
+                assert event.item_id == current_item_id
+
+            # verify content_index_id is correct
+            if event.type in [
+                "response.content_part.added",
+                "response.reasoning_part.added",
+            ]:
+                assert event.content_index != current_content_index
+                current_content_index = event.content_index
+            elif event.type in [
+                "response.output_text.delta",
+                "response.reasoning_text.delta",
+            ]:
+                assert event.content_index == current_content_index
+
+            if "text.delta" in event.type:
+                print(event.delta, end="", flush=True)
+            elif "reasoning_text.delta" in event.type:
+                print(f"{event.delta}", end="", flush=True)
+            elif "response.code_interpreter_call_code.done" in event.type:
+                print(f"Code: {event.code}", end="", flush=True)
+            elif (
+                "response.output_item.added" in event.type
+                and event.item.type == "web_search_call"
+            ):
+                print(f"Web search: {event.item.action}", end="", flush=True)
+            events.append(event)
+
+        assert len(events) > 0
+        response_completed_event = events[-1]
+        assert len(response_completed_event.response.output) > 0
+        assert checked_response_completed
+
+        if background:
+            starting_after = 5
+            async with await client.responses.retrieve(
+                response_id=resp_id, stream=True, starting_after=starting_after
+            ) as stream:
+                counter = starting_after
+                async for event in stream:
+                    counter += 1
+                    assert event == events[counter]
+            assert counter == len(events) - 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="Web search tool is not available in CI yet.")
+async def test_web_search(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="Who is the president of South Korea as of now?",
+        tools=[{"type": "web_search_preview"}],
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_code_interpreter(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        # TODO: Ideally should be able to set max tool calls
+        # to prevent multi-turn, but it is not currently supported
+        # would speed up the test
+        input=(
+            "What's the first 4 digits after the decimal point of "
+            "cube root of `19910212 * 20250910`? "
+            "Show only the digits. The python interpreter is not stateful "
+            "and you must print to see the output."
+        ),
+        tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
+        temperature=0.0,  # More deterministic output in response
+    )
+    assert response is not None
+    assert response.status == "completed"
+    assert response.usage.output_tokens_details.tool_output_tokens > 0
+    for item in response.output:
+        if item.type == "message":
+            output_string = item.content[0].text
+            print("output_string: ", output_string, flush=True)
+            assert "5846" in output_string
+
+
+def get_weather(latitude, longitude):
+    response = requests.get(
+        f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}&current=temperature_2m,wind_speed_10m&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m"  # noqa
+    )
+    data = response.json()
+    return data["current"]["temperature_2m"]
+
+
+def get_place_to_travel():
+    return "Paris"
+
+
+def get_horoscope(sign):
+    return f"{sign}: Next Tuesday you will befriend a baby otter."
+
+
+def call_function(name, args):
+    if name == "get_weather":
+        return get_weather(**args)
+    elif name == "get_place_to_travel":
+        return get_place_to_travel()
+    elif name == "get_horoscope":
+        return get_horoscope(**args)
+    else:
+        raise ValueError(f"Unknown function: {name}")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_reasoning_item(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"type": "message", "content": "Hello.", "role": "user"},
+            {
+                "type": "reasoning",
+                "id": "lol",
+                "content": [
+                    {
+                        "type": "reasoning_text",
+                        "text": "We need to respond: greeting.",
+                    }
+                ],
+                "summary": [],
+            },
+        ],
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling(client: OpenAI, model_name: str):
+    tools = [GET_WEATHER_SCHEMA]
+
+    response = await client.responses.create(
+        model=model_name,
+        input="What's the weather like in Paris today?",
+        tools=tools,
+        temperature=0.0,
+        extra_body={"request_id": "test_function_calling_non_resp"},
+    )
+    assert response is not None
+    assert response.status == "completed"
+    assert len(response.output) == 2
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "function_call"
+
+    tool_call = response.output[1]
+    name = tool_call.name
+    args = json.loads(tool_call.arguments)
+
+    result = call_function(name, args)
+
+    response_2 = await client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "type": "function_call_output",
+                "call_id": tool_call.call_id,
+                "output": str(result),
+            }
+        ],
+        tools=tools,
+        previous_response_id=response.id,
+    )
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert response_2.output_text is not None
+
+    # NOTE: chain-of-thought should be removed.
+    response_3 = await client.responses.create(
+        model=model_name,
+        input="What's the weather like in Paris today?",
+        tools=tools,
+        previous_response_id=response_2.id,
+    )
+    assert response_3 is not None
+    assert response_3.status == "completed"
+    assert response_3.output_text is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.flaky(reruns=5)
+async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
+    tools = [
+        {
+            "type": "function",
+            "name": "get_place_to_travel",
+            "description": "Get a random place to travel",
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        },
+        GET_WEATHER_SCHEMA,
+    ]
+
+    response = await client.responses.create(
+        model=model_name,
+        input="Help me plan a trip to a random place. And tell me the weather there.",
+        tools=tools,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    assert len(response.output) == 2
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "function_call"
+
+    tool_call = response.output[1]
+    name = tool_call.name
+    args = json.loads(tool_call.arguments)
+
+    result = call_function(name, args)
+
+    response_2 = await client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "type": "function_call_output",
+                "call_id": tool_call.call_id,
+                "output": str(result),
+            }
+        ],
+        tools=tools,
+        previous_response_id=response.id,
+    )
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert len(response_2.output) == 2
+    assert response_2.output[0].type == "reasoning"
+    assert response_2.output[1].type == "function_call"
+
+    tool_call = response_2.output[1]
+    name = tool_call.name
+    args = json.loads(tool_call.arguments)
+
+    result = call_function(name, args)
+
+    response_3 = await client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "type": "function_call_output",
+                "call_id": tool_call.call_id,
+                "output": str(result),
+            }
+        ],
+        tools=tools,
+        previous_response_id=response_2.id,
+    )
+    assert response_3 is not None
+    assert response_3.status == "completed"
+    assert response_3.output_text is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_required(client: OpenAI, model_name: str):
+    tools = [GET_WEATHER_SCHEMA]
+
+    with pytest.raises(BadRequestError):
+        await client.responses.create(
+            model=model_name,
+            input="What's the weather like in Paris today?",
+            tools=tools,
+            tool_choice="required",
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_system_message_with_tools(client: OpenAI, model_name: str):
+    from vllm.entrypoints.openai.parser.harmony_utils import get_system_message
+
+    # Test with custom tools enabled - commentary channel should be available
+    sys_msg = get_system_message(with_custom_tools=True)
+    valid_channels = sys_msg.content[0].channel_config.valid_channels
+    assert "commentary" in valid_channels
+
+    # Test with custom tools disabled - commentary channel should be removed
+    sys_msg = get_system_message(with_custom_tools=False)
+    valid_channels = sys_msg.content[0].channel_config.valid_channels
+    assert "commentary" not in valid_channels
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_full_history(client: OpenAI, model_name: str):
+    tools = [GET_WEATHER_SCHEMA]
+
+    input_messages = [
+        {"role": "user", "content": "What's the weather like in Paris today?"}
+    ]
+
+    response = await client.responses.create(
+        model=model_name,
+        input=input_messages,
+        tools=tools,
+    )
+
+    assert response is not None
+    assert response.status == "completed"
+
+    tool_call = response.output[-1]
+    name = tool_call.name
+    args = json.loads(tool_call.arguments)
+
+    result = call_function(name, args)
+
+    input_messages.extend(response.output)  # append model's function call message
+    input_messages.append(
+        {  # append result message
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }
+    )
+
+    response_2 = await client.responses.create(
+        model=model_name,
+        input=input_messages,
+        tools=tools,
+    )
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert response_2.output_text is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_with_stream(client: OpenAI, model_name: str):
+    tools = [GET_WEATHER_SCHEMA]
+    input_list = [
+        {
+            "role": "user",
+            "content": "What's the weather like in Paris today?",
+        }
+    ]
+    stream_response = await client.responses.create(
+        model=model_name,
+        input=input_list,
+        tools=tools,
+        stream=True,
+    )
+    assert stream_response is not None
+    final_tool_calls = {}
+    final_tool_calls_named = {}
+    async for event in stream_response:
+        if event.type == "response.output_item.added":
+            if event.item.type != "function_call":
+                continue
+            final_tool_calls[event.output_index] = event.item
+            final_tool_calls_named[event.item.name] = event.item
+        elif event.type == "response.function_call_arguments.delta":
+            index = event.output_index
+            tool_call = final_tool_calls[index]
+            if tool_call:
+                tool_call.arguments += event.delta
+                final_tool_calls_named[tool_call.name] = tool_call
+        elif event.type == "response.function_call_arguments.done":
+            assert event.arguments == final_tool_calls_named[event.name].arguments
+    for tool_call in final_tool_calls.values():
+        if (
+            tool_call
+            and tool_call.type == "function_call"
+            and tool_call.name == "get_weather"
+        ):
+            args = json.loads(tool_call.arguments)
+            result = call_function(tool_call.name, args)
+            input_list += [tool_call]
+            break
+    assert result is not None
+    response = await client.responses.create(
+        model=model_name,
+        input=input_list
+        + [
+            {
+                "type": "function_call_output",
+                "call_id": tool_call.call_id,
+                "output": str(result),
+            }
+        ],
+        tools=tools,
+        stream=True,
+    )
+    assert response is not None
+    async for event in response:
+        # check that no function call events in the stream
+        assert event.type != "response.function_call_arguments.delta"
+        assert event.type != "response.function_call_arguments.done"
+        # check that the response contains output text
+        if event.type == "response.completed":
+            assert len(event.response.output) > 0
+            assert event.response.output_text is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the capital of South Korea?",
+        extra_body={"enable_response_messages": True},
+    )
+
+    assert response is not None
+    assert response.status == "completed"
+    assert len(response.input_messages) > 0
+    assert len(response.output_messages) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_call_with_previous_input_messages(
+    client: OpenAI, model_name: str
+):
+    """Test function calling using previous_input_messages
+    for multi-turn conversation with a function call"""
+
+    # Define the get_horoscope tool
+    tools = [
+        {
+            "type": "function",
+            "name": "get_horoscope",
+            "description": "Get today's horoscope for an astrological sign.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "sign": {"type": "string"},
+                },
+                "required": ["sign"],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        }
+    ]
+
+    # Step 1: First call with the function tool
+    stream_response = await client.responses.create(
+        model=model_name,
+        input="What is the horoscope for Aquarius today?",
+        tools=tools,
+        extra_body={"enable_response_messages": True},
+        stream=True,
+    )
+
+    response = None
+    async for event in stream_response:
+        if event.type == "response.completed":
+            response = event.response
+
+    assert response is not None
+    assert response.status == "completed"
+
+    # Step 2: Parse the first output to find the function_call type
+    function_call = None
+    for item in response.output:
+        if item.type == "function_call":
+            function_call = item
+            break
+
+    assert function_call is not None, "Expected a function_call in the output"
+    assert function_call.name == "get_horoscope"
+    assert function_call.call_id is not None
+
+    # Verify the format matches expectations
+    args = json.loads(function_call.arguments)
+    assert "sign" in args
+
+    # Step 3: Call the get_horoscope function
+    result = call_function(function_call.name, args)
+    assert "Aquarius" in result
+    assert "baby otter" in result
+
+    # Get the input_messages and output_messages from the first response
+    first_input_messages = response.input_messages
+    first_output_messages = response.output_messages
+
+    # Construct the full conversation history using previous_input_messages
+    previous_messages = (
+        first_input_messages
+        + first_output_messages
+        + [
+            {
+                "role": "tool",
+                "name": "functions.get_horoscope",
+                "content": [{"type": "text", "text": str(result)}],
+            }
+        ]
+    )
+
+    # Step 4: Make another responses.create() call with previous_input_messages
+    stream_response_2 = await client.responses.create(
+        model=model_name,
+        tools=tools,
+        input="",
+        extra_body={
+            "previous_input_messages": previous_messages,
+            "enable_response_messages": True,
+        },
+        stream=True,
+    )
+
+    async for event in stream_response_2:
+        if event.type == "response.completed":
+            response_2 = event.response
+
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert response_2.output_text is not None
+
+    # verify only one system message / developer message
+    num_system_messages_input = 0
+    num_developer_messages_input = 0
+    num_function_call_input = 0
+    for message_dict in response_2.input_messages:
+        message = Message.from_dict(message_dict)
+        if message.author.role == "system":
+            num_system_messages_input += 1
+        elif message.author.role == "developer":
+            num_developer_messages_input += 1
+        elif message.author.role == "tool":
+            num_function_call_input += 1
+    assert num_system_messages_input == 1
+    assert num_developer_messages_input == 1
+    assert num_function_call_input == 1
+
+    # Verify the output makes sense - should contain information about the horoscope
+    output_text = response_2.output_text.lower()
+    assert (
+        "aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
+    )
--- a/tests/entrypoints/openai/test_responses_error.py
+++ b/tests/entrypoints/openai/test_responses_error.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from http import HTTPStatus
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing
+
+
+@pytest.mark.asyncio
+async def test_raise_if_error_raises_generation_error():
+    """test _raise_if_error raises GenerationError"""
+    # create a minimal OpenAIServing instance
+    mock_engine = MagicMock()
+    mock_engine.model_config = MagicMock()
+    mock_engine.model_config.max_model_len = 100
+    mock_models = MagicMock()
+
+    serving = OpenAIServing(
+        engine_client=mock_engine,
+        models=mock_models,
+        request_logger=None,
+    )
+
+    # test that error finish_reason raises GenerationError
+    with pytest.raises(GenerationError) as exc_info:
+        serving._raise_if_error("error", "test-request-id")
+
+    assert str(exc_info.value) == "Internal server error"
+    assert exc_info.value.status_code == HTTPStatus.INTERNAL_SERVER_ERROR
+
+    # test that other finish_reasons don't raise
+    serving._raise_if_error("stop", "test-request-id")  # should not raise
+    serving._raise_if_error("length", "test-request-id")  # should not raise
+    serving._raise_if_error(None, "test-request-id")  # should not raise
+
+
+@pytest.mark.asyncio
+async def test_convert_generation_error_to_response():
+    """test _convert_generation_error_to_response creates proper ErrorResponse"""
+    mock_engine = MagicMock()
+    mock_engine.model_config = MagicMock()
+    mock_engine.model_config.max_model_len = 100
+    mock_models = MagicMock()
+
+    serving = OpenAIServing(
+        engine_client=mock_engine,
+        models=mock_models,
+        request_logger=None,
+    )
+
+    # create a GenerationError
+    gen_error = GenerationError("Internal server error")
+
+    # convert to ErrorResponse
+    error_response = serving._convert_generation_error_to_response(gen_error)
+
+    assert isinstance(error_response, ErrorResponse)
+    assert error_response.error.type == "InternalServerError"
+    assert error_response.error.message == "Internal server error"
+    assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
+
+
+@pytest.mark.asyncio
+async def test_convert_generation_error_to_streaming_response():
+    """test _convert_generation_error_to_streaming_response output"""
+    mock_engine = MagicMock()
+    mock_engine.model_config = MagicMock()
+    mock_engine.model_config.max_model_len = 100
+    mock_models = MagicMock()
+
+    serving = OpenAIServing(
+        engine_client=mock_engine,
+        models=mock_models,
+        request_logger=None,
+    )
+
+    # create a GenerationError
+    gen_error = GenerationError("Internal server error")
+
+    # convert to streaming error response
+    error_json = serving._convert_generation_error_to_streaming_response(gen_error)
+
+    assert isinstance(error_json, str)
+    assert "Internal server error" in error_json
+    assert "InternalServerError" in error_json
--- a/tests/entrypoints/openai/test_responses_function_call_parsing.py
+++ b/tests/entrypoints/openai/test_responses_function_call_parsing.py
@@ -0,0 +1,330 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test function call parsing in ResponsesRequest."""
+
+import json
+
+import pytest
+from openai.types.responses import ResponseFunctionToolCall
+
+from vllm.entrypoints.openai.protocol import ResponsesRequest
+
+
+def test_function_call_dict_converted_to_object():
+    """Test that function_call dictionaries are correctly parsed into
+    ResponseFunctionToolCall objects."""
+    # Create a request with function_call as dict
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            {
+                "type": "function_call",
+                "call_id": "fc_123",
+                "name": "get_weather",
+                "arguments": '{"location": "Boston", "unit": "celsius"}',
+            }
+        ],
+    }
+
+    request = ResponsesRequest(**request_data)
+
+    # Verify the input item is now a ResponseFunctionToolCall object
+    assert len(request.input) == 1
+    assert isinstance(request.input[0], ResponseFunctionToolCall)
+    assert request.input[0].call_id == "fc_123"
+    assert request.input[0].name == "get_weather"
+    assert request.input[0].arguments == '{"location": "Boston", "unit": "celsius"}'
+
+
+def test_direct_function_call_object_preservation():
+    """Test that ResponseFunctionToolCall objects passed directly are preserved."""
+    # Create a request with ResponseFunctionToolCall object
+    function_call = ResponseFunctionToolCall(
+        type="function_call",
+        call_id="fc_456",
+        name="get_stock_price",
+        arguments='{"symbol": "AAPL"}',
+    )
+
+    request_data = {"model": "gpt-oss", "input": [function_call]}
+
+    request = ResponsesRequest(**request_data)
+
+    # Verify the object is preserved
+    assert len(request.input) == 1
+    assert request.input[0] is function_call
+
+
+def test_mixed_input_types_with_function_calls():
+    """Test parsing with mixed input types including function calls."""
+
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            # Valid Message type
+            {
+                "type": "message",
+                "role": "user",
+                "content": [{"type": "input_text", "text": "What's the weather?"}],
+            },
+            # Function call that should be parsed
+            {
+                "type": "function_call",
+                "call_id": "fc_789",
+                "name": "check_weather",
+                "arguments": '{"location": "NYC"}',
+            },
+            # Another function call
+            {
+                "type": "function_call",
+                "call_id": "fc_790",
+                "name": "get_time",
+                "arguments": "{}",
+            },
+        ],
+    }
+
+    request = ResponsesRequest(**request_data)
+
+    # Verify mixed types are handled correctly
+    assert len(request.input) == 3
+    # First item should be validated as Message
+    assert request.input[0]["type"] == "message"
+    # Second item should be parsed to ResponseFunctionToolCall
+    assert isinstance(request.input[1], ResponseFunctionToolCall)
+    assert request.input[1].call_id == "fc_789"
+    assert request.input[1].name == "check_weather"
+    # Third item should also be parsed to ResponseFunctionToolCall
+    assert isinstance(request.input[2], ResponseFunctionToolCall)
+    assert request.input[2].call_id == "fc_790"
+    assert request.input[2].name == "get_time"
+
+
+def test_function_call_with_complex_arguments():
+    """Test parsing function calls with complex nested arguments."""
+    complex_args = {
+        "query": "weather forecast",
+        "filters": {
+            "location": {"city": "San Francisco", "state": "CA"},
+            "timeRange": {"start": "2024-01-01", "end": "2024-01-07"},
+            "metrics": ["temperature", "humidity", "precipitation"],
+        },
+        "options": {"format": "detailed", "includeAlerts": True},
+    }
+
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            {
+                "type": "function_call",
+                "call_id": "fc_complex",
+                "name": "advanced_weather_query",
+                "arguments": json.dumps(complex_args),
+            }
+        ],
+    }
+
+    request = ResponsesRequest(**request_data)
+
+    # Verify complex arguments are preserved correctly
+    assert len(request.input) == 1
+    assert isinstance(request.input[0], ResponseFunctionToolCall)
+    assert request.input[0].call_id == "fc_complex"
+    assert request.input[0].name == "advanced_weather_query"
+
+    # Parse the arguments back to verify they're intact
+    parsed_args = json.loads(request.input[0].arguments)
+    assert parsed_args == complex_args
+
+
+def test_invalid_function_call_fallback():
+    """Test that invalid function call dictionaries fall back gracefully."""
+    # Missing required field 'call_id'
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            {"type": "function_call", "name": "incomplete_function", "arguments": "{}"}
+        ],
+    }
+
+    # This should not raise an error during model creation
+    # The validator should keep the original dict and let Pydantic
+    # handle validation
+    with pytest.raises(ValueError):
+        # Pydantic should raise a validation error for the invalid structure
+        ResponsesRequest(**request_data)
+
+
+def test_string_input_not_affected():
+    """Test that string input is not affected by the validator."""
+    request_data = {"model": "gpt-oss", "input": "This is a simple string input"}
+
+    request = ResponsesRequest(**request_data)
+
+    # Verify string input remains unchanged
+    assert request.input == "This is a simple string input"
+
+
+def test_empty_list_input():
+    """Test that empty list input is handled correctly."""
+    request_data = {"model": "gpt-oss", "input": []}
+
+    request = ResponsesRequest(**request_data)
+
+    # Verify empty list is preserved
+    assert request.input == []
+
+
+def test_function_call_output_not_affected():
+    """Test that FunctionCallOutput is not affected by the function_call parsing."""
+
+    # Test with FunctionCallOutput as dict (should not be parsed)
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            {
+                "type": "function_call_output",
+                "call_id": "fc_output_123",
+                "output": "The weather in Boston is 72°F and sunny.",
+            }
+        ],
+    }
+
+    request = ResponsesRequest(**request_data)
+
+    # FunctionCallOutput should remain as dict (not converted to an object)
+    assert len(request.input) == 1
+    assert isinstance(request.input[0], dict)
+    assert request.input[0]["type"] == "function_call_output"
+    assert request.input[0]["call_id"] == "fc_output_123"
+    assert request.input[0]["output"] == "The weather in Boston is 72°F and sunny."
+
+
+def test_mixed_function_call_and_output():
+    """Test that function_call is parsed while function_call_output is preserved."""
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            # This should be parsed to ResponseFunctionToolCall
+            {
+                "type": "function_call",
+                "call_id": "fc_call_456",
+                "name": "get_weather",
+                "arguments": '{"location": "NYC"}',
+            },
+            # This should remain as dict
+            {
+                "type": "function_call_output",
+                "call_id": "fc_call_456",
+                "output": "NYC weather is 68°F with light rain",
+            },
+        ],
+    }
+
+    request = ResponsesRequest(**request_data)
+
+    assert len(request.input) == 2
+
+    # First item should be parsed to ResponseFunctionToolCall
+    assert isinstance(request.input[0], ResponseFunctionToolCall)
+    assert request.input[0].call_id == "fc_call_456"
+    assert request.input[0].name == "get_weather"
+
+    # Second item should remain as dict (FunctionCallOutput)
+    assert isinstance(request.input[1], dict)
+    assert request.input[1]["type"] == "function_call_output"
+    assert request.input[1]["call_id"] == "fc_call_456"
+    assert request.input[1]["output"] == "NYC weather is 68°F with light rain"
+
+
+def test_function_call_validation_failure_logs_debug(caplog):
+    """Test that validation failures are logged at debug level."""
+    from unittest.mock import patch
+
+    request_data = {
+        "model": "gpt-oss",
+        "input": [
+            {
+                "type": "function_call",
+                "name": "incomplete_function",
+                "arguments": "{}",  # Missing call_id
+            }
+        ],
+    }
+
+    # Mock the logger to verify debug was called
+    with patch("vllm.entrypoints.openai.protocol.logger") as mock_logger:
+        with pytest.raises(ValueError):
+            ResponsesRequest(**request_data)
+
+        # Verify debug was called with expected message
+        mock_logger.debug.assert_called_once()
+        call_args = mock_logger.debug.call_args[0][0]
+        assert "Failed to parse function_call" in call_args
+
+
+def test_validator_handles_iterator_input():
+    """Test that validator can handle ValidatorIterator input (Pydantic internal)."""
+
+    # This test simulates when Pydantic passes a ValidatorIterator instead of a list
+    # This happened with complex nested structures containing reasoning + function_call
+
+    # Create test data that would normally be a list
+    test_input_items = [
+        {
+            "type": "message",
+            "role": "user",
+            "content": [{"type": "input_text", "text": "Test"}],
+        },
+        {
+            "type": "reasoning",
+            "id": "rs_1",
+            "summary": [{"type": "summary_text", "text": "Test reasoning"}],
+            "content": [{"type": "reasoning_text", "text": "Test content"}],
+        },
+        {
+            "type": "function_call",
+            "call_id": "call_1",
+            "name": "test_function",
+            "arguments": '{"test": "value"}',
+            "id": "fc_1",
+        },
+    ]
+
+    # Mock data where input is an iterator (simulates Pydantic ValidatorIterator)
+    mock_data = {
+        "model": "test-model",
+        "input": iter(test_input_items),  # Iterator instead of list
+    }
+
+    # This should NOT raise an error with the fixed validator
+    try:
+        request = ResponsesRequest(**mock_data)
+
+        # Verify the validator processed the data correctly
+        assert len(request.input) == 3
+
+        # Verify function_call was converted to ResponseFunctionToolCall object
+        function_call_item = None
+        for item in request.input:
+            if isinstance(item, ResponseFunctionToolCall):
+                function_call_item = item
+                break
+
+        assert function_call_item is not None
+        assert function_call_item.call_id == "call_1"
+        assert function_call_item.name == "test_function"
+
+    except Exception as e:
+        pytest.fail(f"Validator should handle iterator input, but failed with: {e}")
+
+
+def test_validator_handles_empty_iterator():
+    """Test validator handles empty iterator gracefully."""
+    mock_data = {
+        "model": "test-model",
+        "input": iter([]),  # Empty iterator
+    }
+
+    request = ResponsesRequest(**mock_data)
+    assert request.input == []
--- a/tests/entrypoints/openai/test_return_token_ids.py
+++ b/tests/entrypoints/openai/test_return_token_ids.py
@@ -0,0 +1,369 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.tokenizers import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "hermes",
+        "--enforce-eager",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("return_token_ids", [True, False, None])
+async def test_basic_completion_with_emoji(server, return_token_ids: bool | None):
+    """Test basic completion with emoji to verify token_ids field."""
+    extra_body = None
+    if return_token_ids is not None:
+        extra_body = {"return_token_ids": return_token_ids}
+    async with server.get_async_client() as client:
+        # Test with return_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Complete this sentence with emojis: I love coding 🚀",
+            max_tokens=10,
+            temperature=0,
+            logprobs=1,
+            extra_body=extra_body,
+        )
+
+        # Check the raw response to see the structure
+        completion_dict = completion.model_dump()
+
+        # Verify prompt_token_ids field is present in the completion response
+        assert "prompt_token_ids" in completion_dict["choices"][0]
+        if not return_token_ids:
+            # If return_token_ids is False, token_ids should not be present
+            assert completion_dict["choices"][0].get("token_ids") is None
+            assert completion_dict["choices"][0].get("prompt_token_ids") is None
+            # Skip further checks
+            return
+        assert isinstance(completion.choices[0].prompt_token_ids, list)
+
+        # Check against the expected prompt token IDs
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        encoded_tokens = tokenizer.encode(
+            "Complete this sentence with emojis: I love coding 🚀"
+        )
+        # Check that encoded_tokens is a subsequence of prompt_token_ids
+        assert any(
+            completion.choices[0].prompt_token_ids[i : i + len(encoded_tokens)]
+            == encoded_tokens
+            for i in range(
+                len(completion.choices[0].prompt_token_ids) - len(encoded_tokens) + 1
+            )
+        )
+
+        # Verify token_ids field is present in the choice
+        assert completion.choices[0].token_ids is not None
+        assert isinstance(completion.choices[0].token_ids, list)
+        assert len(completion.choices[0].token_ids) > 0
+
+        # Verify decoding works correctly
+        decoded_text = tokenizer.decode(completion.choices[0].token_ids)
+        # The decoded text should contain a <|im_end|> at the end
+        assert decoded_text.startswith(completion.choices[0].text)
+
+        # Test without return_token_ids (should be None)
+        completion_without = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Complete this sentence with emojis: I love coding 🚀",
+            max_tokens=10,
+            temperature=0,
+            logprobs=1,
+            extra_body={"return_token_ids": False},
+        )
+
+        completion_without_dict = completion_without.model_dump()
+        assert completion_without_dict["choices"][0].get("token_ids") is None
+        assert completion_without_dict.get("prompt_token_ids") is None
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_tool_use(server):
+    """Test chat completion with tool use (get_weather function)."""
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "description": "The unit of temperature",
+                        },
+                    },
+                    "required": ["location"],
+                },
+            },
+        }
+    ]
+
+    async with server.get_async_client() as client:
+        # Test with return_token_ids enabled
+        response = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "What's the weather like in Paris?"},
+            ],
+            tools=tools,
+            tool_choice="auto",
+            max_tokens=100,
+            temperature=0,
+            logprobs=True,
+            extra_body={"return_token_ids": True},
+        )
+
+        # Verify token_ids field is present in choices
+        assert response.choices[0].token_ids is not None
+        assert isinstance(response.choices[0].token_ids, list)
+
+        # Verify prompt_token_ids field is present
+        assert response.prompt_token_ids is not None
+        assert isinstance(response.prompt_token_ids, list)
+
+        # Verify the prompt texts and response texts
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        prompt_text = tokenizer.decode(response.prompt_token_ids)
+        assert prompt_text.startswith(
+            "<|im_start|>system\nYou are a helpful assistant."
+        )
+        assert prompt_text.endswith(
+            "What's the weather like in Paris?<|im_end|>\n<|im_start|>assistant\n"
+        )
+
+        response_text = tokenizer.decode(response.choices[0].token_ids)
+        assert response_text.startswith('<tool_call>\n{"name": "get_weather"')
+        assert response_text.endswith("</tool_call><|im_end|>")
+
+        # If tool call was made, verify the response structure
+        if response.choices[0].message.tool_calls:
+            assert len(response.choices[0].message.tool_calls) > 0
+            tool_call = response.choices[0].message.tool_calls[0]
+            assert tool_call.function.name == "get_weather"
+
+        # Test without return_token_ids
+        response_without = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "What's the weather like in Paris?"},
+            ],
+            tools=tools,
+            tool_choice="auto",
+            max_tokens=100,
+            temperature=0,
+            logprobs=True,
+            extra_body={"return_token_ids": False},
+        )
+
+        assert response_without.choices[0].token_ids is None
+        assert response_without.prompt_token_ids is None
+
+
+@pytest.mark.asyncio
+async def test_comparison_with_prompt_logprobs_and_logprobs(server):
+    """
+    Test that token_ids align with prompt_logprobs and
+    logprobs when return_tokens_as_token_ids is enabled.
+    """
+    async with server.get_async_client() as client:
+        # Test with both return_token_ids and return_tokens_as_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Hello, world! How are you today?",
+            max_tokens=20,
+            temperature=0,
+            echo=True,
+            logprobs=1,
+            extra_body={
+                "return_token_ids": True,
+                "return_tokens_as_token_ids": True,
+                "prompt_logprobs": 1,
+            },
+        )
+
+        # Verify all fields are present
+        assert completion.choices[0].token_ids is not None
+        assert completion.choices[0].prompt_token_ids is not None
+        assert completion.choices[0].prompt_logprobs is not None
+        assert completion.choices[0].logprobs is not None
+
+        # Extract token IDs from logprobs
+        # (when return_tokens_as_token_ids is True)
+        logprobs_token_ids = []
+        for token_str in completion.choices[0].logprobs.tokens:
+            # Token format is "token_id:12345" when
+            # return_tokens_as_token_ids is True
+            if token_str.startswith("token_id:"):
+                token_id = int(token_str.removeprefix("token_id:"))
+                logprobs_token_ids.append(token_id)
+
+        # When echo=True, the logprobs include both prompt and response tokens
+        # The token_ids field should match the suffix of response portion
+        # The prompt_token_ids should match the prompt portion
+        assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
+        response_token_ids_length = len(completion.choices[0].token_ids)
+        assert (
+            logprobs_token_ids[-response_token_ids_length:]
+            == completion.choices[0].token_ids
+        )
+
+        # Verify tokenizer consistency
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+        # Decode prompt tokens
+        if completion.choices[0].prompt_token_ids:
+            prompt_text = tokenizer.decode(completion.choices[0].prompt_token_ids)
+            # The decoded prompt should match or close to original prompt
+            assert "Hello, world" in prompt_text
+
+        # Decode response tokens
+        if completion.choices[0].token_ids:
+            response_text = tokenizer.decode(completion.choices[0].token_ids)
+            assert completion.choices[0].text.endswith(response_text)
+
+        # Test streaming mode
+        stream = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Tell me a short fact about Python:",
+            max_tokens=30,
+            temperature=0,
+            stream=True,
+            echo=False,
+            logprobs=1,
+            extra_body={"return_token_ids": True, "return_tokens_as_token_ids": True},
+        )
+
+        # Collect streamed tokens
+        streamed_prompt_token_ids = []
+        streamed_token_ids = []
+        streamed_logprob_token_ids = []
+        first_chunk = True
+        async for chunk in stream:
+            for token_str in chunk.choices[0].logprobs.tokens:
+                # Token format is "token_id:12345" when
+                # return_tokens_as_token_ids is True
+                if token_str.startswith("token_id:"):
+                    token_id = int(token_str.removeprefix("token_id:"))
+                    streamed_logprob_token_ids.append(token_id)
+            if first_chunk:
+                streamed_prompt_token_ids = chunk.choices[0].prompt_token_ids
+                first_chunk = False
+            streamed_token_ids += chunk.choices[0].token_ids
+
+        # Verify we collected some tokens and first chunk had prompt_token_ids
+        assert len(streamed_prompt_token_ids) > 0
+        assert streamed_token_ids == streamed_logprob_token_ids
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_emoji_and_token_ids(server):
+    """Test chat completion with emojis to verify token_ids handling."""
+    chat_messages = [
+        {"role": "system", "content": "You like to use emojis in your responses."},
+        {"role": "user", "content": "Repeat after me: I love cats 🐱"},
+    ]
+    async with server.get_async_client() as client:
+        response = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=chat_messages,
+            max_tokens=50,
+            temperature=0,
+            logprobs=True,
+            extra_body={"return_token_ids": True},
+        )
+
+        # Verify token_ids are present
+        response_dict = response.model_dump()
+        assert response.choices[0].token_ids is not None
+        assert "prompt_token_ids" in response_dict
+
+        # Verify the response contains the expected fields
+        assert response.choices[0].message.content is not None
+
+        # Decode token_ids and verify consistency
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+        decoded_prompt = tokenizer.decode(response.prompt_token_ids)
+        assert decoded_prompt.startswith(
+            "<|im_start|>system\nYou like to use emojis in your responses."
+        )
+        assert decoded_prompt.endswith(
+            "I love cats 🐱<|im_end|>\n<|im_start|>assistant\n"
+        )
+
+        decoded_response = tokenizer.decode(response.choices[0].token_ids)
+        # The content should match the response text
+        # except the ending <|im_end|>
+        assert decoded_response == response.choices[0].message.content + "<|im_end|>"
+
+        # Test with streaming
+        stream = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=chat_messages,
+            max_tokens=50,
+            temperature=0,
+            stream=True,
+            extra_body={"return_token_ids": True},
+        )
+
+        collected_content = ""
+        collected_token_ids = []
+        first_chunk = True
+
+        async for chunk in stream:
+            if first_chunk:
+                assert chunk.prompt_token_ids is not None
+                assert isinstance(chunk.prompt_token_ids, list)
+                # Check the prompt_token_ids match the initial prompt
+                decoded_prompt_stream = tokenizer.decode(chunk.prompt_token_ids)
+                assert decoded_prompt_stream == decoded_prompt
+                first_chunk = False
+            else:
+                chunk_dump = chunk.model_dump()
+                assert "prompt_token_ids" not in chunk_dump, (
+                    "Subsequent chunks should not have prompt_token_ids"
+                )
+
+            if chunk.choices:
+                if chunk.choices[0].delta.content:
+                    collected_content += chunk.choices[0].delta.content
+                # token_ids may not present in all chunks
+                choice_dump = chunk.choices[0].model_dump()
+                if "token_ids" in choice_dump:
+                    collected_token_ids.extend(chunk.choices[0].token_ids)
+
+        # Verify we got response and token_ids
+        assert len(collected_content) > 0
+        assert len(collected_token_ids) > 0
+
+        # Verify token_ids decode properly
+        decoded_response = tokenizer.decode(collected_token_ids)
+        assert decoded_response == collected_content + "<|im_end|>"
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Separate these tests out from test_completion and test_chat, because they
+# require launching a second server with a different flag. Running both servers
+# at the same time on a single node will OOM.
+
+import pytest
+
+from vllm.tokenizers import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def default_server_args(qwen3_lora_files):
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"qwen3-lora={qwen3_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+    ]
+
+
+@pytest.fixture(scope="module")
+def server_fixture(request, default_server_args):  # noqa: F811
+    use_server_flag = request.param
+    if use_server_flag:
+        args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
+        with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
+            yield (remote_server, True)
+    else:
+        with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+            yield (remote_server, False)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
+async def test_completion_return_tokens_as_token_ids_completion(server_fixture):
+    server, use_server_flag = server_fixture
+    request_args = {}
+    if not use_server_flag:
+        request_args["return_tokens_as_token_ids"] = True
+
+    async with server.get_async_client() as client:
+        completion = await client.completions.create(
+            model=MODEL_NAME,
+            # Include Unicode characters to test for dividing a single
+            # character across multiple tokens: 🎉 is [28705, 31862] for the
+            # Zephyr tokenizer
+            prompt="Say 'Hello, world! 🎉'",
+            echo=True,
+            temperature=0,
+            max_tokens=10,
+            logprobs=1,
+            extra_body=request_args,
+        )
+
+        text = completion.choices[0].text
+        token_strs = completion.choices[0].logprobs.tokens
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        # Check that the token representations are consistent between raw
+        # tokens and top_logprobs
+        # Slice off the first one, because there's no scoring associated
+        # with BOS
+        top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
+        top_logprob_keys = [
+            next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
+        ]
+        assert token_strs[1:] == top_logprob_keys
+
+        # Check that decoding the tokens gives the expected text
+        tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
+        assert text == tokenizer.decode(tokens, skip_special_tokens=True)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
+async def test_chat_return_tokens_as_token_ids_completion(server_fixture):
+    server, use_server_flag = server_fixture
+    request_args = {}
+    if not use_server_flag:
+        request_args["return_tokens_as_token_ids"] = True
+
+    async with server.get_async_client() as client:
+        response = await client.chat.completions.create(
+            model=MODEL_NAME,
+            # Include Unicode characters to test for dividing a single
+            # character across multiple tokens: 🎉 is [28705, 31862] for the
+            # Zephyr tokenizer
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You like to respond in only emojis, like 🎉",
+                },
+                {"role": "user", "content": "Please write some emojis: 🐱🐶🎉"},
+            ],
+            temperature=0,
+            max_tokens=8,
+            logprobs=True,
+            extra_body=request_args,
+        )
+
+        text = response.choices[0].message.content
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        token_ids = []
+        for logprob_content in response.choices[0].logprobs.content:
+            token_ids.append(int(logprob_content.token.removeprefix("token_id:")))
+        assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import contextlib
+import os
+from typing import Any, NamedTuple
+
+import openai  # use the official client for correctness check
+import pytest
+
+from ...utils import RemoteOpenAIServer
+
+# # any model with a chat template should work here
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+API_KEY = "abc-123"
+ERROR_API_KEY = "abc"
+ROOT_PATH = "llm"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--enforce-eager",
+        "--max-model-len",
+        "4080",
+        "--root-path",  # use --root-path=/llm for testing
+        "/" + ROOT_PATH,
+    ]
+    envs = os.environ.copy()
+
+    envs["VLLM_API_KEY"] = API_KEY
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
+        yield remote_server
+
+
+class TestCase(NamedTuple):
+    model_name: str
+    base_url: list[str]
+    api_key: str
+    expected_error: Any
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=["v1"],  # http://localhost:8000/v1
+            api_key=ERROR_API_KEY,
+            expected_error=openai.AuthenticationError,
+        ),
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
+            api_key=ERROR_API_KEY,
+            expected_error=openai.AuthenticationError,
+        ),
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=["v1"],  # http://localhost:8000/v1
+            api_key=API_KEY,
+            expected_error=None,
+        ),
+        TestCase(
+            model_name=MODEL_NAME,
+            base_url=[ROOT_PATH, "v1"],  # http://localhost:8000/llm/v1
+            api_key=API_KEY,
+            expected_error=None,
+        ),
+    ],
+)
+async def test_chat_session_root_path_with_api_key(
+    server: RemoteOpenAIServer, test_case: TestCase
+):
+    saying: str = "Here is a common saying about apple. An apple a day, keeps"
+    ctx = contextlib.nullcontext()
+    if test_case.expected_error is not None:
+        ctx = pytest.raises(test_case.expected_error)
+    with ctx:
+        client = openai.AsyncOpenAI(
+            api_key=test_case.api_key,
+            base_url=server.url_for(*test_case.base_url),
+            max_retries=0,
+        )
+        chat_completion = await client.chat.completions.create(
+            model=test_case.model_name,
+            messages=[
+                {"role": "user", "content": "tell me a common saying"},
+                {"role": "assistant", "content": saying},
+            ],
+            extra_body={"continue_final_message": True, "add_generation_prompt": False},
+        )
+
+        assert chat_completion.id is not None
+        assert len(chat_completion.choices) == 1
+        choice = chat_completion.choices[0]
+        assert choice.finish_reason == "stop"
+        message = choice.message
+        assert len(message.content) > 0
+        assert message.role == "assistant"
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import subprocess
+import tempfile
+
+import pytest
+
+from vllm.entrypoints.openai.run_batch import BatchRequestOutput
+
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
+
+# ruff: noqa: E501
+INPUT_BATCH = (
+    '{{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
+    '{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
+    '{{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "NonExistModel", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
+    '{{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
+    '{{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {{"stream": "True", "model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
+).format(MODEL_NAME)
+
+INVALID_INPUT_BATCH = (
+    '{{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
+    '{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
+).format(MODEL_NAME)
+
+INPUT_EMBEDDING_BATCH = (
+    '{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}\n'
+    '{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}\n'
+    '{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}\n'
+    '{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}'
+)
+
+INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
+
+INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
+
+INPUT_REASONING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Solve this math problem: 2+2=?"}]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "What is the capital of France?"}]}}"""
+
+
+def test_empty_file():
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
+        input_file.write("")
+        input_file.flush()
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "intfloat/multilingual-e5-small",
+            ],
+        )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        assert contents.strip() == ""
+
+
+def test_completions():
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
+        input_file.write(INPUT_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                MODEL_NAME,
+            ],
+        )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+
+
+def test_completions_invalid_input():
+    """
+    Ensure that we fail when the input doesn't conform to the openai api.
+    """
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
+        input_file.write(INVALID_INPUT_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                MODEL_NAME,
+            ],
+        )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode != 0, f"{proc=}"
+
+
+def test_embeddings():
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
+        input_file.write(INPUT_EMBEDDING_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "intfloat/multilingual-e5-small",
+            ],
+        )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+
+
+@pytest.mark.parametrize("input_batch", [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
+def test_score(input_batch):
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
+        input_file.write(input_batch)
+        input_file.flush()
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "BAAI/bge-reranker-v2-m3",
+            ],
+        )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+
+            # Ensure that there is no error in the response.
+            line_dict = json.loads(line)
+            assert isinstance(line_dict, dict)
+            assert line_dict["error"] is None
+
+
+def test_reasoning_parser():
+    """
+    Test that reasoning_parser parameter works correctly in run_batch.
+    """
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
+        input_file.write(INPUT_REASONING_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "Qwen/Qwen3-0.6B",
+                "--reasoning-parser",
+                "qwen3",
+            ],
+        )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+
+            # Ensure that there is no error in the response.
+            line_dict = json.loads(line)
+            assert isinstance(line_dict, dict)
+            assert line_dict["error"] is None
+
+            # Check that reasoning is present and not empty
+            reasoning = line_dict["response"]["body"]["choices"][0]["message"][
+                "reasoning"
+            ]
+            assert reasoning is not None
+            assert len(reasoning) > 0
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import time
+from unittest.mock import Mock
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.tokenizers.mistral import MistralTokenizer
+
+
+@pytest.fixture()
+def serving() -> OpenAIServing:
+    """Create a minimal OpenAIServing instance for testing."""
+
+    # Create minimal mocks
+    engine_client = Mock()
+    model_config = Mock(spec=ModelConfig)
+    model_config.max_model_len = 32768
+    models = Mock(spec=OpenAIServingModels)
+    models.model_config = model_config
+    models.input_processor = Mock()
+    models.io_processor = Mock()
+
+    serving = OpenAIServing(
+        engine_client=engine_client,
+        models=models,
+        request_logger=None,
+    )
+    return serving
+
+
+@pytest.mark.asyncio
+async def test_async_mistral_tokenizer_does_not_block_event_loop(
+    serving: OpenAIServing,
+):
+    expected_tokens = [1, 2, 3]
+
+    # Mock the blocking version to sleep
+    def mocked_apply_chat_template(*_args, **_kwargs):
+        time.sleep(2)
+        return expected_tokens
+
+    mock_tokenizer = Mock(spec=MistralTokenizer)
+    mock_tokenizer.apply_chat_template.side_effect = mocked_apply_chat_template
+
+    task = serving._apply_mistral_chat_template_async(
+        tokenizer=mock_tokenizer, messages=[], chat_template=None, tools=[]
+    )
+
+    # Ensure the event loop is not blocked
+    blocked_count = 0
+    for _i in range(20):  # Check over ~2 seconds
+        start = time.perf_counter()
+        await asyncio.sleep(0)
+        elapsed = time.perf_counter() - start
+
+        # an overly generous elapsed time for slow machines
+        if elapsed >= 0.5:
+            blocked_count += 1
+
+        await asyncio.sleep(0.1)
+
+    # Ensure task completes
+    tokens = await task
+    assert tokens == expected_tokens, "Mocked blocking tokenizer was not called"
+    assert blocked_count == 0, "Event loop blocked during tokenization"
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from http import HTTPStatus
+from unittest.mock import MagicMock
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.openai.protocol import (
+    ErrorResponse,
+    LoadLoRAAdapterRequest,
+    UnloadLoRAAdapterRequest,
+)
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
+from vllm.lora.request import LoRARequest
+
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
+BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+LORA_LOADING_SUCCESS_MESSAGE = "Success: LoRA adapter '{lora_name}' added successfully."
+LORA_UNLOADING_SUCCESS_MESSAGE = (
+    "Success: LoRA adapter '{lora_name}' removed successfully."
+)
+
+
+async def _async_serving_models_init() -> OpenAIServingModels:
+    mock_engine_client = MagicMock(spec=EngineClient)
+    # Set the max_model_len attribute to avoid missing attribute
+    mock_model_config = MagicMock(spec=ModelConfig)
+    mock_model_config.max_model_len = 2048
+    mock_engine_client.model_config = mock_model_config
+    mock_engine_client.input_processor = MagicMock()
+    mock_engine_client.io_processor = MagicMock()
+
+    serving_models = OpenAIServingModels(
+        engine_client=mock_engine_client,
+        base_model_paths=BASE_MODEL_PATHS,
+        lora_modules=None,
+    )
+    await serving_models.init_static_loras()
+
+    return serving_models
+
+
+@pytest.mark.asyncio
+async def test_serving_model_name():
+    serving_models = await _async_serving_models_init()
+    assert serving_models.model_name(None) == MODEL_NAME
+    request = LoRARequest(
+        lora_name="adapter", lora_path="/path/to/adapter2", lora_int_id=1
+    )
+    assert serving_models.model_name(request) == request.lora_name
+
+
+@pytest.mark.asyncio
+async def test_load_lora_adapter_success():
+    serving_models = await _async_serving_models_init()
+    request = LoadLoRAAdapterRequest(lora_name="adapter", lora_path="/path/to/adapter2")
+    response = await serving_models.load_lora_adapter(request)
+    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name="adapter")
+    assert len(serving_models.lora_requests) == 1
+    assert "adapter" in serving_models.lora_requests
+    assert serving_models.lora_requests["adapter"].lora_name == "adapter"
+
+
+@pytest.mark.asyncio
+async def test_load_lora_adapter_missing_fields():
+    serving_models = await _async_serving_models_init()
+    request = LoadLoRAAdapterRequest(lora_name="", lora_path="")
+    response = await serving_models.load_lora_adapter(request)
+    assert isinstance(response, ErrorResponse)
+    assert response.error.type == "InvalidUserInput"
+    assert response.error.code == HTTPStatus.BAD_REQUEST
+
+
+@pytest.mark.asyncio
+async def test_load_lora_adapter_duplicate():
+    serving_models = await _async_serving_models_init()
+    request = LoadLoRAAdapterRequest(
+        lora_name="adapter1", lora_path="/path/to/adapter1"
+    )
+    response = await serving_models.load_lora_adapter(request)
+    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name="adapter1")
+    assert len(serving_models.lora_requests) == 1
+
+    request = LoadLoRAAdapterRequest(
+        lora_name="adapter1", lora_path="/path/to/adapter1"
+    )
+    response = await serving_models.load_lora_adapter(request)
+    assert isinstance(response, ErrorResponse)
+    assert response.error.type == "InvalidUserInput"
+    assert response.error.code == HTTPStatus.BAD_REQUEST
+    assert len(serving_models.lora_requests) == 1
+
+
+@pytest.mark.asyncio
+async def test_unload_lora_adapter_success():
+    serving_models = await _async_serving_models_init()
+    request = LoadLoRAAdapterRequest(
+        lora_name="adapter1", lora_path="/path/to/adapter1"
+    )
+    response = await serving_models.load_lora_adapter(request)
+    assert len(serving_models.lora_requests) == 1
+
+    request = UnloadLoRAAdapterRequest(lora_name="adapter1")
+    response = await serving_models.unload_lora_adapter(request)
+    assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(lora_name="adapter1")
+    assert len(serving_models.lora_requests) == 0
+
+
+@pytest.mark.asyncio
+async def test_unload_lora_adapter_missing_fields():
+    serving_models = await _async_serving_models_init()
+    request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None)
+    response = await serving_models.unload_lora_adapter(request)
+    assert isinstance(response, ErrorResponse)
+    assert response.error.type == "InvalidUserInput"
+    assert response.error.code == HTTPStatus.BAD_REQUEST
+
+
+@pytest.mark.asyncio
+async def test_unload_lora_adapter_not_found():
+    serving_models = await _async_serving_models_init()
+    request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter")
+    response = await serving_models.unload_lora_adapter(request)
+    assert isinstance(response, ErrorResponse)
+    assert response.error.type == "NotFoundError"
+    assert response.error.code == HTTPStatus.NOT_FOUND
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@@ -0,0 +1,352 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import AsyncExitStack
+from unittest.mock import MagicMock
+
+import pytest
+import pytest_asyncio
+from openai.types.responses.tool import (
+    CodeInterpreterContainerCodeInterpreterToolAuto,
+    LocalShell,
+    Mcp,
+    Tool,
+)
+
+from vllm.entrypoints.context import ConversationContext
+from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest
+from vllm.entrypoints.openai.serving_responses import (
+    OpenAIServingResponses,
+    _extract_allowed_tools_from_mcp_requests,
+    extract_tool_types,
+)
+from vllm.entrypoints.tool_server import ToolServer
+from vllm.inputs.data import TokensPrompt
+
+
+class MockConversationContext(ConversationContext):
+    """Mock conversation context for testing"""
+
+    def __init__(self):
+        self.init_tool_sessions_called = False
+        self.init_tool_sessions_args = None
+        self.init_tool_sessions_kwargs = None
+
+    def append_output(self, output) -> None:
+        pass
+
+    def append_tool_output(self, output) -> None:
+        pass
+
+    async def call_tool(self):
+        return []
+
+    def need_builtin_tool_call(self) -> bool:
+        return False
+
+    def render_for_completion(self):
+        return []
+
+    async def init_tool_sessions(self, tool_server, exit_stack, request_id, mcp_tools):
+        self.init_tool_sessions_called = True
+        self.init_tool_sessions_args = (tool_server, exit_stack, request_id, mcp_tools)
+
+    async def cleanup_session(self) -> None:
+        pass
+
+
+@pytest.fixture
+def mock_serving_responses():
+    """Create a mock OpenAIServingResponses instance"""
+    serving_responses = MagicMock(spec=OpenAIServingResponses)
+    serving_responses.tool_server = MagicMock(spec=ToolServer)
+    return serving_responses
+
+
+@pytest.fixture
+def mock_context():
+    """Create a mock conversation context"""
+    return MockConversationContext()
+
+
+@pytest.fixture
+def mock_exit_stack():
+    """Create a mock async exit stack"""
+    return MagicMock(spec=AsyncExitStack)
+
+
+def test_extract_tool_types(monkeypatch: pytest.MonkeyPatch) -> None:
+    tools: list[Tool] = []
+    assert extract_tool_types(tools) == set()
+
+    tools.append(LocalShell(type="local_shell"))
+    assert extract_tool_types(tools) == {"local_shell"}
+
+    tools.append(CodeInterpreterContainerCodeInterpreterToolAuto(type="auto"))
+    assert extract_tool_types(tools) == {"local_shell", "auto"}
+
+    tools.extend(
+        [
+            Mcp(type="mcp", server_label="random", server_url=""),
+            Mcp(type="mcp", server_label="container", server_url=""),
+            Mcp(type="mcp", server_label="code_interpreter", server_url=""),
+            Mcp(type="mcp", server_label="web_search_preview", server_url=""),
+        ]
+    )
+    # When envs.VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS is not set,
+    # mcp tool types are all ignored.
+    assert extract_tool_types(tools) == {"local_shell", "auto"}
+
+    # container is allowed, it would be extracted
+    monkeypatch.setenv("VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "container")
+    assert extract_tool_types(tools) == {"local_shell", "auto", "container"}
+
+    # code_interpreter and web_search_preview are allowed,
+    # they would be extracted
+    monkeypatch.setenv(
+        "VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS", "code_interpreter,web_search_preview"
+    )
+    assert extract_tool_types(tools) == {
+        "local_shell",
+        "auto",
+        "code_interpreter",
+        "web_search_preview",
+    }
+
+
+class TestInitializeToolSessions:
+    """Test class for _initialize_tool_sessions method"""
+
+    @pytest_asyncio.fixture
+    async def serving_responses_instance(self):
+        """Create a real OpenAIServingResponses instance for testing"""
+        # Create minimal mocks for required dependencies
+        engine_client = MagicMock()
+
+        model_config = MagicMock()
+        model_config.hf_config.model_type = "test"
+        model_config.get_diff_sampling_param.return_value = {}
+        engine_client.model_config = model_config
+
+        engine_client.input_processor = MagicMock()
+        engine_client.io_processor = MagicMock()
+
+        models = MagicMock()
+
+        tool_server = MagicMock(spec=ToolServer)
+
+        # Create the actual instance
+        instance = OpenAIServingResponses(
+            engine_client=engine_client,
+            models=models,
+            request_logger=None,
+            chat_template=None,
+            chat_template_content_format="auto",
+            tool_server=tool_server,
+        )
+
+        return instance
+
+    @pytest.mark.asyncio
+    async def test_initialize_tool_sessions(
+        self, serving_responses_instance, mock_context, mock_exit_stack
+    ):
+        """Test that method works correctly with only MCP tools"""
+
+        request = ResponsesRequest(input="test input", tools=[])
+
+        # Call the method
+        await serving_responses_instance._initialize_tool_sessions(
+            request, mock_context, mock_exit_stack
+        )
+        assert mock_context.init_tool_sessions_called is False
+
+        # Create only MCP tools
+        tools = [
+            {"type": "web_search_preview"},
+            {"type": "code_interpreter", "container": {"type": "auto"}},
+        ]
+
+        request = ResponsesRequest(input="test input", tools=tools)
+
+        # Call the method
+        await serving_responses_instance._initialize_tool_sessions(
+            request, mock_context, mock_exit_stack
+        )
+
+        # Verify that init_tool_sessions was called
+        assert mock_context.init_tool_sessions_called
+
+    def test_validate_create_responses_input(
+        self, serving_responses_instance, mock_context, mock_exit_stack
+    ):
+        request = ResponsesRequest(
+            input="test input",
+            previous_input_messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "What is my horoscope? I am an Aquarius.",
+                        }
+                    ],
+                }
+            ],
+            previous_response_id="lol",
+        )
+        error = serving_responses_instance._validate_create_responses_input(request)
+        assert error is not None
+        assert error.error.type == "invalid_request_error"
+
+
+class TestValidateGeneratorInput:
+    """Test class for _validate_generator_input method"""
+
+    @pytest_asyncio.fixture
+    async def serving_responses_instance(self):
+        """Create a real OpenAIServingResponses instance for testing"""
+        # Create minimal mocks for required dependencies
+        engine_client = MagicMock()
+
+        model_config = MagicMock()
+        model_config.hf_config.model_type = "test"
+        model_config.get_diff_sampling_param.return_value = {}
+        engine_client.model_config = model_config
+
+        engine_client.input_processor = MagicMock()
+        engine_client.io_processor = MagicMock()
+
+        models = MagicMock()
+
+        # Create the actual instance
+        instance = OpenAIServingResponses(
+            engine_client=engine_client,
+            models=models,
+            request_logger=None,
+            chat_template=None,
+            chat_template_content_format="auto",
+        )
+
+        # Set max_model_len for testing
+        instance.max_model_len = 100
+
+        return instance
+
+    def test_validate_generator_input(self, serving_responses_instance):
+        """Test _validate_generator_input with valid prompt length"""
+        # Create an engine prompt with valid length (less than max_model_len)
+        valid_prompt_token_ids = list(range(5))  # 5 tokens < 100 max_model_len
+        engine_prompt = TokensPrompt(prompt_token_ids=valid_prompt_token_ids)
+
+        # Call the method
+        result = serving_responses_instance._validate_generator_input(engine_prompt)
+
+        # Should return None for valid input
+        assert result is None
+
+        # create an invalid engine prompt
+        invalid_prompt_token_ids = list(range(200))  # 100 tokens >= 100 max_model_len
+        engine_prompt = TokensPrompt(prompt_token_ids=invalid_prompt_token_ids)
+
+        # Call the method
+        result = serving_responses_instance._validate_generator_input(engine_prompt)
+
+        # Should return an ErrorResponse
+        assert result is not None
+        assert isinstance(result, ErrorResponse)
+
+
+class TestExtractAllowedToolsFromMcpRequests:
+    """Test class for _extract_allowed_tools_from_mcp_requests function"""
+
+    def test_extract_allowed_tools_basic_formats(self):
+        """Test extraction with list format, object format, and None."""
+        from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
+
+        tools = [
+            # List format
+            Mcp(
+                type="mcp",
+                server_label="server1",
+                allowed_tools=["tool1", "tool2"],
+            ),
+            # Object format
+            Mcp(
+                type="mcp",
+                server_label="server2",
+                allowed_tools=McpAllowedToolsMcpToolFilter(
+                    tool_names=["tool3", "tool4"]
+                ),
+            ),
+            # None (no filter)
+            Mcp(
+                type="mcp",
+                server_label="server3",
+                allowed_tools=None,
+            ),
+        ]
+        result = _extract_allowed_tools_from_mcp_requests(tools)
+        assert result == {
+            "server1": ["tool1", "tool2"],
+            "server2": ["tool3", "tool4"],
+            "server3": None,
+        }
+
+    def test_extract_allowed_tools_star_normalization(self):
+        """Test that '*' wildcard is normalized to None (select all tools).
+
+        This is the key test requested by reviewers to explicitly demonstrate
+        that the "*" select-all scenario is handled correctly.
+        """
+        from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
+
+        tools = [
+            # Star in list format
+            Mcp(
+                type="mcp",
+                server_label="server1",
+                allowed_tools=["*"],
+            ),
+            # Star mixed with other tools in list
+            Mcp(
+                type="mcp",
+                server_label="server2",
+                allowed_tools=["tool1", "*"],
+            ),
+            # Star in object format
+            Mcp(
+                type="mcp",
+                server_label="server3",
+                allowed_tools=McpAllowedToolsMcpToolFilter(tool_names=["*"]),
+            ),
+        ]
+        result = _extract_allowed_tools_from_mcp_requests(tools)
+        # All should be normalized to None (allows all tools)
+        assert result == {
+            "server1": None,
+            "server2": None,
+            "server3": None,
+        }
+
+    def test_extract_allowed_tools_filters_non_mcp(self):
+        """Test that non-MCP tools are ignored during extraction."""
+        tools = [
+            Mcp(
+                type="mcp",
+                server_label="server1",
+                allowed_tools=["tool1"],
+            ),
+            LocalShell(type="local_shell"),  # Non-MCP tool should be ignored
+            Mcp(
+                type="mcp",
+                server_label="server2",
+                allowed_tools=["tool2"],
+            ),
+        ]
+        result = _extract_allowed_tools_from_mcp_requests(tools)
+        # Non-MCP tools should be ignored
+        assert result == {
+            "server1": ["tool1"],
+            "server2": ["tool2"],
+        }
--- a/tests/entrypoints/openai/test_serving_tokens.py
+++ b/tests/entrypoints/openai/test_serving_tokens.py
@@ -0,0 +1,262 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import httpx
+import pytest
+import pytest_asyncio
+from transformers import AutoTokenizer
+
+from vllm.config import ModelConfig
+from vllm.v1.engine.detokenizer import check_stop_strings
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+GEN_ENDPOINT = "/inference/v1/generate"
+
+
+def get_vocab_size(model_name):
+    config = ModelConfig(
+        model=model_name,
+        seed=0,
+        dtype="bfloat16",
+    )
+    return config.get_vocab_size()
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    return AutoTokenizer.from_pretrained(MODEL_NAME)
+
+
+@pytest.fixture(scope="module")
+def messages():
+    return [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "How many countries are in the EU?"},
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(request):
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+    ]
+
+    extra_args = getattr(request, "param", None)
+    if extra_args is not None:
+        args = args + (
+            list(extra_args)
+            if isinstance(extra_args, (list, tuple))
+            else [str(extra_args)]
+        )
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    async with httpx.AsyncClient(
+        transport=transport,
+        base_url=server.url_root,
+        timeout=600,
+        headers=headers,
+    ) as c:
+        yield c
+
+
+@pytest.mark.asyncio
+async def test_generate_endpoint(client):
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": [1, 2, 3],
+        "sampling_params": {"max_tokens": 5},
+        "stream": False,
+    }
+    resp = await client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    assert "choices" in data
+
+
+@pytest.mark.asyncio
+async def test_same_response_as_chat_completions(client, tokenizer, messages):
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    for ignore_eos in [True, False]:
+        payload = {
+            "model": MODEL_NAME,
+            "token_ids": token_ids,
+            "sampling_params": {
+                "max_tokens": 24,
+                "temperature": 0.0,
+                # NOTE coordinator will set this to skip detokenization
+                "detokenize": False,
+                "ignore_eos": ignore_eos,
+            },
+            "stream": False,
+        }
+        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+        generate_data = generate_resp.json()
+        generate_res = tokenizer.decode(
+            generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+        )
+
+        payload = {
+            "model": MODEL_NAME,
+            "messages": messages,
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "stream": False,
+            "ignore_eos": ignore_eos,
+            "chat_template_kwargs": dict(enable_thinking=False),
+        }
+        completions_resp = await client.post("/v1/chat/completions", json=payload)
+        completions_data = completions_resp.json()
+        completions_res = completions_data["choices"][0]["message"]["content"]
+
+        assert generate_res == completions_res
+
+
+@pytest.mark.asyncio
+async def test_stop_string_workflow(client, tokenizer, messages):
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": token_ids,
+        "sampling_params": {
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "detokenize": False,
+            # stop strings are only supported when detokenize is True.
+            "stop": ["27 member"],
+        },
+        # TODO stream test is much more interesting
+        "stream": False,
+    }
+    with pytest.raises(httpx.HTTPStatusError):
+        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+        generate_resp.raise_for_status()
+
+    payload["sampling_params"]["stop"] = None
+    generate_resp = await client.post(
+        GEN_ENDPOINT, json=payload, headers={"X-Request-Id": "42"}
+    )
+    generate_data = generate_resp.json()
+    generate_res = tokenizer.decode(
+        generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+    )
+
+    # NOTE This is under the responsibility of the coordinator
+    # stop_checker = StopChecker(
+    #     max_model_len=1024, get_tokenizer_for_seq=lambda _: tokenizer
+    # )
+    stop_str, truncate_to = check_stop_strings(
+        generate_res, len(generate_res), ["27 member"], False
+    )
+    assert stop_str == "27 member"
+    # abort request that hit stop string (requires tokens-only mode)
+    # res = await client.post("/abort_requests", json={"request_ids": ["generate-tokens-42"]}) # noqa: E501
+    # res.raise_for_status()
+    generate_res = generate_res[:truncate_to]
+
+    # Get stop_str response from chat completions
+    payload = {
+        "model": MODEL_NAME,
+        "messages": messages,
+        "max_tokens": 24,
+        "temperature": 0.0,
+        "stream": False,
+        "stop": ["27 member"],
+        "chat_template_kwargs": dict(enable_thinking=False),
+    }
+    completions_resp = await client.post("/v1/chat/completions", json=payload)
+    completions_data = completions_resp.json()
+    completions_res = completions_data["choices"][0]["message"]["content"]
+    assert generate_res == completions_res
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "server",
+    [
+        [
+            "--enable-lora",
+            "--lora-modules",
+            "Alice=charent/self_cognition_Alice",
+            "Bob=charent/self_cognition_Bob",
+            "--max-lora-rank",
+            "64",
+            "--max-cpu-loras",
+            "2",
+        ]
+    ],
+    indirect=True,
+)
+async def test_generate_with_lora_adapter(client, tokenizer, messages):
+    # Verify adapters are listed
+    models_resp = await client.get("/v1/models")
+    models_resp.raise_for_status()
+    models = {m["id"] for m in models_resp.json().get("data", [])}
+    assert {"Alice", "Bob"}.issubset(models)
+
+    # Generate using a LoRA adapter by specifying its name as the model
+    payload = {
+        "model": "Alice",
+        "token_ids": [1, 2, 3],
+        "sampling_params": {"max_tokens": 5},
+        "stream": False,
+    }
+    resp = await client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    assert "choices" in data
+
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    payload = {
+        "model": "Alice",
+        "token_ids": token_ids,
+        "sampling_params": {
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "detokenize": False,
+        },
+        "stream": False,
+    }
+    generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+    generate_data = generate_resp.json()
+    generate_res = tokenizer.decode(
+        generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+    )
+
+    payload = {
+        "model": "Alice",
+        "messages": messages,
+        "max_tokens": 24,
+        "temperature": 0.0,
+        "stream": False,
+        "chat_template_kwargs": dict(enable_thinking=False),
+    }
+    completions_resp = await client.post("/v1/chat/completions", json=payload)
+    completions_data = completions_resp.json()
+    completions_res = completions_data["choices"][0]["message"]["content"]
+
+    assert generate_res == completions_res
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import signal
+import subprocess
+import sys
+import time
+
+import openai
+import pytest
+
+from vllm.utils.network_utils import get_open_port
+
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
+
+
+@pytest.mark.asyncio
+async def test_shutdown_on_engine_failure():
+    """Verify that API returns connection error when server process is killed.
+
+    Starts a vLLM server, kills it to simulate a crash, then verifies that
+    subsequent API calls fail appropriately.
+    """
+
+    port = get_open_port()
+
+    proc = subprocess.Popen(
+        [
+            # dtype, max-len etc set so that this can run in CI
+            sys.executable,
+            "-m",
+            "vllm.entrypoints.openai.api_server",
+            "--model",
+            MODEL_NAME,
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "128",
+            "--enforce-eager",
+            "--port",
+            str(port),
+            "--gpu-memory-utilization",
+            "0.05",
+            "--max-num-seqs",
+            "2",
+            "--disable-frontend-multiprocessing",
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
+    )
+
+    # Wait for server startup
+    start_time = time.time()
+    client = openai.AsyncOpenAI(
+        base_url=f"http://localhost:{port}/v1",
+        api_key="dummy",
+        max_retries=0,
+        timeout=10,
+    )
+
+    # Poll until server is ready
+    while time.time() - start_time < 30:
+        try:
+            await client.completions.create(
+                model=MODEL_NAME, prompt="Hello", max_tokens=1
+            )
+            break
+        except Exception:
+            time.sleep(0.5)
+            if proc.poll() is not None:
+                stdout, stderr = proc.communicate(timeout=1)
+                pytest.fail(
+                    f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
+                )
+    else:
+        proc.terminate()
+        proc.wait(timeout=5)
+        pytest.fail("Server failed to start in 30 seconds")
+
+    # Kill server to simulate crash
+    proc.terminate()
+    time.sleep(1)
+
+    # Verify API calls now fail
+    with pytest.raises((openai.APIConnectionError, openai.APIStatusError)):
+        await client.completions.create(
+            model=MODEL_NAME, prompt="This should fail", max_tokens=1
+        )
+
+    return_code = proc.wait(timeout=5)
+    assert return_code is not None
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import requests
+from prometheus_client.parser import text_string_to_metric_families
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B"
+
+
+def test_sleep_mode():
+    # dtype, max-len etc set so that this can run in CI
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enable-sleep-mode",
+    ]
+
+    with RemoteOpenAIServer(
+        MODEL_NAME,
+        args,
+        env_dict={"VLLM_SERVER_DEV_MODE": "1", "CUDA_VISIBLE_DEVICES": "0"},
+    ) as remote_server:
+        response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
+        assert response.status_code == 200
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is True
+
+        # check sleep metrics
+        response = requests.get(remote_server.url_for("metrics"))
+        assert response.status_code == 200
+        awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
+        assert awake == 0
+        assert weights_offloaded == 1
+        assert discard_all == 0
+
+        response = requests.post(remote_server.url_for("wake_up"))
+        assert response.status_code == 200
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is False
+
+        # check sleep metrics
+        response = requests.get(remote_server.url_for("metrics"))
+        assert response.status_code == 200
+        awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
+        assert awake == 1
+        assert weights_offloaded == 0
+        assert discard_all == 0
+
+        # test wake up with tags
+        response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
+        assert response.status_code == 200
+
+        response = requests.post(
+            remote_server.url_for("wake_up"), params={"tags": ["weights"]}
+        )
+        assert response.status_code == 200
+
+        # is sleeping should be false after waking up any part of the engine
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is True
+
+        response = requests.post(
+            remote_server.url_for("wake_up"), params={"tags": ["kv_cache"]}
+        )
+        assert response.status_code == 200
+
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is False
+
+        # check sleep metrics
+        response = requests.get(remote_server.url_for("metrics"))
+        assert response.status_code == 200
+        awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
+        assert awake == 1
+        assert weights_offloaded == 0
+        assert discard_all == 0
+
+
+def _get_sleep_metrics_from_api(response: requests.Response):
+    """Return (awake, weights_offloaded, discard_all)"""
+
+    awake, weights_offloaded, discard_all = None, None, None
+
+    for family in text_string_to_metric_families(response.text):
+        if family.name == "vllm:engine_sleep_state":
+            for sample in family.samples:
+                if sample.name == "vllm:engine_sleep_state":
+                    for label_name, label_value in sample.labels.items():
+                        if label_value == "awake":
+                            awake = sample.value
+                        elif label_value == "weights_offloaded":
+                            weights_offloaded = sample.value
+                        elif label_value == "discard_all":
+                            discard_all = sample.value
+
+    assert awake is not None
+    assert weights_offloaded is not None
+    assert discard_all is not None
+
+    return awake, weights_offloaded, discard_all
--- a/tests/entrypoints/openai/test_sparse_tensor_validation.py
+++ b/tests/entrypoints/openai/test_sparse_tensor_validation.py
@@ -0,0 +1,342 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Sparse tensor validation in embedding APIs.
+
+Tests verify that malicious sparse tensors are rejected before they can trigger
+out-of-bounds memory writes during to_dense() operations.
+"""
+
+import base64
+import io
+
+import pytest
+import torch
+
+from vllm.entrypoints.renderer import CompletionRenderer
+from vllm.multimodal.audio import AudioEmbeddingMediaIO
+from vllm.multimodal.image import ImageEmbeddingMediaIO
+
+
+def _encode_tensor(tensor: torch.Tensor) -> bytes:
+    """Helper to encode a tensor as base64 bytes."""
+    buffer = io.BytesIO()
+    torch.save(tensor, buffer)
+    buffer.seek(0)
+    return base64.b64encode(buffer.read())
+
+
+def _create_malicious_sparse_tensor() -> torch.Tensor:
+    """
+    Create a malicious sparse COO tensor with out-of-bounds indices.
+
+    This tensor has indices that point beyond the declared shape, which would
+    cause an out-of-bounds write when converted to dense format without
+    validation.
+    """
+    # Create a 3x3 sparse tensor but with indices pointing to (10, 10)
+    indices = torch.tensor([[10], [10]])  # Out of bounds for 3x3 shape
+    values = torch.tensor([1.0])
+    shape = (3, 3)
+
+    # Create sparse tensor (this will be invalid)
+    sparse_tensor = torch.sparse_coo_tensor(indices, values, shape, dtype=torch.float32)
+    return sparse_tensor
+
+
+def _create_valid_sparse_tensor() -> torch.Tensor:
+    """Create a valid sparse COO tensor for baseline testing."""
+    indices = torch.tensor([[0, 1, 2], [0, 1, 2]])
+    values = torch.tensor([1.0, 2.0, 3.0])
+    shape = (3, 3)
+
+    sparse_tensor = torch.sparse_coo_tensor(indices, values, shape, dtype=torch.float32)
+    return sparse_tensor
+
+
+def _create_valid_dense_tensor() -> torch.Tensor:
+    """Create a valid dense tensor for baseline testing."""
+    return torch.randn(10, 768, dtype=torch.float32)  # (seq_len, hidden_size)
+
+
+class TestPromptEmbedsValidation:
+    """Test sparse tensor validation in prompt embeddings (Completions API)."""
+
+    def test_valid_dense_tensor_accepted(self, model_config):
+        """Baseline: Valid dense tensors should work normally."""
+        renderer = CompletionRenderer(model_config)
+
+        valid_tensor = _create_valid_dense_tensor()
+        encoded = _encode_tensor(valid_tensor)
+
+        # Should not raise any exception
+        result = renderer.load_prompt_embeds(encoded)
+        assert len(result) == 1
+        assert result[0]["prompt_embeds"].shape == valid_tensor.shape
+
+    def test_valid_sparse_tensor_accepted(self):
+        """Baseline: Valid sparse tensors should load successfully."""
+        io_handler = ImageEmbeddingMediaIO()
+
+        valid_sparse = _create_valid_sparse_tensor()
+        encoded = _encode_tensor(valid_sparse)
+
+        # Should not raise any exception (sparse tensors remain sparse)
+        result = io_handler.load_base64("", encoded.decode("utf-8"))
+        assert result.shape == valid_sparse.shape
+
+    def test_malicious_sparse_tensor_rejected(self, model_config):
+        """Security: Malicious sparse tensors should be rejected."""
+        renderer = CompletionRenderer(model_config)
+
+        malicious_tensor = _create_malicious_sparse_tensor()
+        encoded = _encode_tensor(malicious_tensor)
+
+        # Should raise RuntimeError due to invalid sparse tensor
+        with pytest.raises((RuntimeError, ValueError)) as exc_info:
+            renderer.load_prompt_embeds(encoded)
+
+        # Error should indicate sparse tensor validation failure
+        error_msg = str(exc_info.value).lower()
+        assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg
+
+    def test_extremely_large_indices_rejected(self, model_config):
+        """Security: Sparse tensors with extremely large indices should be rejected."""
+        renderer = CompletionRenderer(model_config)
+
+        # Create tensor with indices far beyond reasonable bounds
+        indices = torch.tensor([[999999], [999999]])
+        values = torch.tensor([1.0])
+        shape = (10, 10)
+
+        malicious_tensor = torch.sparse_coo_tensor(
+            indices, values, shape, dtype=torch.float32
+        )
+        encoded = _encode_tensor(malicious_tensor)
+
+        with pytest.raises((RuntimeError, ValueError)):
+            renderer.load_prompt_embeds(encoded)
+
+    def test_negative_indices_rejected(self, model_config):
+        """Security: Sparse tensors with negative indices should be rejected."""
+        renderer = CompletionRenderer(model_config)
+
+        # Create tensor with negative indices
+        indices = torch.tensor([[-1], [-1]])
+        values = torch.tensor([1.0])
+        shape = (10, 10)
+
+        malicious_tensor = torch.sparse_coo_tensor(
+            indices, values, shape, dtype=torch.float32
+        )
+        encoded = _encode_tensor(malicious_tensor)
+
+        with pytest.raises((RuntimeError, ValueError)):
+            renderer.load_prompt_embeds(encoded)
+
+
+class TestImageEmbedsValidation:
+    """Test sparse tensor validation in image embeddings (Chat API)."""
+
+    def test_valid_dense_tensor_accepted(self):
+        """Baseline: Valid dense tensors should work normally."""
+        io_handler = ImageEmbeddingMediaIO()
+
+        valid_tensor = _create_valid_dense_tensor()
+        encoded = _encode_tensor(valid_tensor)
+
+        # Should not raise any exception
+        result = io_handler.load_base64("", encoded.decode("utf-8"))
+        assert result.shape == valid_tensor.shape
+
+    def test_valid_sparse_tensor_accepted(self):
+        """Baseline: Valid sparse tensors should load successfully."""
+        io_handler = AudioEmbeddingMediaIO()
+
+        valid_sparse = _create_valid_sparse_tensor()
+        encoded = _encode_tensor(valid_sparse)
+
+        # Should not raise any exception (sparse tensors remain sparse)
+        result = io_handler.load_base64("", encoded.decode("utf-8"))
+        assert result.shape == valid_sparse.shape
+
+    def test_malicious_sparse_tensor_rejected(self):
+        """Security: Malicious sparse tensors should be rejected."""
+        io_handler = ImageEmbeddingMediaIO()
+
+        malicious_tensor = _create_malicious_sparse_tensor()
+        encoded = _encode_tensor(malicious_tensor)
+
+        # Should raise RuntimeError due to invalid sparse tensor
+        with pytest.raises((RuntimeError, ValueError)) as exc_info:
+            io_handler.load_base64("", encoded.decode("utf-8"))
+
+        error_msg = str(exc_info.value).lower()
+        assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg
+
+    def test_load_bytes_validates(self):
+        """Security: Validation should also work for load_bytes method."""
+        io_handler = ImageEmbeddingMediaIO()
+
+        malicious_tensor = _create_malicious_sparse_tensor()
+        buffer = io.BytesIO()
+        torch.save(malicious_tensor, buffer)
+        buffer.seek(0)
+
+        with pytest.raises((RuntimeError, ValueError)):
+            io_handler.load_bytes(buffer.read())
+
+
+class TestAudioEmbedsValidation:
+    """Test sparse tensor validation in audio embeddings (Chat API)."""
+
+    def test_valid_dense_tensor_accepted(self):
+        """Baseline: Valid dense tensors should work normally."""
+        io_handler = AudioEmbeddingMediaIO()
+
+        valid_tensor = _create_valid_dense_tensor()
+        encoded = _encode_tensor(valid_tensor)
+
+        # Should not raise any exception
+        result = io_handler.load_base64("", encoded.decode("utf-8"))
+        assert result.shape == valid_tensor.shape
+
+    def test_valid_sparse_tensor_accepted(self):
+        """Baseline: Valid sparse tensors should be converted successfully."""
+        io_handler = AudioEmbeddingMediaIO()
+
+        valid_sparse = _create_valid_sparse_tensor()
+        encoded = _encode_tensor(valid_sparse)
+
+        # Should not raise any exception
+        result = io_handler.load_base64("", encoded.decode("utf-8"))
+        assert result.is_sparse is False
+
+    def test_malicious_sparse_tensor_rejected(self):
+        """Security: Malicious sparse tensors should be rejected."""
+        io_handler = AudioEmbeddingMediaIO()
+
+        malicious_tensor = _create_malicious_sparse_tensor()
+        encoded = _encode_tensor(malicious_tensor)
+
+        # Should raise RuntimeError due to invalid sparse tensor
+        with pytest.raises((RuntimeError, ValueError)) as exc_info:
+            io_handler.load_base64("", encoded.decode("utf-8"))
+
+        error_msg = str(exc_info.value).lower()
+        assert "sparse" in error_msg or "index" in error_msg or "bounds" in error_msg
+
+    def test_load_bytes_validates(self):
+        """Security: Validation should also work for load_bytes method."""
+        io_handler = AudioEmbeddingMediaIO()
+
+        malicious_tensor = _create_malicious_sparse_tensor()
+        buffer = io.BytesIO()
+        torch.save(malicious_tensor, buffer)
+        buffer.seek(0)
+
+        with pytest.raises((RuntimeError, ValueError)):
+            io_handler.load_bytes(buffer.read())
+
+
+class TestSparseTensorValidationIntegration:
+    """
+    These tests verify the complete attack chain is blocked at all entry points.
+    """
+
+    def test_attack_scenario_completions_api(self, model_config):
+        """
+        Simulate a complete attack through the Completions API.
+
+        Attack scenario:
+        1. Attacker crafts malicious sparse tensor
+        2. Encodes it as base64
+        3. Sends to /v1/completions with prompt_embeds parameter
+        4. Server should reject before memory corruption occurs
+        """
+        renderer = CompletionRenderer(model_config)
+
+        # Step 1-2: Attacker creates malicious payload
+        attack_payload = _encode_tensor(_create_malicious_sparse_tensor())
+
+        # Step 3-4: Server processes and should reject
+        with pytest.raises((RuntimeError, ValueError)):
+            renderer.load_prompt_embeds(attack_payload)
+
+    def test_attack_scenario_chat_api_image(self):
+        """
+        Simulate attack through Chat API with image_embeds.
+
+        Verifies the image embeddings path is protected.
+        """
+        io_handler = ImageEmbeddingMediaIO()
+        attack_payload = _encode_tensor(_create_malicious_sparse_tensor())
+
+        with pytest.raises((RuntimeError, ValueError)):
+            io_handler.load_base64("", attack_payload.decode("utf-8"))
+
+    def test_attack_scenario_chat_api_audio(self):
+        """
+        Simulate attack through Chat API with audio_embeds.
+
+        Verifies the audio embeddings path is protected.
+        """
+        io_handler = AudioEmbeddingMediaIO()
+        attack_payload = _encode_tensor(_create_malicious_sparse_tensor())
+
+        with pytest.raises((RuntimeError, ValueError)):
+            io_handler.load_base64("", attack_payload.decode("utf-8"))
+
+    def test_multiple_valid_embeddings_in_batch(self, model_config):
+        """
+        Regression test: Multiple valid embeddings should still work.
+
+        Ensures the fix doesn't break legitimate batch processing.
+        """
+        renderer = CompletionRenderer(model_config)
+
+        valid_tensors = [
+            _encode_tensor(_create_valid_dense_tensor()),
+            _encode_tensor(_create_valid_dense_tensor()),
+            _encode_tensor(_create_valid_dense_tensor()),
+        ]
+
+        # Should process all without error
+        result = renderer.load_prompt_embeds(valid_tensors)
+        assert len(result) == 3
+
+    def test_mixed_valid_and_malicious_rejected(self, model_config):
+        """
+        Security: Batch with one malicious tensor should be rejected.
+
+        Even if most tensors are valid, a single malicious one should
+        cause rejection of the entire batch.
+        """
+        renderer = CompletionRenderer(model_config)
+
+        mixed_batch = [
+            _encode_tensor(_create_valid_dense_tensor()),
+            _encode_tensor(_create_malicious_sparse_tensor()),  # Malicious
+            _encode_tensor(_create_valid_dense_tensor()),
+        ]
+
+        # Should fail on the malicious tensor
+        with pytest.raises((RuntimeError, ValueError)):
+            renderer.load_prompt_embeds(mixed_batch)
+
+
+# Pytest fixtures
+@pytest.fixture
+def model_config():
+    """Mock ModelConfig for testing."""
+    from vllm.config import ModelConfig
+
+    return ModelConfig(
+        model="facebook/opt-125m",
+        tokenizer="facebook/opt-125m",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float32",
+        seed=0,
+        enable_prompt_embeds=True,  # Required for prompt embeds tests
+    )
--- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py
+++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+import os
+import tempfile
+
+import openai
+import pytest
+import pytest_asyncio
+import torch.cuda
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.model_loader.tensorizer import (
+    TensorizerConfig,
+    tensorize_lora_adapter,
+    tensorize_vllm_model,
+)
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "unsloth/llama-3.2-1b-Instruct"
+LORA_PATH = "davzoku/finqa_adapter_1b"
+
+
+def _cleanup():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+@pytest.fixture(autouse=True)
+def cleanup():
+    _cleanup()
+
+
+@pytest.fixture(scope="module")
+def tmp_dir():
+    with tempfile.TemporaryDirectory() as path:
+        yield path
+
+
+@pytest.fixture(scope="module")
+def model_uri(tmp_dir):
+    yield f"{tmp_dir}/model.tensors"
+
+
+@pytest.fixture(scope="module")
+def tensorize_model_and_lora(tmp_dir, model_uri):
+    tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri, lora_dir=tmp_dir)
+    args = EngineArgs(model=MODEL_NAME)
+
+    tensorize_lora_adapter(LORA_PATH, tensorizer_config)
+    tensorize_vllm_model(args, tensorizer_config)
+
+    # Manually invoke a _cleanup() here, as the cleanup()
+    # fixture won't be guaranteed to be called after this
+    # when this fixture is used for a test
+    _cleanup()
+    yield
+
+
+@pytest.fixture(scope="module")
+def server(model_uri, tensorize_model_and_lora):
+    # In this case, model_uri is a directory with a model.tensors
+    # file and all necessary model artifacts, particularly a
+    # HF `config.json` file. In this case, Tensorizer can infer the
+    # `TensorizerConfig` so --model-loader-extra-config can be completely
+    # omitted.
+
+    ## Start OpenAI API server
+    args = [
+        "--load-format",
+        "tensorizer",
+        "--served-model-name",
+        MODEL_NAME,
+        "--enable-lora",
+    ]
+
+    model_dir = os.path.dirname(model_uri)
+    with RemoteOpenAIServer(model_dir, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+    _cleanup()
+    completion = await client.completions.create(
+        model=model_name, prompt="Hello, my name is", max_tokens=5, temperature=0.0
+    )
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    assert completion.model == MODEL_NAME
+    assert len(completion.choices) == 1
+    assert len(completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11
+    )
--- a/tests/entrypoints/openai/test_token_in_token_out.py
+++ b/tests/entrypoints/openai/test_token_in_token_out.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import tempfile
+
+import pytest
+
+from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
+from vllm.tokenizers import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
+
+
+@pytest.fixture(scope="module")
+def server():
+    global MODEL_PATH
+    MODEL_PATH = download_weights_from_hf(
+        MODEL_NAME,
+        allow_patterns=["*"],
+        cache_dir=MODEL_PATH,
+        ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"],
+    )
+    args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        "--skip-tokenizer-init",
+        "--load-format",
+        "dummy",
+    ]
+    with RemoteOpenAIServer(MODEL_PATH, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_token_in_token_out_and_logprobs(server):
+    """
+    Test token-in-token-out and token_ids align with prompt_logprobs
+    & logprobs when return_tokens_as_token_ids is enabled.
+    """
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    text = "Hello, world! How are you today?"
+    token_ids = tokenizer.encode(text)
+    async with server.get_async_client() as client:
+        # Test with both return_token_ids and return_tokens_as_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_PATH,
+            prompt=token_ids,
+            max_tokens=20,
+            temperature=0,
+            echo=True,
+            extra_body={
+                "return_token_ids": True,
+            },
+        )
+
+        # Verify all fields are present
+        assert (
+            completion.choices[0].token_ids is not None
+            and 0 < len(completion.choices[0].token_ids) <= 20
+        )
+        assert completion.choices[0].prompt_token_ids is not None
+
+        # Decode prompt tokens
+        if completion.choices[0].prompt_token_ids:
+            prompt_text = tokenizer.decode(completion.choices[0].prompt_token_ids)
+            # The decoded prompt should match or close to original prompt
+            assert prompt_text == text
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -0,0 +1,355 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import pytest_asyncio
+import requests
+
+from vllm.tokenizers import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        "--enable-tokenizer-info-endpoint",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def tokenizer_name(model_name: str):
+    return model_name
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME)],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_completions(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
+
+    for add_special in [False, True]:
+        prompt = "vllm1 This is a test prompt."
+        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+        response = requests.post(
+            server.url_for("tokenize"),
+            json={
+                "add_special_tokens": add_special,
+                "model": model_name,
+                "prompt": prompt,
+            },
+        )
+        response.raise_for_status()
+
+        result = response.json()
+        assert result["tokens"] == tokens
+        assert result["count"] == len(tokens)
+        assert result["max_model_len"] == 8192
+        assert result["token_strs"] is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME)],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_chat(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
+
+    for add_generation in [False, True]:
+        for add_special in [False, True]:
+            conversation = [
+                {"role": "user", "content": "Hi there!"},
+                {"role": "assistant", "content": "Nice to meet you!"},
+                {"role": "user", "content": "Can I ask a question? vllm1"},
+            ]
+            for continue_final in [False, True]:
+                if add_generation and continue_final:
+                    continue
+                if continue_final:
+                    conversation.append({"role": "assistant", "content": "Sure,"})
+
+                prompt = tokenizer.apply_chat_template(
+                    add_generation_prompt=add_generation,
+                    continue_final_message=continue_final,
+                    conversation=conversation,
+                    tokenize=False,
+                )
+                tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+                response = requests.post(
+                    server.url_for("tokenize"),
+                    json={
+                        "add_generation_prompt": add_generation,
+                        "continue_final_message": continue_final,
+                        "add_special_tokens": add_special,
+                        "messages": conversation,
+                        "model": model_name,
+                    },
+                )
+                response.raise_for_status()
+
+                result = response.json()
+                assert result["tokens"] == tokens
+                assert result["count"] == len(tokens)
+                assert result["max_model_len"] == 8192
+                assert result["token_strs"] is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME)],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_chat_with_tools(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
+
+    for add_generation in [False, True]:
+        for add_special in [False, True]:
+            conversation = [
+                {
+                    "role": "user",
+                    "content": "What's the weather like in Paris today?",
+                }
+            ]
+
+            tools = [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {"location": {"type": "string"}},
+                        },
+                    },
+                }
+            ]
+
+            for continue_final in [False, True]:
+                if add_generation and continue_final:
+                    continue
+                if continue_final:
+                    conversation.append({"role": "assistant", "content": "Sure,"})
+
+                prompt = tokenizer.apply_chat_template(
+                    add_generation_prompt=add_generation,
+                    continue_final_message=continue_final,
+                    conversation=conversation,
+                    tools=tools,
+                    tokenize=False,
+                )
+                tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+
+                response = requests.post(
+                    server.url_for("tokenize"),
+                    json={
+                        "add_generation_prompt": add_generation,
+                        "continue_final_message": continue_final,
+                        "add_special_tokens": add_special,
+                        "messages": conversation,
+                        "model": model_name,
+                        "tools": tools,
+                    },
+                )
+                response.raise_for_status()
+
+                result = response.json()
+                assert result["tokens"] == tokens
+                assert result["count"] == len(tokens)
+                assert result["max_model_len"] == 8192
+                assert result["token_strs"] is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name, tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME)],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_with_return_token_strs(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
+
+    prompt = "This is a token_strs test prompt! vllm1"
+    response = requests.post(
+        server.url_for("tokenize"),
+        json={"prompt": prompt, "model": model_name, "return_token_strs": True},
+    )
+    response.raise_for_status()
+
+    tokens = tokenizer.encode(prompt, add_special_tokens=True)
+    tokens_str = tokenizer.convert_ids_to_tokens(tokens)
+
+    result = response.json()
+    assert result["tokens"] == tokens
+    assert result["count"] == len(tokens)
+    assert result["max_model_len"] == 8192
+    assert result["token_strs"] == tokens_str
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME)],
+    indirect=["tokenizer_name"],
+)
+async def test_detokenize(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
+
+    prompt = "This is a test prompt. vllm1"
+    tokens = tokenizer.encode(prompt, add_special_tokens=False)
+
+    response = requests.post(
+        server.url_for("detokenize"), json={"model": model_name, "tokens": tokens}
+    )
+    response.raise_for_status()
+
+    assert response.json() == {"prompt": prompt}
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME)],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenizer_info_basic(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    """Test basic tokenizer info endpoint functionality."""
+    response = requests.get(server.url_for("tokenizer_info"))
+    response.raise_for_status()
+    result = response.json()
+    assert "tokenizer_class" in result
+    assert isinstance(result["tokenizer_class"], str)
+    assert result["tokenizer_class"]
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
+    """Test that the response matches expected schema types."""
+    response = requests.get(server.url_for("tokenizer_info"))
+    response.raise_for_status()
+    result = response.json()
+    field_types = {
+        "add_bos_token": bool,
+        "add_prefix_space": bool,
+        "clean_up_tokenization_spaces": bool,
+        "split_special_tokens": bool,
+        "bos_token": str,
+        "eos_token": str,
+        "pad_token": str,
+        "unk_token": str,
+        "chat_template": str,
+        "errors": str,
+        "model_max_length": int,
+        "additional_special_tokens": list,
+        "added_tokens_decoder": dict,
+    }
+    for field, expected_type in field_types.items():
+        if field in result and result[field] is not None:
+            assert isinstance(result[field], expected_type), (
+                f"{field} should be {expected_type.__name__}"
+            )
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_info_added_tokens_structure(
+    server: RemoteOpenAIServer,
+):
+    """Test added_tokens_decoder structure if present."""
+    response = requests.get(server.url_for("tokenizer_info"))
+    response.raise_for_status()
+    result = response.json()
+    added_tokens = result.get("added_tokens_decoder")
+    if added_tokens:
+        for token_id, token_info in added_tokens.items():
+            assert isinstance(token_id, str), "Token IDs should be strings"
+            assert isinstance(token_info, dict), "Token info should be a dict"
+            assert "content" in token_info, "Token info should have content"
+            assert "special" in token_info, "Token info should have special flag"
+            assert isinstance(token_info["special"], bool), (
+                "Special flag should be boolean"
+            )
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_info_consistency_with_tokenize(
+    server: RemoteOpenAIServer,
+):
+    """Test that tokenizer info is consistent with tokenization endpoint."""
+    info_response = requests.get(server.url_for("tokenizer_info"))
+    info_response.raise_for_status()
+    info = info_response.json()
+    tokenize_response = requests.post(
+        server.url_for("tokenize"),
+        json={"model": MODEL_NAME, "prompt": "Hello world!"},
+    )
+    tokenize_response.raise_for_status()
+    tokenize_result = tokenize_response.json()
+    info_max_len = info.get("model_max_length")
+    tokenize_max_len = tokenize_result.get("max_model_len")
+    if info_max_len and tokenize_max_len:
+        assert info_max_len >= tokenize_max_len, (
+            "Info max length should be >= tokenize max length"
+        )
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
+    """Test chat template is properly included."""
+    response = requests.get(server.url_for("tokenizer_info"))
+    response.raise_for_status()
+    result = response.json()
+    chat_template = result.get("chat_template")
+    if chat_template:
+        assert isinstance(chat_template, str), "Chat template should be a string"
+        assert chat_template.strip(), "Chat template should not be empty"
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# imports for structured outputs tests
+import json
+
+import pytest
+
+from ...utils import RemoteOpenAIServer
+
+MISTRAL_FORMAT_ARGS = [
+    "--tokenizer_mode",
+    "mistral",
+    "--config_format",
+    "mistral",
+    "--load_format",
+    "mistral",
+]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
+async def test_basic_audio(mary_had_lamb, model_name):
+    server_args = ["--enforce-eager"]
+
+    if model_name.startswith("mistralai"):
+        server_args += MISTRAL_FORMAT_ARGS
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0,
+        )
+        out = json.loads(transcription)
+        out_text = out["text"]
+        out_usage = out["usage"]
+        assert "Mary had a little lamb," in out_text
+        assert out_usage["seconds"] == 16, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_basic_audio_with_lora(mary_had_lamb):
+    """Ensure STT (transcribe) requests can pass LoRA through to generate."""
+    model_name = "ibm-granite/granite-speech-3.3-2b"
+    lora_model_name = "speech"
+    server_args = [
+        "--enforce-eager",
+        "--enable-lora",
+        "--max-lora-rank",
+        "64",
+        "--lora-modules",
+        f"{lora_model_name}={model_name}",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "1",
+    ]
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=lora_model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0,
+        )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    assert "mary had a little lamb" in out_text
+    assert out_usage["seconds"] == 16, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_basic_audio_gemma(foscolo):
+    # Gemma accuracy on some of the audio samples we use is particularly bad,
+    # hence we use a different one here. WER is evaluated separately.
+    model_name = "google/gemma-3n-E2B-it"
+    server_args = ["--enforce-eager"]
+
+    with RemoteOpenAIServer(
+        model_name, server_args, max_wait_seconds=480
+    ) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=foscolo,
+            language="it",
+            response_format="text",
+            temperature=0.0,
+        )
+        out = json.loads(transcription)["text"]
+        assert "da cui vergine nacque Venere" in out
--- a/tests/entrypoints/openai/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# imports for structured outputs tests
+import asyncio
+import io
+import json
+
+import librosa
+import numpy as np
+import openai
+import pytest
+import pytest_asyncio
+import soundfile as sf
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/whisper-large-v3-turbo"
+SERVER_ARGS = ["--enforce-eager"]
+
+
+@pytest.fixture(scope="module")
+def server():
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def whisper_client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_basic_audio(whisper_client, mary_had_lamb):
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    assert "Mary had a little lamb," in out_text
+    assert out_usage["seconds"] == 16, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client):
+    transcription = whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    transcription2 = whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    # Await both transcriptions by scheduling coroutines together
+    transcription, transcription2 = await asyncio.gather(transcription, transcription2)
+    out = json.loads(transcription)
+    out_text = out["text"]
+    assert "Mary had a little lamb," in out_text
+    out2 = json.loads(transcription2)
+    out_text2 = out2["text"]
+    assert "Edgar Martinez" in out_text2
+
+
+@pytest.mark.asyncio
+async def test_bad_requests(mary_had_lamb, whisper_client):
+    # invalid language
+    with pytest.raises(openai.BadRequestError):
+        await whisper_client.audio.transcriptions.create(
+            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
+        )
+
+
+@pytest.mark.asyncio
+async def test_long_audio_request(mary_had_lamb, whisper_client):
+    mary_had_lamb.seek(0)
+    audio, sr = librosa.load(mary_had_lamb)
+    # Add small silence after each audio for repeatability in the split process
+    audio = np.pad(audio, (0, 1600))
+    repeated_audio = np.tile(audio, 10)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format="WAV")
+    buffer.seek(0)
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=buffer,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    counts = out_text.count("Mary had a little lamb")
+    assert counts == 10, counts
+    assert out_usage["seconds"] == 161, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_completion_endpoints(whisper_client):
+    # text to text model
+    res = await whisper_client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{"role": "system", "content": "You are a helpful assistant."}],
+    )
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Chat Completions API"
+
+    res = await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello")
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Completions API"
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(winning_call, whisper_client):
+    transcription = ""
+    res_no_stream = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        response_format="json",
+        language="en",
+        temperature=0.0,
+    )
+    res = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        timeout=30,
+    )
+    # Reconstruct from chunks and validate
+    async for chunk in res:
+        text = chunk.choices[0]["delta"]["content"]
+        transcription += text
+
+    assert transcription == res_no_stream.text
+
+
+@pytest.mark.asyncio
+async def test_stream_options(winning_call, whisper_client):
+    res = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
+        timeout=30,
+    )
+    final = False
+    continuous = True
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            final = True
+        else:
+            continuous = continuous and hasattr(chunk, "usage")
+    assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_sampling_params(mary_had_lamb, whisper_client):
+    """
+    Compare sampling with params and greedy sampling to assert results
+    are different when extreme sampling parameters values are picked.
+    """
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.8,
+        extra_body=dict(
+            seed=42,
+            repetition_penalty=1.9,
+            top_k=12,
+            top_p=0.4,
+            min_p=0.5,
+            frequency_penalty=1.8,
+            presence_penalty=2.0,
+        ),
+    )
+
+    greedy_transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.0,
+        extra_body=dict(seed=42),
+    )
+
+    assert greedy_transcription.text != transcription.text
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(mary_had_lamb, whisper_client):
+    prompt = "This is a speech, recorded in a phonograph."
+    # Prompts should not omit the part of original prompt while transcribing.
+    prefix = "The first words I spoke in the original phonograph"
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)["text"]
+    assert prefix in out
+    transcription_wprompt = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        prompt=prompt,
+        temperature=0.0,
+    )
+    out_prompt = json.loads(transcription_wprompt)["text"]
+    assert prefix in out_prompt
+
+
+@pytest.mark.asyncio
+async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="verbose_json",
+        temperature=0.0,
+    )
+    assert transcription.segments is not None
+    assert len(transcription.segments) > 0
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import io
+
+# imports for structured outputs tests
+import json
+
+import httpx
+import librosa
+import numpy as np
+import pytest
+import pytest_asyncio
+import soundfile as sf
+
+from ...utils import RemoteOpenAIServer
+
+SERVER_ARGS = ["--enforce-eager"]
+
+
+@pytest.fixture(
+    scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
+)
+def server(request):
+    # Parametrize over model name
+    with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
+        yield remote_server, request.param
+
+
+@pytest_asyncio.fixture
+async def client_and_model(server):
+    server, model_name = server
+    async with server.get_async_client() as async_client:
+        yield async_client, model_name
+
+
+@pytest.mark.asyncio
+async def test_non_asr_model(foscolo):
+    # text to text model
+    model_name = "JackFram/llama-68m"
+    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
+        client = remote_server.get_async_client()
+        res = await client.audio.translations.create(
+            model=model_name, file=foscolo, temperature=0.0
+        )
+        err = res.error
+        assert err["code"] == 400 and not res.text
+        assert err["message"] == "The model does not support Translations API"
+
+
+@pytest.mark.asyncio
+async def test_basic_audio_with_lora(mary_had_lamb):
+    """Ensure STT (translate) requests can pass LoRA through to generate."""
+    # NOTE - careful to call this test before the module scoped server
+    # fixture, otherwise it'll OOMkill the CI
+    model_name = "ibm-granite/granite-speech-3.3-2b"
+    lora_model_name = "speech"
+    server_args = [
+        "--enforce-eager",
+        "--enable-lora",
+        "--max-lora-rank",
+        "64",
+        "--lora-modules",
+        f"{lora_model_name}={model_name}",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "1",
+    ]
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        translation = await client.audio.translations.create(
+            model=lora_model_name,
+            file=mary_had_lamb,
+            extra_body=dict(language="en", to_language="es"),
+            response_format="text",
+            temperature=0.0,
+        )
+    out = json.loads(translation)["text"].strip().lower()
+    assert "pequeño" in out.split(" ")
+
+
+# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
+@pytest.mark.asyncio
+async def test_basic_audio(foscolo, client_and_model):
+    client, model_name = client_and_model
+    translation = await client.audio.translations.create(
+        model=model_name,
+        file=foscolo,
+        response_format="text",
+        # TODO remove `language="it"` once language detection is implemented
+        extra_body=dict(language="it", to_language="en"),
+        temperature=0.0,
+    )
+    out = json.loads(translation)["text"].strip().lower()
+    assert "greek sea" in out
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(foscolo, client_and_model):
+    client, model_name = client_and_model
+    # Condition whisper on starting text
+    prompt = "Nor have I ever"
+    transcription = await client.audio.translations.create(
+        model=model_name,
+        file=foscolo,
+        prompt=prompt,
+        extra_body=dict(language="it", to_language="en"),
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)["text"]
+    assert "Nor will I ever touch the sacred" not in out
+    assert prompt not in out
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(foscolo, client_and_model, server):
+    client, model_name = client_and_model
+    translation = ""
+    res_no_stream = await client.audio.translations.create(
+        model=model_name,
+        file=foscolo,
+        response_format="json",
+        extra_body=dict(language="it", to_language="en", seed=42),
+        temperature=0.0,
+    )
+
+    # Stream via HTTPX since OpenAI translation client doesn't expose streaming
+    server, model_name = server
+    url = server.url_for("v1/audio/translations")
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    data = {
+        "model": model_name,
+        "language": "it",
+        "to_language": "en",
+        "stream": True,
+        "temperature": 0.0,
+        "seed": 42,
+    }
+    foscolo.seek(0)
+    async with httpx.AsyncClient() as http_client:
+        files = {"file": foscolo}
+        async with http_client.stream(
+            "POST", url, headers=headers, data=data, files=files
+        ) as response:
+            async for line in response.aiter_lines():
+                if not line:
+                    continue
+                if line.startswith("data: "):
+                    line = line[len("data: ") :]
+                if line.strip() == "[DONE]":
+                    break
+                chunk = json.loads(line)
+                text = chunk["choices"][0].get("delta", {}).get("content")
+                translation += text or ""
+
+    res_stream = translation.split()
+    # NOTE There's a small non-deterministic issue here, likely in the attn
+    # computation, which will cause a few tokens to be different, while still
+    # being very close semantically.
+    assert (
+        sum([x == y for x, y in zip(res_stream, res_no_stream.text.split())])
+        >= len(res_stream) * 0.9
+    )
+
+
+@pytest.mark.asyncio
+async def test_stream_options(foscolo, server):
+    server, model_name = server
+    url = server.url_for("v1/audio/translations")
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    data = {
+        "model": model_name,
+        "language": "it",
+        "to_language": "en",
+        "stream": True,
+        "stream_include_usage": True,
+        "stream_continuous_usage_stats": True,
+        "temperature": 0.0,
+    }
+    foscolo.seek(0)
+    final = False
+    continuous = True
+    async with httpx.AsyncClient() as http_client:
+        files = {"file": foscolo}
+        async with http_client.stream(
+            "POST", url, headers=headers, data=data, files=files
+        ) as response:
+            async for line in response.aiter_lines():
+                if not line:
+                    continue
+                if line.startswith("data: "):
+                    line = line[len("data: ") :]
+                if line.strip() == "[DONE]":
+                    break
+                chunk = json.loads(line)
+                choices = chunk.get("choices", [])
+                if not choices:
+                    # final usage sent
+                    final = True
+                else:
+                    continuous = continuous and ("usage" in chunk)
+    assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_long_audio_request(foscolo, client_and_model):
+    client, model_name = client_and_model
+    if model_name == "google/gemma-3n-E2B-it":
+        pytest.skip("Gemma3n does not support long audio requests")
+    foscolo.seek(0)
+    audio, sr = librosa.load(foscolo)
+    repeated_audio = np.tile(audio, 2)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format="WAV")
+    buffer.seek(0)
+    translation = await client.audio.translations.create(
+        model=model_name,
+        file=buffer,
+        extra_body=dict(language="it", to_language="en"),
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(translation)["text"].strip().lower()
+    assert out.count("greek sea") == 2
--- a/tests/entrypoints/openai/test_uds.py
+++ b/tests/entrypoints/openai/test_uds.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from tempfile import TemporaryDirectory
+
+import httpx
+import pytest
+
+from vllm.version import __version__ as VLLM_VERSION
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def server():
+    with TemporaryDirectory() as tmpdir:
+        args = [
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "8192",
+            "--enforce-eager",
+            "--max-num-seqs",
+            "128",
+            "--uds",
+            f"{tmpdir}/vllm.sock",
+        ]
+
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_show_version(server: RemoteOpenAIServer):
+    transport = httpx.HTTPTransport(uds=server.uds)
+    client = httpx.Client(transport=transport)
+    response = client.get(server.url_for("version"))
+    response.raise_for_status()
+
+    assert response.json() == {"version": VLLM_VERSION}
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -0,0 +1,325 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.multimodal.utils import encode_video_base64, fetch_video
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+MAXIMUM_VIDEOS = 4
+
+TEST_VIDEO_URLS = [
+    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4",
+    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4",
+    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4",
+    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4",
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--runner",
+        "generate",
+        "--max-model-len",
+        "32768",
+        "--max-num-seqs",
+        "2",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"video": MAXIMUM_VIDEOS}),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_video() -> dict[str, str]:
+    return {
+        video_url: encode_video_base64(fetch_video(video_url)[0])
+        for video_url in TEST_VIDEO_URLS
+    }
+
+
+def dummy_messages_from_video_url(
+    video_urls: str | list[str],
+    content_text: str = "What's in this video?",
+):
+    if isinstance(video_urls, str):
+        video_urls = [video_urls]
+
+    return [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "video_url", "video_url": {"url": video_url}}
+                    for video_url in video_urls
+                ),
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_single_chat_session_video(
+    client: openai.AsyncOpenAI, model_name: str, video_url: str
+):
+    messages = dummy_messages_from_video_url(video_url)
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        temperature=0.0,
+        top_logprobs=5,
+    )
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297
+    )
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_error_on_invalid_video_url_type(
+    client: openai.AsyncOpenAI, model_name: str, video_url: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "video_url", "video_url": video_url},
+                {"type": "text", "text": "What's in this video?"},
+            ],
+        }
+    ]
+
+    # video_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_single_chat_session_video_beamsearch(
+    client: openai.AsyncOpenAI, model_name: str, video_url: str
+):
+    messages = dummy_messages_from_video_url(video_url)
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        n=2,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+        extra_body=dict(use_beam_search=True),
+    )
+    assert len(chat_completion.choices) == 2
+    assert (
+        chat_completion.choices[0].message.content
+        != chat_completion.choices[1].message.content
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_single_chat_session_video_base64encoded(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    video_url: str,
+    base64_encoded_video: dict[str, str],
+):
+    messages = dummy_messages_from_video_url(
+        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+    )
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        temperature=0.0,
+        top_logprobs=5,
+    )
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297
+    )
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_single_chat_session_video_base64encoded_beamsearch(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    video_url: str,
+    base64_encoded_video: dict[str, str],
+):
+    messages = dummy_messages_from_video_url(
+        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+    )
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        n=2,
+        max_completion_tokens=10,
+        extra_body=dict(use_beam_search=True),
+    )
+    assert len(chat_completion.choices) == 2
+    assert (
+        chat_completion.choices[0].message.content
+        != chat_completion.choices[1].message.content
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_chat_streaming_video(
+    client: openai.AsyncOpenAI, model_name: str, video_url: str
+):
+    messages = dummy_messages_from_video_url(video_url)
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
+)
+async def test_multi_video_input(
+    client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
+):
+    messages = dummy_messages_from_video_url(video_urls)
+
+    if len(video_urls) > MAXIMUM_VIDEOS:
+        with pytest.raises(openai.BadRequestError):  # test multi-video input
+            await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                max_completion_tokens=10,
+                temperature=0.0,
+            )
+
+        # the server should still work afterwards
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+        )
+        completion = completion.choices[0].text
+        assert completion is not None and len(completion) >= 0
+    else:
+        chat_completion = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
+        message = chat_completion.choices[0].message
+        assert message.content is not None and len(message.content) >= 0
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -0,0 +1,567 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai
+import pytest
+import pytest_asyncio
+from transformers import AutoProcessor
+
+from vllm.multimodal.base import MediaWithBytes
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
+MAXIMUM_IMAGES = 2
+
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
+]
+
+EXPECTED_MM_BEAM_SEARCH_RES = [
+    [
+        "The image shows a wooden boardwalk leading through a",
+        "The image shows a wooden boardwalk extending into a",
+    ],
+    [
+        "The image shows two parrots perched on",
+        "The image shows two birds perched on a cur",
+    ],
+    [
+        "The image shows a Venn diagram with three over",
+        "The image shows a colorful Venn diagram with",
+    ],
+    [
+        "This image displays a gradient of colors ranging from",
+        "This image displays a gradient of colors forming a spectrum",
+    ],
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--runner",
+        "generate",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": MAXIMUM_IMAGES}),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image(local_asset_server) -> dict[str, str]:
+    return {
+        image_asset: encode_image_base64(
+            local_asset_server.get_image_asset(image_asset)
+        )
+        for image_asset in TEST_IMAGE_ASSETS
+    }
+
+
+def dummy_messages_from_image_url(
+    image_urls: str | list[str],
+    content_text: str = "What's in this image?",
+):
+    if isinstance(image_urls, str):
+        image_urls = [image_urls]
+
+    return [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "image_url", "image_url": {"url": image_url}}
+                    for image_url in image_urls
+                ),
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
+
+
+def get_hf_prompt_tokens(model_name, content, image_url):
+    processor = AutoProcessor.from_pretrained(
+        model_name, trust_remote_code=True, num_crops=4
+    )
+
+    placeholder = "<|image_1|>\n"
+    messages = [
+        {
+            "role": "user",
+            "content": f"{placeholder}{content}",
+        }
+    ]
+    image = fetch_image(image_url)
+    # Unwrap MediaWithBytes if present
+    if isinstance(image, MediaWithBytes):
+        image = image.media
+    images = [image]
+
+    prompt = processor.tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = processor(prompt, images, return_tensors="pt")
+
+    return inputs.input_ids.shape[1]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
+async def test_single_chat_session_image(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
+    content_text = "What's in this image?"
+    messages = dummy_messages_from_image_url(image_url, content_text)
+
+    max_completion_tokens = 10
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=max_completion_tokens,
+        logprobs=True,
+        temperature=0.0,
+        top_logprobs=5,
+    )
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=max_completion_tokens,
+        prompt_tokens=hf_prompt_tokens,
+        total_tokens=hf_prompt_tokens + max_completion_tokens,
+    )
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
+async def test_error_on_invalid_image_url_type(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
+    content_text = "What's in this image?"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": image_url},
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
+
+    # image_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
+async def test_single_chat_session_image_beamsearch(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
+    content_text = "What's in this image?"
+    messages = dummy_messages_from_image_url(image_url, content_text)
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        n=2,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+        extra_body=dict(use_beam_search=True),
+    )
+    assert len(chat_completion.choices) == 2
+    assert (
+        chat_completion.choices[0].message.content
+        != chat_completion.choices[1].message.content
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
+async def test_single_chat_session_image_base64encoded(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    raw_image_url: str,
+    image_url: str,
+    base64_encoded_image: dict[str, str],
+):
+    content_text = "What's in this image?"
+    messages = dummy_messages_from_image_url(
+        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
+        content_text,
+    )
+
+    max_completion_tokens = 10
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=max_completion_tokens,
+        logprobs=True,
+        temperature=0.0,
+        top_logprobs=5,
+    )
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=max_completion_tokens,
+        prompt_tokens=hf_prompt_tokens,
+        total_tokens=hf_prompt_tokens + max_completion_tokens,
+    )
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS))))
+async def test_single_chat_session_image_base64encoded_beamsearch(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_idx: int,
+    base64_encoded_image: dict[str, str],
+):
+    # NOTE: This test also validates that we pass MM data through beam search
+    raw_image_url = TEST_IMAGE_ASSETS[image_idx]
+    expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
+
+    messages = dummy_messages_from_image_url(
+        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
+    )
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        n=2,
+        max_completion_tokens=10,
+        temperature=0.0,
+        extra_body=dict(use_beam_search=True),
+    )
+    assert len(chat_completion.choices) == 2
+    for actual, expected_str in zip(chat_completion.choices, expected_res):
+        assert actual.message.content == expected_str
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
+async def test_chat_streaming_image(
+    client: openai.AsyncOpenAI, model_name: str, image_url: str
+):
+    messages = dummy_messages_from_image_url(image_url)
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: list[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True,
+)
+async def test_multi_image_input(
+    client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
+):
+    messages = dummy_messages_from_image_url(image_urls)
+
+    if len(image_urls) > MAXIMUM_IMAGES:
+        with pytest.raises(openai.BadRequestError):  # test multi-image input
+            await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                max_completion_tokens=10,
+                temperature=0.0,
+            )
+
+        # the server should still work afterwards
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+        )
+        completion = completion.choices[0].text
+        assert completion is not None and len(completion) >= 0
+    else:
+        chat_completion = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
+        message = chat_completion.choices[0].message
+        assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True,
+)
+async def test_completions_with_image(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            },
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True,
+)
+async def test_completions_with_image_with_uuid(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            },
+                            "uuid": image_url,
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
+
+        # Second request, with empty image but the same uuid.
+        chat_completion_with_empty_image = await client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {"type": "image_url", "image_url": {}, "uuid": image_url},
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion_with_empty_image.choices[0].message.content is not None
+        assert isinstance(
+            chat_completion_with_empty_image.choices[0].message.content, str
+        )
+        assert len(chat_completion_with_empty_image.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_completions_with_empty_image_with_uuid_without_cache_hit(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {},
+                            "uuid": "uuid_not_previously_seen",
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True,
+)
+async def test_completions_with_image_with_incorrect_uuid_format(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                                "incorrect_uuid_key": image_url,
+                            },
+                            "also_incorrect_uuid_key": image_url,
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
--- a/tests/entrypoints/openai/test_vision_embeds.py
+++ b/tests/entrypoints/openai/test_vision_embeds.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+
+import numpy as np
+import pytest
+import requests
+import torch
+
+from vllm.utils.serial_utils import tensor2base64
+
+from ...utils import RemoteOpenAIServer
+
+
+def _terratorch_dummy_messages():
+    pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
+    location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
+
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_embeds",
+                    "image_embeds": {
+                        "pixel_values": tensor2base64(pixel_values),
+                        "location_coords": tensor2base64(location_coords),
+                    },
+                }
+            ],
+        }
+    ]
+
+
+@pytest.mark.parametrize(
+    "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
+)
+def test_single_request(model_name: str):
+    args = [
+        "--runner",
+        "pooling",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--max-num-seqs",
+        "32",
+        "--model-impl",
+        "terratorch",
+        "--skip-tokenizer-init",
+        "--enable-mm-embeds",
+    ]
+
+    with RemoteOpenAIServer(model_name, args) as server:
+        response = requests.post(
+            server.url_for("pooling"),
+            json={
+                "model": model_name,
+                "messages": _terratorch_dummy_messages(),
+                "encoding_format": "base64",
+            },
+        )
+        response.raise_for_status()
+
+        output = response.json()["data"][0]["data"]
+
+        np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
+        assert len(np_response) == 524288
--- a/tests/entrypoints/openai/tool_parsers/init.py
+++ b/tests/entrypoints/openai/tool_parsers/init.py
--- a/tests/entrypoints/openai/tool_parsers/conftest.py
+++ b/tests/entrypoints/openai/tool_parsers/conftest.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.tokenizers import TokenizerLike
+
+
+@pytest.fixture(scope="function")
+def default_tokenizer() -> TokenizerLike:
+    return AutoTokenizer.from_pretrained("gpt2")
--- a/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers import ToolParser, ToolParserManager
+
+SIMPLE_ARGS_DICT = {
+    "action": "create",
+    "id": "preferences",
+}
+SIMPLE_FUNCTION_JSON = json.dumps(
+    {
+        "name": "manage_user_memory",
+        "arguments": SIMPLE_ARGS_DICT,
+    },
+    ensure_ascii=False,
+)
+SIMPLE_FUNCTION_OUTPUT = "function call" + SIMPLE_FUNCTION_JSON
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="manage_user_memory",
+    arguments=json.dumps(SIMPLE_ARGS_DICT, ensure_ascii=False),
+)
+
+
+PARAMETERLESS_FUNCTION_JSON = json.dumps(
+    {
+        "name": "manage_user_memory",
+        "arguments": {},
+    },
+    ensure_ascii=False,
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "function call" + PARAMETERLESS_FUNCTION_JSON
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="manage_user_memory",
+    arguments=json.dumps({}, ensure_ascii=False),
+)
+
+
+COMPLEX_ARGS_DICT = {
+    "action": "create",
+    "id": "preferences",
+    "content": {
+        "short_answers": True,
+        "hate_emojis": True,
+        "english_ui": False,
+        "russian_math_explanations": True,
+    },
+}
+COMPLEX_FUNCTION_JSON = json.dumps(
+    {
+        "name": "manage_user_memory",
+        "arguments": COMPLEX_ARGS_DICT,
+    },
+    ensure_ascii=False,
+)
+COMPLEX_FUNCTION_OUTPUT = "function call" + COMPLEX_FUNCTION_JSON
+COMPLEX_FUNCTION_CALL = FunctionCall(
+    name="manage_user_memory",
+    arguments=json.dumps(COMPLEX_ARGS_DICT, ensure_ascii=False),
+)
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
+        default_tokenizer
+    )
+    model_output = "How can I help you today?"
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+TEST_CASES = [
+    pytest.param(
+        True,
+        SIMPLE_FUNCTION_OUTPUT,
+        [SIMPLE_FUNCTION_CALL],
+        None,
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        SIMPLE_FUNCTION_OUTPUT,
+        [SIMPLE_FUNCTION_CALL],
+        None,
+        id="simple_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        PARAMETERLESS_FUNCTION_OUTPUT,
+        [PARAMETERLESS_FUNCTION_CALL],
+        None,
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        PARAMETERLESS_FUNCTION_OUTPUT,
+        [PARAMETERLESS_FUNCTION_CALL],
+        None,
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        COMPLEX_FUNCTION_OUTPUT,
+        [COMPLEX_FUNCTION_CALL],
+        None,
+        id="complex_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLEX_FUNCTION_OUTPUT,
+        [COMPLEX_FUNCTION_CALL],
+        None,
+        id="complex_nonstreaming",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "streaming, model_output, expected_tool_calls, expected_content", TEST_CASES
+)
+def test_tool_call(
+    streaming: bool,
+    model_output: str,
+    expected_tool_calls: list[FunctionCall],
+    expected_content: str | None,
+    default_tokenizer: TokenizerLike,
+):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
+        default_tokenizer
+    )
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+    assert content == expected_content
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function.name == expected.name
+        actual_args = json.loads(actual.function.arguments)
+        expected_args = json.loads(expected.arguments)
+        assert actual_args == expected_args
+
+
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")(
+        default_tokenizer
+    )
+    model_output_deltas = [
+        "function call",
+        COMPLEX_FUNCTION_JSON[:40],
+        COMPLEX_FUNCTION_JSON[40:],
+    ]
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser,
+        model_output_deltas,
+        assert_one_tool_per_delta=False,
+    )
+    assert len(reconstructor.tool_calls) == 1
+    call = reconstructor.tool_calls[0]
+    assert call.type == "function"
+    assert call.function.name == "manage_user_memory"
+    args_dict = json.loads(call.function.arguments)
+    assert args_dict == COMPLEX_ARGS_DICT
--- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -0,0 +1,460 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
+
+from ....utils import RemoteOpenAIServer
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+LORA_MODEL = "minpeter/LoRA-Llama-3.2-1B-tool-vllm-ci"
+
+SERVER_ARGS = [
+    "--enforce-eager",
+    "--enable-auto-tool-choice",
+    "--tool-call-parser",
+    "hermes",
+    "--enable-lora",
+    "--lora-modules",
+    f"{LORA_MODEL}={LORA_MODEL}",
+    "--tokenizer",
+    f"{LORA_MODEL}",
+]
+
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+
+PRODUCT_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_product_info",
+            "description": "Get detailed information of a product based on its "
+            "product ID.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "inserted": {
+                        "type": "boolean",
+                        "description": "inserted.",
+                    },
+                    "product_id": {
+                        "type": "integer",
+                        "description": "The product ID of the product.",
+                    },
+                },
+                "required": ["product_id", "inserted"],
+            },
+        },
+    }
+]
+
+MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}]
+
+PRODUCT_MESSAGES = [
+    {
+        "role": "user",
+        "content": "Hi! Do you have any detailed information about the product id "
+        "7355608 and inserted true?",
+    }
+]
+
+
+@pytest.mark.asyncio
+async def test_non_streaming_tool_call():
+    """Test tool call in non-streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        response = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=MESSAGES,
+            tools=TOOLS,
+            tool_choice="auto",
+            temperature=0.0,
+        )
+
+        assert response.choices
+        choice = response.choices[0]
+        message = choice.message
+
+        assert choice.finish_reason == "tool_calls"
+        assert message.tool_calls is not None
+
+        tool_call = message.tool_calls[0]
+        assert tool_call.type == "function"
+        assert tool_call.function.name == "get_current_weather"
+
+        arguments = json.loads(tool_call.function.arguments)
+        assert "location" in arguments
+        assert "Boston" in arguments["location"]
+        print("\n[Non-Streaming Test Passed]")
+        print(f"Tool Call: {tool_call.function.name}")
+        print(f"Arguments: {arguments}")
+
+
+@pytest.mark.asyncio
+async def test_streaming_tool_call():
+    """Test tool call in streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        stream = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=MESSAGES,
+            tools=TOOLS,
+            tool_choice="auto",
+            temperature=0.0,
+            stream=True,
+        )
+
+        tool_call_chunks = {}
+        async for chunk in stream:
+            if not chunk.choices:
+                continue
+
+            delta = chunk.choices[0].delta
+            if not delta or not delta.tool_calls:
+                continue
+
+            for tool_chunk in delta.tool_calls:
+                index = tool_chunk.index
+                if index not in tool_call_chunks:
+                    tool_call_chunks[index] = {"name": "", "arguments": ""}
+
+                if tool_chunk.function.name:
+                    tool_call_chunks[index]["name"] += tool_chunk.function.name
+                if tool_chunk.function.arguments:
+                    tool_call_chunks[index]["arguments"] += (
+                        tool_chunk.function.arguments
+                    )
+
+        assert len(tool_call_chunks) == 1
+        reconstructed_tool_call = tool_call_chunks[0]
+
+        assert reconstructed_tool_call["name"] == "get_current_weather"
+
+        arguments = json.loads(reconstructed_tool_call["arguments"])
+        assert "location" in arguments
+        assert "Boston" in arguments["location"]
+        print("\n[Streaming Test Passed]")
+        print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
+        print(f"Reconstructed Arguments: {arguments}")
+
+
+@pytest.mark.asyncio
+async def test_non_streaming_product_tool_call():
+    """Test tool call integer and boolean parameters in non-streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        response = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=PRODUCT_MESSAGES,
+            tools=PRODUCT_TOOLS,
+            tool_choice="auto",
+            temperature=0.66,
+        )
+
+        assert response.choices
+        choice = response.choices[0]
+        message = choice.message
+
+        assert choice.finish_reason == "tool_calls"
+        assert message.tool_calls is not None
+
+        tool_call = message.tool_calls[0]
+        assert tool_call.type == "function"
+        assert tool_call.function.name == "get_product_info"
+
+        arguments = json.loads(tool_call.function.arguments)
+        assert "product_id" in arguments
+        assert "inserted" in arguments
+
+        product_id = arguments.get("product_id")
+        inserted = arguments.get("inserted")
+
+        assert isinstance(product_id, int)
+        assert product_id == 7355608
+        assert isinstance(inserted, bool)
+        assert inserted is True
+
+        print("\n[Non-Streaming Product Test Passed]")
+        print(f"Tool Call: {tool_call.function.name}")
+        print(f"Arguments: {arguments}")
+
+
+@pytest.mark.asyncio
+async def test_streaming_product_tool_call():
+    """Test tool call integer and boolean parameters in streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        stream = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=PRODUCT_MESSAGES,
+            tools=PRODUCT_TOOLS,
+            tool_choice="auto",
+            temperature=0.66,
+            stream=True,
+        )
+
+        tool_call_chunks = {}
+        async for chunk in stream:
+            if not chunk.choices:
+                continue
+
+            delta = chunk.choices[0].delta
+            if not delta or not delta.tool_calls:
+                continue
+
+            for tool_chunk in delta.tool_calls:
+                index = tool_chunk.index
+                if index not in tool_call_chunks:
+                    tool_call_chunks[index] = {"name": "", "arguments": ""}
+
+                if tool_chunk.function.name:
+                    tool_call_chunks[index]["name"] += tool_chunk.function.name
+                if tool_chunk.function.arguments:
+                    tool_call_chunks[index]["arguments"] += (
+                        tool_chunk.function.arguments
+                    )
+
+        assert len(tool_call_chunks) == 1
+        reconstructed_tool_call = tool_call_chunks[0]
+
+        assert reconstructed_tool_call["name"] == "get_product_info"
+
+        arguments = json.loads(reconstructed_tool_call["arguments"])
+        assert "product_id" in arguments
+        assert "inserted" in arguments
+
+        # Handle type coercion for streaming test as well
+        product_id = arguments.get("product_id")
+        inserted = arguments.get("inserted")
+
+        assert isinstance(product_id, int)
+        assert product_id == 7355608
+        assert isinstance(inserted, bool)
+        assert inserted is True
+
+        print("\n[Streaming Product Test Passed]")
+        print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
+        print(f"Reconstructed Arguments: {arguments}")
+
+
+@pytest.fixture
+def qwen_tokenizer() -> TokenizerLike:
+    from vllm.tokenizers import get_tokenizer
+
+    return get_tokenizer("Qwen/Qwen3-32B")
+
+
+@pytest.fixture
+def hermes_parser(qwen_tokenizer: TokenizerLike) -> Hermes2ProToolParser:
+    return Hermes2ProToolParser(qwen_tokenizer)
+
+
+@pytest.fixture
+def any_chat_request() -> ChatCompletionRequest:
+    return ChatCompletionRequest(
+        seed=42,
+        model="Qwen/Qwen3-32B",
+        messages=[],
+    )
+
+
+def test_hermes_parser_streaming_just_forward_text(
+    qwen_tokenizer: TokenizerLike,
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """This is some prior text that has nothing to do with tool calling."""
+    tokens = qwen_tokenizer.encode(text)
+    previous_text = ""
+    delta_messages = []
+    for token in tokens:
+        delta_text = qwen_tokenizer.decode([token])
+        current_text = previous_text + delta_text
+        delta = hermes_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=delta_text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=any_chat_request,
+        )
+        previous_text = current_text
+        delta_messages.append(delta)
+
+    for delta in delta_messages:
+        assert delta is not None
+        assert not delta.tool_calls
+
+    print(delta_messages)
+    assert "".join([delta.content for delta in delta_messages]) == text
+
+
+def test_hermes_parser_streaming_failure_case_bug_19056(
+    qwen_tokenizer: TokenizerLike,
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """<tool_call>
+{"name": "final_answer", "arguments": {"trigger": true}}
+</tool_call>"""
+    tokens = qwen_tokenizer.encode(text)
+    previous_text = ""
+    delta_messages = []
+    for token in tokens:
+        text = qwen_tokenizer.decode([token])
+        current_text = previous_text + text
+        delta = hermes_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=any_chat_request,
+        )
+        previous_text = current_text
+        if delta is not None:
+            delta_messages.append(delta)
+
+    assert delta_messages[0].tool_calls[0].function.name == "final_answer"
+    tool_call_args = "".join(
+        delta.tool_calls[0].function.arguments or "" for delta in delta_messages
+    )
+    assert tool_call_args == '{"trigger": true}'
+
+
+def test_hermes_parser_streaming(
+    qwen_tokenizer: TokenizerLike,
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = '<tool_call>\
+{"name": "get_current_temperature",\
+"arguments": {"location":\
+"San Francisco, California, United States", "unit": "celsius"}}\
+</tool_call>'
+
+    tokens = qwen_tokenizer.encode(text)
+    previous_text = ""
+    delta_messages = []
+    for token in tokens:
+        text = qwen_tokenizer.decode([token])
+        current_text = previous_text + text
+        delta = hermes_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=any_chat_request,
+        )
+        previous_text = current_text
+        if delta is not None:
+            delta_messages.append(delta)
+    print(delta_messages)
+    assert delta_messages[0].tool_calls[0].function.name == "get_current_temperature"
+    tool_call_args = "".join(
+        delta.tool_calls[0].function.arguments or "" for delta in delta_messages
+    )
+    assert tool_call_args == (
+        '{"location":"San Francisco, California, United States", "unit": "celsius"}'
+    )
+
+
+def test_hermes_parser_non_streaming_no_tool_call(
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """This is not a tool call."""
+    tool_call = hermes_parser.extract_tool_calls(
+        model_output=text,
+        request=any_chat_request,
+    )
+
+    assert tool_call is not None
+    assert not tool_call.tools_called
+
+
+def test_hermes_parser_non_streaming_tool_call_between_tags(
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """<tool_call>
+{"name": "final_answer", "arguments": {"trigger": true}}
+</tool_call>"""
+    tool_call = hermes_parser.extract_tool_calls(
+        model_output=text,
+        request=any_chat_request,
+    )
+
+    assert tool_call is not None
+    assert tool_call.tools_called
+    assert tool_call.tool_calls[0].function.name == "final_answer"
+    assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}'
+
+
+def test_hermes_parser_non_streaming_tool_call_until_eos(
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """<tool_call>
+{"name": "final_answer", "arguments": {"trigger": true}}"""
+    tool_call = hermes_parser.extract_tool_calls(
+        model_output=text,
+        request=any_chat_request,
+    )
+
+    assert tool_call is not None
+    assert tool_call.tools_called
+    assert tool_call.tool_calls[0].function.name == "final_answer"
+    assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}'
+
+
+def test_hermes_parser_non_streaming_tool_call_invalid_json(
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    # Missing closing brace to trigger exception
+    text = """<tool_call>
+{"name": "final_answer", "arguments": {"trigger": true}"""
+    tool_call = hermes_parser.extract_tool_calls(
+        model_output=text,
+        request=any_chat_request,
+    )
+
+    assert tool_call is not None
+    assert not tool_call.tools_called
--- a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
@@ -0,0 +1,179 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+
+import json
+from unittest.mock import MagicMock
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
+from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.tool_parsers import ToolParser, ToolParserManager
+
+
+def make_tool_call(name, arguments):
+    return ToolCall(
+        type="function",
+        function=FunctionCall(name=name, arguments=json.dumps(arguments)),
+    )
+
+
+# TODO: add reason prefix and suffix.
+
+
+@pytest.mark.parametrize(
+    "model_output,expected_tool_calls,expected_content",
+    [
+        # No tool call
+        ("How can I help you today?", [], "How can I help you today?"),
+        # Single tool call, no content
+        (
+            '<tool_calls>[{"name": "get_weather", "arguments": {"city": "San Francisco", "metric": "celsius"}}]</tool_calls>',  # noqa: E501
+            [
+                make_tool_call(
+                    "get_weather", {"city": "San Francisco", "metric": "celsius"}
+                )
+            ],
+            None,
+        ),
+        # Multiple tool calls
+        (
+            '<tool_calls>[{"name": "get_weather", "arguments": {"city": "San Francisco", "metric": "celsius"}}, {"name": "register_user", "arguments": {"name": "John Doe", "age": 37, "address": {"city": "San Francisco", "state": "CA"}, "role": null, "passed_test": true, "aliases": ["John", "Johnny"]}}]</tool_calls>',  # noqa: E501
+            [
+                make_tool_call(
+                    "get_weather", {"city": "San Francisco", "metric": "celsius"}
+                ),
+                make_tool_call(
+                    "register_user",
+                    {
+                        "name": "John Doe",
+                        "age": 37,
+                        "address": {"city": "San Francisco", "state": "CA"},
+                        "role": None,
+                        "passed_test": True,
+                        "aliases": ["John", "Johnny"],
+                    },
+                ),
+            ],
+            None,
+        ),
+        # Content before tool call
+        (
+            'I will call the tool now. <tool_calls>[{"name": "get_weather", "arguments": {"city": "Boston"}}]</tool_calls>',  # noqa: E501
+            [make_tool_call("get_weather", {"city": "Boston"})],
+            "I will call the tool now. ",
+        ),
+        # Content after tool call (should be stripped)
+        (
+            '<tool_calls>[{"name": "get_weather", "arguments": {"city": "Seattle"}}]</tool_calls>\nThank you!',  # noqa: E501
+            [make_tool_call("get_weather", {"city": "Seattle"})],
+            None,
+        ),
+        (
+            '<tool_calls>[{"name": "complex_tool", "arguments": {"level1": {"level2": {"level3": {"value": 123}}}}}]</tool_calls>',
+            [
+                make_tool_call(
+                    "complex_tool", {"level1": {"level2": {"level3": {"value": 123}}}}
+                )
+            ],
+            None,
+        ),
+    ],
+)
+def test_hunyuan_a13b_tool_parser_extract(
+    model_output, expected_tool_calls, expected_content
+):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("hunyuan_a13b")(
+        mock_tokenizer
+    )
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=False
+    )
+
+    # align the random id.
+    for idx in range(len(tool_calls)):
+        tool_calls[idx].id = expected_tool_calls[idx].id
+    assert tool_calls == expected_tool_calls
+    assert content == expected_content
+
+
+# Streaming test: simulate incremental output
+@pytest.mark.parametrize(
+    "model_deltas,expected_tool_calls",
+    [
+        (
+            [
+                '<tool_calls>[{"name": "get_weather", ',
+                '"arguments": {"city": "San Francisco", ',
+                '"metric": "celsius"}}]',
+                "</tool_calls>",
+            ],
+            [
+                make_tool_call(
+                    "get_weather", {"city": "San Francisco", "metric": "celsius"}
+                )
+            ],
+        ),
+        (
+            [
+                '<tool_calls>[{"name":',
+                ' "get_weather",',
+                ' "arguments":',
+                ' {"city": "Boston"}',
+                "}]",
+                "</tool_calls>",
+            ],
+            [make_tool_call("get_weather", {"city": "Boston"})],
+        ),
+        (
+            [
+                "",
+                '<tool_calls>[{"name":',
+                ' "get_weather",',
+                ' "arguments":',
+                ' {"city": "Boston"}',
+                "}]",
+                "</tool_calls>",
+                "\n</answer>",
+            ],
+            [make_tool_call("get_weather", {"city": "Boston"})],
+        ),
+        pytest.param(
+            [
+                '<tool_calls>[{"name": "complex_tool",',
+                ' "arguments": ',
+                ' {"level1": {"level2": ',
+                '{"level3": {"value": 123}}}}}',
+                "]</tool_calls>",
+            ],
+            [
+                make_tool_call(
+                    "complex_tool", {"level1": {"level2": {"level3": {"value": 123}}}}
+                )
+            ],
+            marks=pytest.mark.xfail(
+                reason="stream parsing not support nested json yet."
+            ),
+        ),
+    ],
+)
+def test_hunyuan_a13b_tool_parser_streaming(model_deltas, expected_tool_calls):
+    mock_tokenizer = MagicMock()
+
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("hunyuan_a13b")(
+        mock_tokenizer
+    )
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_deltas, assert_one_tool_per_delta=False
+    )
+
+    # align the random id.
+    for idx in range(len(reconstructor.tool_calls)):
+        reconstructor.tool_calls[idx].id = expected_tool_calls[idx].id
+
+    assert reconstructor.tool_calls == expected_tool_calls
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -0,0 +1,262 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.llama_tool_parser import Llama3JsonToolParser
+
+
+@pytest.fixture
+def parser(default_tokenizer: TokenizerLike):
+    return Llama3JsonToolParser(default_tokenizer)
+
+
+def test_extract_tool_calls_simple(parser):
+    # Test with a simple tool call
+    model_output = (
+        'Here is the result: {"name": "getOpenIncidentsTool", '
+        '"parameters": {}} Would you like to know more?'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert isinstance(result, ExtractedToolCallInformation)
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].type == "function"
+    assert result.tool_calls[0].function.name == "getOpenIncidentsTool"
+    assert result.tool_calls[0].function.arguments == "{}"
+    assert result.content is None
+
+
+def test_extract_tool_calls_with_arguments(parser):
+    # Test with a tool call that has arguments
+    model_output = (
+        '{"name": "searchTool", "parameters": {"query": "test query", "limit": 10}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert '"query": "test query"' in result.tool_calls[0].function.arguments
+    assert '"limit": 10' in result.tool_calls[0].function.arguments
+
+
+def test_extract_tool_calls_no_json(parser):
+    # Test with text that doesn't contain a JSON object
+    model_output = "This is just some text without any tool calls"
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_extract_tool_calls_invalid_json(parser):
+    # Test with invalid JSON
+    model_output = '{"name": "invalidTool", "parameters": {invalid json}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_extract_tool_calls_with_arguments_key(parser):
+    # Test with a tool call that uses "arguments" instead of "parameters"
+    model_output = '{"name": "searchTool", "arguments": {"query": "test"}}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert '"query": "test"' in result.tool_calls[0].function.arguments
+
+
+def test_extract_tool_calls_multiple_json(parser):
+    # Test with multiple JSONs separated by semicolons
+    model_output = (
+        '{"name": "searchTool", "parameters": {"query": "test1"}}; '
+        '{"name": "getOpenIncidentsTool", "parameters": {}}; '
+        '{"name": "searchTool", "parameters": {"query": "test2"}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 3
+
+    # Check first tool call
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert '"query": "test1"' in result.tool_calls[0].function.arguments
+
+    # Check second tool call
+    assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
+    assert result.tool_calls[1].function.arguments == "{}"
+
+    # Check third tool call
+    assert result.tool_calls[2].function.name == "searchTool"
+    assert '"query": "test2"' in result.tool_calls[2].function.arguments
+
+
+def test_extract_tool_calls_multiple_json_with_whitespace(parser):
+    # Test with multiple JSONs separated by semicolons and extra whitespace
+    model_output = (
+        '{"name": "searchTool", "parameters": {"query": "test1"}} ; '
+        '{"name": "getOpenIncidentsTool", "parameters": {}} ; '
+        '{"name": "searchTool", "parameters": {"query": "test2"}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 3
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
+    assert result.tool_calls[2].function.name == "searchTool"
+
+
+def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
+    # Test with multiple JSONs and surrounding text
+    model_output = (
+        "Here are the results: "
+        '{"name": "searchTool", "parameters": {"query": "test1"}}; '
+        '{"name": "getOpenIncidentsTool", "parameters": {}}; '
+        '{"name": "searchTool", "parameters": {"query": "test2"}} '
+        "Would you like to know more?"
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 3
+    assert result.tool_calls[0].function.name == "searchTool"
+    assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
+    assert result.tool_calls[2].function.name == "searchTool"
+
+
+def test_extract_tool_calls_deeply_nested_json(parser):
+    # Test with deeply nested JSON parameters (5 levels)
+    model_output = (
+        '{"name": "complexTool", '
+        '"parameters": {'
+        '"level1": {'
+        '"level2": {'
+        '"level3": {'
+        '"level4": {'
+        '"value": "deep"'
+        "}}}}}}"
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "complexTool"
+    # Verify the nested structure is preserved in the arguments
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["level1"]["level2"]["level3"]["level4"]["value"] == "deep"
+
+
+def test_extract_tool_calls_multiple_with_deep_nesting(parser):
+    # Test with multiple tool calls where some have deeply nested parameters
+    model_output = (
+        '{"name": "simpleTool", "parameters": {"value": "test"}}; '
+        '{"name": "complexTool", "parameters": '
+        '{"config": {"database": {"connection": {"pool": {"size": 10}}}}}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 2
+
+    # Check first tool call
+    assert result.tool_calls[0].function.name == "simpleTool"
+    import json
+
+    args0 = json.loads(result.tool_calls[0].function.arguments)
+    assert args0["value"] == "test"
+
+    # Check second tool call with deep nesting
+    assert result.tool_calls[1].function.name == "complexTool"
+    args1 = json.loads(result.tool_calls[1].function.arguments)
+    assert args1["config"]["database"]["connection"]["pool"]["size"] == 10
+
+
+def test_extract_tool_calls_with_quotes_and_brackets_in_string(parser):
+    # Test with quotes and brackets inside quoted string values
+    model_output = (
+        '{"name": "searchTool", '
+        '"parameters": {'
+        '"query": "test {value} [complex]",'
+        '"nested": {"inner": "more {brackets}"}'
+        "}}"
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "searchTool"
+    # Verify the string values are preserved including brackets and quotes
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["query"] == "test {value} [complex]"
+    assert args["nested"]["inner"] == "more {brackets}"
+
+
+def test_extract_tool_calls_with_escaped_quotes_in_nested_json(parser):
+    # Test with escaped quotes in deeply nested JSON
+    model_output = (
+        '{"name": "parserTool", "parameters": {"text": "He said \\"Hello {world}\\""}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "parserTool"
+    # Verify escaped quotes are preserved
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["text"] == 'He said "Hello {world}"'
+
+
+def test_extract_tool_calls_missing_name_key(parser):
+    # Test that missing "name" key returns content
+    model_output = '{"parameters": {}}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_extract_tool_calls_missing_parameters_and_arguments_key(parser):
+    # Test that missing both "parameters" and "arguments" keys returns content
+    model_output = '{"name": "toolWithoutParams"}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_regex_timeout_handling(parser):
+    """Test regex timeout is handled gracefully"""
+    fake_problematic_input = "{hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.finditer.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(parser, "tool_call_start_regex", mock_regex):
+        result = parser.extract_tool_calls(fake_problematic_input, None)
+
+        # should treat as regular text when regex times out
+        assert result.content == fake_problematic_input
+        assert result.tools_called is False
+        assert len(result.tool_calls) == 0
+        mock_regex.finditer.assert_called_once()
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -0,0 +1,269 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers import ToolParser, ToolParserManager
+
+# Test cases similar to pythonic parser but with Llama4 specific format
+SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "LA", "metric": "C"}',
+)
+MORE_TYPES_FUNCTION_OUTPUT = (
+    "[register_user(name='Doe', "
+    "age=9, "
+    "address={'city': 'LA', 'state': 'CA'}, "
+    "role=None, "
+    "passed_test=True, "
+    "aliases=['John', 'Johnny'])]"
+)
+MORE_TYPES_FUNCTION_CALL = FunctionCall(
+    name="register_user",
+    arguments='{"name": "Doe", '
+    '"age": 9, '
+    '"address": {"city": "LA", "state": "CA"}, '
+    '"role": null, '
+    '"passed_test": true, '
+    '"aliases": ["John", "Johnny"]}',
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "[get_weather()]"
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments="{}",
+)
+EMPTY_DICT_FUNCTION_OUTPUT = "[do_something_cool(additional_data={})]"
+EMPTY_DICT_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"additional_data": {}}',
+)
+EMPTY_LIST_FUNCTION_OUTPUT = "[do_something_cool(steps=[])]"
+EMPTY_LIST_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"steps": []}',
+)
+ESCAPED_STRING_FUNCTION_OUTPUT = (
+    r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]"
+)
+ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
+)
+PYTHON_TAG_FUNCTION_OUTPUT = (
+    "<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>"
+)
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        default_tokenizer
+    )
+    model_output = "How can I help you today?"
+
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+test_str = "<|python_start|>"
+test_str += "[get_weather(city='LA', metric='C'),"
+test_str += "register_user(name='Doe', age=9)]"
+TEST_CASES = [
+    pytest.param(
+        True,
+        ESCAPED_STRING_FUNCTION_OUTPUT,
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False, SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL], id="simple_nonstreaming"
+    ),
+    pytest.param(
+        True,
+        MORE_TYPES_FUNCTION_OUTPUT,
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_streaming",
+    ),
+    pytest.param(
+        False,
+        MORE_TYPES_FUNCTION_OUTPUT,
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        PARAMETERLESS_FUNCTION_OUTPUT,
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        PARAMETERLESS_FUNCTION_OUTPUT,
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        EMPTY_DICT_FUNCTION_OUTPUT,
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY_DICT_FUNCTION_OUTPUT,
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        EMPTY_LIST_FUNCTION_OUTPUT,
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY_LIST_FUNCTION_OUTPUT,
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        ESCAPED_STRING_FUNCTION_OUTPUT,
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_streaming",
+    ),
+    pytest.param(
+        False,
+        ESCAPED_STRING_FUNCTION_OUTPUT,
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
+        ],
+        id="parallel_calls_streaming",
+    ),
+    pytest.param(
+        False,
+        "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
+        ],
+        id="parallel_calls_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        PYTHON_TAG_FUNCTION_OUTPUT,
+        [SIMPLE_FUNCTION_CALL],
+        id="python_tag_streaming",
+    ),
+    pytest.param(
+        False,
+        PYTHON_TAG_FUNCTION_OUTPUT,
+        [SIMPLE_FUNCTION_CALL],
+        id="python_tag_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        test_str,
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
+        ],
+        id="parallel_calls_streaming",
+    ),
+    pytest.param(
+        False,
+        "<|python_start|>[get_weather(city='LA', metric='C'), "
+        + "register_user(name='Doe', age=9)]",
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user", arguments='{"name": "Doe", "age": 9}'),
+        ],
+        id="parallel_calls_nonstreaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
+def test_tool_call(
+    streaming: bool,
+    model_output: str,
+    expected_tool_calls: list[FunctionCall],
+    default_tokenizer: TokenizerLike,
+):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        default_tokenizer
+    )
+
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function == expected
+
+
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        default_tokenizer
+    )
+    model_output_deltas = [
+        "<|python_start|>[get_weather(city='LA', metric='C'), "
+        "get_weather(), "
+        "do_something_cool(steps=[])]<|python_end|>",
+    ]
+
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False
+    )
+
+    assert reconstructor.other_content == ""
+    assert len(reconstructor.tool_calls) == 3
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
+    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+
+
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
+    """test regex timeout is handled gracefully"""
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
+        default_tokenizer
+    )
+
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
+        content, tool_calls = run_tool_extraction(
+            tool_parser, fake_problematic_input, streaming=streaming
+        )
+
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
--- a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
@@ -0,0 +1,251 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers import ToolParser, ToolParserManager
+
+# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
+SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "San Francisco", "metric": "celsius"}',
+)
+MORE_TYPES_FUNCTION_OUTPUT = (
+    "register_user(name='John Doe', "
+    "age=37, "
+    "address={'city': 'San Francisco', 'state': 'CA'}, "
+    "role=None, "
+    "passed_test=True, "
+    "aliases=['John', 'Johnny'])"
+)
+MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS = (
+    "register_user(name='John Doe', "
+    "age=37, "
+    "address={'city': 'San Francisco', 'state': 'CA'}, "
+    "role=null, "
+    "passed_test=true, "
+    "aliases=['John', 'Johnny'])"
+)
+MORE_TYPES_FUNCTION_CALL = FunctionCall(
+    name="register_user",
+    arguments='{"name": "John Doe", '
+    '"age": 37, '
+    '"address": {"city": "San Francisco", "state": "CA"}, '
+    '"role": null, '
+    '"passed_test": true, '
+    '"aliases": ["John", "Johnny"]}',
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments="{}",
+)
+EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
+EMPTY_DICT_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"additional_data": {}}',
+)
+EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
+EMPTY_LIST_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"steps": []}',
+)
+ESCAPED_STRING_FUNCTION_OUTPUT = (
+    r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
+)
+ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
+)
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
+        default_tokenizer
+    )
+    model_output = "How can I help you today?"
+
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+TEST_CASES = [
+    pytest.param(
+        True,
+        f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
+        [SIMPLE_FUNCTION_CALL],
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
+        [SIMPLE_FUNCTION_CALL],
+        id="simple_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_streaming_json_literals",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_nonstreaming_json_literals",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        id="parallel_calls_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        id="parallel_calls_nonstreaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
+def test_tool_call(
+    streaming: bool,
+    model_output: str,
+    expected_tool_calls: list[FunctionCall],
+    default_tokenizer: TokenizerLike,
+):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
+        default_tokenizer
+    )
+
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+
+    assert content is None
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function == expected
+
+
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
+        default_tokenizer
+    )
+    model_output_deltas = [
+        "<function_calls>get_weather(city='San",
+        " Francisco', metric='celsius')\n"
+        f"{PARAMETERLESS_FUNCTION_OUTPUT}\n"
+        f"{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
+    ]
+
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False
+    )
+
+    assert reconstructor.other_content == ""
+    assert len(reconstructor.tool_calls) == 3
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
+    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+
+
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
+    """test regex timeout is handled gracefully"""
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
+        default_tokenizer
+    )
+
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
+        content, tool_calls = run_tool_extraction(
+            tool_parser, fake_problematic_input, streaming=streaming
+        )
+
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
--- a/tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py
@@ -0,0 +1,359 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import jsonschema
+import openai
+import pytest
+import pytest_asyncio
+from rapidfuzz import fuzz
+
+from ....utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/gpt-oss-20b"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "openai",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    """Async fixture providing an OpenAI-compatible vLLM client."""
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+# ==========================================================
+# Tool Definitions
+# ==========================================================
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "calculator",
+            "description": "Performs basic arithmetic calculations.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": (
+                            "Arithmetic expression to evaluate, e.g. '123 + 456'."
+                        ),
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_time",
+            "description": "Retrieves the current local time for a given city.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "City name, e.g. 'New York'.",
+                    }
+                },
+                "required": ["city"],
+            },
+        },
+    },
+]
+
+
+# ==========================================================
+# Message Examples
+# ==========================================================
+MESSAGES_CALC = [
+    {"role": "user", "content": "Calculate 123 + 456 using the calculator."}
+]
+
+MESSAGES_GET_TIME = [
+    {"role": "user", "content": "What is the current time in New York?"}
+]
+
+MESSAGES_MULTIPLE_CALLS = [
+    {
+        "role": "system",
+        "content": (
+            "You can call multiple tools. "
+            "When using more than one, return single JSON object with tool_calls array"
+            "containing each tool call with its function name and arguments. "
+            "Do not output multiple JSON objects separately."
+        ),
+    },
+    {
+        "role": "user",
+        "content": "First, calculate 7 * 8 using the calculator. "
+        "Then, use get_time to tell me the current time in New York.",
+    },
+]
+
+MESSAGES_INVALID_CALL = [
+    {
+        "role": "user",
+        "content": "Can you help with something, "
+        "but don’t actually perform any calculation?",
+    }
+]
+
+
+# Expected outputs
+FUNC_CALC = "calculator"
+FUNC_ARGS_CALC = '{"expression":"123 + 456"}'
+
+FUNC_TIME = "get_time"
+FUNC_ARGS_TIME = '{"city": "New York"}'
+
+
+# ==========================================================
+# Utility to extract reasoning and tool calls
+# ==========================================================
+def extract_reasoning_and_calls(chunks: list) -> tuple[str, list[str], list[str]]:
+    """
+    Extract accumulated reasoning text and tool call arguments
+    from streaming chunks.
+    """
+    reasoning_content: str = ""
+    tool_calls: dict[int, dict[str, str]] = {}
+
+    for chunk in chunks:
+        choice = getattr(chunk.choices[0], "delta", None)
+        if not choice:
+            continue
+
+        if hasattr(choice, "reasoning_content") and choice.reasoning_content:
+            reasoning_content += choice.reasoning_content
+
+        for tc in getattr(choice, "tool_calls", []) or []:
+            idx = getattr(tc, "index", 0)
+            tool_entry = tool_calls.setdefault(idx, {"name": "", "arguments": ""})
+
+            if getattr(tc, "function", None):
+                func = tc.function
+                if getattr(func, "name", None):
+                    tool_entry["name"] = func.name
+                if getattr(func, "arguments", None):
+                    tool_entry["arguments"] += func.arguments
+
+    function_names: list[str] = [v["name"] for _, v in sorted(tool_calls.items())]
+    arguments: list[str] = [v["arguments"] for _, v in sorted(tool_calls.items())]
+
+    return reasoning_content, arguments, function_names
+
+
+# ==========================================================
+# Test Scenarios
+# ==========================================================
+@pytest.mark.asyncio
+async def test_calculator_tool_call_and_argument_accuracy(client: openai.AsyncOpenAI):
+    """Verify calculator tool call is made and arguments are accurate."""
+
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_CALC,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=False,
+    )
+
+    message = response.choices[0].message
+    tool_calls = getattr(message, "tool_calls", [])
+    assert tool_calls, "No tool calls detected"
+
+    calc_call = next((c for c in tool_calls if c.function.name == FUNC_CALC), None)
+    assert calc_call, "Calculator function not called"
+
+    raw_args = calc_call.function.arguments
+    assert raw_args, "Calculator arguments missing"
+    assert "123" in raw_args and "456" in raw_args, (
+        f"Expected values not in raw arguments: {raw_args}"
+    )
+
+    try:
+        parsed_args = json.loads(raw_args)
+    except json.JSONDecodeError:
+        pytest.fail(f"Invalid JSON in calculator arguments: {raw_args}")
+
+    expected_expr = "123 + 456"
+    actual_expr = parsed_args.get("expression", "")
+    similarity = fuzz.ratio(actual_expr, expected_expr)
+
+    assert similarity > 90, (
+        f"Expression mismatch: expected '{expected_expr}' "
+        f"got '{actual_expr}' (similarity={similarity}%)"
+    )
+
+
+@pytest.mark.asyncio
+async def test_streaming_tool_call_get_time_with_reasoning(client: openai.AsyncOpenAI):
+    """Verify streamed reasoning and tool call behavior for get_time."""
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_GET_TIME,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=True,
+    )
+
+    chunks = [chunk async for chunk in stream]
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+
+    assert FUNC_TIME in function_names, "get_time function not called"
+
+    assert any("New York" in arg for arg in arguments), (
+        f"Expected get_time arguments for New York not found in {arguments}"
+    )
+
+    assert len(reasoning) > 0, "Expected reasoning content missing"
+
+    assert any(keyword in reasoning for keyword in ["New York", "time", "current"]), (
+        f"Reasoning is not relevant to the request: {reasoning}"
+    )
+
+
+@pytest.mark.asyncio
+async def test_streaming_multiple_tools(client: openai.AsyncOpenAI):
+    """Test streamed multi-tool response with reasoning."""
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_MULTIPLE_CALLS,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=True,
+    )
+
+    chunks = [chunk async for chunk in stream]
+    reasoning, arguments, function_names = extract_reasoning_and_calls(chunks)
+
+    try:
+        assert FUNC_CALC in function_names, (
+            f"Calculator tool missing — found {function_names}"
+        )
+        assert FUNC_TIME in function_names, (
+            f"Time tool missing — found {function_names}"
+        )
+        assert len(reasoning) > 0, "Expected reasoning content in streamed response"
+    except AssertionError as e:
+        print(f"ERROR: {e}")
+
+
+@pytest.mark.asyncio
+async def test_invalid_tool_call(client: openai.AsyncOpenAI):
+    """
+    Verify that ambiguous instructions that should not trigger a tool
+    do not produce any tool calls.
+    """
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_INVALID_CALL,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=False,
+    )
+
+    message = response.choices[0].message
+
+    assert message is not None, "Expected message in response"
+    assert hasattr(message, "content"), "Expected 'content' field in message"
+
+    tool_calls = getattr(message, "tool_calls", [])
+    assert not tool_calls, (
+        f"Model unexpectedly attempted a tool call on invalid input: {tool_calls}"
+    )
+
+
+@pytest.mark.asyncio
+async def test_tool_call_with_temperature(client: openai.AsyncOpenAI):
+    """
+    Verify model produces valid tool or text output
+    under non-deterministic sampling.
+    """
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_CALC,
+        tools=TOOLS,
+        temperature=0.7,
+        stream=False,
+    )
+
+    message = response.choices[0].message
+    assert message is not None, "Expected non-empty message in response"
+    assert message.tool_calls or message.content, (
+        "Response missing both text and tool calls"
+    )
+
+    print(f"\nTool calls: {message.tool_calls}")
+    print(f"Text: {message.content}")
+
+
+@pytest.mark.asyncio
+async def test_tool_response_schema_accuracy(client: openai.AsyncOpenAI):
+    """Validate that tool call arguments adhere to their declared JSON schema."""
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES_MULTIPLE_CALLS,
+        tools=TOOLS,
+        temperature=0.0,
+    )
+
+    calls = response.choices[0].message.tool_calls
+    assert calls, "No tool calls produced"
+
+    for call in calls:
+        func_name = call.function.name
+        args = json.loads(call.function.arguments)
+
+        schema: dict[str, object] | None = None
+        for tool_entry in TOOLS:
+            function_def = tool_entry.get("function")
+            if (
+                function_def
+                and isinstance(function_def, dict)
+                and function_def.get("name") == func_name
+            ):
+                schema = function_def.get("parameters")
+                break
+
+        assert schema is not None, f"No matching tool schema found for {func_name}"
+
+        jsonschema.validate(instance=args, schema=schema)
+
+
+@pytest.mark.asyncio
+async def test_semantic_consistency_with_temperature(client: openai.AsyncOpenAI):
+    """Test that temperature variation doesn't cause contradictory reasoning."""
+    responses = []
+    for temp in [0.0, 0.5, 1.0]:
+        resp = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=MESSAGES_CALC,
+            tools=TOOLS,
+            temperature=temp,
+        )
+        text = (resp.choices[0].message.content or "").strip()
+        responses.append(text)
+
+    # Compare fuzzy similarity between low- and mid-temperature outputs
+    low_mid_sim = fuzz.ratio(responses[0], responses[1])
+    assert low_mid_sim > 60, (
+        f"Semantic drift too large between T=0.0 and T=0.5 ({low_mid_sim}%)"
+    )
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -0,0 +1,231 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers import ToolParser, ToolParserManager
+
+# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
+SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "San Francisco", "metric": "celsius"}',
+)
+MORE_TYPES_FUNCTION_OUTPUT = (
+    "register_user(name='John Doe', "
+    "age=37, "
+    "address={'city': 'San Francisco', 'state': 'CA'}, "
+    "role=None, "
+    "passed_test=True, "
+    "aliases=['John', 'Johnny'])"
+)
+MORE_TYPES_FUNCTION_CALL = FunctionCall(
+    name="register_user",
+    arguments='{"name": "John Doe", '
+    '"age": 37, '
+    '"address": {"city": "San Francisco", "state": "CA"}, '
+    '"role": null, '
+    '"passed_test": true, '
+    '"aliases": ["John", "Johnny"]}',
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments="{}",
+)
+EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
+EMPTY_DICT_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"additional_data": {}}',
+)
+EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
+EMPTY_LIST_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"steps": []}',
+)
+ESCAPED_STRING_FUNCTION_OUTPUT = (
+    r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
+)
+ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
+)
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
+        default_tokenizer
+    )
+    model_output = "How can I help you today?"
+
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+TEST_CASES = [
+    pytest.param(
+        True,
+        f"[{SIMPLE_FUNCTION_OUTPUT}]",
+        [SIMPLE_FUNCTION_CALL],
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{SIMPLE_FUNCTION_OUTPUT}]",
+        [SIMPLE_FUNCTION_CALL],
+        id="simple_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{MORE_TYPES_FUNCTION_OUTPUT}]",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{MORE_TYPES_FUNCTION_OUTPUT}]",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{EMPTY_DICT_FUNCTION_OUTPUT}]",
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{EMPTY_DICT_FUNCTION_OUTPUT}]",
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{EMPTY_LIST_FUNCTION_OUTPUT}]",
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{EMPTY_LIST_FUNCTION_OUTPUT}]",
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        id="parallel_calls_streaming",
+    ),
+    pytest.param(
+        False,
+        f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        id="parallel_calls_nonstreaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
+def test_tool_call(
+    streaming: bool,
+    model_output: str,
+    expected_tool_calls: list[FunctionCall],
+    default_tokenizer: TokenizerLike,
+):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
+        default_tokenizer
+    )
+
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+
+    assert content is None
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function == expected
+
+
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
+        default_tokenizer
+    )
+    model_output_deltas = [
+        "[get_weather(city='San",
+        " Francisco', metric='celsius'), "
+        f"{PARAMETERLESS_FUNCTION_OUTPUT}, "
+        f"{EMPTY_LIST_FUNCTION_OUTPUT}]",
+    ]
+
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False
+    )
+
+    assert reconstructor.other_content == ""
+    assert len(reconstructor.tool_calls) == 3
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
+    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+
+
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
+    """test regex timeout is handled gracefully"""
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
+        default_tokenizer
+    )
+
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
+        content, tool_calls = run_tool_extraction(
+            tool_parser, fake_problematic_input, streaming=streaming
+        )
+
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaMessage,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers import ToolParser
+
+
+class StreamingToolReconstructor:
+    def __init__(self, assert_one_tool_per_delta: bool = True):
+        self.tool_calls: list[ToolCall] = []
+        self.other_content: str = ""
+        self._assert_one_tool_per_delta = assert_one_tool_per_delta
+
+    def append_delta(self, delta: DeltaMessage):
+        if delta.content is not None:
+            self.other_content += delta.content
+        else:
+            assert delta.tool_calls, (
+                "Streaming results should have either content or tool calls (or both)"
+            )
+        if self._assert_one_tool_per_delta:
+            # Note: This isn't strictly required by the API and may not be
+            # possible to adhere to depending on the token space and number of
+            # tokens per streamed response from the model, but it is required
+            # by tool_use tests, so we enforce it here by default also.
+            assert len(delta.tool_calls) < 2, (
+                "Streaming should include only one tool call per update."
+            )
+        for call_delta in delta.tool_calls:
+            assert call_delta.type is None or call_delta.type == "function", (
+                "Streaming tool calls should only emit function calls. Got "
+                f"{call_delta.type}"
+            )
+            current_tool_call = (
+                self.tool_calls[call_delta.index]
+                if call_delta.index < len(self.tool_calls)
+                else None
+            )
+            if current_tool_call:
+                assert not call_delta.function.name, (
+                    "Streaming tool calls should emit the full function name "
+                    f"exactly once. Got {call_delta.function.name}"
+                )
+                assert not call_delta.id, (
+                    "Streaming tool calls must emit function id only once. Got "
+                    f"{call_delta.id}"
+                )
+                assert call_delta.index == len(self.tool_calls) - 1, (
+                    f"Incorrect index for tool delta. Got {call_delta.index}, "
+                    f"expected {len(self.tool_calls) - 1}"
+                )
+                current_tool_call.function.arguments += call_delta.function.arguments
+            else:
+                assert call_delta.id is not None, (
+                    "Streaming tool calls must have an id on first appearance"
+                )
+                assert call_delta.function.name is not None, (
+                    "Streaming tool calls must have a function name on first appearance"
+                )
+                assert call_delta.index == len(self.tool_calls), (
+                    f"Incorrect index for tool delta. Got {call_delta.index}, "
+                    f"expected {len(self.tool_calls)}"
+                )
+                self.tool_calls.append(
+                    ToolCall(
+                        id=call_delta.id,
+                        function=FunctionCall(
+                            name=call_delta.function.name,
+                            arguments=call_delta.function.arguments or "",
+                        ),
+                    )
+                )
+
+
+def run_tool_extraction(
+    tool_parser: ToolParser,
+    model_output: str,
+    request: ChatCompletionRequest | None = None,
+    streaming: bool = False,
+    assert_one_tool_per_delta: bool = True,
+) -> tuple[str | None, list[ToolCall]]:
+    if streaming:
+        reconstructor = run_tool_extraction_streaming(
+            tool_parser,
+            model_output,
+            request,
+            assert_one_tool_per_delta=assert_one_tool_per_delta,
+        )
+        return reconstructor.other_content or None, reconstructor.tool_calls
+    else:
+        extracted = run_tool_extraction_nonstreaming(tool_parser, model_output, request)
+        assert extracted.tools_called == bool(extracted.tool_calls)
+        return extracted.content, extracted.tool_calls
+
+
+def run_tool_extraction_nonstreaming(
+    tool_parser: ToolParser,
+    model_output: str,
+    request: ChatCompletionRequest | None = None,
+) -> ExtractedToolCallInformation:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    return tool_parser.extract_tool_calls(model_output, request)
+
+
+def split_string_into_token_deltas(tokenizer: TokenizerLike, text: str) -> list[str]:
+    # Split a string into a series of deltas using the provided tokenizer. Each
+    # delta will be the string equivalent of a single token.
+    token_ids = tokenizer.encode(text, add_special_tokens=False)
+    previously_decoded_text = ""
+    deltas = []
+    for i in range(1, len(token_ids) + 1):
+        current_tokens = token_ids[:i]
+        current_text = tokenizer.decode(current_tokens)
+        new_text = current_text[len(previously_decoded_text) :]
+        previously_decoded_text = current_text
+        deltas.append(new_text)
+    return deltas
+
+
+def run_tool_extraction_streaming(
+    tool_parser: ToolParser,
+    model_deltas: Iterable[str],
+    request: ChatCompletionRequest | None = None,
+    assert_one_tool_per_delta: bool = True,
+) -> StreamingToolReconstructor:
+    if isinstance(model_deltas, str):
+        model_deltas = split_string_into_token_deltas(
+            tool_parser.model_tokenizer, model_deltas
+        )
+
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    reconstructor = StreamingToolReconstructor(
+        assert_one_tool_per_delta=assert_one_tool_per_delta
+    )
+    previous_text = ""
+    previous_tokens: list[int] = []
+    for delta in model_deltas:
+        token_delta = [
+            tool_parser.vocab.get(token)
+            for token in tool_parser.model_tokenizer.tokenize(delta)
+            if token in tool_parser.vocab
+        ]
+        current_text = previous_text + delta
+        current_tokens = previous_tokens + token_delta
+        delta_message = tool_parser.extract_tool_calls_streaming(
+            previous_text,
+            current_text,
+            delta,
+            previous_tokens,
+            current_tokens,
+            token_delta,
+            request,
+        )
+        if delta_message is not None:
+            reconstructor.append_delta(delta_message)
+        previous_text = current_text
+        previous_tokens = current_tokens
+    return reconstructor
--- a/tests/entrypoints/openai/utils.py
+++ b/tests/entrypoints/openai/utils.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+from collections.abc import AsyncGenerator
+from typing import Any
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionResponse,
+    ChatCompletionResponseChoice,
+    ChatCompletionStreamResponse,
+    ChatMessage,
+    UsageInfo,
+)
+
+
+async def accumulate_streaming_response(
+    stream_generator: AsyncGenerator[str, None],
+) -> ChatCompletionResponse:
+    """
+    Accumulate streaming SSE chunks into a complete ChatCompletionResponse.
+
+    This helper parses the SSE format and builds up the complete response
+    by combining all the delta chunks.
+    """
+    accumulated_content = ""
+    accumulated_reasoning = None
+    accumulated_tool_calls: list[dict[str, Any]] = []
+    role = None
+    finish_reason = None
+    response_id = None
+    created = None
+    model = None
+    index = 0
+
+    async for chunk_str in stream_generator:
+        # Skip empty lines and [DONE] marker
+        if not chunk_str.strip() or chunk_str.strip() == "data: [DONE]":
+            continue
+
+        # Parse SSE format: "data: {json}\n\n"
+        if chunk_str.startswith("data: "):
+            json_str = chunk_str[6:].strip()
+            try:
+                chunk_data = json.loads(json_str)
+                # print(f"DEBUG: Parsed chunk_data: {chunk_data}")
+                chunk = ChatCompletionStreamResponse(**chunk_data)
+
+                # Store metadata from first chunk
+                if response_id is None:
+                    response_id = chunk.id
+                    created = chunk.created
+                    model = chunk.model
+
+                # Process each choice in the chunk
+                for choice in chunk.choices:
+                    if choice.delta.role:
+                        role = choice.delta.role
+                    if choice.delta.content:
+                        accumulated_content += choice.delta.content
+                    if choice.delta.reasoning:
+                        if accumulated_reasoning is None:
+                            accumulated_reasoning = ""
+                        accumulated_reasoning += choice.delta.reasoning
+                    if choice.delta.tool_calls:
+                        # Accumulate tool calls
+                        for tool_call_delta in choice.delta.tool_calls:
+                            # Find or create the tool call at this index
+                            while len(accumulated_tool_calls) <= tool_call_delta.index:
+                                accumulated_tool_calls.append(
+                                    {
+                                        "id": None,
+                                        "type": "function",
+                                        "function": {"name": "", "arguments": ""},
+                                    }
+                                )
+
+                            if tool_call_delta.id:
+                                accumulated_tool_calls[tool_call_delta.index]["id"] = (
+                                    tool_call_delta.id
+                                )
+                            if tool_call_delta.function:
+                                if tool_call_delta.function.name:
+                                    accumulated_tool_calls[tool_call_delta.index][
+                                        "function"
+                                    ]["name"] += tool_call_delta.function.name
+                                if tool_call_delta.function.arguments:
+                                    accumulated_tool_calls[tool_call_delta.index][
+                                        "function"
+                                    ]["arguments"] += tool_call_delta.function.arguments
+
+                    if choice.finish_reason:
+                        finish_reason = choice.finish_reason
+                    if choice.index is not None:
+                        index = choice.index
+
+            except json.JSONDecodeError:
+                continue
+
+    # Build the final message
+    message_kwargs = {
+        "role": role or "assistant",
+        "content": accumulated_content if accumulated_content else None,
+        "reasoning": accumulated_reasoning,
+    }
+
+    # Only include tool_calls if there are any
+    if accumulated_tool_calls:
+        message_kwargs["tool_calls"] = [
+            {"id": tc["id"], "type": tc["type"], "function": tc["function"]}
+            for tc in accumulated_tool_calls
+        ]
+
+    message = ChatMessage(**message_kwargs)
+
+    # Build the final response
+    choice = ChatCompletionResponseChoice(
+        index=index,
+        message=message,
+        finish_reason=finish_reason or "stop",
+    )
+
+    # Create usage info (with dummy values for tests)
+    usage = UsageInfo(
+        prompt_tokens=0,
+        completion_tokens=0,
+        total_tokens=0,
+    )
+
+    response = ChatCompletionResponse(
+        id=response_id or "chatcmpl-test",
+        object="chat.completion",
+        created=created or 0,
+        model=model or "test-model",
+        choices=[choice],
+        usage=usage,
+    )
+
+    return response
+
+
+def verify_harmony_messages(
+    messages: list[Any], expected_messages: list[dict[str, Any]]
+):
+    assert len(messages) == len(expected_messages)
+    for msg, expected in zip(messages, expected_messages):
+        if "role" in expected:
+            assert msg.author.role == expected["role"]
+        if "author_name" in expected:
+            assert msg.author.name == expected["author_name"]
+        if "channel" in expected:
+            assert msg.channel == expected["channel"]
+        if "recipient" in expected:
+            assert msg.recipient == expected["recipient"]
+        if "content" in expected:
+            assert msg.content[0].text == expected["content"]
+        if "content_type" in expected:
+            assert msg.content_type == expected["content_type"]
+        if "tool_definitions" in expected:
+            # Check that the tool definitions match the expected list of tool names
+            actual_tools = [t.name for t in msg.content[0].tools["functions"].tools]
+            assert actual_tools == expected["tool_definitions"]
+
+
+def verify_chat_response(
+    response: ChatCompletionResponse,
+    content: str | None = None,
+    reasoning: str | None = None,
+    tool_calls: list[tuple[str, str]] | None = None,
+):
+    assert len(response.choices) == 1
+    message = response.choices[0].message
+
+    if content is not None:
+        assert message.content == content
+    else:
+        assert not message.content
+
+    if reasoning is not None:
+        assert message.reasoning == reasoning
+    else:
+        assert not message.reasoning
+
+    if tool_calls:
+        assert message.tool_calls is not None
+        assert len(message.tool_calls) == len(tool_calls)
+        for tc, (expected_name, expected_args) in zip(message.tool_calls, tool_calls):
+            assert tc.function.name == expected_name
+            assert tc.function.arguments == expected_args
+    else:
+        assert not message.tool_calls
--- a/tests/entrypoints/pooling/init.py
+++ b/tests/entrypoints/pooling/init.py
--- a/tests/entrypoints/pooling/basic/init.py
+++ b/tests/entrypoints/pooling/basic/init.py
--- a/tests/entrypoints/pooling/basic/test_encode.py
+++ b/tests/entrypoints/pooling/basic/test_encode.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+TOKEN_IDS = [
+    # Using ID={0, 1, 2, 3} results in NaN values,
+    # so we add this offset of 1000
+    [1000],
+    [1000, 1001],
+    [1000, 1002, 1001],
+    [1000, 1003, 1001, 1002],
+]
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_multiple_pooling_params(llm: LLM):
+    pooling_params = [
+        PoolingParams(),
+        PoolingParams(),
+        PoolingParams(),
+        PoolingParams(),
+    ]
+
+    # Multiple PoolingParams should be matched with each prompt
+    outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed")
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.encode(
+            PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed"
+        )
+
+    # Single PoolingParams should be applied to every prompt
+    single_pooling_params = PoolingParams()
+    outputs = llm.encode(
+        PROMPTS, pooling_params=single_pooling_params, pooling_task="embed"
+    )
+    assert len(PROMPTS) == len(outputs)
+
+    # pooling_params is None, default params should be applied
+    outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed")
+    assert len(PROMPTS) == len(outputs)
+
+
+def test_right_side_truncation(llm: LLM):
+    # Embeddings models should truncate the end of the prompt
+    tokenizer = llm.get_tokenizer()
+    assert tokenizer.truncation_side == "right"
--- a/tests/entrypoints/pooling/basic/test_truncation.py
+++ b/tests/entrypoints/pooling/basic/test_truncation.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import openai
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
+max_model_len = 128
+
+input = """Immerse yourself in the enchanting chronicle of calculus, a 
+    mathematical domain that has radically transformed our comprehension of 
+    change and motion. Despite its roots in ancient civilizations, the 
+    formal birth of calculus predominantly occurred in the 17th century, 
+    primarily under the influential guidance of Sir Isaac Newton and Gottfried 
+    Wilhelm Leibniz. The earliest traces of calculus concepts are found in 
+    ancient Greek mathematics,most notably in the works of Eudoxus and 
+    Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a 
+    technique for computing areas and volumes through the use of finite sums. 
+    This methodology laid crucial foundational work for integral calculus. 
+    In the 17th century, both Newton and Leibniz independently pioneered 
+    calculus, each contributing unique perspectives that would shape this new 
+    field."""
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--runner",
+        "pooling",
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        str(max_model_len),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = 10
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size,
+    }
+
+    response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
+
+    assert response["usage"]["prompt_tokens"] == truncation_size
+
+
+@pytest.mark.asyncio
+async def test_zero_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = 0
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size,
+    }
+
+    response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
+
+    assert response["usage"]["prompt_tokens"] == truncation_size
+
+
+@pytest.mark.asyncio
+async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = max_model_len + 1
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size,
+    }
+
+    with pytest.raises(openai.BadRequestError) as err:
+        await client.post(path="embeddings", cast_to=object, body={**kwargs})
+
+    assert err.value.status_code == 400
+    error_details = err.value.response.json()["error"]
+    assert error_details["type"] == "BadRequestError"
+    expected_message = (
+        "truncate_prompt_tokens value is "
+        "greater than max_model_len."
+        " Please, select a smaller truncation size."
+    )
+    assert error_details["message"] == expected_message
+
+
+@pytest.mark.asyncio
+async def test_max_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = -1
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size,
+    }
+
+    response = await client.post(path="embeddings", cast_to=object, body={**kwargs})
+
+    assert response["usage"]["prompt_tokens"] == max_model_len
--- a/tests/entrypoints/pooling/classify/init.py
+++ b/tests/entrypoints/pooling/classify/init.py
--- a/tests/entrypoints/pooling/classify/test_offline.py
+++ b/tests/entrypoints/pooling/classify/test_offline.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+
+from tests.models.utils import softmax
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
+
+prompts = ["The chef prepared a delicious meal."]
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_pooling_params(llm: LLM):
+    def get_outputs(use_activation):
+        outputs = llm.classify(
+            prompts,
+            pooling_params=PoolingParams(use_activation=use_activation),
+            use_tqdm=False,
+        )
+        return torch.tensor([x.outputs.probs for x in outputs])
+
+    default = get_outputs(use_activation=None)
+    w_activation = get_outputs(use_activation=True)
+    wo_activation = get_outputs(use_activation=False)
+
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
+    )
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
+    )
+
+
+@pytest.mark.skip_global_cleanup
+def test_token_classify(llm: LLM):
+    llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
+
+
+def test_score_api(llm: LLM):
+    err_msg = "Score API is only enabled for num_labels == 1."
+    with pytest.raises(ValueError, match=err_msg):
+        llm.score("ping", "pong", use_tqdm=False)
--- a/tests/entrypoints/pooling/classify/test_online.py
+++ b/tests/entrypoints/pooling/classify/test_online.py
@@ -0,0 +1,293 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import requests
+import torch
+import torch.nn.functional as F
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
+
+MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
+DTYPE = "float32"  # Use float32 to avoid NaN issue
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--enforce-eager",
+        "--max-model-len",
+        "512",
+        "--dtype",
+        DTYPE,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_single_input_classification(server: RemoteOpenAIServer, model_name: str):
+    input_text = "This product was excellent and exceeded my expectations"
+
+    classification_response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "input": input_text},
+    )
+
+    classification_response.raise_for_status()
+    output = ClassificationResponse.model_validate(classification_response.json())
+
+    assert output.object == "list"
+    assert output.model == MODEL_NAME
+    assert len(output.data) == 1
+    assert hasattr(output.data[0], "label")
+    assert hasattr(output.data[0], "probs")
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_add_special_tokens_false(server: RemoteOpenAIServer, model_name: str):
+    response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "input": "hello", "add_special_tokens": False},
+    )
+    response.raise_for_status()
+    ClassificationResponse.model_validate(response.json())
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str):
+    input_texts = [
+        "The product arrived on time and works perfectly",
+        "I'm very satisfied with my purchase, would buy again",
+        "The customer service was helpful and resolved my issue quickly",
+        "This product broke after one week, terrible quality",
+        "I'm very disappointed with this purchase, complete waste of money",
+        "The customer service was rude and unhelpful",
+    ]
+
+    classification_response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "input": input_texts},
+    )
+    output = ClassificationResponse.model_validate(classification_response.json())
+
+    assert len(output.data) == len(input_texts)
+    for i, item in enumerate(output.data):
+        assert item.index == i
+        assert hasattr(item, "label")
+        assert hasattr(item, "probs")
+        assert len(item.probs) == item.num_classes
+        assert item.label in ["Default", "Spoiled"]
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_truncate_prompt_tokens(server: RemoteOpenAIServer, model_name: str):
+    long_text = "hello " * 600
+
+    classification_response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "input": long_text, "truncate_prompt_tokens": 5},
+    )
+
+    classification_response.raise_for_status()
+    output = ClassificationResponse.model_validate(classification_response.json())
+
+    assert len(output.data) == 1
+    assert output.data[0].index == 0
+    assert hasattr(output.data[0], "probs")
+    assert output.usage.prompt_tokens == 5
+    assert output.usage.total_tokens == 5
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_invalid_truncate_prompt_tokens_error(
+    server: RemoteOpenAIServer, model_name: str
+):
+    classification_response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "input": "test", "truncate_prompt_tokens": 513},
+    )
+
+    error = classification_response.json()
+    assert classification_response.status_code == 400
+    assert "truncate_prompt_tokens" in error["error"]["message"]
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_empty_input_error(server: RemoteOpenAIServer, model_name: str):
+    classification_response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "input": ""},
+    )
+
+    error = classification_response.json()
+    assert classification_response.status_code == 400
+    assert "error" in error
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_batch_classification_empty_list(server: RemoteOpenAIServer, model_name: str):
+    classification_response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "input": []},
+    )
+    classification_response.raise_for_status()
+    output = ClassificationResponse.model_validate(classification_response.json())
+
+    assert output.object == "list"
+    assert isinstance(output.data, list)
+    assert len(output.data) == 0
+
+
+@pytest.mark.asyncio
+async def test_invocations(server: RemoteOpenAIServer):
+    request_args = {
+        "model": MODEL_NAME,
+        "input": "This product was excellent and exceeded my expectations",
+    }
+
+    classification_response = requests.post(
+        server.url_for("classify"), json=request_args
+    )
+    classification_response.raise_for_status()
+
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
+    invocation_response.raise_for_status()
+
+    classification_output = classification_response.json()
+    invocation_output = invocation_response.json()
+
+    assert classification_output.keys() == invocation_output.keys()
+    for classification_data, invocation_data in zip(
+        classification_output["data"], invocation_output["data"]
+    ):
+        assert classification_data.keys() == invocation_data.keys()
+        assert classification_data["probs"] == pytest.approx(
+            invocation_data["probs"], rel=0.01
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["This product was excellent and exceeded my expectations"]
+
+    async def get_outputs(use_activation):
+        response = requests.post(
+            server.url_for("classify"),
+            json={
+                "model": model_name,
+                "input": input_text,
+                "use_activation": use_activation,
+            },
+        )
+        outputs = response.json()
+        return torch.tensor([x["probs"] for x in outputs["data"]])
+
+    default = await get_outputs(use_activation=None)
+    w_activation = await get_outputs(use_activation=True)
+    wo_activation = await get_outputs(use_activation=False)
+
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
+    )
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_score(server: RemoteOpenAIServer, model_name: str):
+    # score api is only enabled for num_labels == 1.
+    response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": model_name,
+            "text_1": "ping",
+            "text_2": "pong",
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_rerank(server: RemoteOpenAIServer, model_name: str):
+    # rerank api is only enabled for num_labels == 1.
+    response = requests.post(
+        server.url_for("rerank"),
+        json={
+            "model": model_name,
+            "query": "ping",
+            "documents": ["pong"],
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str):
+    input_text = "This product was excellent and exceeded my expectations"
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": "classify",
+        },
+    )
+    poolings = PoolingResponse.model_validate(response.json())
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 2
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str):
+    task = "token_classify"
+    input_text = ["This product was excellent and exceeded my expectations"]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    poolings = PoolingResponse.model_validate(response.json())
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 8
+    assert len(poolings.data[0].data[0]) == 2
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
+async def test_pooling_not_supported(
+    server: RemoteOpenAIServer, model_name: str, task: str
+):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(
+        f"Task {task} is not supported"
+    )
--- a/tests/entrypoints/pooling/classify/test_online_vision.py
+++ b/tests/entrypoints/pooling/classify/test_online_vision.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
+
+VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
+MAXIMUM_VIDEOS = 1
+TEST_VIDEO_URL = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
+
+HF_OVERRIDES = {
+    "text_config": {
+        "architectures": ["Qwen2_5_VLForSequenceClassification"],
+    },
+}
+
+
+@pytest.fixture(scope="module")
+def server_vlm_classify():
+    args = [
+        "--runner",
+        "pooling",
+        "--max-model-len",
+        "5000",
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"video": MAXIMUM_VIDEOS}),
+    ]
+
+    with RemoteOpenAIServer(
+        VLM_MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
+def test_classify_accepts_chat_text_only(
+    server_vlm_classify: RemoteOpenAIServer, model_name: str
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this text request."},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server_vlm_classify.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == model_name
+    assert len(output.data) == 1
+    assert len(output.data[0].probs) == 2
+    assert output.usage.prompt_tokens == 22
+
+
+@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
+def test_classify_accepts_chat_video_url(
+    server_vlm_classify: RemoteOpenAIServer, model_name: str
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this video."},
+                {"type": "video_url", "video_url": {"url": TEST_VIDEO_URL}},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server_vlm_classify.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == model_name
+    assert len(output.data) == 1
+    assert len(output.data[0].probs) == 2
+    assert output.usage.prompt_tokens == 4807
--- a/tests/entrypoints/pooling/embed/init.py
+++ b/tests/entrypoints/pooling/embed/init.py
--- a/tests/entrypoints/pooling/embed/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import pytest
+
+from tests.models.language.pooling_mteb_test.mteb_utils import (
+    MTEB_EMBED_TASKS,
+    MTEB_EMBED_TOL,
+    OpenAIClientMtebEncoder,
+    run_mteb_embed_task,
+)
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
+
+MODEL_NAME = "intfloat/e5-small"
+MAIN_SCORE = 0.7422994752439667
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+def test_mteb_embed(server):
+    client = server.get_client()
+    encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
+    vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
+    st_main_score = MAIN_SCORE
+
+    print("VLLM main score: ", vllm_main_score)
+    print("SentenceTransformer main score: ", st_main_score)
+    print("Difference: ", st_main_score - vllm_main_score)
+
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_EMBED_TOL
--- a/tests/entrypoints/pooling/embed/test_offline.py
+++ b/tests/entrypoints/pooling/embed/test_offline.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+
+prompts = ["The chef prepared a delicious meal."]
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(
+        model=MODEL_NAME,
+        max_num_batched_tokens=32768,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.75,
+        enforce_eager=True,
+        seed=0,
+    )
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_token_embed(llm: LLM):
+    outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
+    multi_vector = outputs[0].outputs.data
+    assert multi_vector.shape == (11, 384)
+
+
+def test_pooling_params(llm: LLM):
+    def get_outputs(normalize):
+        outputs = llm.embed(
+            prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False
+        )
+        return torch.tensor([x.outputs.embedding for x in outputs])
+
+    default = get_outputs(normalize=None)
+    w_normal = get_outputs(normalize=True)
+    wo_normal = get_outputs(normalize=False)
+
+    assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
+    assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
+        "wo_normal should not use normal."
+    )
+    assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
+        "w_normal should be close to normal(wo_normal)."
+    )
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -0,0 +1,680 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import json
+
+import numpy as np
+import openai
+import pytest
+import pytest_asyncio
+import requests
+import torch
+import torch.nn.functional as F
+
+from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
+from tests.models.utils import check_embeddings_close
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
+from vllm.platforms import current_platform
+from vllm.tokenizers import get_tokenizer
+from vllm.utils.serial_utils import (
+    EMBED_DTYPE_TO_TORCH_DTYPE,
+    ENDIANNESS,
+    MetadataItem,
+    binary2tensor,
+    build_metadata_items,
+    decode_pooling_output,
+)
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+DTYPE = "bfloat16"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--runner",
+        "pooling",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        DTYPE,
+        "--enforce-eager",
+        "--max-model-len",
+        "512",
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="module")
+def hf_model(hf_runner):
+    with hf_runner(MODEL_NAME, dtype=DTYPE, is_sentence_transformer=True) as hf_model:
+        yield hf_model
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    # test single embedding
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 11
+    assert embeddings.usage.total_tokens == 11
+
+    vllm_outputs = [d.embedding for d in embeddings.data]
+    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
+
+    # test using token IDs
+    input_tokens = [1, 1, 1, 1, 1]
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 5
+    assert embeddings.usage.total_tokens == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI, model_name: str):
+    # test list[str]
+    input_texts = [
+        "The cat sat on the mat.",
+        "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky.",
+    ]
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 3
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 33
+    assert embeddings.usage.total_tokens == 33
+
+    vllm_outputs = [d.embedding for d in embeddings.data]
+    run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
+
+    # test list[list[int]]
+    input_tokens = [
+        [4, 5, 7, 9, 20],
+        [15, 29, 499],
+        [24, 24, 24, 24, 24],
+        [25, 32, 64, 77],
+    ]
+    embedding_response = await client.embeddings.create(
+        model=model_name,
+        input=input_tokens,
+        encoding_format="float",
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 4
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 17
+    assert embeddings.usage.total_tokens == 17
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_conversation_embedding(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
+
+    chat_response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
+    chat_response.raise_for_status()
+    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
+
+    tokenizer = get_tokenizer(tokenizer_name=model_name)
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        chat_template=DUMMY_CHAT_TEMPLATE,
+        add_generation_prompt=True,
+        continue_final_message=False,
+        tokenize=False,
+    )
+    completion_response = await client.embeddings.create(
+        model=model_name,
+        input=prompt,
+        encoding_format="float",
+        # To be consistent with chat
+        extra_body={"add_special_tokens": False},
+    )
+    completion_embeddings = EmbeddingResponse.model_validate(
+        completion_response.model_dump(mode="json")
+    )
+
+    assert chat_embeddings.id is not None
+    assert completion_embeddings.id is not None
+    assert chat_embeddings.created <= completion_embeddings.created
+    assert chat_embeddings.model_dump(exclude={"id", "created"}) == (
+        completion_embeddings.model_dump(exclude={"id", "created"})
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_base64_embedding(
+    hf_model, client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
+    float_data = [d.embedding for d in responses_float.data]
+    run_embedding_correctness_test(hf_model, input_texts, float_data)
+
+    responses_base64 = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="base64"
+    )
+    base64_data = []
+    for data in responses_base64.data:
+        base64_data.append(
+            np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()
+        )
+
+    run_embedding_correctness_test(hf_model, input_texts, base64_data)
+
+    # Default response is float32 decoded from base64 by OpenAI Client
+    responses_default = await client.embeddings.create(
+        input=input_texts, model=model_name
+    )
+    default_data = [d.embedding for d in responses_default.data]
+    run_embedding_correctness_test(hf_model, input_texts, default_data)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_base64_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
+    float_data = [d.embedding for d in responses_float.data]
+
+    for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE:
+        for endianness in ENDIANNESS:
+            responses_base64 = requests.post(
+                server.url_for("/v1/embeddings"),
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "base64",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            base64_data = []
+            for data in responses_base64.json()["data"]:
+                binary = base64.b64decode(data["embedding"])
+                tensor = binary2tensor(binary, (-1,), embed_dtype, endianness)
+                base64_data.append(tensor.to(torch.float32).tolist())
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=base64_data,
+                name_0="float_data",
+                name_1="base64_data",
+                tol=1e-2,
+            )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_bytes_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
+    float_data = [d.embedding for d in responses_float.data]
+
+    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
+        for endianness in ENDIANNESS:
+            responses_bytes = requests.post(
+                server.url_for("/v1/embeddings"),
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "bytes",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            metadata = json.loads(responses_bytes.headers["metadata"])
+            body = responses_bytes.content
+            items = [MetadataItem(**x) for x in metadata["data"]]
+
+            bytes_data = decode_pooling_output(items=items, body=body)
+            bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=bytes_data,
+                name_0="float_data",
+                name_1="bytes_data",
+                tol=1e-2,
+            )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_bytes_only_embed_dtype_and_endianness(
+    server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ] * 2
+
+    responses_float = await client.embeddings.create(
+        input=input_texts, model=model_name, encoding_format="float"
+    )
+    float_data = [d.embedding for d in responses_float.data]
+    embedding_size = len(float_data[0])
+
+    for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()):
+        for endianness in ENDIANNESS:
+            responses_bytes = requests.post(
+                server.url_for("/v1/embeddings"),
+                json={
+                    "model": model_name,
+                    "input": input_texts,
+                    "encoding_format": "bytes_only",
+                    "embed_dtype": embed_dtype,
+                    "endianness": endianness,
+                },
+            )
+
+            assert "metadata" not in responses_bytes.headers
+            body = responses_bytes.content
+            items = build_metadata_items(
+                embed_dtype=embed_dtype,
+                endianness=endianness,
+                shape=(embedding_size,),
+                n_request=len(input_texts),
+            )
+
+            bytes_data = decode_pooling_output(items=items, body=body)
+            bytes_data = [x.to(torch.float32).tolist() for x in bytes_data]
+
+            check_embeddings_close(
+                embeddings_0_lst=float_data,
+                embeddings_1_lst=bytes_data,
+                name_0="float_data",
+                name_1="bytes_data",
+                tol=1e-2,
+            )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"])
+async def test_params_not_supported(
+    server: RemoteOpenAIServer, model_name: str, param_name: str
+):
+    input_texts = [
+        "The best thing about vLLM is that it supports many different models",
+    ]
+
+    responses_base64 = requests.post(
+        server.url_for("/v1/embeddings"),
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "base64",
+            param_name: f"bad_{param_name}",
+        },
+    )
+
+    assert responses_base64.status_code == 400
+    assert "literal_error" in responses_base64.json()["error"]["message"]
+    assert f"bad_{param_name}" in responses_base64.json()["error"]["message"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation(client: openai.AsyncOpenAI, model_name: str):
+    input_texts = [
+        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
+    ]
+
+    # test single embedding
+    embedding_response = await client.embeddings.create(
+        model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 10
+    assert embeddings.usage.total_tokens == 10
+
+    input_tokens = [
+        1,
+        24428,
+        289,
+        18341,
+        26165,
+        285,
+        19323,
+        283,
+        289,
+        26789,
+        3871,
+        28728,
+        9901,
+        340,
+        2229,
+        385,
+        340,
+        315,
+        28741,
+        28804,
+        2,
+    ]
+    embedding_response = await client.embeddings.create(
+        model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}
+    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json")
+    )
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 10
+    assert embeddings.usage.total_tokens == 10
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_embedding_truncation_invalid(
+    client: openai.AsyncOpenAI, model_name: str
+):
+    input_texts = [
+        "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
+    ]
+
+    with pytest.raises(openai.BadRequestError):
+        response = await client.embeddings.create(
+            model=model_name,
+            input=input_texts,
+            extra_body={"truncate_prompt_tokens": 8193},
+        )
+        assert "error" in response.object
+        assert (
+            "truncate_prompt_tokens value is greater than max_model_len. "
+            "Please, select a smaller truncation size." in response.message
+        )
+
+
+@pytest.mark.asyncio
+async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "input": input_texts,
+        "encoding_format": "float",
+    }
+
+    completion_response = await client.embeddings.create(**request_args)
+
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
+    invocation_response.raise_for_status()
+
+    completion_output = completion_response.model_dump()
+    invocation_output = invocation_response.json()
+
+    assert completion_output.keys() == invocation_output.keys()
+    for completion_data, invocation_data in zip(
+        completion_output["data"], invocation_output["data"]
+    ):
+        assert completion_data.keys() == invocation_data.keys()
+        check_embeddings_close(
+            embeddings_0_lst=[completion_data["embedding"]],
+            embeddings_1_lst=[invocation_data["embedding"]],
+            name_0="completion",
+            name_1="invocation",
+        )
+
+
+@pytest.mark.asyncio
+async def test_invocations_conversation(server: RemoteOpenAIServer):
+    messages = [
+        {
+            "role": "user",
+            "content": "The cat sat on the mat.",
+        },
+        {
+            "role": "assistant",
+            "content": "A feline was resting on a rug.",
+        },
+        {
+            "role": "user",
+            "content": "Stars twinkle brightly in the night sky.",
+        },
+    ]
+
+    request_args = {
+        "model": MODEL_NAME,
+        "messages": messages,
+        "encoding_format": "float",
+    }
+
+    chat_response = requests.post(server.url_for("v1/embeddings"), json=request_args)
+    chat_response.raise_for_status()
+
+    invocation_response = requests.post(
+        server.url_for("invocations"), json=request_args
+    )
+    invocation_response.raise_for_status()
+
+    chat_output = chat_response.json()
+    invocation_output = invocation_response.json()
+
+    assert chat_output.keys() == invocation_output.keys()
+    for chat_data, invocation_data in zip(
+        chat_output["data"], invocation_output["data"]
+    ):
+        assert chat_data.keys() == invocation_data.keys()
+        check_embeddings_close(
+            embeddings_0_lst=[chat_data["embedding"]],
+            embeddings_1_lst=[invocation_data["embedding"]],
+            name_0="chat",
+            name_1="invocation",
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_normalize(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["The chef prepared a delicious meal."]
+
+    async def get_outputs(normalize):
+        request_args = {
+            "model": MODEL_NAME,
+            "input": input_text,
+            "encoding_format": "float",
+            "normalize": normalize,
+        }
+
+        response = requests.post(server.url_for("v1/embeddings"), json=request_args)
+        outputs = response.json()
+
+        return torch.tensor([x["embedding"] for x in outputs["data"]])
+
+    default = await get_outputs(normalize=None)
+    w_normal = await get_outputs(normalize=True)
+    wo_normal = await get_outputs(normalize=False)
+
+    assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
+    assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
+        "wo_normal should not use normal."
+    )
+    assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
+        "w_normal should be close to normal(wo_normal)."
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str):
+    task = "embed"
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 384
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str):
+    task = "token_embed"
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_text,
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 11
+    assert len(poolings.data[0].data[0]) == 384
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"])
+async def test_pooling_not_supported(
+    server: RemoteOpenAIServer, model_name: str, task: str
+):
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": "test",
+            "encoding_format": "float",
+            "task": task,
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+    assert response.json()["error"]["message"].startswith(
+        f"Task {task} is not supported"
+    )
--- a/Show More
+++ b/Show More