Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/models/language/generation/init.py
+++ b/tests/models/language/generation/init.py
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -0,0 +1,185 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+from ....utils import large_gpu_mark
+from ...registry import HF_EXAMPLE_MODELS
+from ...utils import check_logprobs_close
+
+# This list contains the model that are using AITER kernel.
+# Skip model that are not using AITER tests.
+# When more AITER kernels are added, this list will not be
+# needed as all the models will be calling AITER kernels
+# in parts of the operators
+AITER_MODEL_LIST = [
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "openbmb/MiniCPM3-4B",
+    "Qwen/Qwen-7B-Chat",
+    "Qwen/Qwen2.5-0.5B-Instruct",
+    "TitanML/tiny-mixtral",
+    "Qwen/Qwen3-8B",
+]
+
+
+# @maybe_test_rocm_aiter
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param(
+            "bigscience/bloom-560m",  # bloom - testing alibi slopes
+            marks=[
+                pytest.mark.core_model,
+                pytest.mark.slow_test,
+                pytest.mark.cpu_model,
+            ],
+        ),
+        pytest.param(
+            "openai-community/gpt2",  # gpt2
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param("Milos/slovak-gpt-j-405M"),  # gptj
+        pytest.param("bigcode/tiny_starcoder_py"),  # gpt_bigcode
+        pytest.param("EleutherAI/pythia-70m"),  # gpt_neox
+        pytest.param(
+            "google/gemma-1.1-2b-it",  # gemma
+            marks=[
+                pytest.mark.core_model,
+                pytest.mark.cpu_model,
+                pytest.mark.slow_test,
+            ],
+        ),
+        pytest.param(
+            "google/gemma-2-2b-it",  # test hybrid attention
+            marks=[pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "zai-org/chatglm3-6b",  # chatglm (text-only)
+        ),
+        pytest.param(
+            "meta-llama/Llama-3.2-1B-Instruct",  # llama
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "openbmb/MiniCPM3-4B",
+            marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
+        ),
+        pytest.param(
+            "facebook/opt-125m",  # opt
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "microsoft/phi-2",  # phi
+            marks=[pytest.mark.core_model, pytest.mark.slow_test],
+        ),
+        pytest.param(
+            "Qwen/Qwen-7B-Chat",  # qwen (text-only)
+        ),
+        pytest.param(
+            "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
+            marks=[
+                pytest.mark.core_model,
+                pytest.mark.cpu_model,
+                pytest.mark.slow_test,
+            ],
+        ),
+        pytest.param(
+            "Qwen/Qwen3-8B",  # qwen (text-only)
+        ),
+        pytest.param("stabilityai/stablelm-3b-4e1t"),  # stablelm
+        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
+        pytest.param(
+            "TitanML/tiny-mixtral",  # mixtral
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param("swiss-ai/Apertus-8B-Instruct-2509"),  # apertus
+    ],
+)
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
+)
+@pytest.mark.parametrize("use_prompt_embeds", [True, False])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+    use_rocm_aiter: bool,
+    use_prompt_embeds: bool,
+    monkeypatch,
+) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    if use_rocm_aiter and (model in AITER_MODEL_LIST):
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+    elif use_rocm_aiter and model not in AITER_MODEL_LIST:
+        # Skip model that are not using AITER tests.
+        # When more AITER kernels are added, this list will not be
+        # needed as all the models will be calling AITER kernels
+        # in parts of the operators
+        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
+
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+        prompt_embeds: list[torch.Tensor] | None = [] if use_prompt_embeds else None
+
+        prompt_token_ids = []
+        for prompt in example_prompts:
+            token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
+                hf_model.model.device
+            )
+            prompt_token_ids.append(token_ids)
+            if prompt_embeds is not None:
+                prompt_embeds.append(
+                    hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
+                )
+
+    with vllm_runner(
+        model,
+        tokenizer_name=model_info.tokenizer or model,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
+        max_num_seqs=2,
+        enable_prompt_embeds=use_prompt_embeds,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+        if prompt_embeds is not None:
+            vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
+                prompt_embeds, max_tokens, num_logprobs
+            )
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+    if prompt_embeds is not None:
+        check_logprobs_close(
+            outputs_0_lst=vllm_outputs,
+            outputs_1_lst=vllm_outputs_from_embeds,
+            name_0="vllm",
+            name_1="vllm_from_embeds",
+        )
+
+    if use_rocm_aiter:
+        # this is to ensure that vllm engine
+        # has deallocated the memory before running the next
+        # unit tests. On ROCm, when using AITER
+        # the memory might not be deallocated completely
+        # before running the next test case
+        torch.cuda.synchronize()
--- a/tests/models/language/generation/test_gemma.py
+++ b/tests/models/language/generation/test_gemma.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+import pytest
+
+MODELS = ["google/gemma-2b", "google/gemma-2-2b", "google/gemma-3-4b-it"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+        with vllm_runner(
+            model,
+            load_format="dummy",
+        ) as llm:
+            if model == "google/gemma-3-4b-it":
+                normalizers = llm.llm.collective_rpc(
+                    lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item()  # noqa: E501
+                )
+                config = llm.llm.llm_engine.model_config.hf_config.text_config
+            else:
+                normalizers = llm.llm.collective_rpc(
+                    lambda self: self.model_runner.model.model.normalizer.cpu().item()
+                )
+                config = llm.llm.llm_engine.model_config.hf_config
+            assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
--- a/tests/models/language/generation/test_granite.py
+++ b/tests/models/language/generation/test_granite.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from ...utils import check_logprobs_close
+
+MODELS = [
+    # TODO(sang): Sliding window should be tested separately.
+    "ibm/PowerLM-3b",
+    "ibm/PowerMoE-3b",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -0,0 +1,758 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+
+import pytest
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+from tests.utils import multi_gpu_test
+from vllm.engine.arg_utils import EngineArgs
+from vllm.sampling_params import SamplingParams
+
+from ...utils import check_logprobs_close, check_outputs_equal
+
+# Mark all tests as hybrid
+pytestmark = pytest.mark.hybrid_model
+
+# NOTE: The first model in each list is taken as the primary model,
+# meaning that it will be used in all tests in this file
+# The rest of the models will only be tested by test_models
+
+APC_MULTIPLY_BY = 300
+
+SSM_MODELS = [
+    "state-spaces/mamba-130m-hf",
+    "tiiuae/falcon-mamba-tiny-dev",
+    # mamba2-codestral in transformers is broken pending:
+    # https://github.com/huggingface/transformers/pull/40861
+    # "yujiepan/mamba2-codestral-v0.1-tiny-random",
+]
+
+HYBRID_MODELS = [
+    "ai21labs/Jamba-tiny-dev",
+    "pfnet/plamo-2-1b",
+    "Zyphra/Zamba2-1.2B-instruct",
+    "hmellor/tiny-random-BambaForCausalLM",
+    "ibm-granite/granite-4.0-tiny-preview",
+    "tiiuae/Falcon-H1-0.5B-Base",
+    "LiquidAI/LFM2-1.2B",
+    "tiny-random/qwen3-next-moe",
+]
+
+FULL_CUDA_GRAPH_MODELS = [
+    "ai21labs/Jamba-tiny-dev",
+    "pfnet/plamo-2-1b",
+    "Zyphra/Zamba2-1.2B-instruct",
+]
+
+FP32_STATE_MODELS = [
+    "state-spaces/mamba-130m-hf",
+    "Zyphra/Zamba2-1.2B-instruct",
+]
+
+# Avoid OOM
+MAX_NUM_SEQS = 4
+
+
+@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_batching(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    for_loop_outputs = []
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        for prompt in example_prompts:
+            (single_output,) = vllm_model.generate_greedy_logprobs(
+                [prompt], max_tokens, num_logprobs
+            )
+            for_loop_outputs.append(single_output)
+
+        batched_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=for_loop_outputs,
+        outputs_1_lst=batched_outputs,
+        name_0="for_loop_vllm",
+        name_1="batched_vllm",
+    )
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_chunked_prefill_with_parallel_sampling(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+) -> None:
+    """
+    Tests chunked prefill in conjunction with n > 1.
+
+    In this case, prefill is populated with decoding tokens and
+    we test that it doesn't fail.
+
+    This test might fail if cache is not allocated correctly for n > 1
+    decoding steps inside a chunked prefill forward pass
+    (where we have both prefill and decode together)
+    """
+    sampling_params = SamplingParams(n=3, temperature=1, seed=0, max_tokens=max_tokens)
+    with vllm_runner(
+        model,
+        enable_chunked_prefill=True,
+        # forces prefill chunks with decoding
+        max_num_batched_tokens=MAX_NUM_SEQS * 3,
+        max_num_seqs=MAX_NUM_SEQS,
+    ) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_mamba_cache_cg_padding(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+) -> None:
+    """
+    This test is for verifying that mamba cache is padded to CG captured
+    batch size. If it's not, a torch RuntimeError will be raised because
+    tensor dimensions aren't compatible.
+    """
+    vllm_config = EngineArgs(model=model, trust_remote_code=True).create_engine_config()
+    while len(example_prompts) == vllm_config.pad_for_cudagraph(len(example_prompts)):
+        example_prompts.append(example_prompts[0])
+
+    try:
+        with vllm_runner(model) as vllm_model:
+            vllm_model.generate_greedy(example_prompts, max_tokens)
+    except RuntimeError:
+        pytest.fail(
+            "Couldn't run batch size which is not equal to a Cuda Graph "
+            "captured batch size. "
+            "Could be related to mamba cache not padded correctly"
+        )
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
+    vllm_runner,
+    example_prompts,
+    model: str,
+) -> None:
+    """
+    This test is for verifying that the hybrid inner state management doesn't
+    collapse in case where the number of incoming requests and
+    finished_requests_ids is larger than the maximum mamba block capacity.
+
+    This could generally happen due to the fact that hybrid does support
+    statelessness mechanism where it can clean up new incoming requests in
+    a single step.
+    """
+    try:
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
+    except ValueError:
+        pytest.fail(
+            "Hybrid inner state wasn't cleaned up properly between"
+            "steps finished requests registered unnecessarily "
+        )
+
+
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+def test_state_cleanup(
+    vllm_runner,
+    example_prompts,
+    model: str,
+) -> None:
+    """
+    This test is for verifying that the Hybrid state is cleaned up between
+    steps.
+
+    If it's not cleaned, an error would be expected.
+    """
+    try:
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            for _ in range(10):
+                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
+    except ValueError:
+        pytest.fail(
+            "Hybrid inner state wasn't cleaned up between states, "
+            "could be related to finished_requests_ids"
+        )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_distributed_correctness(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(
+        model, tensor_parallel_size=1, max_num_seqs=MAX_NUM_SEQS
+    ) as vllm_model:
+        vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    with vllm_runner(
+        model, tensor_parallel_size=2, max_num_seqs=MAX_NUM_SEQS
+    ) as vllm_model:
+        vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=vllm_outputs_tp_1,
+        outputs_1_lst=vllm_outputs_tp_2,
+        name_0="vllm_tp_1",
+        name_1="vllm_tp_2",
+    )
+
+
+@pytest.mark.parametrize("model", FULL_CUDA_GRAPH_MODELS)
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_full_cuda_graph(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", FP32_STATE_MODELS)
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize(
+    "cache_dtype_param", ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]
+)
+def test_fp32_cache_state(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+    cache_dtype_param: str,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    with vllm_runner(
+        model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+# Helper functions for the APC tests
+def _get_vllm_runner_params(
+    model: str,
+    max_model_len: int,
+    tensor_parallel_size: int = 1,
+):
+    return {
+        "model_name": model,
+        "enable_chunked_prefill": True,
+        "enable_prefix_caching": False,
+        "max_model_len": max_model_len,
+        "tensor_parallel_size": tensor_parallel_size,
+        "gpu_memory_utilization": 0.4,
+    }
+
+
+def _get_vLLM_output(
+    vllm_runner,
+    kwargs,
+    prompts,
+    max_tokens,
+    num_logprobs,
+    num_repetitions=1,
+    vllm_model=None,
+):
+    outs = []
+    if vllm_model is None:
+        vllm_model = vllm_runner(**kwargs)
+    for _ in range(num_repetitions):
+        if num_logprobs < 0:
+            vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
+        else:
+            vllm_output = vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs
+            )
+        outs.append(vllm_output)
+
+    return outs, vllm_model
+
+
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("n_repetitions", [2])
+# If num_logprobs is set to -1, then the stringent version
+# of the test is executed using `check_outputs_equal`
+# instead of `check_logprobs_close`
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_apc_single_prompt(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    n_repetitions: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )
+
+    # Sample prompts.
+    generated_prompts = [APC_MULTIPLY_BY * example_prompts[0]]
+
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
+    vllm_runner_kwargs = _get_vllm_runner_params(
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )
+
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    vllm_outputs_cache_rep, _ = _get_vLLM_output(
+        vllm_runner,
+        vllm_runner_kwargs,
+        generated_prompts,
+        max_tokens,
+        num_logprobs,
+        n_repetitions,
+    )
+
+    for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
+        # In the first repetition, the caches are filled
+        # In the second repetition, these caches are reused
+
+        compare_operator(
+            outputs_0_lst=vllm_outputs_no_cache[0],
+            outputs_1_lst=vllm_outputs_cache_itn,
+            name_0="vllm_no_cache",
+            name_1=f"vllm_cache_it_{r_idx + 1}",
+        )
+
+
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("n_repetitions", [2])
+# If num_logprobs is set to -1, then the stringent version
+# of the test is executed using `check_outputs_equal`
+# instead of `check_logprobs_close`
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_apc_single_prompt_block_align_alignment(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    n_repetitions: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )
+
+    # Sample prompts. This custom prompt is used, as it causes the most issues
+    generated_prompts = ["The president of the United States is " * APC_MULTIPLY_BY]
+
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
+    vllm_runner_kwargs = _get_vllm_runner_params(
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
+
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )
+
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    with vllm_runner(**vllm_runner_kwargs) as vllm_model:
+        # Retrieve the default mamba state block size
+        mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
+
+    # In case the hybrid model does not have the
+    # "mamba_block_size" assume a fixed constant
+    if mamba_block_size is None:
+        mamba_block_size = 512
+
+    mamba_block_size_multiplier = 10
+    for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
+        vllm_runner_kwargs["max_num_batched_tokens"] = (
+            mamba_block_size_multiplier * mamba_block_size - offsets
+        )
+        vllm_outputs_cache_rep, _ = _get_vLLM_output(
+            vllm_runner,
+            vllm_runner_kwargs,
+            generated_prompts,
+            max_tokens,
+            num_logprobs,
+            n_repetitions,
+        )
+
+        # Check alignment of the output logits when using APC
+        for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
+            # In the first repetition, the caches are filled
+            # In the second repetition, these caches are reused
+
+            compare_operator(
+                outputs_0_lst=vllm_outputs_no_cache[0],
+                outputs_1_lst=vllm_outputs_cache_itn,
+                name_0="vllm_no_cache",
+                name_1=f"vllm_cache_it_{r_idx + 1}",
+            )
+
+
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("n_repetitions", [2])
+# If num_logprobs is set to -1, then the stringent version
+# of the test is executed using `check_outputs_equal`
+# instead of `check_logprobs_close`
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_apc_multiple_prompts_all_cached_outputs(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    n_repetitions: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )
+
+    # Sample prompts.
+    generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts]
+
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
+    vllm_runner_kwargs = _get_vllm_runner_params(
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
+
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )
+
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    vllm_outputs_cache_rep, _ = _get_vLLM_output(
+        vllm_runner,
+        vllm_runner_kwargs,
+        generated_prompts,
+        max_tokens,
+        num_logprobs,
+        n_repetitions,
+    )
+
+    for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
+        # In the first repetition, the caches are filled
+        # In the second repetition, these caches are reused
+
+        compare_operator(
+            outputs_0_lst=vllm_outputs_no_cache[0],
+            outputs_1_lst=vllm_outputs_cache_itn,
+            name_0="vllm_no_cache",
+            name_1=f"vllm_cache_it_{r_idx + 1}",
+        )
+
+
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("n_repetitions", [2])
+# If num_logprobs is set to -1, then the stringent version
+# of the test is executed using `check_outputs_equal`
+# instead of `check_logprobs_close`
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_apc_multiple_prompts_block_align_alignment(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    n_repetitions: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )
+
+    # Sample prompts. This custom prompt is used, as it causes the most issues
+    prompt_text = "The president of the United States is "
+    prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31]
+    generated_prompts = [
+        prompt_text[offset:] * APC_MULTIPLY_BY for offset in prompt_offsets
+    ]
+
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
+    vllm_runner_kwargs = _get_vllm_runner_params(
+        model, max_model_len, tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
+
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )
+
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    with vllm_runner(**vllm_runner_kwargs) as vllm_model:
+        # Retrieve the default mamba state block size
+        mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
+
+    # In case the hybrid model does not have the
+    # "mamba_block_size" assume a fixed constant
+    if mamba_block_size is None:
+        mamba_block_size = 512
+
+    mamba_block_size_multiplier = 10
+    for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
+        vllm_runner_kwargs["max_num_batched_tokens"] = (
+            mamba_block_size_multiplier * mamba_block_size - offsets
+        )
+        vllm_outputs_cache_rep, _ = _get_vLLM_output(
+            vllm_runner,
+            vllm_runner_kwargs,
+            generated_prompts,
+            max_tokens,
+            num_logprobs,
+            n_repetitions,
+        )
+
+        # Check alignment of the output logits when using APC
+        for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
+            # In the first repetition, the caches are filled
+            # In the second repetition, these caches are reused
+
+            compare_operator(
+                outputs_0_lst=vllm_outputs_no_cache[0],
+                outputs_1_lst=vllm_outputs_cache_itn,
+                name_0="vllm_no_cache",
+                name_1=f"vllm_cache_it_{r_idx + 1}",
+            )
+
+
+@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("n_repetitions", [2])
+# If num_logprobs is set to -1, then the stringent version
+# of the test is executed using `check_outputs_equal`
+# instead of `check_logprobs_close`
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_apc_multiple_prompts_partial_cached_outputs(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    n_repetitions: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+) -> None:
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    compare_operator: Callable = (
+        check_logprobs_close if num_logprobs > 0 else check_outputs_equal  # type: ignore
+    )
+
+    # Sample prompts.
+    generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts]
+
+    max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
+    vllm_runner_kwargs = _get_vllm_runner_params(
+        model, max_model_len, tensor_parallel_size=tensor_parallel_size
+    )
+    vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
+
+    vllm_outputs_no_cache, _ = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
+    )
+
+    # Cache only part of all the prompts
+    vllm_runner_kwargs["enable_prefix_caching"] = True
+    vllm_outputs_partial_cache, vllm_model = _get_vLLM_output(
+        vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens, num_logprobs
+    )
+
+    compare_operator(
+        outputs_0_lst=vllm_outputs_no_cache[0][:3],
+        outputs_1_lst=vllm_outputs_partial_cache[0],
+        name_0="vllm_no_cache",
+        name_1="vllm_partial_cache",
+    )
+
+    vllm_outputs_cache_rep, _ = _get_vLLM_output(
+        vllm_runner,
+        vllm_runner_kwargs,
+        generated_prompts,
+        max_tokens,
+        num_logprobs,
+        n_repetitions,
+        vllm_model=vllm_model,
+    )
+
+    for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
+        # In the first repetition, the caches are filled
+        # In the second repetition, these caches are reused
+
+        compare_operator(
+            outputs_0_lst=vllm_outputs_no_cache[0],
+            outputs_1_lst=vllm_outputs_cache_itn,
+            name_0="vllm_no_cache",
+            name_1=f"vllm_cache_it_{r_idx + 1}",
+        )
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -0,0 +1,352 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+import json
+
+import pytest
+
+from vllm.sampling_params import SamplingParams
+from vllm.tokenizers.mistral import MistralTokenizer
+from vllm.tool_parsers.mistral_tool_parser import (
+    MistralToolCall,
+    MistralToolParser,
+)
+
+from ...utils import check_logprobs_close
+
+MODELS = [
+    "mistralai/Mistral-7B-Instruct-v0.3",
+]
+
+MISTRAL_FORMAT_MODELS = [
+    "mistralai/Mistral-7B-Instruct-v0.3",
+    # uses the v3-Tekken tokenizer
+    "mistralai/Ministral-8B-Instruct-2410",
+    # Mistral-Nemo is too big for CI, but passes locally
+    # "mistralai/Mistral-Nemo-Instruct-2407"
+]
+
+SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
+SYMBOLIC_LANG_PROMPTS = [
+    "勇敢な船乗りについての詩を書く",  # japanese
+    "寫一首關於勇敢的水手的詩",  # chinese
+    "ပုံပြင်လေးပြောပြပါ်:\n",  # burmese
+    "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n",  # see https://github.com/vllm-project/vllm/pull/9625
+]
+
+# for function calling
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. "
+                        "'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that "
+                        "the city is in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "rewrite",
+            "description": "Rewrites text",
+            "parameters": {
+                "type": "object",
+                "required": [],
+                "properties": {
+                    "text": {
+                        "type": "string",
+                        "description": "The input text to rewrite.",
+                    }
+                },
+            },
+        },
+    },
+]
+MSGS = [
+    {"role": "system", "content": "You are an assistant."},
+    {
+        "role": "user",
+        "content": "Could you please rewrite the below article? \n\n My English needs "
+        "improvving, maybe I make errors.",
+    },
+    {
+        "role": "assistant",
+        "content": "",
+        "tool_calls": [
+            {
+                "id": "bbc5b7ede",
+                "type": "function",
+                "function": {
+                    "name": "rewrite",
+                    "arguments": '{"text":"My English needs improvving, maybe '
+                    'I make errors."}',
+                },
+            }
+        ],
+    },
+    {
+        "role": "tool",
+        "content": '{"action":"rewrite","outcome":"My English needs improving, maybe '
+        'I make errors."}',
+        "tool_call_id": "bbc5b7ede",
+        "name": "rewrite",
+    },
+    {
+        "role": "assistant",
+        "content": "---\n\nMy English needs improving, maybe I make errors",
+    },
+    {
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
+]
+
+SAMPLE_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {"type": "string"},
+        "age": {"type": "integer"},
+        "skills": {
+            "type": "array",
+            "items": {"type": "string", "maxLength": 10},
+            "minItems": 3,
+        },
+        "work_history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {"type": "string"},
+                    "duration": {"type": "number"},
+                    "position": {"type": "string"},
+                },
+                "required": ["company", "position"],
+            },
+        },
+    },
+    "required": ["name", "age", "skills", "work_history"],
+}
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    # TODO(sang): Sliding window should be tested separately.
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral") as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_mistral_format(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        load_format="mistral",
+        config_format="mistral",
+    ) as mistral_format_model:
+        mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        tokenizer_mode="hf",
+        load_format="safetensors",
+        config_format="hf",
+    ) as hf_format_model:
+        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=hf_format_outputs,
+        outputs_1_lst=mistral_format_outputs,
+        name_0="hf",
+        name_1="mistral",
+    )
+
+
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str) -> None:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        max_model_len=8192,
+        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
+    ) as vllm_model:
+        for prompt in SYMBOLIC_LANG_PROMPTS:
+            msg = {"role": "user", "content": prompt}
+            outputs = vllm_model.llm.chat([msg], sampling_params=SAMPLING_PARAMS)
+            assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
+
+
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
+    ) as vllm_model:
+        msgs = copy.deepcopy(MSGS)
+        outputs = vllm_model.llm.chat(
+            msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS
+        )
+
+        tokenizer = vllm_model.llm.get_tokenizer()
+        tool_parser = MistralToolParser(tokenizer)
+
+        model_output = outputs[0].outputs[0].text.strip()
+        assert model_output.startswith(tool_parser.bot_token), model_output
+        parsed_message = tool_parser.extract_tool_calls(model_output, None)
+
+        assert parsed_message.tools_called
+
+        assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
+        assert parsed_message.tool_calls[0].function.name == "get_current_weather"
+        assert (
+            parsed_message.tool_calls[0].function.arguments
+            == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'
+        )  # noqa
+        assert parsed_message.content is None
+
+
+def test_mistral_function_call_nested_json():
+    """Ensure that the function-name regex captures the entire outermost
+    JSON block, including nested braces."""
+
+    # Create a minimal stub tokenizer that provides the few attributes the
+    # parser accesses (`version` and `get_vocab`).
+    class _StubMistralTokenizer(MistralTokenizer):
+        version = 11  # Satisfy the version check
+
+        def __init__(self):
+            pass
+
+        @staticmethod
+        def get_vocab():
+            # Provide the special TOOL_CALLS token expected by the parser.
+            return {"[TOOL_CALLS]": 0}
+
+    tokenizer = _StubMistralTokenizer()
+    parser = MistralToolParser(tokenizer)
+
+    # Craft a model output featuring nested JSON inside the arguments.
+    args_dict = {
+        "city": "Dallas",
+        "state": "TX",
+        "unit": "fahrenheit",
+        "sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
+    }
+
+    model_output = f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}"
+
+    parsed = parser.extract_tool_calls(model_output, None)
+
+    # Assertions: the tool call is detected and the full nested JSON is parsed
+    # without truncation.
+    assert parsed.tools_called
+
+    assert MistralToolCall.is_valid_id(parsed.tool_calls[0].id)
+    assert parsed.tool_calls[0].function.name == "get_current_weather"
+    assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict
+    # No additional content outside the tool call should be returned.
+    assert parsed.content is None
+
+    # multiple calls
+    multiple_args_dict = [
+        {
+            "city": "Dallas",
+            "state": "TX",
+            "unit": "fahrenheit",
+            "sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
+        },
+        {},
+        {"a": 0},
+        {"a": 1, "b": "c"},
+    ]
+    names = ["get_current_weather", "get_current_weather_2", "random", "random_2"]
+
+    model_output = "".join(
+        [
+            f"{parser.bot_token}{name}{json.dumps(args)}"
+            for name, args in zip(names, multiple_args_dict)
+        ]
+    )
+
+    parsed = parser.extract_tool_calls(model_output, None)
+
+    # Assertions: the tool call is detected and the full nested JSON is parsed
+    # without truncation.
+    assert parsed.tools_called
+    assert len(parsed.tool_calls) == len(multiple_args_dict)
+
+    for i, tool_call in enumerate(parsed.tool_calls):
+        assert MistralToolCall.is_valid_id(tool_call.id)
+        assert tool_call.function.name == names[i]
+        assert json.loads(tool_call.function.arguments) == multiple_args_dict[i]
+        # No additional content outside the tool call should be returned.
+        assert parsed.content is None
--- a/tests/models/language/generation/test_phimoe.py
+++ b/tests/models/language/generation/test_phimoe.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+from ....utils import large_gpu_test
+from ...utils import check_logprobs_close
+
+MODELS = [
+    "microsoft/Phi-3.5-MoE-instruct",
+]
+
+
+def test_phimoe_routing_function():
+    from vllm.model_executor.models.phimoe import phimoe_routing_function
+
+    test_case = {
+        0: {
+            "hidden_states": torch.tensor(
+                [1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
+            ).view(4, 2),
+            "gating_output": torch.tensor(
+                [0.1, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
+            ),
+            "topk": 2,
+            "renormalize": False,
+        },
+        1: {
+            "hidden_states": torch.tensor(
+                [1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
+            ).view(4, 2),
+            "gating_output": torch.tensor(
+                [0.4, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
+            ),
+            "topk": 2,
+            "renormalize": False,
+        },
+    }
+
+    ground_truth = {
+        0: {
+            "topk_weights": torch.tensor(
+                [1.0, 1.0], dtype=torch.float32, requires_grad=False
+            ),
+            "topk_ids": torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
+        },
+        1: {
+            "topk_weights": torch.tensor(
+                [0.5, 1.0], dtype=torch.float32, requires_grad=False
+            ),
+            "topk_ids": torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
+        },
+    }
+
+    for test_id in test_case:
+        topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
+        assert torch.allclose(topk_weights, ground_truth[test_id]["topk_weights"])
+        assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
+
+
+@pytest.mark.skipif(
+    condition=current_platform.is_cpu(),
+    reason="This test takes a lot time to run on CPU, "
+    "and vllm CI's disk space is not enough for this model.",
+)
+@large_gpu_test(min_gb=80)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs
+        )
+
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs
+        )
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )