Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -0,0 +1,185 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.platforms import current_platform
from ....utils import large_gpu_mark
from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
# This list contains the model that are using AITER kernel.
# Skip model that are not using AITER tests.
# When more AITER kernels are added, this list will not be
# needed as all the models will be calling AITER kernels
# in parts of the operators
AITER_MODEL_LIST = [
"meta-llama/Llama-3.2-1B-Instruct",
"openbmb/MiniCPM3-4B",
"Qwen/Qwen-7B-Chat",
"Qwen/Qwen2.5-0.5B-Instruct",
"TitanML/tiny-mixtral",
"Qwen/Qwen3-8B",
]
# @maybe_test_rocm_aiter
@pytest.mark.parametrize(
"model",
[
pytest.param(
"bigscience/bloom-560m", # bloom - testing alibi slopes
marks=[
pytest.mark.core_model,
pytest.mark.slow_test,
pytest.mark.cpu_model,
],
),
pytest.param(
"openai-community/gpt2", # gpt2
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param("Milos/slovak-gpt-j-405M"), # gptj
pytest.param("bigcode/tiny_starcoder_py"), # gpt_bigcode
pytest.param("EleutherAI/pythia-70m"), # gpt_neox
pytest.param(
"google/gemma-1.1-2b-it", # gemma
marks=[
pytest.mark.core_model,
pytest.mark.cpu_model,
pytest.mark.slow_test,
],
),
pytest.param(
"google/gemma-2-2b-it", # test hybrid attention
marks=[pytest.mark.cpu_model],
),
pytest.param(
"zai-org/chatglm3-6b", # chatglm (text-only)
),
pytest.param(
"meta-llama/Llama-3.2-1B-Instruct", # llama
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param(
"openbmb/MiniCPM3-4B",
marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
),
pytest.param(
"facebook/opt-125m", # opt
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param(
"microsoft/phi-2", # phi
marks=[pytest.mark.core_model, pytest.mark.slow_test],
),
pytest.param(
"Qwen/Qwen-7B-Chat", # qwen (text-only)
),
pytest.param(
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
marks=[
pytest.mark.core_model,
pytest.mark.cpu_model,
pytest.mark.slow_test,
],
),
pytest.param(
"Qwen/Qwen3-8B", # qwen (text-only)
),
pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm
pytest.param("bigcode/starcoder2-3b"), # starcoder2
pytest.param(
"TitanML/tiny-mixtral", # mixtral
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param("swiss-ai/Apertus-8B-Instruct-2509"), # apertus
],
)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
)
@pytest.mark.parametrize("use_prompt_embeds", [True, False])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
max_tokens: int,
num_logprobs: int,
use_rocm_aiter: bool,
use_prompt_embeds: bool,
monkeypatch,
) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
if use_rocm_aiter and (model in AITER_MODEL_LIST):
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
elif use_rocm_aiter and model not in AITER_MODEL_LIST:
# Skip model that are not using AITER tests.
# When more AITER kernels are added, this list will not be
# needed as all the models will be calling AITER kernels
# in parts of the operators
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs
)
prompt_embeds: list[torch.Tensor] | None = [] if use_prompt_embeds else None
prompt_token_ids = []
for prompt in example_prompts:
token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
hf_model.model.device
)
prompt_token_ids.append(token_ids)
if prompt_embeds is not None:
prompt_embeds.append(
hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
)
with vllm_runner(
model,
tokenizer_name=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
max_num_seqs=2,
enable_prompt_embeds=use_prompt_embeds,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)
if prompt_embeds is not None:
vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
prompt_embeds, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
if prompt_embeds is not None:
check_logprobs_close(
outputs_0_lst=vllm_outputs,
outputs_1_lst=vllm_outputs_from_embeds,
name_0="vllm",
name_1="vllm_from_embeds",
)
if use_rocm_aiter:
# this is to ensure that vllm engine
# has deallocated the memory before running the next
# unit tests. On ROCm, when using AITER
# the memory might not be deallocated completely
# before running the next test case
torch.cuda.synchronize()

View File

@@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np
import pytest
MODELS = ["google/gemma-2b", "google/gemma-2-2b", "google/gemma-3-4b-it"]
@pytest.mark.parametrize("model", MODELS)
def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
with monkeypatch.context() as m:
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(
model,
load_format="dummy",
) as llm:
if model == "google/gemma-3-4b-it":
normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item() # noqa: E501
)
config = llm.llm.llm_engine.model_config.hf_config.text_config
else:
normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.model.normalizer.cpu().item()
)
config = llm.llm.llm_engine.model_config.hf_config
assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)

View File

@@ -0,0 +1,41 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from ...utils import check_logprobs_close
MODELS = [
# TODO(sang): Sliding window should be tested separately.
"ibm/PowerLM-3b",
"ibm/PowerMoE-3b",
]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)

View File

@@ -0,0 +1,758 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Callable
import pytest
from tests.models.registry import HF_EXAMPLE_MODELS
from tests.utils import multi_gpu_test
from vllm.engine.arg_utils import EngineArgs
from vllm.sampling_params import SamplingParams
from ...utils import check_logprobs_close, check_outputs_equal
# Mark all tests as hybrid
pytestmark = pytest.mark.hybrid_model
# NOTE: The first model in each list is taken as the primary model,
# meaning that it will be used in all tests in this file
# The rest of the models will only be tested by test_models
APC_MULTIPLY_BY = 300
SSM_MODELS = [
"state-spaces/mamba-130m-hf",
"tiiuae/falcon-mamba-tiny-dev",
# mamba2-codestral in transformers is broken pending:
# https://github.com/huggingface/transformers/pull/40861
# "yujiepan/mamba2-codestral-v0.1-tiny-random",
]
HYBRID_MODELS = [
"ai21labs/Jamba-tiny-dev",
"pfnet/plamo-2-1b",
"Zyphra/Zamba2-1.2B-instruct",
"hmellor/tiny-random-BambaForCausalLM",
"ibm-granite/granite-4.0-tiny-preview",
"tiiuae/Falcon-H1-0.5B-Base",
"LiquidAI/LFM2-1.2B",
"tiny-random/qwen3-next-moe",
]
FULL_CUDA_GRAPH_MODELS = [
"ai21labs/Jamba-tiny-dev",
"pfnet/plamo-2-1b",
"Zyphra/Zamba2-1.2B-instruct",
]
FP32_STATE_MODELS = [
"state-spaces/mamba-130m-hf",
"Zyphra/Zamba2-1.2B-instruct",
]
# Avoid OOM
MAX_NUM_SEQS = 4
@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
monkeypatch,
model: str,
max_tokens: int,
num_logprobs: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
except ValueError:
pass
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_batching(
vllm_runner,
example_prompts,
model: str,
max_tokens: int,
num_logprobs: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
except ValueError:
pass
for_loop_outputs = []
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
for prompt in example_prompts:
(single_output,) = vllm_model.generate_greedy_logprobs(
[prompt], max_tokens, num_logprobs
)
for_loop_outputs.append(single_output)
batched_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=for_loop_outputs,
outputs_1_lst=batched_outputs,
name_0="for_loop_vllm",
name_1="batched_vllm",
)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [10])
def test_chunked_prefill_with_parallel_sampling(
vllm_runner,
example_prompts,
model: str,
max_tokens: int,
) -> None:
"""
Tests chunked prefill in conjunction with n > 1.
In this case, prefill is populated with decoding tokens and
we test that it doesn't fail.
This test might fail if cache is not allocated correctly for n > 1
decoding steps inside a chunked prefill forward pass
(where we have both prefill and decode together)
"""
sampling_params = SamplingParams(n=3, temperature=1, seed=0, max_tokens=max_tokens)
with vllm_runner(
model,
enable_chunked_prefill=True,
# forces prefill chunks with decoding
max_num_batched_tokens=MAX_NUM_SEQS * 3,
max_num_seqs=MAX_NUM_SEQS,
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [20])
def test_mamba_cache_cg_padding(
vllm_runner,
example_prompts,
model: str,
max_tokens: int,
) -> None:
"""
This test is for verifying that mamba cache is padded to CG captured
batch size. If it's not, a torch RuntimeError will be raised because
tensor dimensions aren't compatible.
"""
vllm_config = EngineArgs(model=model, trust_remote_code=True).create_engine_config()
while len(example_prompts) == vllm_config.pad_for_cudagraph(len(example_prompts)):
example_prompts.append(example_prompts[0])
try:
with vllm_runner(model) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
except RuntimeError:
pytest.fail(
"Couldn't run batch size which is not equal to a Cuda Graph "
"captured batch size. "
"Could be related to mamba cache not padded correctly"
)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
vllm_runner,
example_prompts,
model: str,
) -> None:
"""
This test is for verifying that the hybrid inner state management doesn't
collapse in case where the number of incoming requests and
finished_requests_ids is larger than the maximum mamba block capacity.
This could generally happen due to the fact that hybrid does support
statelessness mechanism where it can clean up new incoming requests in
a single step.
"""
try:
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
except ValueError:
pytest.fail(
"Hybrid inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily "
)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
def test_state_cleanup(
vllm_runner,
example_prompts,
model: str,
) -> None:
"""
This test is for verifying that the Hybrid state is cleaned up between
steps.
If it's not cleaned, an error would be expected.
"""
try:
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
for _ in range(10):
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
except ValueError:
pytest.fail(
"Hybrid inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids"
)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_distributed_correctness(
vllm_runner,
example_prompts,
model: str,
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(
model, tensor_parallel_size=1, max_num_seqs=MAX_NUM_SEQS
) as vllm_model:
vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(
model, tensor_parallel_size=2, max_num_seqs=MAX_NUM_SEQS
) as vllm_model:
vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=vllm_outputs_tp_1,
outputs_1_lst=vllm_outputs_tp_2,
name_0="vllm_tp_1",
name_1="vllm_tp_2",
)
@pytest.mark.parametrize("model", FULL_CUDA_GRAPH_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_full_cuda_graph(
hf_runner,
vllm_runner,
example_prompts,
monkeypatch,
model: str,
max_tokens: int,
num_logprobs: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
except ValueError:
pass
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize(
"cache_dtype_param", ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]
)
def test_fp32_cache_state(
hf_runner,
vllm_runner,
example_prompts,
monkeypatch,
model: str,
max_tokens: int,
num_logprobs: int,
cache_dtype_param: str,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
except ValueError:
pass
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(
model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
# Helper functions for the APC tests
def _get_vllm_runner_params(
model: str,
max_model_len: int,
tensor_parallel_size: int = 1,
):
return {
"model_name": model,
"enable_chunked_prefill": True,
"enable_prefix_caching": False,
"max_model_len": max_model_len,
"tensor_parallel_size": tensor_parallel_size,
"gpu_memory_utilization": 0.4,
}
def _get_vLLM_output(
vllm_runner,
kwargs,
prompts,
max_tokens,
num_logprobs,
num_repetitions=1,
vllm_model=None,
):
outs = []
if vllm_model is None:
vllm_model = vllm_runner(**kwargs)
for _ in range(num_repetitions):
if num_logprobs < 0:
vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
else:
vllm_output = vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
outs.append(vllm_output)
return outs, vllm_model
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("n_repetitions", [2])
# If num_logprobs is set to -1, then the stringent version
# of the test is executed using `check_outputs_equal`
# instead of `check_logprobs_close`
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tensor_parallel_size", [1])
def test_apc_single_prompt(
hf_runner,
vllm_runner,
example_prompts,
monkeypatch,
model: str,
max_tokens: int,
n_repetitions: int,
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
except ValueError:
pass
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
# Sample prompts.
generated_prompts = [APC_MULTIPLY_BY * example_prompts[0]]
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs["enable_prefix_caching"] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled
# In the second repetition, these caches are reused
compare_operator(
outputs_0_lst=vllm_outputs_no_cache[0],
outputs_1_lst=vllm_outputs_cache_itn,
name_0="vllm_no_cache",
name_1=f"vllm_cache_it_{r_idx + 1}",
)
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("n_repetitions", [2])
# If num_logprobs is set to -1, then the stringent version
# of the test is executed using `check_outputs_equal`
# instead of `check_logprobs_close`
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tensor_parallel_size", [1])
def test_apc_single_prompt_block_align_alignment(
hf_runner,
vllm_runner,
example_prompts,
monkeypatch,
model: str,
max_tokens: int,
n_repetitions: int,
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
except ValueError:
pass
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
# Sample prompts. This custom prompt is used, as it causes the most issues
generated_prompts = ["The president of the United States is " * APC_MULTIPLY_BY]
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs["enable_prefix_caching"] = True
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
# Retrieve the default mamba state block size
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
# In case the hybrid model does not have the
# "mamba_block_size" assume a fixed constant
if mamba_block_size is None:
mamba_block_size = 512
mamba_block_size_multiplier = 10
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
vllm_runner_kwargs["max_num_batched_tokens"] = (
mamba_block_size_multiplier * mamba_block_size - offsets
)
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
# Check alignment of the output logits when using APC
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled
# In the second repetition, these caches are reused
compare_operator(
outputs_0_lst=vllm_outputs_no_cache[0],
outputs_1_lst=vllm_outputs_cache_itn,
name_0="vllm_no_cache",
name_1=f"vllm_cache_it_{r_idx + 1}",
)
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("n_repetitions", [2])
# If num_logprobs is set to -1, then the stringent version
# of the test is executed using `check_outputs_equal`
# instead of `check_logprobs_close`
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tensor_parallel_size", [1])
def test_apc_multiple_prompts_all_cached_outputs(
hf_runner,
vllm_runner,
example_prompts,
monkeypatch,
model: str,
max_tokens: int,
n_repetitions: int,
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
except ValueError:
pass
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
# Sample prompts.
generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts]
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs["enable_prefix_caching"] = True
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled
# In the second repetition, these caches are reused
compare_operator(
outputs_0_lst=vllm_outputs_no_cache[0],
outputs_1_lst=vllm_outputs_cache_itn,
name_0="vllm_no_cache",
name_1=f"vllm_cache_it_{r_idx + 1}",
)
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("n_repetitions", [2])
# If num_logprobs is set to -1, then the stringent version
# of the test is executed using `check_outputs_equal`
# instead of `check_logprobs_close`
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tensor_parallel_size", [1])
def test_apc_multiple_prompts_block_align_alignment(
hf_runner,
vllm_runner,
example_prompts,
monkeypatch,
model: str,
max_tokens: int,
n_repetitions: int,
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
except ValueError:
pass
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
# Sample prompts. This custom prompt is used, as it causes the most issues
prompt_text = "The president of the United States is "
prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31]
generated_prompts = [
prompt_text[offset:] * APC_MULTIPLY_BY for offset in prompt_offsets
]
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
vllm_runner_kwargs["enable_prefix_caching"] = True
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
# Retrieve the default mamba state block size
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
# In case the hybrid model does not have the
# "mamba_block_size" assume a fixed constant
if mamba_block_size is None:
mamba_block_size = 512
mamba_block_size_multiplier = 10
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
vllm_runner_kwargs["max_num_batched_tokens"] = (
mamba_block_size_multiplier * mamba_block_size - offsets
)
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
)
# Check alignment of the output logits when using APC
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled
# In the second repetition, these caches are reused
compare_operator(
outputs_0_lst=vllm_outputs_no_cache[0],
outputs_1_lst=vllm_outputs_cache_itn,
name_0="vllm_no_cache",
name_1=f"vllm_cache_it_{r_idx + 1}",
)
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("n_repetitions", [2])
# If num_logprobs is set to -1, then the stringent version
# of the test is executed using `check_outputs_equal`
# instead of `check_logprobs_close`
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tensor_parallel_size", [1])
def test_apc_multiple_prompts_partial_cached_outputs(
hf_runner,
vllm_runner,
example_prompts,
monkeypatch,
model: str,
max_tokens: int,
n_repetitions: int,
num_logprobs: int,
tensor_parallel_size: int,
) -> None:
try:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
except ValueError:
pass
compare_operator: Callable = (
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
)
# Sample prompts.
generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts]
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
vllm_runner_kwargs = _get_vllm_runner_params(
model, max_model_len, tensor_parallel_size=tensor_parallel_size
)
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
vllm_outputs_no_cache, _ = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
)
# Cache only part of all the prompts
vllm_runner_kwargs["enable_prefix_caching"] = True
vllm_outputs_partial_cache, vllm_model = _get_vLLM_output(
vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens, num_logprobs
)
compare_operator(
outputs_0_lst=vllm_outputs_no_cache[0][:3],
outputs_1_lst=vllm_outputs_partial_cache[0],
name_0="vllm_no_cache",
name_1="vllm_partial_cache",
)
vllm_outputs_cache_rep, _ = _get_vLLM_output(
vllm_runner,
vllm_runner_kwargs,
generated_prompts,
max_tokens,
num_logprobs,
n_repetitions,
vllm_model=vllm_model,
)
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
# In the first repetition, the caches are filled
# In the second repetition, these caches are reused
compare_operator(
outputs_0_lst=vllm_outputs_no_cache[0],
outputs_1_lst=vllm_outputs_cache_itn,
name_0="vllm_no_cache",
name_1=f"vllm_cache_it_{r_idx + 1}",
)

View File

@@ -0,0 +1,352 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
import json
import pytest
from vllm.sampling_params import SamplingParams
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.tool_parsers.mistral_tool_parser import (
MistralToolCall,
MistralToolParser,
)
from ...utils import check_logprobs_close
MODELS = [
"mistralai/Mistral-7B-Instruct-v0.3",
]
MISTRAL_FORMAT_MODELS = [
"mistralai/Mistral-7B-Instruct-v0.3",
# uses the v3-Tekken tokenizer
"mistralai/Ministral-8B-Instruct-2410",
# Mistral-Nemo is too big for CI, but passes locally
# "mistralai/Mistral-Nemo-Instruct-2407"
]
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
SYMBOLIC_LANG_PROMPTS = [
"勇敢な船乗りについての詩を書く", # japanese
"寫一首關於勇敢的水手的詩", # chinese
"ပုံပြင်လေးပြောပြပါ်:\n", # burmese
"Repeat the phrase 'URGENCY🌶':\nURGENCY🌶\nURGENCY🌶\n", # see https://github.com/vllm-project/vllm/pull/9625
]
# for function calling
TOOLS = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to find the weather for, e.g. "
"'San Francisco'",
},
"state": {
"type": "string",
"description": "the two-letter abbreviation for the state that "
"the city is in, e.g. 'CA' which would mean 'California'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "unit"],
},
},
},
{
"type": "function",
"function": {
"name": "rewrite",
"description": "Rewrites text",
"parameters": {
"type": "object",
"required": [],
"properties": {
"text": {
"type": "string",
"description": "The input text to rewrite.",
}
},
},
},
},
]
MSGS = [
{"role": "system", "content": "You are an assistant."},
{
"role": "user",
"content": "Could you please rewrite the below article? \n\n My English needs "
"improvving, maybe I make errors.",
},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "bbc5b7ede",
"type": "function",
"function": {
"name": "rewrite",
"arguments": '{"text":"My English needs improvving, maybe '
'I make errors."}',
},
}
],
},
{
"role": "tool",
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe '
'I make errors."}',
"tool_call_id": "bbc5b7ede",
"name": "rewrite",
},
{
"role": "assistant",
"content": "---\n\nMy English needs improving, maybe I make errors",
},
{
"role": "user",
"content": (
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
),
},
]
SAMPLE_JSON_SCHEMA = {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"},
"skills": {
"type": "array",
"items": {"type": "string", "maxLength": 10},
"minItems": 3,
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {"type": "string"},
"duration": {"type": "number"},
"position": {"type": "string"},
},
"required": ["company", "position"],
},
},
},
"required": ["name", "age", "skills", "work_history"],
}
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
# TODO(sang): Sliding window should be tested separately.
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral") as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_mistral_format(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
) as mistral_format_model:
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="hf",
load_format="safetensors",
config_format="hf",
) as hf_format_model:
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_format_outputs,
outputs_1_lst=mistral_format_outputs,
name_0="hf",
name_1="mistral",
)
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str) -> None:
with vllm_runner(
model,
dtype=dtype,
max_model_len=8192,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral",
) as vllm_model:
for prompt in SYMBOLIC_LANG_PROMPTS:
msg = {"role": "user", "content": prompt}
outputs = vllm_model.llm.chat([msg], sampling_params=SAMPLING_PARAMS)
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral",
) as vllm_model:
msgs = copy.deepcopy(MSGS)
outputs = vllm_model.llm.chat(
msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS
)
tokenizer = vllm_model.llm.get_tokenizer()
tool_parser = MistralToolParser(tokenizer)
model_output = outputs[0].outputs[0].text.strip()
assert model_output.startswith(tool_parser.bot_token), model_output
parsed_message = tool_parser.extract_tool_calls(model_output, None)
assert parsed_message.tools_called
assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
assert parsed_message.tool_calls[0].function.name == "get_current_weather"
assert (
parsed_message.tool_calls[0].function.arguments
== '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'
) # noqa
assert parsed_message.content is None
def test_mistral_function_call_nested_json():
"""Ensure that the function-name regex captures the entire outermost
JSON block, including nested braces."""
# Create a minimal stub tokenizer that provides the few attributes the
# parser accesses (`version` and `get_vocab`).
class _StubMistralTokenizer(MistralTokenizer):
version = 11 # Satisfy the version check
def __init__(self):
pass
@staticmethod
def get_vocab():
# Provide the special TOOL_CALLS token expected by the parser.
return {"[TOOL_CALLS]": 0}
tokenizer = _StubMistralTokenizer()
parser = MistralToolParser(tokenizer)
# Craft a model output featuring nested JSON inside the arguments.
args_dict = {
"city": "Dallas",
"state": "TX",
"unit": "fahrenheit",
"sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
}
model_output = f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}"
parsed = parser.extract_tool_calls(model_output, None)
# Assertions: the tool call is detected and the full nested JSON is parsed
# without truncation.
assert parsed.tools_called
assert MistralToolCall.is_valid_id(parsed.tool_calls[0].id)
assert parsed.tool_calls[0].function.name == "get_current_weather"
assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict
# No additional content outside the tool call should be returned.
assert parsed.content is None
# multiple calls
multiple_args_dict = [
{
"city": "Dallas",
"state": "TX",
"unit": "fahrenheit",
"sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
},
{},
{"a": 0},
{"a": 1, "b": "c"},
]
names = ["get_current_weather", "get_current_weather_2", "random", "random_2"]
model_output = "".join(
[
f"{parser.bot_token}{name}{json.dumps(args)}"
for name, args in zip(names, multiple_args_dict)
]
)
parsed = parser.extract_tool_calls(model_output, None)
# Assertions: the tool call is detected and the full nested JSON is parsed
# without truncation.
assert parsed.tools_called
assert len(parsed.tool_calls) == len(multiple_args_dict)
for i, tool_call in enumerate(parsed.tool_calls):
assert MistralToolCall.is_valid_id(tool_call.id)
assert tool_call.function.name == names[i]
assert json.loads(tool_call.function.arguments) == multiple_args_dict[i]
# No additional content outside the tool call should be returned.
assert parsed.content is None

View File

@@ -0,0 +1,96 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.platforms import current_platform
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
MODELS = [
"microsoft/Phi-3.5-MoE-instruct",
]
def test_phimoe_routing_function():
from vllm.model_executor.models.phimoe import phimoe_routing_function
test_case = {
0: {
"hidden_states": torch.tensor(
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
).view(4, 2),
"gating_output": torch.tensor(
[0.1, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
),
"topk": 2,
"renormalize": False,
},
1: {
"hidden_states": torch.tensor(
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
).view(4, 2),
"gating_output": torch.tensor(
[0.4, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
),
"topk": 2,
"renormalize": False,
},
}
ground_truth = {
0: {
"topk_weights": torch.tensor(
[1.0, 1.0], dtype=torch.float32, requires_grad=False
),
"topk_ids": torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
},
1: {
"topk_weights": torch.tensor(
[0.5, 1.0], dtype=torch.float32, requires_grad=False
),
"topk_ids": torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
},
}
for test_id in test_case:
topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
assert torch.allclose(topk_weights, ground_truth[test_id]["topk_weights"])
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
@pytest.mark.skipif(
condition=current_platform.is_cpu(),
reason="This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model.",
)
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)