Sync from v0.13
This commit is contained in:
0
tests/models/language/generation/__init__.py
Normal file
0
tests/models/language/generation/__init__.py
Normal file
185
tests/models/language/generation/test_common.py
Normal file
185
tests/models/language/generation/test_common.py
Normal file
@@ -0,0 +1,185 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import large_gpu_mark
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
# This list contains the model that are using AITER kernel.
|
||||
# Skip model that are not using AITER tests.
|
||||
# When more AITER kernels are added, this list will not be
|
||||
# needed as all the models will be calling AITER kernels
|
||||
# in parts of the operators
|
||||
AITER_MODEL_LIST = [
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"openbmb/MiniCPM3-4B",
|
||||
"Qwen/Qwen-7B-Chat",
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
"TitanML/tiny-mixtral",
|
||||
"Qwen/Qwen3-8B",
|
||||
]
|
||||
|
||||
|
||||
# @maybe_test_rocm_aiter
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param(
|
||||
"bigscience/bloom-560m", # bloom - testing alibi slopes
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.slow_test,
|
||||
pytest.mark.cpu_model,
|
||||
],
|
||||
),
|
||||
pytest.param(
|
||||
"openai-community/gpt2", # gpt2
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param("Milos/slovak-gpt-j-405M"), # gptj
|
||||
pytest.param("bigcode/tiny_starcoder_py"), # gpt_bigcode
|
||||
pytest.param("EleutherAI/pythia-70m"), # gpt_neox
|
||||
pytest.param(
|
||||
"google/gemma-1.1-2b-it", # gemma
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
),
|
||||
pytest.param(
|
||||
"google/gemma-2-2b-it", # test hybrid attention
|
||||
marks=[pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param(
|
||||
"zai-org/chatglm3-6b", # chatglm (text-only)
|
||||
),
|
||||
pytest.param(
|
||||
"meta-llama/Llama-3.2-1B-Instruct", # llama
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param(
|
||||
"openbmb/MiniCPM3-4B",
|
||||
marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
pytest.param(
|
||||
"facebook/opt-125m", # opt
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param(
|
||||
"microsoft/phi-2", # phi
|
||||
marks=[pytest.mark.core_model, pytest.mark.slow_test],
|
||||
),
|
||||
pytest.param(
|
||||
"Qwen/Qwen-7B-Chat", # qwen (text-only)
|
||||
),
|
||||
pytest.param(
|
||||
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
pytest.mark.cpu_model,
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
),
|
||||
pytest.param(
|
||||
"Qwen/Qwen3-8B", # qwen (text-only)
|
||||
),
|
||||
pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm
|
||||
pytest.param("bigcode/starcoder2-3b"), # starcoder2
|
||||
pytest.param(
|
||||
"TitanML/tiny-mixtral", # mixtral
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param("swiss-ai/Apertus-8B-Instruct-2509"), # apertus
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize(
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
)
|
||||
@pytest.mark.parametrize("use_prompt_embeds", [True, False])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
use_rocm_aiter: bool,
|
||||
use_prompt_embeds: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
if use_rocm_aiter and (model in AITER_MODEL_LIST):
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
elif use_rocm_aiter and model not in AITER_MODEL_LIST:
|
||||
# Skip model that are not using AITER tests.
|
||||
# When more AITER kernels are added, this list will not be
|
||||
# needed as all the models will be calling AITER kernels
|
||||
# in parts of the operators
|
||||
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
prompt_embeds: list[torch.Tensor] | None = [] if use_prompt_embeds else None
|
||||
|
||||
prompt_token_ids = []
|
||||
for prompt in example_prompts:
|
||||
token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
|
||||
hf_model.model.device
|
||||
)
|
||||
prompt_token_ids.append(token_ids)
|
||||
if prompt_embeds is not None:
|
||||
prompt_embeds.append(
|
||||
hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
tokenizer_name=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
max_num_seqs=2,
|
||||
enable_prompt_embeds=use_prompt_embeds,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
if prompt_embeds is not None:
|
||||
vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs(
|
||||
prompt_embeds, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
if prompt_embeds is not None:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs_from_embeds,
|
||||
name_0="vllm",
|
||||
name_1="vllm_from_embeds",
|
||||
)
|
||||
|
||||
if use_rocm_aiter:
|
||||
# this is to ensure that vllm engine
|
||||
# has deallocated the memory before running the next
|
||||
# unit tests. On ROCm, when using AITER
|
||||
# the memory might not be deallocated completely
|
||||
# before running the next test case
|
||||
torch.cuda.synchronize()
|
||||
27
tests/models/language/generation/test_gemma.py
Normal file
27
tests/models/language/generation/test_gemma.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
MODELS = ["google/gemma-2b", "google/gemma-2-2b", "google/gemma-3-4b-it"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
with vllm_runner(
|
||||
model,
|
||||
load_format="dummy",
|
||||
) as llm:
|
||||
if model == "google/gemma-3-4b-it":
|
||||
normalizers = llm.llm.collective_rpc(
|
||||
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item() # noqa: E501
|
||||
)
|
||||
config = llm.llm.llm_engine.model_config.hf_config.text_config
|
||||
else:
|
||||
normalizers = llm.llm.collective_rpc(
|
||||
lambda self: self.model_runner.model.model.normalizer.cpu().item()
|
||||
)
|
||||
config = llm.llm.llm_engine.model_config.hf_config
|
||||
assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
|
||||
41
tests/models/language/generation/test_granite.py
Normal file
41
tests/models/language/generation/test_granite.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
# TODO(sang): Sliding window should be tested separately.
|
||||
"ibm/PowerLM-3b",
|
||||
"ibm/PowerMoE-3b",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
758
tests/models/language/generation/test_hybrid.py
Normal file
758
tests/models/language/generation/test_hybrid.py
Normal file
@@ -0,0 +1,758 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Callable
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.registry import HF_EXAMPLE_MODELS
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
from ...utils import check_logprobs_close, check_outputs_equal
|
||||
|
||||
# Mark all tests as hybrid
|
||||
pytestmark = pytest.mark.hybrid_model
|
||||
|
||||
# NOTE: The first model in each list is taken as the primary model,
|
||||
# meaning that it will be used in all tests in this file
|
||||
# The rest of the models will only be tested by test_models
|
||||
|
||||
APC_MULTIPLY_BY = 300
|
||||
|
||||
SSM_MODELS = [
|
||||
"state-spaces/mamba-130m-hf",
|
||||
"tiiuae/falcon-mamba-tiny-dev",
|
||||
# mamba2-codestral in transformers is broken pending:
|
||||
# https://github.com/huggingface/transformers/pull/40861
|
||||
# "yujiepan/mamba2-codestral-v0.1-tiny-random",
|
||||
]
|
||||
|
||||
HYBRID_MODELS = [
|
||||
"ai21labs/Jamba-tiny-dev",
|
||||
"pfnet/plamo-2-1b",
|
||||
"Zyphra/Zamba2-1.2B-instruct",
|
||||
"hmellor/tiny-random-BambaForCausalLM",
|
||||
"ibm-granite/granite-4.0-tiny-preview",
|
||||
"tiiuae/Falcon-H1-0.5B-Base",
|
||||
"LiquidAI/LFM2-1.2B",
|
||||
"tiny-random/qwen3-next-moe",
|
||||
]
|
||||
|
||||
FULL_CUDA_GRAPH_MODELS = [
|
||||
"ai21labs/Jamba-tiny-dev",
|
||||
"pfnet/plamo-2-1b",
|
||||
"Zyphra/Zamba2-1.2B-instruct",
|
||||
]
|
||||
|
||||
FP32_STATE_MODELS = [
|
||||
"state-spaces/mamba-130m-hf",
|
||||
"Zyphra/Zamba2-1.2B-instruct",
|
||||
]
|
||||
|
||||
# Avoid OOM
|
||||
MAX_NUM_SEQS = 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_batching(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
for_loop_outputs = []
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
for prompt in example_prompts:
|
||||
(single_output,) = vllm_model.generate_greedy_logprobs(
|
||||
[prompt], max_tokens, num_logprobs
|
||||
)
|
||||
for_loop_outputs.append(single_output)
|
||||
|
||||
batched_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=for_loop_outputs,
|
||||
outputs_1_lst=batched_outputs,
|
||||
name_0="for_loop_vllm",
|
||||
name_1="batched_vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_chunked_prefill_with_parallel_sampling(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""
|
||||
Tests chunked prefill in conjunction with n > 1.
|
||||
|
||||
In this case, prefill is populated with decoding tokens and
|
||||
we test that it doesn't fail.
|
||||
|
||||
This test might fail if cache is not allocated correctly for n > 1
|
||||
decoding steps inside a chunked prefill forward pass
|
||||
(where we have both prefill and decode together)
|
||||
"""
|
||||
sampling_params = SamplingParams(n=3, temperature=1, seed=0, max_tokens=max_tokens)
|
||||
with vllm_runner(
|
||||
model,
|
||||
enable_chunked_prefill=True,
|
||||
# forces prefill chunks with decoding
|
||||
max_num_batched_tokens=MAX_NUM_SEQS * 3,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_mamba_cache_cg_padding(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""
|
||||
This test is for verifying that mamba cache is padded to CG captured
|
||||
batch size. If it's not, a torch RuntimeError will be raised because
|
||||
tensor dimensions aren't compatible.
|
||||
"""
|
||||
vllm_config = EngineArgs(model=model, trust_remote_code=True).create_engine_config()
|
||||
while len(example_prompts) == vllm_config.pad_for_cudagraph(len(example_prompts)):
|
||||
example_prompts.append(example_prompts[0])
|
||||
|
||||
try:
|
||||
with vllm_runner(model) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
except RuntimeError:
|
||||
pytest.fail(
|
||||
"Couldn't run batch size which is not equal to a Cuda Graph "
|
||||
"captured batch size. "
|
||||
"Could be related to mamba cache not padded correctly"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
) -> None:
|
||||
"""
|
||||
This test is for verifying that the hybrid inner state management doesn't
|
||||
collapse in case where the number of incoming requests and
|
||||
finished_requests_ids is larger than the maximum mamba block capacity.
|
||||
|
||||
This could generally happen due to the fact that hybrid does support
|
||||
statelessness mechanism where it can clean up new incoming requests in
|
||||
a single step.
|
||||
"""
|
||||
try:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
|
||||
except ValueError:
|
||||
pytest.fail(
|
||||
"Hybrid inner state wasn't cleaned up properly between"
|
||||
"steps finished requests registered unnecessarily "
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
def test_state_cleanup(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
) -> None:
|
||||
"""
|
||||
This test is for verifying that the Hybrid state is cleaned up between
|
||||
steps.
|
||||
|
||||
If it's not cleaned, an error would be expected.
|
||||
"""
|
||||
try:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
for _ in range(10):
|
||||
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
|
||||
except ValueError:
|
||||
pytest.fail(
|
||||
"Hybrid inner state wasn't cleaned up between states, "
|
||||
"could be related to finished_requests_ids"
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_distributed_correctness(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model, tensor_parallel_size=1, max_num_seqs=MAX_NUM_SEQS
|
||||
) as vllm_model:
|
||||
vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model, tensor_parallel_size=2, max_num_seqs=MAX_NUM_SEQS
|
||||
) as vllm_model:
|
||||
vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=vllm_outputs_tp_1,
|
||||
outputs_1_lst=vllm_outputs_tp_2,
|
||||
name_0="vllm_tp_1",
|
||||
name_1="vllm_tp_2",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", FULL_CUDA_GRAPH_MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_full_cuda_graph(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize(
|
||||
"cache_dtype_param", ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]
|
||||
)
|
||||
def test_fp32_cache_state(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
cache_dtype_param: str,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model, max_num_seqs=MAX_NUM_SEQS, **{cache_dtype_param: "float32"}
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
# Helper functions for the APC tests
|
||||
def _get_vllm_runner_params(
|
||||
model: str,
|
||||
max_model_len: int,
|
||||
tensor_parallel_size: int = 1,
|
||||
):
|
||||
return {
|
||||
"model_name": model,
|
||||
"enable_chunked_prefill": True,
|
||||
"enable_prefix_caching": False,
|
||||
"max_model_len": max_model_len,
|
||||
"tensor_parallel_size": tensor_parallel_size,
|
||||
"gpu_memory_utilization": 0.4,
|
||||
}
|
||||
|
||||
|
||||
def _get_vLLM_output(
|
||||
vllm_runner,
|
||||
kwargs,
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
num_repetitions=1,
|
||||
vllm_model=None,
|
||||
):
|
||||
outs = []
|
||||
if vllm_model is None:
|
||||
vllm_model = vllm_runner(**kwargs)
|
||||
for _ in range(num_repetitions):
|
||||
if num_logprobs < 0:
|
||||
vllm_output = vllm_model.generate_greedy(prompts, max_tokens)
|
||||
else:
|
||||
vllm_output = vllm_model.generate_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs
|
||||
)
|
||||
outs.append(vllm_output)
|
||||
|
||||
return outs, vllm_model
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("n_repetitions", [2])
|
||||
# If num_logprobs is set to -1, then the stringent version
|
||||
# of the test is executed using `check_outputs_equal`
|
||||
# instead of `check_logprobs_close`
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
def test_apc_single_prompt(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
n_repetitions: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
# Sample prompts.
|
||||
generated_prompts = [APC_MULTIPLY_BY * example_prompts[0]]
|
||||
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
)
|
||||
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
# In the second repetition, these caches are reused
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0],
|
||||
outputs_1_lst=vllm_outputs_cache_itn,
|
||||
name_0="vllm_no_cache",
|
||||
name_1=f"vllm_cache_it_{r_idx + 1}",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("n_repetitions", [2])
|
||||
# If num_logprobs is set to -1, then the stringent version
|
||||
# of the test is executed using `check_outputs_equal`
|
||||
# instead of `check_logprobs_close`
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
def test_apc_single_prompt_block_align_alignment(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
n_repetitions: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
# Sample prompts. This custom prompt is used, as it causes the most issues
|
||||
generated_prompts = ["The president of the United States is " * APC_MULTIPLY_BY]
|
||||
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
|
||||
# Retrieve the default mamba state block size
|
||||
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
|
||||
|
||||
# In case the hybrid model does not have the
|
||||
# "mamba_block_size" assume a fixed constant
|
||||
if mamba_block_size is None:
|
||||
mamba_block_size = 512
|
||||
|
||||
mamba_block_size_multiplier = 10
|
||||
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
|
||||
vllm_runner_kwargs["max_num_batched_tokens"] = (
|
||||
mamba_block_size_multiplier * mamba_block_size - offsets
|
||||
)
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
)
|
||||
|
||||
# Check alignment of the output logits when using APC
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
# In the second repetition, these caches are reused
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0],
|
||||
outputs_1_lst=vllm_outputs_cache_itn,
|
||||
name_0="vllm_no_cache",
|
||||
name_1=f"vllm_cache_it_{r_idx + 1}",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("n_repetitions", [2])
|
||||
# If num_logprobs is set to -1, then the stringent version
|
||||
# of the test is executed using `check_outputs_equal`
|
||||
# instead of `check_logprobs_close`
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
def test_apc_multiple_prompts_all_cached_outputs(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
n_repetitions: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
# Sample prompts.
|
||||
generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts]
|
||||
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
)
|
||||
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
# In the second repetition, these caches are reused
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0],
|
||||
outputs_1_lst=vllm_outputs_cache_itn,
|
||||
name_0="vllm_no_cache",
|
||||
name_1=f"vllm_cache_it_{r_idx + 1}",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("n_repetitions", [2])
|
||||
# If num_logprobs is set to -1, then the stringent version
|
||||
# of the test is executed using `check_outputs_equal`
|
||||
# instead of `check_logprobs_close`
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
def test_apc_multiple_prompts_block_align_alignment(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
n_repetitions: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
# Sample prompts. This custom prompt is used, as it causes the most issues
|
||||
prompt_text = "The president of the United States is "
|
||||
prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31]
|
||||
generated_prompts = [
|
||||
prompt_text[offset:] * APC_MULTIPLY_BY for offset in prompt_offsets
|
||||
]
|
||||
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
|
||||
# Retrieve the default mamba state block size
|
||||
mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size
|
||||
|
||||
# In case the hybrid model does not have the
|
||||
# "mamba_block_size" assume a fixed constant
|
||||
if mamba_block_size is None:
|
||||
mamba_block_size = 512
|
||||
|
||||
mamba_block_size_multiplier = 10
|
||||
for offsets in [-3, 3, mamba_block_size // 4 + 3, mamba_block_size // 2 - 3]:
|
||||
vllm_runner_kwargs["max_num_batched_tokens"] = (
|
||||
mamba_block_size_multiplier * mamba_block_size - offsets
|
||||
)
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
)
|
||||
|
||||
# Check alignment of the output logits when using APC
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
# In the second repetition, these caches are reused
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0],
|
||||
outputs_1_lst=vllm_outputs_cache_itn,
|
||||
name_0="vllm_no_cache",
|
||||
name_1=f"vllm_cache_it_{r_idx + 1}",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("n_repetitions", [2])
|
||||
# If num_logprobs is set to -1, then the stringent version
|
||||
# of the test is executed using `check_outputs_equal`
|
||||
# instead of `check_logprobs_close`
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
def test_apc_multiple_prompts_partial_cached_outputs(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
n_repetitions: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
compare_operator: Callable = (
|
||||
check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore
|
||||
)
|
||||
|
||||
# Sample prompts.
|
||||
generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts]
|
||||
|
||||
max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts)
|
||||
vllm_runner_kwargs = _get_vllm_runner_params(
|
||||
model, max_model_len, tensor_parallel_size=tensor_parallel_size
|
||||
)
|
||||
vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32"
|
||||
|
||||
vllm_outputs_no_cache, _ = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
# Cache only part of all the prompts
|
||||
vllm_runner_kwargs["enable_prefix_caching"] = True
|
||||
vllm_outputs_partial_cache, vllm_model = _get_vLLM_output(
|
||||
vllm_runner, vllm_runner_kwargs, generated_prompts[:3], max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0][:3],
|
||||
outputs_1_lst=vllm_outputs_partial_cache[0],
|
||||
name_0="vllm_no_cache",
|
||||
name_1="vllm_partial_cache",
|
||||
)
|
||||
|
||||
vllm_outputs_cache_rep, _ = _get_vLLM_output(
|
||||
vllm_runner,
|
||||
vllm_runner_kwargs,
|
||||
generated_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
n_repetitions,
|
||||
vllm_model=vllm_model,
|
||||
)
|
||||
|
||||
for r_idx, vllm_outputs_cache_itn in enumerate(vllm_outputs_cache_rep):
|
||||
# In the first repetition, the caches are filled
|
||||
# In the second repetition, these caches are reused
|
||||
|
||||
compare_operator(
|
||||
outputs_0_lst=vllm_outputs_no_cache[0],
|
||||
outputs_1_lst=vllm_outputs_cache_itn,
|
||||
name_0="vllm_no_cache",
|
||||
name_1=f"vllm_cache_it_{r_idx + 1}",
|
||||
)
|
||||
352
tests/models/language/generation/test_mistral.py
Normal file
352
tests/models/language/generation/test_mistral.py
Normal file
@@ -0,0 +1,352 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import copy
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
from vllm.tool_parsers.mistral_tool_parser import (
|
||||
MistralToolCall,
|
||||
MistralToolParser,
|
||||
)
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
]
|
||||
|
||||
MISTRAL_FORMAT_MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
# uses the v3-Tekken tokenizer
|
||||
"mistralai/Ministral-8B-Instruct-2410",
|
||||
# Mistral-Nemo is too big for CI, but passes locally
|
||||
# "mistralai/Mistral-Nemo-Instruct-2407"
|
||||
]
|
||||
|
||||
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||
SYMBOLIC_LANG_PROMPTS = [
|
||||
"勇敢な船乗りについての詩を書く", # japanese
|
||||
"寫一首關於勇敢的水手的詩", # chinese
|
||||
"ပုံပြင်လေးပြောပြပါ်:\n", # burmese
|
||||
"Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n", # see https://github.com/vllm-project/vllm/pull/9625
|
||||
]
|
||||
|
||||
# for function calling
|
||||
TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description": "The city to find the weather for, e.g. "
|
||||
"'San Francisco'",
|
||||
},
|
||||
"state": {
|
||||
"type": "string",
|
||||
"description": "the two-letter abbreviation for the state that "
|
||||
"the city is in, e.g. 'CA' which would mean 'California'",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["city", "state", "unit"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "rewrite",
|
||||
"description": "Rewrites text",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"required": [],
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The input text to rewrite.",
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
MSGS = [
|
||||
{"role": "system", "content": "You are an assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Could you please rewrite the below article? \n\n My English needs "
|
||||
"improvving, maybe I make errors.",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "",
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": "bbc5b7ede",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "rewrite",
|
||||
"arguments": '{"text":"My English needs improvving, maybe '
|
||||
'I make errors."}',
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe '
|
||||
'I make errors."}',
|
||||
"tool_call_id": "bbc5b7ede",
|
||||
"name": "rewrite",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "---\n\nMy English needs improving, maybe I make errors",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
SAMPLE_JSON_SCHEMA = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"age": {"type": "integer"},
|
||||
"skills": {
|
||||
"type": "array",
|
||||
"items": {"type": "string", "maxLength": 10},
|
||||
"minItems": 3,
|
||||
},
|
||||
"work_history": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company": {"type": "string"},
|
||||
"duration": {"type": "number"},
|
||||
"position": {"type": "string"},
|
||||
},
|
||||
"required": ["company", "position"],
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["name", "age", "skills", "work_history"],
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# TODO(sang): Sliding window should be tested separately.
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, dtype=dtype, tokenizer_mode="mistral") as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_mistral_format(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
) as mistral_format_model:
|
||||
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="hf",
|
||||
load_format="safetensors",
|
||||
config_format="hf",
|
||||
) as hf_format_model:
|
||||
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_format_outputs,
|
||||
outputs_1_lst=mistral_format_outputs,
|
||||
name_0="hf",
|
||||
name_1="mistral",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_mistral_symbolic_languages(vllm_runner, model: str, dtype: str) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
) as vllm_model:
|
||||
for prompt in SYMBOLIC_LANG_PROMPTS:
|
||||
msg = {"role": "user", "content": prompt}
|
||||
outputs = vllm_model.llm.chat([msg], sampling_params=SAMPLING_PARAMS)
|
||||
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
) as vllm_model:
|
||||
msgs = copy.deepcopy(MSGS)
|
||||
outputs = vllm_model.llm.chat(
|
||||
msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS
|
||||
)
|
||||
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
tool_parser = MistralToolParser(tokenizer)
|
||||
|
||||
model_output = outputs[0].outputs[0].text.strip()
|
||||
assert model_output.startswith(tool_parser.bot_token), model_output
|
||||
parsed_message = tool_parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert parsed_message.tools_called
|
||||
|
||||
assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
|
||||
assert parsed_message.tool_calls[0].function.name == "get_current_weather"
|
||||
assert (
|
||||
parsed_message.tool_calls[0].function.arguments
|
||||
== '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'
|
||||
) # noqa
|
||||
assert parsed_message.content is None
|
||||
|
||||
|
||||
def test_mistral_function_call_nested_json():
|
||||
"""Ensure that the function-name regex captures the entire outermost
|
||||
JSON block, including nested braces."""
|
||||
|
||||
# Create a minimal stub tokenizer that provides the few attributes the
|
||||
# parser accesses (`version` and `get_vocab`).
|
||||
class _StubMistralTokenizer(MistralTokenizer):
|
||||
version = 11 # Satisfy the version check
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def get_vocab():
|
||||
# Provide the special TOOL_CALLS token expected by the parser.
|
||||
return {"[TOOL_CALLS]": 0}
|
||||
|
||||
tokenizer = _StubMistralTokenizer()
|
||||
parser = MistralToolParser(tokenizer)
|
||||
|
||||
# Craft a model output featuring nested JSON inside the arguments.
|
||||
args_dict = {
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
"sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
|
||||
}
|
||||
|
||||
model_output = f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}"
|
||||
|
||||
parsed = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
# Assertions: the tool call is detected and the full nested JSON is parsed
|
||||
# without truncation.
|
||||
assert parsed.tools_called
|
||||
|
||||
assert MistralToolCall.is_valid_id(parsed.tool_calls[0].id)
|
||||
assert parsed.tool_calls[0].function.name == "get_current_weather"
|
||||
assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict
|
||||
# No additional content outside the tool call should be returned.
|
||||
assert parsed.content is None
|
||||
|
||||
# multiple calls
|
||||
multiple_args_dict = [
|
||||
{
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
"sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}},
|
||||
},
|
||||
{},
|
||||
{"a": 0},
|
||||
{"a": 1, "b": "c"},
|
||||
]
|
||||
names = ["get_current_weather", "get_current_weather_2", "random", "random_2"]
|
||||
|
||||
model_output = "".join(
|
||||
[
|
||||
f"{parser.bot_token}{name}{json.dumps(args)}"
|
||||
for name, args in zip(names, multiple_args_dict)
|
||||
]
|
||||
)
|
||||
|
||||
parsed = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
# Assertions: the tool call is detected and the full nested JSON is parsed
|
||||
# without truncation.
|
||||
assert parsed.tools_called
|
||||
assert len(parsed.tool_calls) == len(multiple_args_dict)
|
||||
|
||||
for i, tool_call in enumerate(parsed.tool_calls):
|
||||
assert MistralToolCall.is_valid_id(tool_call.id)
|
||||
assert tool_call.function.name == names[i]
|
||||
assert json.loads(tool_call.function.arguments) == multiple_args_dict[i]
|
||||
# No additional content outside the tool call should be returned.
|
||||
assert parsed.content is None
|
||||
96
tests/models/language/generation/test_phimoe.py
Normal file
96
tests/models/language/generation/test_phimoe.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"microsoft/Phi-3.5-MoE-instruct",
|
||||
]
|
||||
|
||||
|
||||
def test_phimoe_routing_function():
|
||||
from vllm.model_executor.models.phimoe import phimoe_routing_function
|
||||
|
||||
test_case = {
|
||||
0: {
|
||||
"hidden_states": torch.tensor(
|
||||
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
|
||||
).view(4, 2),
|
||||
"gating_output": torch.tensor(
|
||||
[0.1, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
|
||||
),
|
||||
"topk": 2,
|
||||
"renormalize": False,
|
||||
},
|
||||
1: {
|
||||
"hidden_states": torch.tensor(
|
||||
[1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32, requires_grad=False
|
||||
).view(4, 2),
|
||||
"gating_output": torch.tensor(
|
||||
[0.4, 0.2, 0.3, 0.4], dtype=torch.float32, requires_grad=False
|
||||
),
|
||||
"topk": 2,
|
||||
"renormalize": False,
|
||||
},
|
||||
}
|
||||
|
||||
ground_truth = {
|
||||
0: {
|
||||
"topk_weights": torch.tensor(
|
||||
[1.0, 1.0], dtype=torch.float32, requires_grad=False
|
||||
),
|
||||
"topk_ids": torch.tensor([3, 2], dtype=torch.long, requires_grad=False),
|
||||
},
|
||||
1: {
|
||||
"topk_weights": torch.tensor(
|
||||
[0.5, 1.0], dtype=torch.float32, requires_grad=False
|
||||
),
|
||||
"topk_ids": torch.tensor([0, 3], dtype=torch.long, requires_grad=False),
|
||||
},
|
||||
}
|
||||
|
||||
for test_id in test_case:
|
||||
topk_weights, topk_ids = phimoe_routing_function(**test_case[test_id])
|
||||
assert torch.allclose(topk_weights, ground_truth[test_id]["topk_weights"])
|
||||
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
condition=current_platform.is_cpu(),
|
||||
reason="This test takes a lot time to run on CPU, "
|
||||
"and vllm CI's disk space is not enough for this model.",
|
||||
)
|
||||
@large_gpu_test(min_gb=80)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
Reference in New Issue
Block a user