Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,94 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file test accuracy of the vLLM server via LMEval.
It uses local-completions, which interacts with vLLM
through the OAI API with N concurrent connections.
This simulates real work usage of the API and makes
sure that the zmq frontend mp RPC message passing and
AsyncLLMEngine are working correctly.
"""
import lm_eval
import pytest
from vllm.platforms import current_platform
MODEL_NAMES = [
"Qwen/Qwen3-1.7B",
"google/gemma-3-1b-it",
]
FP8_KV_MODEL_NAMES = [
"Qwen/Qwen3-1.7B",
]
NUM_CONCURRENT = 500
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
EXPECTED_VALUES = {
"Qwen/Qwen3-1.7B": 0.68,
"google/gemma-3-1b-it": 0.25,
}
def run_test(model_name, more_args=None):
"""Run the end to end accuracy test."""
model_args = f"pretrained={model_name},max_model_len=4096"
if more_args is not None:
model_args = "{},{}".format(model_args, more_args)
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks="gsm8k",
batch_size="auto",
)
measured_value = results["results"][TASK][FILTER]
assert model_name in EXPECTED_VALUES, (
f"Cannot find the expected value for the model {model_name=}"
)
expected_value = EXPECTED_VALUES[model_name]
assert (
measured_value - RTOL < expected_value
and measured_value + RTOL > expected_value
), f"Expected: {expected_value} | Measured: {measured_value}"
# TODO: [AlexM] Fix it with new CI/CD tests
TPU_TP_TEST_STR = "" # "tensor_parallel_size=4"
@pytest.mark.parametrize("model", MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine(model):
"""Run with the V1 Engine."""
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_model_len=2048,max_num_seqs=64"
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
run_test(model, more_args)
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
"""Run with the V1 Engine."""
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
run_test(model, more_args)

View File

@@ -0,0 +1,212 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.sampling_params import SamplingParams
from ..openai.test_vision import TEST_IMAGE_ASSETS
@pytest.fixture(scope="function")
def text_llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, seed=0)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.fixture(scope="function")
def llm_for_failure_test():
"""
Fixture for testing issue #26081.
Uses a small max_model_len to easily trigger length errors.
"""
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
seed=0,
max_model_len=128,
disable_log_stats=True,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
def test_chat(text_llm):
prompt1 = "Explain the concept of entropy."
messages = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt1},
]
outputs = text_llm.chat(messages)
assert len(outputs) == 1
def test_multi_chat(text_llm):
prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is."
conversation1 = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt1},
]
conversation2 = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt2},
]
messages = [conversation1, conversation2]
outputs = text_llm.chat(messages)
assert len(outputs) == 2
@pytest.fixture(scope="function")
def vision_llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct",
max_model_len=4096,
max_num_seqs=5,
enforce_eager=True,
trust_remote_code=True,
limit_mm_per_prompt={"image": 2},
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.parametrize(
"image_urls", [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], indirect=True
)
def test_chat_multi_image(vision_llm, image_urls: list[str]):
messages = [
{
"role": "user",
"content": [
*(
{"type": "image_url", "image_url": {"url": image_url}}
for image_url in image_urls
),
{"type": "text", "text": "What's in this image?"},
],
}
]
outputs = vision_llm.chat(messages)
assert len(outputs) >= 0
def test_llm_chat_tokenization_no_double_bos(text_llm):
"""
LLM.chat() should not add special tokens when using chat templates.
Check we get a single BOS token for llama chat.
"""
messages = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": "Hello!"},
]
outputs = text_llm.chat(messages)
assert len(outputs) == 1
prompt_token_ids = outputs[0].prompt_token_ids
assert prompt_token_ids is not None
bos_token = text_llm.get_tokenizer().bos_token_id
# Ensure we have a single BOS
assert prompt_token_ids[0] == bos_token
assert prompt_token_ids[1] != bos_token, "Double BOS"
@pytest.fixture(scope="function")
def thinking_llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model="Qwen/Qwen3-0.6B",
max_model_len=4096,
enforce_eager=True,
seed=0,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.parametrize("enable_thinking", [True, False])
def test_chat_extra_kwargs(thinking_llm, enable_thinking):
messages = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": "What is 1+1?"},
]
outputs = thinking_llm.chat(
messages,
chat_template_kwargs={"enable_thinking": enable_thinking},
)
assert len(outputs) == 1
prompt_token_ids = outputs[0].prompt_token_ids
assert prompt_token_ids is not None
think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]
if enable_thinking:
assert think_id not in prompt_token_ids
else:
# The chat template includes dummy thinking process
assert think_id in prompt_token_ids
def test_chat_batch_failure_cleanup(llm_for_failure_test):
"""
Tests that if a batch call to llm.chat() fails mid-way
(e.g., due to one invalid prompt), the requests that
were already enqueued are properly aborted and do not
pollute the queue for subsequent calls.
(Fixes Issue #26081)
"""
llm = llm_for_failure_test
valid_msg = [{"role": "user", "content": "Hello"}]
long_text = "This is a very long text to test the error " * 50
invalid_msg = [{"role": "user", "content": long_text}]
batch_1 = [
valid_msg,
valid_msg,
invalid_msg,
]
batch_2 = [
valid_msg,
valid_msg,
]
sampling_params = SamplingParams(temperature=0, max_tokens=10)
with pytest.raises(ValueError, match="longer than the maximum model length"):
llm.chat(batch_1, sampling_params=sampling_params)
outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
assert len(outputs_2) == len(batch_2)
assert llm.llm_engine.get_num_unfinished_requests() == 0

View File

@@ -0,0 +1,36 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm import LLM
from ...utils import create_new_process_for_each_test
@pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("backend", ["mp", "ray"])
@create_new_process_for_each_test()
def test_collective_rpc(tp_size, backend, monkeypatch):
if torch.cuda.device_count() < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
if tp_size == 1 and backend == "ray":
pytest.skip("Skip duplicate test case")
if tp_size == 1:
backend = None
# intentionally define the method and class in the test function,
# to test if they can be serialized and sent to the workers
def echo_rank(self):
return self.rank
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
llm = LLM(
model="hmellor/tiny-random-LlamaForCausalLM",
enforce_eager=True,
load_format="dummy",
tensor_parallel_size=tp_size,
distributed_executor_backend=backend,
)
assert llm.collective_rpc(echo_rank) == list(range(tp_size))

View File

@@ -0,0 +1,124 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
import pytest
from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
MODEL_NAME = "distilbert/distilgpt2"
PROMPTS = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
TOKEN_IDS = [
[0],
[0, 1],
[0, 2, 1],
[0, 3, 1, 2],
]
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(
model=MODEL_NAME,
max_num_batched_tokens=4096,
tensor_parallel_size=1,
gpu_memory_utilization=0.10,
enforce_eager=True,
)
yield weakref.proxy(llm)
del llm
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_multiple_sampling_params(llm: LLM):
sampling_params = [
SamplingParams(temperature=0.01, top_p=0.95),
SamplingParams(temperature=0.3, top_p=0.95),
SamplingParams(temperature=0.7, top_p=0.95),
SamplingParams(temperature=0.99, top_p=0.95),
]
# Multiple SamplingParams should be matched with each prompt
outputs = llm.generate(PROMPTS, sampling_params=sampling_params)
assert len(PROMPTS) == len(outputs)
# Exception raised, if the size of params does not match the size of prompts
with pytest.raises(ValueError):
outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3])
# Single SamplingParams should be applied to every prompt
single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params)
assert len(PROMPTS) == len(outputs)
# sampling_params is None, default params should be applied
outputs = llm.generate(PROMPTS, sampling_params=None)
assert len(PROMPTS) == len(outputs)
def test_multiple_priority(llm: LLM):
# Generate works when priority is None
outputs = llm.generate(PROMPTS, sampling_params=None, priority=None)
assert len(PROMPTS) == len(outputs)
# Generate works when length of priority is same as the len(PROMPTS)
outputs = llm.generate(PROMPTS, sampling_params=None, priority=[0] * len(PROMPTS))
assert len(PROMPTS) == len(outputs)
# Exception raised, if the length of priority does not match the length of prompts
with pytest.raises(ValueError):
outputs = llm.generate(
PROMPTS, sampling_params=None, priority=[0] * (len(PROMPTS) - 1)
)
# Exception raised, if the priority list is empty
with pytest.raises(ValueError):
outputs = llm.generate(PROMPTS, sampling_params=None, priority=[])
def test_max_model_len():
max_model_len = 20
llm = LLM(
model=MODEL_NAME,
max_model_len=max_model_len,
gpu_memory_utilization=0.10,
enforce_eager=True, # reduce test time
)
sampling_params = SamplingParams(max_tokens=max_model_len + 10)
outputs = llm.generate(PROMPTS, sampling_params)
for output in outputs:
num_total_tokens = len(output.prompt_token_ids) + len(
output.outputs[0].token_ids
)
# Total tokens must not exceed max_model_len.
# It can be less if generation finishes due to other reasons (e.g., EOS)
# before reaching the absolute model length limit.
assert num_total_tokens <= max_model_len
def test_log_stats():
llm = LLM(
model=MODEL_NAME,
disable_log_stats=False,
gpu_memory_utilization=0.10,
enforce_eager=True, # reduce test time
)
outputs = llm.generate(PROMPTS, sampling_params=None)
# disable_log_stats is False, every output should have metrics
assert all(output.metrics is not None for output in outputs)

View File

@@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import LLM, SamplingParams
def test_gpu_memory_utilization():
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# makes sure gpu_memory_utilization is per-instance limit,
# not a global limit
llms = [
LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3, enforce_eager=True)
for i in range(3)
]
for llm in llms:
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@@ -0,0 +1,98 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import logging
import pytest
import regex as re
from vllm import LLM
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.v1.metrics import loggers as stat_loggers
from vllm.v1.metrics.reader import Counter, Metric
from ..openai.test_vision import TEST_IMAGE_ASSETS
def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
return [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
}
]
def _get_counter_value(metrics: list[Metric], name: str):
metric = next(m for m in metrics if m.name == name)
assert isinstance(metric, Counter)
return metric.value
def _get_mm_cache_stats(metrics: list[Metric]):
mm_cache_queries = _get_counter_value(metrics, "vllm:mm_cache_queries")
mm_cache_hits = _get_counter_value(metrics, "vllm:mm_cache_hits")
return mm_cache_queries, mm_cache_hits
def _get_mm_cache_log(llm: LLM, caplog_vllm: pytest.LogCaptureFixture) -> float:
caplog_vllm.clear()
with caplog_vllm.at_level(logging.INFO, logger=stat_loggers.__name__):
llm.llm_engine.do_log_stats()
assert len(caplog_vllm.records) == 1
msg = caplog_vllm.records[0].getMessage()
assert "MM cache hit rate" in msg
match = re.search(r"MM cache hit rate: ([0-9.]+)%", msg)
assert match is not None
return float(match.group(1))
@pytest.mark.parametrize("image_urls", [TEST_IMAGE_ASSETS[:2]], indirect=True)
@pytest.mark.parametrize("mm_processor_cache_type", ["lru", "shm"])
def test_mm_cache_stats(
num_gpus_available,
image_urls,
mm_processor_cache_type,
caplog_vllm,
):
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
max_model_len=4096,
max_num_seqs=5,
enforce_eager=True,
mm_processor_cache_type=mm_processor_cache_type,
disable_log_stats=False,
limit_mm_per_prompt={"image": 2},
)
llm.chat(_make_messages(image_urls[0]))
assert _get_mm_cache_stats(llm.get_metrics()) == (1, 0)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
llm.chat(_make_messages(image_urls[1]))
assert _get_mm_cache_stats(llm.get_metrics()) == (2, 0)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
llm.chat(_make_messages(image_urls[0]))
assert _get_mm_cache_stats(llm.get_metrics()) == (3, 1)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(33.3)
# NOTE: This only resets hit rate stats in CachingMetrics
# The raw queries and hits counts remain unaffected
llm.reset_mm_cache()
llm.chat(_make_messages(image_urls[0]))
assert _get_mm_cache_stats(llm.get_metrics()) == (4, 1)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
llm.chat(_make_messages(image_urls[1]))
assert _get_mm_cache_stats(llm.get_metrics()) == (5, 1)
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)

View File

@@ -0,0 +1,34 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm import LLM
def test_empty_prompt():
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
with pytest.raises(ValueError, match="decoder prompt cannot be empty"):
llm.generate([""])
def test_out_of_vocab_token():
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
with pytest.raises(ValueError, match="out of vocabulary"):
llm.generate({"prompt_token_ids": [999999]})
def test_require_mm_embeds():
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
enforce_eager=True,
enable_mm_embeds=False,
)
with pytest.raises(ValueError, match="--enable-mm-embeds"):
llm.generate(
{
"prompt": "<image>",
"multi_modal_data": {"image": torch.empty(1, 1, 1)},
}
)