Sync from v0.13
This commit is contained in:
0
tests/entrypoints/llm/__init__.py
Normal file
0
tests/entrypoints/llm/__init__.py
Normal file
94
tests/entrypoints/llm/test_accuracy.py
Normal file
94
tests/entrypoints/llm/test_accuracy.py
Normal file
@@ -0,0 +1,94 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This file test accuracy of the vLLM server via LMEval.
|
||||
It uses local-completions, which interacts with vLLM
|
||||
through the OAI API with N concurrent connections.
|
||||
This simulates real work usage of the API and makes
|
||||
sure that the zmq frontend mp RPC message passing and
|
||||
AsyncLLMEngine are working correctly.
|
||||
"""
|
||||
|
||||
import lm_eval
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_NAMES = [
|
||||
"Qwen/Qwen3-1.7B",
|
||||
"google/gemma-3-1b-it",
|
||||
]
|
||||
FP8_KV_MODEL_NAMES = [
|
||||
"Qwen/Qwen3-1.7B",
|
||||
]
|
||||
NUM_CONCURRENT = 500
|
||||
TASK = "gsm8k"
|
||||
FILTER = "exact_match,strict-match"
|
||||
RTOL = 0.03
|
||||
EXPECTED_VALUES = {
|
||||
"Qwen/Qwen3-1.7B": 0.68,
|
||||
"google/gemma-3-1b-it": 0.25,
|
||||
}
|
||||
|
||||
|
||||
def run_test(model_name, more_args=None):
|
||||
"""Run the end to end accuracy test."""
|
||||
|
||||
model_args = f"pretrained={model_name},max_model_len=4096"
|
||||
|
||||
if more_args is not None:
|
||||
model_args = "{},{}".format(model_args, more_args)
|
||||
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=model_args,
|
||||
tasks="gsm8k",
|
||||
batch_size="auto",
|
||||
)
|
||||
|
||||
measured_value = results["results"][TASK][FILTER]
|
||||
assert model_name in EXPECTED_VALUES, (
|
||||
f"Cannot find the expected value for the model {model_name=}"
|
||||
)
|
||||
expected_value = EXPECTED_VALUES[model_name]
|
||||
assert (
|
||||
measured_value - RTOL < expected_value
|
||||
and measured_value + RTOL > expected_value
|
||||
), f"Expected: {expected_value} | Measured: {measured_value}"
|
||||
|
||||
|
||||
# TODO: [AlexM] Fix it with new CI/CD tests
|
||||
TPU_TP_TEST_STR = "" # "tensor_parallel_size=4"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODEL_NAMES)
|
||||
def test_lm_eval_accuracy_v1_engine(model):
|
||||
"""Run with the V1 Engine."""
|
||||
|
||||
more_args = None
|
||||
if current_platform.is_tpu():
|
||||
# Limit compilation time for TPU V1
|
||||
|
||||
more_args = "max_model_len=2048,max_num_seqs=64"
|
||||
|
||||
# Add TP test (if provided)
|
||||
if TPU_TP_TEST_STR:
|
||||
more_args += ",{}".format(TPU_TP_TEST_STR)
|
||||
|
||||
run_test(model, more_args)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
|
||||
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
|
||||
"""Run with the V1 Engine."""
|
||||
|
||||
more_args = None
|
||||
if current_platform.is_tpu():
|
||||
# Limit compilation time for TPU V1
|
||||
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
|
||||
|
||||
# Add TP test (if provided)
|
||||
if TPU_TP_TEST_STR:
|
||||
more_args += ",{}".format(TPU_TP_TEST_STR)
|
||||
|
||||
run_test(model, more_args)
|
||||
212
tests/entrypoints/llm/test_chat.py
Normal file
212
tests/entrypoints/llm/test_chat.py
Normal file
@@ -0,0 +1,212 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
from ..openai.test_vision import TEST_IMAGE_ASSETS
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def text_llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, seed=0)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def llm_for_failure_test():
|
||||
"""
|
||||
Fixture for testing issue #26081.
|
||||
Uses a small max_model_len to easily trigger length errors.
|
||||
"""
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
max_model_len=128,
|
||||
disable_log_stats=True,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
def test_chat(text_llm):
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": prompt1},
|
||||
]
|
||||
outputs = text_llm.chat(messages)
|
||||
assert len(outputs) == 1
|
||||
|
||||
|
||||
def test_multi_chat(text_llm):
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
prompt2 = "Explain what among us is."
|
||||
|
||||
conversation1 = [
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": prompt1},
|
||||
]
|
||||
|
||||
conversation2 = [
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": prompt2},
|
||||
]
|
||||
|
||||
messages = [conversation1, conversation2]
|
||||
|
||||
outputs = text_llm.chat(messages)
|
||||
assert len(outputs) == 2
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def vision_llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
seed=0,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls", [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], indirect=True
|
||||
)
|
||||
def test_chat_multi_image(vision_llm, image_urls: list[str]):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "image_url", "image_url": {"url": image_url}}
|
||||
for image_url in image_urls
|
||||
),
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
outputs = vision_llm.chat(messages)
|
||||
assert len(outputs) >= 0
|
||||
|
||||
|
||||
def test_llm_chat_tokenization_no_double_bos(text_llm):
|
||||
"""
|
||||
LLM.chat() should not add special tokens when using chat templates.
|
||||
Check we get a single BOS token for llama chat.
|
||||
"""
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": "Hello!"},
|
||||
]
|
||||
outputs = text_llm.chat(messages)
|
||||
assert len(outputs) == 1
|
||||
|
||||
prompt_token_ids = outputs[0].prompt_token_ids
|
||||
assert prompt_token_ids is not None
|
||||
|
||||
bos_token = text_llm.get_tokenizer().bos_token_id
|
||||
|
||||
# Ensure we have a single BOS
|
||||
assert prompt_token_ids[0] == bos_token
|
||||
assert prompt_token_ids[1] != bos_token, "Double BOS"
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def thinking_llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
max_model_len=4096,
|
||||
enforce_eager=True,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_thinking", [True, False])
|
||||
def test_chat_extra_kwargs(thinking_llm, enable_thinking):
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": "What is 1+1?"},
|
||||
]
|
||||
|
||||
outputs = thinking_llm.chat(
|
||||
messages,
|
||||
chat_template_kwargs={"enable_thinking": enable_thinking},
|
||||
)
|
||||
assert len(outputs) == 1
|
||||
|
||||
prompt_token_ids = outputs[0].prompt_token_ids
|
||||
assert prompt_token_ids is not None
|
||||
|
||||
think_id = thinking_llm.get_tokenizer().get_vocab()["<think>"]
|
||||
|
||||
if enable_thinking:
|
||||
assert think_id not in prompt_token_ids
|
||||
else:
|
||||
# The chat template includes dummy thinking process
|
||||
assert think_id in prompt_token_ids
|
||||
|
||||
|
||||
def test_chat_batch_failure_cleanup(llm_for_failure_test):
|
||||
"""
|
||||
Tests that if a batch call to llm.chat() fails mid-way
|
||||
(e.g., due to one invalid prompt), the requests that
|
||||
were already enqueued are properly aborted and do not
|
||||
pollute the queue for subsequent calls.
|
||||
(Fixes Issue #26081)
|
||||
"""
|
||||
llm = llm_for_failure_test
|
||||
valid_msg = [{"role": "user", "content": "Hello"}]
|
||||
long_text = "This is a very long text to test the error " * 50
|
||||
invalid_msg = [{"role": "user", "content": long_text}]
|
||||
batch_1 = [
|
||||
valid_msg,
|
||||
valid_msg,
|
||||
invalid_msg,
|
||||
]
|
||||
batch_2 = [
|
||||
valid_msg,
|
||||
valid_msg,
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
with pytest.raises(ValueError, match="longer than the maximum model length"):
|
||||
llm.chat(batch_1, sampling_params=sampling_params)
|
||||
outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
|
||||
assert len(outputs_2) == len(batch_2)
|
||||
assert llm.llm_engine.get_num_unfinished_requests() == 0
|
||||
36
tests/entrypoints/llm/test_collective_rpc.py
Normal file
36
tests/entrypoints/llm/test_collective_rpc.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
@pytest.mark.parametrize("backend", ["mp", "ray"])
|
||||
@create_new_process_for_each_test()
|
||||
def test_collective_rpc(tp_size, backend, monkeypatch):
|
||||
if torch.cuda.device_count() < tp_size:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
if tp_size == 1 and backend == "ray":
|
||||
pytest.skip("Skip duplicate test case")
|
||||
if tp_size == 1:
|
||||
backend = None
|
||||
|
||||
# intentionally define the method and class in the test function,
|
||||
# to test if they can be serialized and sent to the workers
|
||||
def echo_rank(self):
|
||||
return self.rank
|
||||
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
llm = LLM(
|
||||
model="hmellor/tiny-random-LlamaForCausalLM",
|
||||
enforce_eager=True,
|
||||
load_format="dummy",
|
||||
tensor_parallel_size=tp_size,
|
||||
distributed_executor_backend=backend,
|
||||
)
|
||||
assert llm.collective_rpc(echo_rank) == list(range(tp_size))
|
||||
124
tests/entrypoints/llm/test_generate.py
Normal file
124
tests/entrypoints/llm/test_generate.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "distilbert/distilgpt2"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
TOKEN_IDS = [
|
||||
[0],
|
||||
[0, 1],
|
||||
[0, 2, 1],
|
||||
[0, 3, 1, 2],
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
max_num_batched_tokens=4096,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.10,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_multiple_sampling_params(llm: LLM):
|
||||
sampling_params = [
|
||||
SamplingParams(temperature=0.01, top_p=0.95),
|
||||
SamplingParams(temperature=0.3, top_p=0.95),
|
||||
SamplingParams(temperature=0.7, top_p=0.95),
|
||||
SamplingParams(temperature=0.99, top_p=0.95),
|
||||
]
|
||||
|
||||
# Multiple SamplingParams should be matched with each prompt
|
||||
outputs = llm.generate(PROMPTS, sampling_params=sampling_params)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the size of params does not match the size of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3])
|
||||
|
||||
# Single SamplingParams should be applied to every prompt
|
||||
single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
|
||||
outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# sampling_params is None, default params should be applied
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
|
||||
def test_multiple_priority(llm: LLM):
|
||||
# Generate works when priority is None
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None, priority=None)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Generate works when length of priority is same as the len(PROMPTS)
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None, priority=[0] * len(PROMPTS))
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the length of priority does not match the length of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.generate(
|
||||
PROMPTS, sampling_params=None, priority=[0] * (len(PROMPTS) - 1)
|
||||
)
|
||||
|
||||
# Exception raised, if the priority list is empty
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None, priority=[])
|
||||
|
||||
|
||||
def test_max_model_len():
|
||||
max_model_len = 20
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=0.10,
|
||||
enforce_eager=True, # reduce test time
|
||||
)
|
||||
sampling_params = SamplingParams(max_tokens=max_model_len + 10)
|
||||
outputs = llm.generate(PROMPTS, sampling_params)
|
||||
for output in outputs:
|
||||
num_total_tokens = len(output.prompt_token_ids) + len(
|
||||
output.outputs[0].token_ids
|
||||
)
|
||||
# Total tokens must not exceed max_model_len.
|
||||
# It can be less if generation finishes due to other reasons (e.g., EOS)
|
||||
# before reaching the absolute model length limit.
|
||||
assert num_total_tokens <= max_model_len
|
||||
|
||||
|
||||
def test_log_stats():
|
||||
llm = LLM(
|
||||
model=MODEL_NAME,
|
||||
disable_log_stats=False,
|
||||
gpu_memory_utilization=0.10,
|
||||
enforce_eager=True, # reduce test time
|
||||
)
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None)
|
||||
|
||||
# disable_log_stats is False, every output should have metrics
|
||||
assert all(output.metrics is not None for output in outputs)
|
||||
27
tests/entrypoints/llm/test_gpu_utilization.py
Normal file
27
tests/entrypoints/llm/test_gpu_utilization.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
|
||||
def test_gpu_memory_utilization():
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# makes sure gpu_memory_utilization is per-instance limit,
|
||||
# not a global limit
|
||||
llms = [
|
||||
LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3, enforce_eager=True)
|
||||
for i in range(3)
|
||||
]
|
||||
for llm in llms:
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
98
tests/entrypoints/llm/test_mm_cache_stats.py
Normal file
98
tests/entrypoints/llm/test_mm_cache_stats.py
Normal file
@@ -0,0 +1,98 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import logging
|
||||
|
||||
import pytest
|
||||
import regex as re
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
from vllm.v1.metrics import loggers as stat_loggers
|
||||
from vllm.v1.metrics.reader import Counter, Metric
|
||||
|
||||
from ..openai.test_vision import TEST_IMAGE_ASSETS
|
||||
|
||||
|
||||
def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _get_counter_value(metrics: list[Metric], name: str):
|
||||
metric = next(m for m in metrics if m.name == name)
|
||||
assert isinstance(metric, Counter)
|
||||
return metric.value
|
||||
|
||||
|
||||
def _get_mm_cache_stats(metrics: list[Metric]):
|
||||
mm_cache_queries = _get_counter_value(metrics, "vllm:mm_cache_queries")
|
||||
mm_cache_hits = _get_counter_value(metrics, "vllm:mm_cache_hits")
|
||||
|
||||
return mm_cache_queries, mm_cache_hits
|
||||
|
||||
|
||||
def _get_mm_cache_log(llm: LLM, caplog_vllm: pytest.LogCaptureFixture) -> float:
|
||||
caplog_vllm.clear()
|
||||
with caplog_vllm.at_level(logging.INFO, logger=stat_loggers.__name__):
|
||||
llm.llm_engine.do_log_stats()
|
||||
|
||||
assert len(caplog_vllm.records) == 1
|
||||
msg = caplog_vllm.records[0].getMessage()
|
||||
|
||||
assert "MM cache hit rate" in msg
|
||||
match = re.search(r"MM cache hit rate: ([0-9.]+)%", msg)
|
||||
assert match is not None
|
||||
return float(match.group(1))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("image_urls", [TEST_IMAGE_ASSETS[:2]], indirect=True)
|
||||
@pytest.mark.parametrize("mm_processor_cache_type", ["lru", "shm"])
|
||||
def test_mm_cache_stats(
|
||||
num_gpus_available,
|
||||
image_urls,
|
||||
mm_processor_cache_type,
|
||||
caplog_vllm,
|
||||
):
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
enforce_eager=True,
|
||||
mm_processor_cache_type=mm_processor_cache_type,
|
||||
disable_log_stats=False,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
)
|
||||
|
||||
llm.chat(_make_messages(image_urls[0]))
|
||||
assert _get_mm_cache_stats(llm.get_metrics()) == (1, 0)
|
||||
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
|
||||
|
||||
llm.chat(_make_messages(image_urls[1]))
|
||||
assert _get_mm_cache_stats(llm.get_metrics()) == (2, 0)
|
||||
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
|
||||
|
||||
llm.chat(_make_messages(image_urls[0]))
|
||||
assert _get_mm_cache_stats(llm.get_metrics()) == (3, 1)
|
||||
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(33.3)
|
||||
|
||||
# NOTE: This only resets hit rate stats in CachingMetrics
|
||||
# The raw queries and hits counts remain unaffected
|
||||
llm.reset_mm_cache()
|
||||
|
||||
llm.chat(_make_messages(image_urls[0]))
|
||||
assert _get_mm_cache_stats(llm.get_metrics()) == (4, 1)
|
||||
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
|
||||
|
||||
llm.chat(_make_messages(image_urls[1]))
|
||||
assert _get_mm_cache_stats(llm.get_metrics()) == (5, 1)
|
||||
assert _get_mm_cache_log(llm, caplog_vllm) == pytest.approx(0.0)
|
||||
34
tests/entrypoints/llm/test_prompt_validation.py
Normal file
34
tests/entrypoints/llm/test_prompt_validation.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
|
||||
def test_empty_prompt():
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match="decoder prompt cannot be empty"):
|
||||
llm.generate([""])
|
||||
|
||||
|
||||
def test_out_of_vocab_token():
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match="out of vocabulary"):
|
||||
llm.generate({"prompt_token_ids": [999999]})
|
||||
|
||||
|
||||
def test_require_mm_embeds():
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
enforce_eager=True,
|
||||
enable_mm_embeds=False,
|
||||
)
|
||||
with pytest.raises(ValueError, match="--enable-mm-embeds"):
|
||||
llm.generate(
|
||||
{
|
||||
"prompt": "<image>",
|
||||
"multi_modal_data": {"image": torch.empty(1, 1, 1)},
|
||||
}
|
||||
)
|
||||
Reference in New Issue
Block a user