Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,30 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import subprocess
import pytest
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
@pytest.mark.benchmark
def test_bench_latency():
command = [
"vllm",
"bench",
"latency",
"--model",
MODEL_NAME,
"--input-len",
"32",
"--output-len",
"1",
"--enforce-eager",
"--load-format",
"dummy",
]
result = subprocess.run(command, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"

View File

@@ -0,0 +1,249 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import tempfile
from pathlib import Path
import pytest
from vllm.benchmarks.sweep.param_sweep import ParameterSweep, ParameterSweepItem
class TestParameterSweepItem:
"""Test ParameterSweepItem functionality."""
@pytest.mark.parametrize(
"input_dict,expected",
[
(
{"compilation_config.use_inductor_graph_partition": False},
"--compilation-config.use_inductor_graph_partition=false",
),
(
{"compilation_config.use_inductor_graph_partition": True},
"--compilation-config.use_inductor_graph_partition=true",
),
],
)
def test_nested_boolean_params(self, input_dict, expected):
"""Test that nested boolean params use =true/false syntax."""
item = ParameterSweepItem.from_record(input_dict)
cmd = item.apply_to_cmd(["vllm", "serve", "model"])
assert expected in cmd
@pytest.mark.parametrize(
"input_dict,expected",
[
({"enable_prefix_caching": False}, "--no-enable-prefix-caching"),
({"enable_prefix_caching": True}, "--enable-prefix-caching"),
({"disable_log_stats": False}, "--no-disable-log-stats"),
({"disable_log_stats": True}, "--disable-log-stats"),
],
)
def test_non_nested_boolean_params(self, input_dict, expected):
"""Test that non-nested boolean params use --no- prefix."""
item = ParameterSweepItem.from_record(input_dict)
cmd = item.apply_to_cmd(["vllm", "serve", "model"])
assert expected in cmd
@pytest.mark.parametrize(
"compilation_config",
[
{"cudagraph_mode": "full", "mode": 2, "use_inductor_graph_partition": True},
{
"cudagraph_mode": "piecewise",
"mode": 3,
"use_inductor_graph_partition": False,
},
],
)
def test_nested_dict_value(self, compilation_config):
"""Test that nested dict values are serialized as JSON."""
item = ParameterSweepItem.from_record(
{"compilation_config": compilation_config}
)
cmd = item.apply_to_cmd(["vllm", "serve", "model"])
assert "--compilation-config" in cmd
# The dict should be JSON serialized
idx = cmd.index("--compilation-config")
assert json.loads(cmd[idx + 1]) == compilation_config
@pytest.mark.parametrize(
"input_dict,expected_key,expected_value",
[
({"model": "test-model"}, "--model", "test-model"),
({"max_tokens": 100}, "--max-tokens", "100"),
({"temperature": 0.7}, "--temperature", "0.7"),
],
)
def test_string_and_numeric_values(self, input_dict, expected_key, expected_value):
"""Test that string and numeric values are handled correctly."""
item = ParameterSweepItem.from_record(input_dict)
cmd = item.apply_to_cmd(["vllm", "serve"])
assert expected_key in cmd
assert expected_value in cmd
@pytest.mark.parametrize(
"input_dict,expected_key,key_idx_offset",
[
({"max_tokens": 200}, "--max-tokens", 1),
({"enable_prefix_caching": False}, "--no-enable-prefix-caching", 0),
],
)
def test_replace_existing_parameter(self, input_dict, expected_key, key_idx_offset):
"""Test that existing parameters in cmd are replaced."""
item = ParameterSweepItem.from_record(input_dict)
if key_idx_offset == 1:
# Key-value pair
cmd = item.apply_to_cmd(["vllm", "serve", "--max-tokens", "100", "model"])
assert expected_key in cmd
idx = cmd.index(expected_key)
assert cmd[idx + 1] == "200"
assert "100" not in cmd
else:
# Boolean flag
cmd = item.apply_to_cmd(
["vllm", "serve", "--enable-prefix-caching", "model"]
)
assert expected_key in cmd
assert "--enable-prefix-caching" not in cmd
class TestParameterSweep:
"""Test ParameterSweep functionality."""
def test_from_records_list(self):
"""Test creating ParameterSweep from a list of records."""
records = [
{"max_tokens": 100, "temperature": 0.7},
{"max_tokens": 200, "temperature": 0.9},
]
sweep = ParameterSweep.from_records(records)
assert len(sweep) == 2
assert sweep[0]["max_tokens"] == 100
assert sweep[1]["max_tokens"] == 200
def test_read_from_dict(self):
"""Test creating ParameterSweep from a dict format."""
data = {
"experiment1": {"max_tokens": 100, "temperature": 0.7},
"experiment2": {"max_tokens": 200, "temperature": 0.9},
}
sweep = ParameterSweep.read_from_dict(data)
assert len(sweep) == 2
# Check that items have the _benchmark_name field
names = {item["_benchmark_name"] for item in sweep}
assert names == {"experiment1", "experiment2"}
# Check that parameters are preserved
for item in sweep:
if item["_benchmark_name"] == "experiment1":
assert item["max_tokens"] == 100
assert item["temperature"] == 0.7
elif item["_benchmark_name"] == "experiment2":
assert item["max_tokens"] == 200
assert item["temperature"] == 0.9
def test_read_json_list_format(self):
"""Test reading JSON file with list format."""
records = [
{"max_tokens": 100, "temperature": 0.7},
{"max_tokens": 200, "temperature": 0.9},
]
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
json.dump(records, f)
temp_path = Path(f.name)
try:
sweep = ParameterSweep.read_json(temp_path)
assert len(sweep) == 2
assert sweep[0]["max_tokens"] == 100
assert sweep[1]["max_tokens"] == 200
finally:
temp_path.unlink()
def test_read_json_dict_format(self):
"""Test reading JSON file with dict format."""
data = {
"experiment1": {"max_tokens": 100, "temperature": 0.7},
"experiment2": {"max_tokens": 200, "temperature": 0.9},
}
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
json.dump(data, f)
temp_path = Path(f.name)
try:
sweep = ParameterSweep.read_json(temp_path)
assert len(sweep) == 2
# Check that items have the _benchmark_name field
names = {item["_benchmark_name"] for item in sweep}
assert names == {"experiment1", "experiment2"}
finally:
temp_path.unlink()
def test_unique_benchmark_names_validation(self):
"""Test that duplicate _benchmark_name values raise an error."""
# Test with duplicate names in list format
records = [
{"_benchmark_name": "exp1", "max_tokens": 100},
{"_benchmark_name": "exp1", "max_tokens": 200},
]
with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
ParameterSweep.from_records(records)
def test_unique_benchmark_names_multiple_duplicates(self):
"""Test validation with multiple duplicate names."""
records = [
{"_benchmark_name": "exp1", "max_tokens": 100},
{"_benchmark_name": "exp1", "max_tokens": 200},
{"_benchmark_name": "exp2", "max_tokens": 300},
{"_benchmark_name": "exp2", "max_tokens": 400},
]
with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
ParameterSweep.from_records(records)
def test_no_benchmark_names_allowed(self):
"""Test that records without _benchmark_name are allowed."""
records = [
{"max_tokens": 100, "temperature": 0.7},
{"max_tokens": 200, "temperature": 0.9},
]
sweep = ParameterSweep.from_records(records)
assert len(sweep) == 2
def test_mixed_benchmark_names_allowed(self):
"""Test that mixing records with and without _benchmark_name is allowed."""
records = [
{"_benchmark_name": "exp1", "max_tokens": 100},
{"max_tokens": 200, "temperature": 0.9},
]
sweep = ParameterSweep.from_records(records)
assert len(sweep) == 2
class TestParameterSweepItemKeyNormalization:
"""Test key normalization in ParameterSweepItem."""
def test_underscore_to_hyphen_conversion(self):
"""Test that underscores are converted to hyphens in CLI."""
item = ParameterSweepItem.from_record({"max_tokens": 100})
cmd = item.apply_to_cmd(["vllm", "serve"])
assert "--max-tokens" in cmd
def test_nested_key_preserves_suffix(self):
"""Test that nested keys preserve the suffix format."""
# The suffix after the dot should preserve underscores
item = ParameterSweepItem.from_record(
{"compilation_config.some_nested_param": "value"}
)
cmd = item.apply_to_cmd(["vllm", "serve"])
# The prefix (compilation_config) gets converted to hyphens,
# but the suffix (some_nested_param) is preserved
assert any("compilation-config.some_nested_param" in arg for arg in cmd)

View File

@@ -0,0 +1,171 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pandas as pd
import pytest
from vllm.benchmarks.sweep.plot import (
PlotEqualTo,
PlotFilterBase,
PlotFilters,
PlotGreaterThan,
PlotGreaterThanOrEqualTo,
PlotLessThan,
PlotLessThanOrEqualTo,
PlotNotEqualTo,
)
class TestPlotFilters:
"""Test PlotFilter functionality including 'inf' edge case."""
def setup_method(self):
"""Create sample DataFrames for testing."""
# DataFrame with numeric values
self.df_numeric = pd.DataFrame(
{
"request_rate": [1.0, 5.0, 10.0, 50.0, 100.0],
"value": [10, 20, 30, 40, 50],
}
)
# DataFrame with float('inf') - note: string "inf" values are coerced
# to float when loading data, so we only test with float('inf')
self.df_inf_float = pd.DataFrame(
{
"request_rate": [1.0, 5.0, 10.0, float("inf"), float("inf")],
"value": [10, 20, 30, 40, 50],
}
)
@pytest.mark.parametrize(
"target,expected_count",
[
("5.0", 1),
("10.0", 1),
("1.0", 1),
],
)
def test_equal_to_numeric(self, target, expected_count):
"""Test PlotEqualTo with numeric values."""
filter_obj = PlotEqualTo("request_rate", target)
result = filter_obj.apply(self.df_numeric)
assert len(result) == expected_count
def test_equal_to_inf_float(self):
"""Test PlotEqualTo with float('inf')."""
filter_obj = PlotEqualTo("request_rate", "inf")
result = filter_obj.apply(self.df_inf_float)
# Should match both float('inf') entries because float('inf') == float('inf')
assert len(result) == 2
@pytest.mark.parametrize(
"target,expected_count",
[
("5.0", 4), # All except 5.0
("1.0", 4), # All except 1.0
],
)
def test_not_equal_to_numeric(self, target, expected_count):
"""Test PlotNotEqualTo with numeric values."""
filter_obj = PlotNotEqualTo("request_rate", target)
result = filter_obj.apply(self.df_numeric)
assert len(result) == expected_count
def test_not_equal_to_inf_float(self):
"""Test PlotNotEqualTo with float('inf')."""
filter_obj = PlotNotEqualTo("request_rate", "inf")
result = filter_obj.apply(self.df_inf_float)
# Should exclude float('inf') entries
assert len(result) == 3
@pytest.mark.parametrize(
"target,expected_count",
[
("10.0", 2), # 1.0, 5.0
("50.0", 3), # 1.0, 5.0, 10.0
("5.0", 1), # 1.0
],
)
def test_less_than(self, target, expected_count):
"""Test PlotLessThan with numeric values."""
filter_obj = PlotLessThan("request_rate", target)
result = filter_obj.apply(self.df_numeric)
assert len(result) == expected_count
@pytest.mark.parametrize(
"target,expected_count",
[
("10.0", 3), # 1.0, 5.0, 10.0
("5.0", 2), # 1.0, 5.0
],
)
def test_less_than_or_equal_to(self, target, expected_count):
"""Test PlotLessThanOrEqualTo with numeric values."""
filter_obj = PlotLessThanOrEqualTo("request_rate", target)
result = filter_obj.apply(self.df_numeric)
assert len(result) == expected_count
@pytest.mark.parametrize(
"target,expected_count",
[
("10.0", 2), # 50.0, 100.0
("5.0", 3), # 10.0, 50.0, 100.0
],
)
def test_greater_than(self, target, expected_count):
"""Test PlotGreaterThan with numeric values."""
filter_obj = PlotGreaterThan("request_rate", target)
result = filter_obj.apply(self.df_numeric)
assert len(result) == expected_count
@pytest.mark.parametrize(
"target,expected_count",
[
("10.0", 3), # 10.0, 50.0, 100.0
("5.0", 4), # 5.0, 10.0, 50.0, 100.0
],
)
def test_greater_than_or_equal_to(self, target, expected_count):
"""Test PlotGreaterThanOrEqualTo with numeric values."""
filter_obj = PlotGreaterThanOrEqualTo("request_rate", target)
result = filter_obj.apply(self.df_numeric)
assert len(result) == expected_count
@pytest.mark.parametrize(
"filter_str,expected_var,expected_target,expected_type",
[
("request_rate==5.0", "request_rate", "5.0", PlotEqualTo),
("request_rate!=10.0", "request_rate", "10.0", PlotNotEqualTo),
("request_rate<50.0", "request_rate", "50.0", PlotLessThan),
("request_rate<=50.0", "request_rate", "50.0", PlotLessThanOrEqualTo),
("request_rate>10.0", "request_rate", "10.0", PlotGreaterThan),
("request_rate>=10.0", "request_rate", "10.0", PlotGreaterThanOrEqualTo),
("request_rate==inf", "request_rate", "inf", PlotEqualTo),
("request_rate!='inf'", "request_rate", "inf", PlotNotEqualTo),
],
)
def test_parse_str(self, filter_str, expected_var, expected_target, expected_type):
"""Test parsing filter strings."""
filter_obj = PlotFilterBase.parse_str(filter_str)
assert isinstance(filter_obj, expected_type)
assert filter_obj.var == expected_var
assert filter_obj.target == expected_target
def test_parse_str_inf_edge_case(self):
"""Test parsing 'inf' string in filter."""
filter_obj = PlotFilterBase.parse_str("request_rate==inf")
assert isinstance(filter_obj, PlotEqualTo)
assert filter_obj.var == "request_rate"
assert filter_obj.target == "inf"
def test_parse_multiple_filters(self):
"""Test parsing multiple filters."""
filters = PlotFilters.parse_str("request_rate>5.0,value<=40")
assert len(filters) == 2
assert isinstance(filters[0], PlotGreaterThan)
assert isinstance(filters[1], PlotLessThanOrEqualTo)
def test_parse_empty_filter(self):
"""Test parsing empty filter string."""
filters = PlotFilters.parse_str("")
assert len(filters) == 0

View File

@@ -0,0 +1,484 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
from typing import Any, NamedTuple, cast
import numpy as np
import pytest
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.benchmarks.datasets import (
RandomDataset,
RandomMultiModalDataset,
SampleRequest,
)
@pytest.fixture(scope="session")
def hf_tokenizer() -> PreTrainedTokenizerBase:
# Use a small, commonly available tokenizer
return AutoTokenizer.from_pretrained("gpt2")
class Params(NamedTuple):
num_requests: int
prefix_len: int
range_ratio: float
input_len: int
output_len: int
@pytest.fixture(scope="session")
def random_dataset_params() -> Params:
return Params(
num_requests=16, prefix_len=7, range_ratio=0.3, input_len=50, output_len=20
)
def _fingerprint_sample(req: SampleRequest) -> tuple[str, int, int]:
"""Project a SampleRequest into a comparable tuple."""
return (req.prompt, req.prompt_len, req.expected_output_len)
def _collect_samples(
dataset: RandomDataset,
tokenizer: PreTrainedTokenizerBase,
num_requests: int = 16,
prefix_len: int = 7,
range_ratio: float = 0.3,
input_len: int = 50,
output_len: int = 20,
) -> list[tuple[str, int, int]]:
samples = dataset.sample(
tokenizer=tokenizer,
num_requests=num_requests,
prefix_len=prefix_len,
range_ratio=range_ratio,
input_len=input_len,
output_len=output_len,
)
return [_fingerprint_sample(s) for s in samples]
@pytest.mark.benchmark
def test_random_dataset_same_seed(
hf_tokenizer: PreTrainedTokenizerBase, random_dataset_params: Params
) -> None:
"""Same seed should yield identical outputs, even if global RNGs change.
This guards against accidental reliance on Python's random or np.random
in RandomDataset after moving to numpy.default_rng.
"""
p = random_dataset_params
common_seed = 123
dataset_a = RandomDataset(random_seed=common_seed)
dataset_b = RandomDataset(random_seed=common_seed)
a = _collect_samples(
dataset_a,
hf_tokenizer,
num_requests=p.num_requests,
prefix_len=p.prefix_len,
range_ratio=p.range_ratio,
input_len=p.input_len,
output_len=p.output_len,
)
# Perturb global RNG state to ensure isolation
random.seed(999)
_ = [random.random() for _ in range(100)]
np.random.seed(888)
_ = [np.random.random() for _ in range(100)]
b = _collect_samples(
dataset_b,
hf_tokenizer,
num_requests=p.num_requests,
prefix_len=p.prefix_len,
range_ratio=p.range_ratio,
input_len=p.input_len,
output_len=p.output_len,
)
assert a == b
@pytest.mark.benchmark
def test_random_dataset_different_seeds(
hf_tokenizer: PreTrainedTokenizerBase, random_dataset_params: Params
) -> None:
"""Different seeds should change outputs with overwhelming likelihood."""
p = random_dataset_params
seed_a = 0
dataset_a = RandomDataset(random_seed=seed_a)
a = _collect_samples(
dataset_a,
hf_tokenizer,
num_requests=p.num_requests,
prefix_len=p.prefix_len,
range_ratio=p.range_ratio,
input_len=p.input_len,
output_len=p.output_len,
)
seed_b = 999
dataset_b = RandomDataset(random_seed=seed_b)
# Perturb global RNG with same seed as dataset_a to ensure isolation
random.seed(seed_a)
np.random.seed(seed_a)
b = _collect_samples(
dataset_b,
hf_tokenizer,
num_requests=p.num_requests,
prefix_len=p.prefix_len,
range_ratio=p.range_ratio,
input_len=p.input_len,
output_len=p.output_len,
)
assert a != b
# -----------------------------
# RandomMultiModalDataset tests
# -----------------------------
def _mm_fingerprint_sample(
req: SampleRequest,
) -> tuple[str, int, int, int, list[str]]:
"""Create a compact fingerprint for multimodal samples.
Includes:
- prompt string
- prompt_len
- expected_output_len
- count of multimodal items
- per-item type and URL prefix (e.g., 'data:image/jpeg;base64,')
"""
items = req.multi_modal_data or []
item_prefixes: list[str] = []
for it in items:
if isinstance(it, dict) and it.get("type") == "image_url":
url = it.get("image_url", {}).get("url", "")
# Only keep a short identifying prefix to avoid huge strings
item_prefixes.append(f"image:{url[:22]}")
elif isinstance(it, dict) and it.get("type") == "video_url":
url = it.get("video_url", {}).get("url", "")
item_prefixes.append(f"video:{url[:22]}")
else:
item_prefixes.append("unknown:")
return (
req.prompt,
req.prompt_len,
req.expected_output_len,
len(items),
item_prefixes,
)
def _collect_mm_samples(
dataset: RandomMultiModalDataset,
tokenizer: PreTrainedTokenizerBase,
*,
num_requests: int = 8,
prefix_len: int = 3,
range_ratio: float = 0.0,
input_len: int = 20,
output_len: int = 5,
base_items_per_request: int = 2,
num_mm_items_range_ratio: float = 0.0,
limit_mm_per_prompt: dict[str, int] | None = None,
bucket_config: dict[tuple[int, int, int], float] | None = None,
enable_multimodal_chat: bool = False,
) -> list[SampleRequest]:
if limit_mm_per_prompt is None:
limit_mm_per_prompt = {"image": 5, "video": 0}
if bucket_config is None:
bucket_config = {(32, 32, 1): 0.5, (52, 64, 1): 0.5}
return dataset.sample(
tokenizer=tokenizer,
num_requests=num_requests,
prefix_len=prefix_len,
range_ratio=range_ratio,
input_len=input_len,
output_len=output_len,
base_items_per_request=base_items_per_request,
num_mm_items_range_ratio=num_mm_items_range_ratio,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
enable_multimodal_chat=enable_multimodal_chat,
)
@pytest.mark.benchmark
def test_random_mm_same_seed(hf_tokenizer: PreTrainedTokenizerBase) -> None:
seed = 42
ds_a = RandomMultiModalDataset(random_seed=seed)
ds_b = RandomMultiModalDataset(random_seed=seed)
a = _collect_mm_samples(ds_a, hf_tokenizer)
b = _collect_mm_samples(ds_b, hf_tokenizer)
fa = [_mm_fingerprint_sample(s) for s in a]
fb = [_mm_fingerprint_sample(s) for s in b]
assert fa == fb
@pytest.mark.benchmark
def test_random_mm_different_seeds(
hf_tokenizer: PreTrainedTokenizerBase,
) -> None:
ds_a = RandomMultiModalDataset(random_seed=0)
ds_b = RandomMultiModalDataset(random_seed=999)
a = _collect_mm_samples(ds_a, hf_tokenizer)
b = _collect_mm_samples(ds_b, hf_tokenizer)
fa = [_mm_fingerprint_sample(s) for s in a]
fb = [_mm_fingerprint_sample(s) for s in b]
assert fa != fb
@pytest.mark.benchmark
def test_random_mm_respects_limits(
hf_tokenizer: PreTrainedTokenizerBase,
) -> None:
ds = RandomMultiModalDataset(random_seed=0)
# Requesting 3 items with a per-prompt limit of 1 should error per current
# design (dataset refuses to silently clamp below the requested baseline).
with pytest.raises(ValueError):
_collect_mm_samples(
ds,
hf_tokenizer,
num_requests=12,
base_items_per_request=3,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt={"image": 1, "video": 0},
bucket_config={(32, 32, 1): 1.0},
)
@pytest.mark.benchmark
def test_random_mm_zero_prob_entries_are_removed(
hf_tokenizer: PreTrainedTokenizerBase,
) -> None:
ds = RandomMultiModalDataset(random_seed=0)
# Second bucket has zero probability and should be ignored after
# normalization
samples = _collect_mm_samples(
ds,
hf_tokenizer,
num_requests=6,
base_items_per_request=2,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt={"image": 10, "video": 0},
bucket_config={(32, 32, 1): 1.0, (52, 64, 1): 0.0},
)
for s in samples:
assert isinstance(s.multi_modal_data, list)
typed_mm = cast(list[dict[str, Any]], s.multi_modal_data)
for it in typed_mm:
assert it.get("type") == "image_url"
@pytest.mark.benchmark
def test_random_mm_zero_items(hf_tokenizer: PreTrainedTokenizerBase) -> None:
ds = RandomMultiModalDataset(random_seed=0)
samples = _collect_mm_samples(
ds,
hf_tokenizer,
num_requests=5,
base_items_per_request=0,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt={"image": 5, "video": 0},
bucket_config={(32, 32, 1): 1.0},
)
for s in samples:
assert s.multi_modal_data == []
@pytest.mark.benchmark
def test_random_mm_num_items_per_prompt(hf_tokenizer: PreTrainedTokenizerBase) -> None:
ds = RandomMultiModalDataset(random_seed=0)
# Fixed number of images per prompt
# set num_mm_items_range_ratio to 0.0
# TODO: modify video values when video sampling is implemented
samples_fixed_items = _collect_mm_samples(
ds,
hf_tokenizer,
num_requests=5,
base_items_per_request=3,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt={"image": 3, "video": 0},
bucket_config={(32, 32, 1): 1.0},
)
# Must have 5 requests each with 3 mm items per prompt
assert len(samples_fixed_items) == 5
for s in samples_fixed_items:
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
assert len(mm_data) == 3
for it in mm_data:
assert it.get("type") == "image_url"
@pytest.mark.benchmark
def test_random_mm_bucket_config_not_mutated(
hf_tokenizer: PreTrainedTokenizerBase,
) -> None:
ds = RandomMultiModalDataset(random_seed=0)
# This bucket config is not normalized to sum to 1
# and has more buckets than requested images
original = {(32, 32, 1): 0.2, (52, 64, 1): 6, (25, 64, 1): 3}
# Keep a snapshot to compare after sampling
snapshot = dict(original)
_ = _collect_mm_samples(
ds,
hf_tokenizer,
num_requests=4,
base_items_per_request=1,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt={"image": 1, "video": 0},
bucket_config=original,
)
# Ensure the original dict content is unchanged
assert original == snapshot
# Vary number of mm items per prompt
# set num_mm_items_range_ratio to 0.5
samples_varying_items = _collect_mm_samples(
ds,
hf_tokenizer,
num_requests=5,
base_items_per_request=2,
num_mm_items_range_ratio=0.5,
limit_mm_per_prompt={"image": 4, "video": 0},
bucket_config={(32, 32, 1): 1.0},
)
# Must have 5 requests each with less than 4 mm items per prompt
# but at least 1 mm item per prompt
assert len(samples_varying_items) == 5
for s in samples_varying_items:
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
assert len(mm_data) <= 4
assert len(mm_data) >= 1
for it in mm_data:
assert it.get("type") == "image_url"
@pytest.mark.benchmark
def test_random_mm_video_sampling(hf_tokenizer: PreTrainedTokenizerBase) -> None:
"""Test video sampling functionality in RandomMultiModalDataset."""
ds = RandomMultiModalDataset(random_seed=42)
# Test with video bucket configuration
bucket_config = {
(64, 64, 1): 0.3, # Images
(64, 64, 8): 0.7, # Videos
}
limit_mm_per_prompt = {"image": 2, "video": 2}
samples = _collect_mm_samples(
ds,
hf_tokenizer,
num_requests=5,
base_items_per_request=1,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
)
assert len(samples) == 5
# Check that we have both images and videos
video_count = 0
image_count = 0
for s in samples:
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
assert len(mm_data) == 1
item = mm_data[0]
if item.get("type") == "video_url":
video_count += 1
# Verify video URL format
url = item.get("video_url", {}).get("url", "")
assert url.startswith("data:video/mp4;base64,")
elif item.get("type") == "image_url":
image_count += 1
# Verify image URL format
url = item.get("image_url", {}).get("url", "")
assert url.startswith("data:image/jpeg;base64,")
# Should have some videos due to 0.7 probability
assert video_count > 0
assert image_count > 0
@pytest.mark.benchmark
def test_random_mm_video_only_sampling(hf_tokenizer: PreTrainedTokenizerBase) -> None:
"""Test sampling with only video buckets."""
ds = RandomMultiModalDataset(random_seed=42)
bucket_config = {
(64, 64, 8): 1.0, # Only videos
}
limit_mm_per_prompt = {"image": 0, "video": 1}
samples = _collect_mm_samples(
ds,
hf_tokenizer,
num_requests=3,
base_items_per_request=1,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
)
assert len(samples) == 3
for s in samples:
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
assert len(mm_data) == 1
item = mm_data[0]
assert item.get("type") == "video_url"
url = item.get("video_url", {}).get("url", "")
assert url.startswith("data:video/mp4;base64,")
@pytest.mark.benchmark
def test_random_mm_video_deterministic_sampling(
hf_tokenizer: PreTrainedTokenizerBase,
) -> None:
"""Test that video sampling is deterministic with same seed."""
seed = 123
ds_a = RandomMultiModalDataset(random_seed=seed)
ds_b = RandomMultiModalDataset(random_seed=seed)
bucket_config = {
(64, 64, 8): 1.0, # Only videos
}
limit_mm_per_prompt = {"image": 0, "video": 1}
a = _collect_mm_samples(
ds_a,
hf_tokenizer,
num_requests=3,
base_items_per_request=1,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
)
b = _collect_mm_samples(
ds_b,
hf_tokenizer,
num_requests=3,
base_items_per_request=1,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
)
fa = [_mm_fingerprint_sample(s) for s in a]
fb = [_mm_fingerprint_sample(s) for s in b]
assert fa == fb

View File

@@ -0,0 +1,398 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import os
from tempfile import NamedTemporaryFile
from typing import Any, cast
import cv2
import pytest
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.benchmarks.datasets import RandomMultiModalDataset, SampleRequest
@pytest.fixture(scope="session")
def hf_tokenizer() -> PreTrainedTokenizerBase:
"""Use a small, commonly available tokenizer."""
return AutoTokenizer.from_pretrained("gpt2")
@pytest.fixture
def video_dataset() -> RandomMultiModalDataset:
"""Create a RandomMultiModalDataset instance for testing."""
return RandomMultiModalDataset(random_seed=42)
@pytest.mark.benchmark
def test_generate_synthetic_video_different_seeds():
"""Test that different seeds produce different videos."""
dataset1 = RandomMultiModalDataset(random_seed=123)
dataset2 = RandomMultiModalDataset(random_seed=456)
width, height, num_frames = 64, 48, 8
video1 = dataset1.generate_synthetic_video(width, height, num_frames)
video2 = dataset2.generate_synthetic_video(width, height, num_frames)
# Videos should be different due to different seeds
assert video1["bytes"] != video2["bytes"]
@pytest.mark.benchmark
def test_map_config_to_modality(video_dataset: RandomMultiModalDataset):
"""Test modality mapping for different configurations."""
# Test image configuration (num_frames = 1)
assert video_dataset.map_config_to_modality((256, 256, 1)) == "image"
assert video_dataset.map_config_to_modality((720, 1280, 1)) == "image"
# Test video configurations (num_frames > 1)
assert video_dataset.map_config_to_modality((256, 256, 8)) == "video"
assert video_dataset.map_config_to_modality((720, 1280, 16)) == "video"
assert video_dataset.map_config_to_modality((64, 64, 32)) == "video"
# Test invalid configurations
with pytest.raises(ValueError, match="Invalid multimodal item configuration"):
video_dataset.map_config_to_modality((256, 256, 0))
with pytest.raises(ValueError, match="Invalid multimodal item configuration"):
video_dataset.map_config_to_modality((256, 256, -1))
@pytest.mark.benchmark
def test_generate_mm_item_video(video_dataset: RandomMultiModalDataset):
"""Test generating multimodal items for video configurations."""
# Test video item generation
video_config = (64, 48, 8) # height, width, num_frames
result = video_dataset.generate_mm_item(video_config)
# Check the result structure matches OpenAI API format
assert isinstance(result, dict)
assert result["type"] == "video_url"
assert "video_url" in result
assert "url" in result["video_url"]
# Check that the URL is a data URL with base64 encoded video
url = result["video_url"]["url"]
assert url.startswith("data:video/mp4;base64,")
# Decode and verify the video content
base64_data = url.split(",")[1]
video_bytes = base64.b64decode(base64_data)
assert len(video_bytes) > 0
# Verify the video can be decoded
with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
temp_path = temp_file.name
temp_file.write(video_bytes)
try:
cap = cv2.VideoCapture(temp_path)
assert cap.isOpened()
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
assert frame_count == 8
assert frame_width == 48
assert frame_height == 64
cap.release()
finally:
if os.path.exists(temp_path):
os.unlink(temp_path)
@pytest.mark.benchmark
def test_generate_mm_item_image(video_dataset: RandomMultiModalDataset):
"""Test generating multimodal items for image configurations."""
# Test image item generation
image_config = (64, 48, 1) # height, width, num_frames=1
result = video_dataset.generate_mm_item(image_config)
# Check the result structure matches OpenAI API format
assert isinstance(result, dict)
assert result["type"] == "image_url"
assert "image_url" in result
assert "url" in result["image_url"]
# Check that the URL is a data URL with base64 encoded image
url = result["image_url"]["url"]
assert url.startswith("data:image/jpeg;base64,")
@pytest.mark.benchmark
def test_generate_mm_item_invalid_config(video_dataset: RandomMultiModalDataset):
"""Test error handling for invalid configurations."""
with pytest.raises(ValueError, match="Invalid multimodal item configuration"):
video_dataset.generate_mm_item((256, 256, 0))
@pytest.mark.benchmark
def test_sample_with_video_buckets(
video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
):
"""Test sampling with video bucket configurations."""
# Configure bucket with video probability > 0
bucket_config = {
(64, 64, 1): 0.3, # Images
(64, 64, 8): 0.7, # Videos
}
limit_mm_per_prompt = {"image": 5, "video": 3}
samples = video_dataset.sample(
tokenizer=hf_tokenizer,
num_requests=5,
base_items_per_request=2,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
input_len=20,
output_len=5,
)
assert len(samples) == 5
# Check that samples contain both images and videos
video_count = 0
image_count = 0
for sample in samples:
assert isinstance(sample, SampleRequest)
assert sample.multi_modal_data is not None
assert isinstance(sample.multi_modal_data, list)
mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
assert len(mm_data) == 2 # base_items_per_request
for item in mm_data:
if item["type"] == "video_url":
video_count += 1
# Verify video URL format
url = item["video_url"]["url"]
assert url.startswith("data:video/mp4;base64,")
elif item["type"] == "image_url":
image_count += 1
# Verify image URL format
url = item["image_url"]["url"]
assert url.startswith("data:image/jpeg;base64,")
# Should have some videos due to 0.7 probability
assert video_count > 0
assert image_count > 0
@pytest.mark.benchmark
def test_sample_video_only_buckets(
video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
):
"""Test sampling with only video buckets."""
bucket_config = {
(64, 64, 8): 1.0, # Only videos
}
limit_mm_per_prompt = {"image": 0, "video": 2}
samples = video_dataset.sample(
tokenizer=hf_tokenizer,
num_requests=3,
base_items_per_request=1,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
input_len=20,
output_len=5,
)
assert len(samples) == 3
for sample in samples:
assert isinstance(sample, SampleRequest)
assert sample.multi_modal_data is not None
assert isinstance(sample.multi_modal_data, list)
mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
assert len(mm_data) == 1
item = mm_data[0]
assert item["type"] == "video_url"
url = item["video_url"]["url"]
assert url.startswith("data:video/mp4;base64,")
@pytest.mark.benchmark
def test_sample_respects_video_limits(
video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
):
"""Test that sampling respects video limits per prompt."""
bucket_config = {
(64, 64, 8): 1.0, # Only videos
}
# Set very low video limit
limit_mm_per_prompt = {"image": 0, "video": 1}
samples = video_dataset.sample(
tokenizer=hf_tokenizer,
num_requests=3,
base_items_per_request=1,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
input_len=20,
output_len=5,
)
assert len(samples) == 3
for sample in samples:
mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
assert len(mm_data) <= 1 # Should respect video limit
@pytest.mark.benchmark
def test_sample_mixed_buckets_with_zero_probability(
video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
):
"""Test sampling with mixed buckets including zero probability entries."""
bucket_config = {
(64, 64, 1): 0.5, # Images
(64, 64, 8): 0.5, # Videos
(128, 128, 16): 0.0, # Zero probability videos (should be ignored)
}
limit_mm_per_prompt = {"image": 2, "video": 2}
samples = video_dataset.sample(
tokenizer=hf_tokenizer,
num_requests=4,
base_items_per_request=2,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
input_len=20,
output_len=5,
)
assert len(samples) == 4
# Should only see 64x64 videos, not 128x128 videos
for sample in samples:
mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
for item in mm_data:
if item["type"] == "video_url":
# Decode video to verify dimensions
url = item["video_url"]["url"]
base64_data = url.split(",")[1]
video_bytes = base64.b64decode(base64_data)
with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file: # noqa
temp_path = temp_file.name
temp_file.write(video_bytes)
try:
cap = cv2.VideoCapture(temp_path)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
# Should be 64x64, not 128x128
assert frame_width == 64
assert frame_height == 64
finally:
if os.path.exists(temp_path):
os.unlink(temp_path)
@pytest.mark.benchmark
def test_sample_deterministic_with_videos(hf_tokenizer: PreTrainedTokenizerBase):
"""Test that sampling with videos is deterministic with same seed."""
dataset1 = RandomMultiModalDataset(random_seed=123)
dataset2 = RandomMultiModalDataset(random_seed=123)
bucket_config = {
(64, 64, 1): 0.3, # Images
(64, 64, 8): 0.7, # Videos
}
limit_mm_per_prompt = {"image": 2, "video": 2}
samples1 = dataset1.sample(
tokenizer=hf_tokenizer,
num_requests=3,
base_items_per_request=1,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
input_len=20,
output_len=5,
)
samples2 = dataset2.sample(
tokenizer=hf_tokenizer,
num_requests=3,
base_items_per_request=1,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
input_len=20,
output_len=5,
)
assert len(samples1) == len(samples2)
# Compare multimodal data
for s1, s2 in zip(samples1, samples2):
assert s1.multi_modal_data == s2.multi_modal_data
@pytest.mark.benchmark
def test_sample_different_seeds_produce_different_videos(
hf_tokenizer: PreTrainedTokenizerBase,
):
"""Test that different seeds produce different video content."""
dataset1 = RandomMultiModalDataset(random_seed=123)
dataset2 = RandomMultiModalDataset(random_seed=456)
bucket_config = {
(64, 64, 8): 1.0, # Only videos
}
limit_mm_per_prompt = {"image": 0, "video": 1}
samples1 = dataset1.sample(
tokenizer=hf_tokenizer,
num_requests=2,
base_items_per_request=1,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
input_len=20,
output_len=5,
)
samples2 = dataset2.sample(
tokenizer=hf_tokenizer,
num_requests=2,
base_items_per_request=1,
num_mm_items_range_ratio=0.0,
limit_mm_per_prompt=limit_mm_per_prompt,
bucket_config=bucket_config,
input_len=20,
output_len=5,
)
# Video content should be different
for s1, s2 in zip(samples1, samples2):
mm_data1 = cast(list[dict[str, Any]], s1.multi_modal_data)
mm_data2 = cast(list[dict[str, Any]], s2.multi_modal_data)
assert len(mm_data1) == len(mm_data2) == 1
url1 = mm_data1[0]["video_url"]["url"]
url2 = mm_data2[0]["video_url"]["url"]
assert url1 != url2 # Different video content

View File

@@ -0,0 +1,77 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import subprocess
import pytest
from ..utils import RemoteOpenAIServer
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
@pytest.fixture(scope="module")
def server():
args = ["--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.benchmark
def test_bench_serve(server):
command = [
"vllm",
"bench",
"serve",
"--model",
MODEL_NAME,
"--host",
server.host,
"--port",
str(server.port),
"--dataset-name",
"random",
"--random-input-len",
"32",
"--random-output-len",
"4",
"--num-prompts",
"5",
]
result = subprocess.run(command, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
@pytest.mark.benchmark
def test_bench_serve_chat(server):
command = [
"vllm",
"bench",
"serve",
"--model",
MODEL_NAME,
"--host",
server.host,
"--port",
str(server.port),
"--dataset-name",
"random",
"--random-input-len",
"32",
"--random-output-len",
"4",
"--num-prompts",
"5",
"--endpoint",
"/v1/chat/completions",
"--backend",
"openai-chat",
]
result = subprocess.run(command, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"

View File

@@ -0,0 +1,30 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import subprocess
import pytest
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
@pytest.mark.benchmark
def test_bench_throughput():
command = [
"vllm",
"bench",
"throughput",
"--model",
MODEL_NAME,
"--input-len",
"32",
"--output-len",
"1",
"--enforce-eager",
"--load-format",
"dummy",
]
result = subprocess.run(command, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"