Sync from v0.13
This commit is contained in:
0
tests/benchmarks/__init__.py
Normal file
0
tests/benchmarks/__init__.py
Normal file
30
tests/benchmarks/test_latency_cli.py
Normal file
30
tests/benchmarks/test_latency_cli.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_bench_latency():
|
||||
command = [
|
||||
"vllm",
|
||||
"bench",
|
||||
"latency",
|
||||
"--model",
|
||||
MODEL_NAME,
|
||||
"--input-len",
|
||||
"32",
|
||||
"--output-len",
|
||||
"1",
|
||||
"--enforce-eager",
|
||||
"--load-format",
|
||||
"dummy",
|
||||
]
|
||||
result = subprocess.run(command, capture_output=True, text=True)
|
||||
print(result.stdout)
|
||||
print(result.stderr)
|
||||
|
||||
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
|
||||
249
tests/benchmarks/test_param_sweep.py
Normal file
249
tests/benchmarks/test_param_sweep.py
Normal file
@@ -0,0 +1,249 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.benchmarks.sweep.param_sweep import ParameterSweep, ParameterSweepItem
|
||||
|
||||
|
||||
class TestParameterSweepItem:
|
||||
"""Test ParameterSweepItem functionality."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_dict,expected",
|
||||
[
|
||||
(
|
||||
{"compilation_config.use_inductor_graph_partition": False},
|
||||
"--compilation-config.use_inductor_graph_partition=false",
|
||||
),
|
||||
(
|
||||
{"compilation_config.use_inductor_graph_partition": True},
|
||||
"--compilation-config.use_inductor_graph_partition=true",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_nested_boolean_params(self, input_dict, expected):
|
||||
"""Test that nested boolean params use =true/false syntax."""
|
||||
item = ParameterSweepItem.from_record(input_dict)
|
||||
cmd = item.apply_to_cmd(["vllm", "serve", "model"])
|
||||
assert expected in cmd
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_dict,expected",
|
||||
[
|
||||
({"enable_prefix_caching": False}, "--no-enable-prefix-caching"),
|
||||
({"enable_prefix_caching": True}, "--enable-prefix-caching"),
|
||||
({"disable_log_stats": False}, "--no-disable-log-stats"),
|
||||
({"disable_log_stats": True}, "--disable-log-stats"),
|
||||
],
|
||||
)
|
||||
def test_non_nested_boolean_params(self, input_dict, expected):
|
||||
"""Test that non-nested boolean params use --no- prefix."""
|
||||
item = ParameterSweepItem.from_record(input_dict)
|
||||
cmd = item.apply_to_cmd(["vllm", "serve", "model"])
|
||||
assert expected in cmd
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"compilation_config",
|
||||
[
|
||||
{"cudagraph_mode": "full", "mode": 2, "use_inductor_graph_partition": True},
|
||||
{
|
||||
"cudagraph_mode": "piecewise",
|
||||
"mode": 3,
|
||||
"use_inductor_graph_partition": False,
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_nested_dict_value(self, compilation_config):
|
||||
"""Test that nested dict values are serialized as JSON."""
|
||||
item = ParameterSweepItem.from_record(
|
||||
{"compilation_config": compilation_config}
|
||||
)
|
||||
cmd = item.apply_to_cmd(["vllm", "serve", "model"])
|
||||
assert "--compilation-config" in cmd
|
||||
# The dict should be JSON serialized
|
||||
idx = cmd.index("--compilation-config")
|
||||
assert json.loads(cmd[idx + 1]) == compilation_config
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_dict,expected_key,expected_value",
|
||||
[
|
||||
({"model": "test-model"}, "--model", "test-model"),
|
||||
({"max_tokens": 100}, "--max-tokens", "100"),
|
||||
({"temperature": 0.7}, "--temperature", "0.7"),
|
||||
],
|
||||
)
|
||||
def test_string_and_numeric_values(self, input_dict, expected_key, expected_value):
|
||||
"""Test that string and numeric values are handled correctly."""
|
||||
item = ParameterSweepItem.from_record(input_dict)
|
||||
cmd = item.apply_to_cmd(["vllm", "serve"])
|
||||
assert expected_key in cmd
|
||||
assert expected_value in cmd
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_dict,expected_key,key_idx_offset",
|
||||
[
|
||||
({"max_tokens": 200}, "--max-tokens", 1),
|
||||
({"enable_prefix_caching": False}, "--no-enable-prefix-caching", 0),
|
||||
],
|
||||
)
|
||||
def test_replace_existing_parameter(self, input_dict, expected_key, key_idx_offset):
|
||||
"""Test that existing parameters in cmd are replaced."""
|
||||
item = ParameterSweepItem.from_record(input_dict)
|
||||
|
||||
if key_idx_offset == 1:
|
||||
# Key-value pair
|
||||
cmd = item.apply_to_cmd(["vllm", "serve", "--max-tokens", "100", "model"])
|
||||
assert expected_key in cmd
|
||||
idx = cmd.index(expected_key)
|
||||
assert cmd[idx + 1] == "200"
|
||||
assert "100" not in cmd
|
||||
else:
|
||||
# Boolean flag
|
||||
cmd = item.apply_to_cmd(
|
||||
["vllm", "serve", "--enable-prefix-caching", "model"]
|
||||
)
|
||||
assert expected_key in cmd
|
||||
assert "--enable-prefix-caching" not in cmd
|
||||
|
||||
|
||||
class TestParameterSweep:
|
||||
"""Test ParameterSweep functionality."""
|
||||
|
||||
def test_from_records_list(self):
|
||||
"""Test creating ParameterSweep from a list of records."""
|
||||
records = [
|
||||
{"max_tokens": 100, "temperature": 0.7},
|
||||
{"max_tokens": 200, "temperature": 0.9},
|
||||
]
|
||||
sweep = ParameterSweep.from_records(records)
|
||||
assert len(sweep) == 2
|
||||
assert sweep[0]["max_tokens"] == 100
|
||||
assert sweep[1]["max_tokens"] == 200
|
||||
|
||||
def test_read_from_dict(self):
|
||||
"""Test creating ParameterSweep from a dict format."""
|
||||
data = {
|
||||
"experiment1": {"max_tokens": 100, "temperature": 0.7},
|
||||
"experiment2": {"max_tokens": 200, "temperature": 0.9},
|
||||
}
|
||||
sweep = ParameterSweep.read_from_dict(data)
|
||||
assert len(sweep) == 2
|
||||
|
||||
# Check that items have the _benchmark_name field
|
||||
names = {item["_benchmark_name"] for item in sweep}
|
||||
assert names == {"experiment1", "experiment2"}
|
||||
|
||||
# Check that parameters are preserved
|
||||
for item in sweep:
|
||||
if item["_benchmark_name"] == "experiment1":
|
||||
assert item["max_tokens"] == 100
|
||||
assert item["temperature"] == 0.7
|
||||
elif item["_benchmark_name"] == "experiment2":
|
||||
assert item["max_tokens"] == 200
|
||||
assert item["temperature"] == 0.9
|
||||
|
||||
def test_read_json_list_format(self):
|
||||
"""Test reading JSON file with list format."""
|
||||
records = [
|
||||
{"max_tokens": 100, "temperature": 0.7},
|
||||
{"max_tokens": 200, "temperature": 0.9},
|
||||
]
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
||||
json.dump(records, f)
|
||||
temp_path = Path(f.name)
|
||||
|
||||
try:
|
||||
sweep = ParameterSweep.read_json(temp_path)
|
||||
assert len(sweep) == 2
|
||||
assert sweep[0]["max_tokens"] == 100
|
||||
assert sweep[1]["max_tokens"] == 200
|
||||
finally:
|
||||
temp_path.unlink()
|
||||
|
||||
def test_read_json_dict_format(self):
|
||||
"""Test reading JSON file with dict format."""
|
||||
data = {
|
||||
"experiment1": {"max_tokens": 100, "temperature": 0.7},
|
||||
"experiment2": {"max_tokens": 200, "temperature": 0.9},
|
||||
}
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
||||
json.dump(data, f)
|
||||
temp_path = Path(f.name)
|
||||
|
||||
try:
|
||||
sweep = ParameterSweep.read_json(temp_path)
|
||||
assert len(sweep) == 2
|
||||
|
||||
# Check that items have the _benchmark_name field
|
||||
names = {item["_benchmark_name"] for item in sweep}
|
||||
assert names == {"experiment1", "experiment2"}
|
||||
finally:
|
||||
temp_path.unlink()
|
||||
|
||||
def test_unique_benchmark_names_validation(self):
|
||||
"""Test that duplicate _benchmark_name values raise an error."""
|
||||
# Test with duplicate names in list format
|
||||
records = [
|
||||
{"_benchmark_name": "exp1", "max_tokens": 100},
|
||||
{"_benchmark_name": "exp1", "max_tokens": 200},
|
||||
]
|
||||
|
||||
with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
|
||||
ParameterSweep.from_records(records)
|
||||
|
||||
def test_unique_benchmark_names_multiple_duplicates(self):
|
||||
"""Test validation with multiple duplicate names."""
|
||||
records = [
|
||||
{"_benchmark_name": "exp1", "max_tokens": 100},
|
||||
{"_benchmark_name": "exp1", "max_tokens": 200},
|
||||
{"_benchmark_name": "exp2", "max_tokens": 300},
|
||||
{"_benchmark_name": "exp2", "max_tokens": 400},
|
||||
]
|
||||
|
||||
with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
|
||||
ParameterSweep.from_records(records)
|
||||
|
||||
def test_no_benchmark_names_allowed(self):
|
||||
"""Test that records without _benchmark_name are allowed."""
|
||||
records = [
|
||||
{"max_tokens": 100, "temperature": 0.7},
|
||||
{"max_tokens": 200, "temperature": 0.9},
|
||||
]
|
||||
sweep = ParameterSweep.from_records(records)
|
||||
assert len(sweep) == 2
|
||||
|
||||
def test_mixed_benchmark_names_allowed(self):
|
||||
"""Test that mixing records with and without _benchmark_name is allowed."""
|
||||
records = [
|
||||
{"_benchmark_name": "exp1", "max_tokens": 100},
|
||||
{"max_tokens": 200, "temperature": 0.9},
|
||||
]
|
||||
sweep = ParameterSweep.from_records(records)
|
||||
assert len(sweep) == 2
|
||||
|
||||
|
||||
class TestParameterSweepItemKeyNormalization:
|
||||
"""Test key normalization in ParameterSweepItem."""
|
||||
|
||||
def test_underscore_to_hyphen_conversion(self):
|
||||
"""Test that underscores are converted to hyphens in CLI."""
|
||||
item = ParameterSweepItem.from_record({"max_tokens": 100})
|
||||
cmd = item.apply_to_cmd(["vllm", "serve"])
|
||||
assert "--max-tokens" in cmd
|
||||
|
||||
def test_nested_key_preserves_suffix(self):
|
||||
"""Test that nested keys preserve the suffix format."""
|
||||
# The suffix after the dot should preserve underscores
|
||||
item = ParameterSweepItem.from_record(
|
||||
{"compilation_config.some_nested_param": "value"}
|
||||
)
|
||||
cmd = item.apply_to_cmd(["vllm", "serve"])
|
||||
# The prefix (compilation_config) gets converted to hyphens,
|
||||
# but the suffix (some_nested_param) is preserved
|
||||
assert any("compilation-config.some_nested_param" in arg for arg in cmd)
|
||||
171
tests/benchmarks/test_plot_filters.py
Normal file
171
tests/benchmarks/test_plot_filters.py
Normal file
@@ -0,0 +1,171 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from vllm.benchmarks.sweep.plot import (
|
||||
PlotEqualTo,
|
||||
PlotFilterBase,
|
||||
PlotFilters,
|
||||
PlotGreaterThan,
|
||||
PlotGreaterThanOrEqualTo,
|
||||
PlotLessThan,
|
||||
PlotLessThanOrEqualTo,
|
||||
PlotNotEqualTo,
|
||||
)
|
||||
|
||||
|
||||
class TestPlotFilters:
|
||||
"""Test PlotFilter functionality including 'inf' edge case."""
|
||||
|
||||
def setup_method(self):
|
||||
"""Create sample DataFrames for testing."""
|
||||
# DataFrame with numeric values
|
||||
self.df_numeric = pd.DataFrame(
|
||||
{
|
||||
"request_rate": [1.0, 5.0, 10.0, 50.0, 100.0],
|
||||
"value": [10, 20, 30, 40, 50],
|
||||
}
|
||||
)
|
||||
|
||||
# DataFrame with float('inf') - note: string "inf" values are coerced
|
||||
# to float when loading data, so we only test with float('inf')
|
||||
self.df_inf_float = pd.DataFrame(
|
||||
{
|
||||
"request_rate": [1.0, 5.0, 10.0, float("inf"), float("inf")],
|
||||
"value": [10, 20, 30, 40, 50],
|
||||
}
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target,expected_count",
|
||||
[
|
||||
("5.0", 1),
|
||||
("10.0", 1),
|
||||
("1.0", 1),
|
||||
],
|
||||
)
|
||||
def test_equal_to_numeric(self, target, expected_count):
|
||||
"""Test PlotEqualTo with numeric values."""
|
||||
filter_obj = PlotEqualTo("request_rate", target)
|
||||
result = filter_obj.apply(self.df_numeric)
|
||||
assert len(result) == expected_count
|
||||
|
||||
def test_equal_to_inf_float(self):
|
||||
"""Test PlotEqualTo with float('inf')."""
|
||||
filter_obj = PlotEqualTo("request_rate", "inf")
|
||||
result = filter_obj.apply(self.df_inf_float)
|
||||
# Should match both float('inf') entries because float('inf') == float('inf')
|
||||
assert len(result) == 2
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target,expected_count",
|
||||
[
|
||||
("5.0", 4), # All except 5.0
|
||||
("1.0", 4), # All except 1.0
|
||||
],
|
||||
)
|
||||
def test_not_equal_to_numeric(self, target, expected_count):
|
||||
"""Test PlotNotEqualTo with numeric values."""
|
||||
filter_obj = PlotNotEqualTo("request_rate", target)
|
||||
result = filter_obj.apply(self.df_numeric)
|
||||
assert len(result) == expected_count
|
||||
|
||||
def test_not_equal_to_inf_float(self):
|
||||
"""Test PlotNotEqualTo with float('inf')."""
|
||||
filter_obj = PlotNotEqualTo("request_rate", "inf")
|
||||
result = filter_obj.apply(self.df_inf_float)
|
||||
# Should exclude float('inf') entries
|
||||
assert len(result) == 3
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target,expected_count",
|
||||
[
|
||||
("10.0", 2), # 1.0, 5.0
|
||||
("50.0", 3), # 1.0, 5.0, 10.0
|
||||
("5.0", 1), # 1.0
|
||||
],
|
||||
)
|
||||
def test_less_than(self, target, expected_count):
|
||||
"""Test PlotLessThan with numeric values."""
|
||||
filter_obj = PlotLessThan("request_rate", target)
|
||||
result = filter_obj.apply(self.df_numeric)
|
||||
assert len(result) == expected_count
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target,expected_count",
|
||||
[
|
||||
("10.0", 3), # 1.0, 5.0, 10.0
|
||||
("5.0", 2), # 1.0, 5.0
|
||||
],
|
||||
)
|
||||
def test_less_than_or_equal_to(self, target, expected_count):
|
||||
"""Test PlotLessThanOrEqualTo with numeric values."""
|
||||
filter_obj = PlotLessThanOrEqualTo("request_rate", target)
|
||||
result = filter_obj.apply(self.df_numeric)
|
||||
assert len(result) == expected_count
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target,expected_count",
|
||||
[
|
||||
("10.0", 2), # 50.0, 100.0
|
||||
("5.0", 3), # 10.0, 50.0, 100.0
|
||||
],
|
||||
)
|
||||
def test_greater_than(self, target, expected_count):
|
||||
"""Test PlotGreaterThan with numeric values."""
|
||||
filter_obj = PlotGreaterThan("request_rate", target)
|
||||
result = filter_obj.apply(self.df_numeric)
|
||||
assert len(result) == expected_count
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"target,expected_count",
|
||||
[
|
||||
("10.0", 3), # 10.0, 50.0, 100.0
|
||||
("5.0", 4), # 5.0, 10.0, 50.0, 100.0
|
||||
],
|
||||
)
|
||||
def test_greater_than_or_equal_to(self, target, expected_count):
|
||||
"""Test PlotGreaterThanOrEqualTo with numeric values."""
|
||||
filter_obj = PlotGreaterThanOrEqualTo("request_rate", target)
|
||||
result = filter_obj.apply(self.df_numeric)
|
||||
assert len(result) == expected_count
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filter_str,expected_var,expected_target,expected_type",
|
||||
[
|
||||
("request_rate==5.0", "request_rate", "5.0", PlotEqualTo),
|
||||
("request_rate!=10.0", "request_rate", "10.0", PlotNotEqualTo),
|
||||
("request_rate<50.0", "request_rate", "50.0", PlotLessThan),
|
||||
("request_rate<=50.0", "request_rate", "50.0", PlotLessThanOrEqualTo),
|
||||
("request_rate>10.0", "request_rate", "10.0", PlotGreaterThan),
|
||||
("request_rate>=10.0", "request_rate", "10.0", PlotGreaterThanOrEqualTo),
|
||||
("request_rate==inf", "request_rate", "inf", PlotEqualTo),
|
||||
("request_rate!='inf'", "request_rate", "inf", PlotNotEqualTo),
|
||||
],
|
||||
)
|
||||
def test_parse_str(self, filter_str, expected_var, expected_target, expected_type):
|
||||
"""Test parsing filter strings."""
|
||||
filter_obj = PlotFilterBase.parse_str(filter_str)
|
||||
assert isinstance(filter_obj, expected_type)
|
||||
assert filter_obj.var == expected_var
|
||||
assert filter_obj.target == expected_target
|
||||
|
||||
def test_parse_str_inf_edge_case(self):
|
||||
"""Test parsing 'inf' string in filter."""
|
||||
filter_obj = PlotFilterBase.parse_str("request_rate==inf")
|
||||
assert isinstance(filter_obj, PlotEqualTo)
|
||||
assert filter_obj.var == "request_rate"
|
||||
assert filter_obj.target == "inf"
|
||||
|
||||
def test_parse_multiple_filters(self):
|
||||
"""Test parsing multiple filters."""
|
||||
filters = PlotFilters.parse_str("request_rate>5.0,value<=40")
|
||||
assert len(filters) == 2
|
||||
assert isinstance(filters[0], PlotGreaterThan)
|
||||
assert isinstance(filters[1], PlotLessThanOrEqualTo)
|
||||
|
||||
def test_parse_empty_filter(self):
|
||||
"""Test parsing empty filter string."""
|
||||
filters = PlotFilters.parse_str("")
|
||||
assert len(filters) == 0
|
||||
484
tests/benchmarks/test_random_dataset.py
Normal file
484
tests/benchmarks/test_random_dataset.py
Normal file
@@ -0,0 +1,484 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import random
|
||||
from typing import Any, NamedTuple, cast
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from vllm.benchmarks.datasets import (
|
||||
RandomDataset,
|
||||
RandomMultiModalDataset,
|
||||
SampleRequest,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def hf_tokenizer() -> PreTrainedTokenizerBase:
|
||||
# Use a small, commonly available tokenizer
|
||||
return AutoTokenizer.from_pretrained("gpt2")
|
||||
|
||||
|
||||
class Params(NamedTuple):
|
||||
num_requests: int
|
||||
prefix_len: int
|
||||
range_ratio: float
|
||||
input_len: int
|
||||
output_len: int
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def random_dataset_params() -> Params:
|
||||
return Params(
|
||||
num_requests=16, prefix_len=7, range_ratio=0.3, input_len=50, output_len=20
|
||||
)
|
||||
|
||||
|
||||
def _fingerprint_sample(req: SampleRequest) -> tuple[str, int, int]:
|
||||
"""Project a SampleRequest into a comparable tuple."""
|
||||
return (req.prompt, req.prompt_len, req.expected_output_len)
|
||||
|
||||
|
||||
def _collect_samples(
|
||||
dataset: RandomDataset,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int = 16,
|
||||
prefix_len: int = 7,
|
||||
range_ratio: float = 0.3,
|
||||
input_len: int = 50,
|
||||
output_len: int = 20,
|
||||
) -> list[tuple[str, int, int]]:
|
||||
samples = dataset.sample(
|
||||
tokenizer=tokenizer,
|
||||
num_requests=num_requests,
|
||||
prefix_len=prefix_len,
|
||||
range_ratio=range_ratio,
|
||||
input_len=input_len,
|
||||
output_len=output_len,
|
||||
)
|
||||
return [_fingerprint_sample(s) for s in samples]
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_dataset_same_seed(
|
||||
hf_tokenizer: PreTrainedTokenizerBase, random_dataset_params: Params
|
||||
) -> None:
|
||||
"""Same seed should yield identical outputs, even if global RNGs change.
|
||||
|
||||
This guards against accidental reliance on Python's random or np.random
|
||||
in RandomDataset after moving to numpy.default_rng.
|
||||
"""
|
||||
p = random_dataset_params
|
||||
common_seed = 123
|
||||
dataset_a = RandomDataset(random_seed=common_seed)
|
||||
dataset_b = RandomDataset(random_seed=common_seed)
|
||||
a = _collect_samples(
|
||||
dataset_a,
|
||||
hf_tokenizer,
|
||||
num_requests=p.num_requests,
|
||||
prefix_len=p.prefix_len,
|
||||
range_ratio=p.range_ratio,
|
||||
input_len=p.input_len,
|
||||
output_len=p.output_len,
|
||||
)
|
||||
|
||||
# Perturb global RNG state to ensure isolation
|
||||
random.seed(999)
|
||||
_ = [random.random() for _ in range(100)]
|
||||
np.random.seed(888)
|
||||
_ = [np.random.random() for _ in range(100)]
|
||||
|
||||
b = _collect_samples(
|
||||
dataset_b,
|
||||
hf_tokenizer,
|
||||
num_requests=p.num_requests,
|
||||
prefix_len=p.prefix_len,
|
||||
range_ratio=p.range_ratio,
|
||||
input_len=p.input_len,
|
||||
output_len=p.output_len,
|
||||
)
|
||||
assert a == b
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_dataset_different_seeds(
|
||||
hf_tokenizer: PreTrainedTokenizerBase, random_dataset_params: Params
|
||||
) -> None:
|
||||
"""Different seeds should change outputs with overwhelming likelihood."""
|
||||
p = random_dataset_params
|
||||
seed_a = 0
|
||||
dataset_a = RandomDataset(random_seed=seed_a)
|
||||
a = _collect_samples(
|
||||
dataset_a,
|
||||
hf_tokenizer,
|
||||
num_requests=p.num_requests,
|
||||
prefix_len=p.prefix_len,
|
||||
range_ratio=p.range_ratio,
|
||||
input_len=p.input_len,
|
||||
output_len=p.output_len,
|
||||
)
|
||||
|
||||
seed_b = 999
|
||||
dataset_b = RandomDataset(random_seed=seed_b)
|
||||
# Perturb global RNG with same seed as dataset_a to ensure isolation
|
||||
random.seed(seed_a)
|
||||
np.random.seed(seed_a)
|
||||
b = _collect_samples(
|
||||
dataset_b,
|
||||
hf_tokenizer,
|
||||
num_requests=p.num_requests,
|
||||
prefix_len=p.prefix_len,
|
||||
range_ratio=p.range_ratio,
|
||||
input_len=p.input_len,
|
||||
output_len=p.output_len,
|
||||
)
|
||||
assert a != b
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# RandomMultiModalDataset tests
|
||||
# -----------------------------
|
||||
|
||||
|
||||
def _mm_fingerprint_sample(
|
||||
req: SampleRequest,
|
||||
) -> tuple[str, int, int, int, list[str]]:
|
||||
"""Create a compact fingerprint for multimodal samples.
|
||||
|
||||
Includes:
|
||||
- prompt string
|
||||
- prompt_len
|
||||
- expected_output_len
|
||||
- count of multimodal items
|
||||
- per-item type and URL prefix (e.g., 'data:image/jpeg;base64,')
|
||||
"""
|
||||
items = req.multi_modal_data or []
|
||||
item_prefixes: list[str] = []
|
||||
for it in items:
|
||||
if isinstance(it, dict) and it.get("type") == "image_url":
|
||||
url = it.get("image_url", {}).get("url", "")
|
||||
# Only keep a short identifying prefix to avoid huge strings
|
||||
item_prefixes.append(f"image:{url[:22]}")
|
||||
elif isinstance(it, dict) and it.get("type") == "video_url":
|
||||
url = it.get("video_url", {}).get("url", "")
|
||||
item_prefixes.append(f"video:{url[:22]}")
|
||||
else:
|
||||
item_prefixes.append("unknown:")
|
||||
return (
|
||||
req.prompt,
|
||||
req.prompt_len,
|
||||
req.expected_output_len,
|
||||
len(items),
|
||||
item_prefixes,
|
||||
)
|
||||
|
||||
|
||||
def _collect_mm_samples(
|
||||
dataset: RandomMultiModalDataset,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
*,
|
||||
num_requests: int = 8,
|
||||
prefix_len: int = 3,
|
||||
range_ratio: float = 0.0,
|
||||
input_len: int = 20,
|
||||
output_len: int = 5,
|
||||
base_items_per_request: int = 2,
|
||||
num_mm_items_range_ratio: float = 0.0,
|
||||
limit_mm_per_prompt: dict[str, int] | None = None,
|
||||
bucket_config: dict[tuple[int, int, int], float] | None = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
) -> list[SampleRequest]:
|
||||
if limit_mm_per_prompt is None:
|
||||
limit_mm_per_prompt = {"image": 5, "video": 0}
|
||||
if bucket_config is None:
|
||||
bucket_config = {(32, 32, 1): 0.5, (52, 64, 1): 0.5}
|
||||
return dataset.sample(
|
||||
tokenizer=tokenizer,
|
||||
num_requests=num_requests,
|
||||
prefix_len=prefix_len,
|
||||
range_ratio=range_ratio,
|
||||
input_len=input_len,
|
||||
output_len=output_len,
|
||||
base_items_per_request=base_items_per_request,
|
||||
num_mm_items_range_ratio=num_mm_items_range_ratio,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
enable_multimodal_chat=enable_multimodal_chat,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_same_seed(hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
||||
seed = 42
|
||||
ds_a = RandomMultiModalDataset(random_seed=seed)
|
||||
ds_b = RandomMultiModalDataset(random_seed=seed)
|
||||
a = _collect_mm_samples(ds_a, hf_tokenizer)
|
||||
b = _collect_mm_samples(ds_b, hf_tokenizer)
|
||||
fa = [_mm_fingerprint_sample(s) for s in a]
|
||||
fb = [_mm_fingerprint_sample(s) for s in b]
|
||||
assert fa == fb
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_different_seeds(
|
||||
hf_tokenizer: PreTrainedTokenizerBase,
|
||||
) -> None:
|
||||
ds_a = RandomMultiModalDataset(random_seed=0)
|
||||
ds_b = RandomMultiModalDataset(random_seed=999)
|
||||
a = _collect_mm_samples(ds_a, hf_tokenizer)
|
||||
b = _collect_mm_samples(ds_b, hf_tokenizer)
|
||||
fa = [_mm_fingerprint_sample(s) for s in a]
|
||||
fb = [_mm_fingerprint_sample(s) for s in b]
|
||||
assert fa != fb
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_respects_limits(
|
||||
hf_tokenizer: PreTrainedTokenizerBase,
|
||||
) -> None:
|
||||
ds = RandomMultiModalDataset(random_seed=0)
|
||||
# Requesting 3 items with a per-prompt limit of 1 should error per current
|
||||
# design (dataset refuses to silently clamp below the requested baseline).
|
||||
with pytest.raises(ValueError):
|
||||
_collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=12,
|
||||
base_items_per_request=3,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt={"image": 1, "video": 0},
|
||||
bucket_config={(32, 32, 1): 1.0},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_zero_prob_entries_are_removed(
|
||||
hf_tokenizer: PreTrainedTokenizerBase,
|
||||
) -> None:
|
||||
ds = RandomMultiModalDataset(random_seed=0)
|
||||
# Second bucket has zero probability and should be ignored after
|
||||
# normalization
|
||||
samples = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=6,
|
||||
base_items_per_request=2,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt={"image": 10, "video": 0},
|
||||
bucket_config={(32, 32, 1): 1.0, (52, 64, 1): 0.0},
|
||||
)
|
||||
for s in samples:
|
||||
assert isinstance(s.multi_modal_data, list)
|
||||
typed_mm = cast(list[dict[str, Any]], s.multi_modal_data)
|
||||
for it in typed_mm:
|
||||
assert it.get("type") == "image_url"
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_zero_items(hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
||||
ds = RandomMultiModalDataset(random_seed=0)
|
||||
samples = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=5,
|
||||
base_items_per_request=0,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt={"image": 5, "video": 0},
|
||||
bucket_config={(32, 32, 1): 1.0},
|
||||
)
|
||||
for s in samples:
|
||||
assert s.multi_modal_data == []
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_num_items_per_prompt(hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
||||
ds = RandomMultiModalDataset(random_seed=0)
|
||||
# Fixed number of images per prompt
|
||||
# set num_mm_items_range_ratio to 0.0
|
||||
# TODO: modify video values when video sampling is implemented
|
||||
samples_fixed_items = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=5,
|
||||
base_items_per_request=3,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt={"image": 3, "video": 0},
|
||||
bucket_config={(32, 32, 1): 1.0},
|
||||
)
|
||||
# Must have 5 requests each with 3 mm items per prompt
|
||||
assert len(samples_fixed_items) == 5
|
||||
for s in samples_fixed_items:
|
||||
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
|
||||
assert len(mm_data) == 3
|
||||
for it in mm_data:
|
||||
assert it.get("type") == "image_url"
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_bucket_config_not_mutated(
|
||||
hf_tokenizer: PreTrainedTokenizerBase,
|
||||
) -> None:
|
||||
ds = RandomMultiModalDataset(random_seed=0)
|
||||
# This bucket config is not normalized to sum to 1
|
||||
# and has more buckets than requested images
|
||||
original = {(32, 32, 1): 0.2, (52, 64, 1): 6, (25, 64, 1): 3}
|
||||
# Keep a snapshot to compare after sampling
|
||||
snapshot = dict(original)
|
||||
|
||||
_ = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=4,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt={"image": 1, "video": 0},
|
||||
bucket_config=original,
|
||||
)
|
||||
|
||||
# Ensure the original dict content is unchanged
|
||||
assert original == snapshot
|
||||
|
||||
# Vary number of mm items per prompt
|
||||
# set num_mm_items_range_ratio to 0.5
|
||||
samples_varying_items = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=5,
|
||||
base_items_per_request=2,
|
||||
num_mm_items_range_ratio=0.5,
|
||||
limit_mm_per_prompt={"image": 4, "video": 0},
|
||||
bucket_config={(32, 32, 1): 1.0},
|
||||
)
|
||||
# Must have 5 requests each with less than 4 mm items per prompt
|
||||
# but at least 1 mm item per prompt
|
||||
assert len(samples_varying_items) == 5
|
||||
for s in samples_varying_items:
|
||||
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
|
||||
assert len(mm_data) <= 4
|
||||
assert len(mm_data) >= 1
|
||||
for it in mm_data:
|
||||
assert it.get("type") == "image_url"
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_video_sampling(hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
||||
"""Test video sampling functionality in RandomMultiModalDataset."""
|
||||
ds = RandomMultiModalDataset(random_seed=42)
|
||||
|
||||
# Test with video bucket configuration
|
||||
bucket_config = {
|
||||
(64, 64, 1): 0.3, # Images
|
||||
(64, 64, 8): 0.7, # Videos
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 2, "video": 2}
|
||||
|
||||
samples = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=5,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
)
|
||||
|
||||
assert len(samples) == 5
|
||||
|
||||
# Check that we have both images and videos
|
||||
video_count = 0
|
||||
image_count = 0
|
||||
|
||||
for s in samples:
|
||||
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
|
||||
assert len(mm_data) == 1
|
||||
|
||||
item = mm_data[0]
|
||||
if item.get("type") == "video_url":
|
||||
video_count += 1
|
||||
# Verify video URL format
|
||||
url = item.get("video_url", {}).get("url", "")
|
||||
assert url.startswith("data:video/mp4;base64,")
|
||||
elif item.get("type") == "image_url":
|
||||
image_count += 1
|
||||
# Verify image URL format
|
||||
url = item.get("image_url", {}).get("url", "")
|
||||
assert url.startswith("data:image/jpeg;base64,")
|
||||
|
||||
# Should have some videos due to 0.7 probability
|
||||
assert video_count > 0
|
||||
assert image_count > 0
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_video_only_sampling(hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
||||
"""Test sampling with only video buckets."""
|
||||
ds = RandomMultiModalDataset(random_seed=42)
|
||||
|
||||
bucket_config = {
|
||||
(64, 64, 8): 1.0, # Only videos
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 0, "video": 1}
|
||||
|
||||
samples = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=3,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
)
|
||||
|
||||
assert len(samples) == 3
|
||||
|
||||
for s in samples:
|
||||
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
|
||||
assert len(mm_data) == 1
|
||||
|
||||
item = mm_data[0]
|
||||
assert item.get("type") == "video_url"
|
||||
url = item.get("video_url", {}).get("url", "")
|
||||
assert url.startswith("data:video/mp4;base64,")
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_video_deterministic_sampling(
|
||||
hf_tokenizer: PreTrainedTokenizerBase,
|
||||
) -> None:
|
||||
"""Test that video sampling is deterministic with same seed."""
|
||||
seed = 123
|
||||
ds_a = RandomMultiModalDataset(random_seed=seed)
|
||||
ds_b = RandomMultiModalDataset(random_seed=seed)
|
||||
|
||||
bucket_config = {
|
||||
(64, 64, 8): 1.0, # Only videos
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 0, "video": 1}
|
||||
|
||||
a = _collect_mm_samples(
|
||||
ds_a,
|
||||
hf_tokenizer,
|
||||
num_requests=3,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
)
|
||||
|
||||
b = _collect_mm_samples(
|
||||
ds_b,
|
||||
hf_tokenizer,
|
||||
num_requests=3,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
)
|
||||
|
||||
fa = [_mm_fingerprint_sample(s) for s in a]
|
||||
fb = [_mm_fingerprint_sample(s) for s in b]
|
||||
assert fa == fb
|
||||
398
tests/benchmarks/test_random_multimodal_dataset_video.py
Normal file
398
tests/benchmarks/test_random_multimodal_dataset_video.py
Normal file
@@ -0,0 +1,398 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import base64
|
||||
import os
|
||||
from tempfile import NamedTemporaryFile
|
||||
from typing import Any, cast
|
||||
|
||||
import cv2
|
||||
import pytest
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from vllm.benchmarks.datasets import RandomMultiModalDataset, SampleRequest
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def hf_tokenizer() -> PreTrainedTokenizerBase:
|
||||
"""Use a small, commonly available tokenizer."""
|
||||
return AutoTokenizer.from_pretrained("gpt2")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def video_dataset() -> RandomMultiModalDataset:
|
||||
"""Create a RandomMultiModalDataset instance for testing."""
|
||||
return RandomMultiModalDataset(random_seed=42)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_generate_synthetic_video_different_seeds():
|
||||
"""Test that different seeds produce different videos."""
|
||||
dataset1 = RandomMultiModalDataset(random_seed=123)
|
||||
dataset2 = RandomMultiModalDataset(random_seed=456)
|
||||
|
||||
width, height, num_frames = 64, 48, 8
|
||||
|
||||
video1 = dataset1.generate_synthetic_video(width, height, num_frames)
|
||||
video2 = dataset2.generate_synthetic_video(width, height, num_frames)
|
||||
|
||||
# Videos should be different due to different seeds
|
||||
assert video1["bytes"] != video2["bytes"]
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_map_config_to_modality(video_dataset: RandomMultiModalDataset):
|
||||
"""Test modality mapping for different configurations."""
|
||||
# Test image configuration (num_frames = 1)
|
||||
assert video_dataset.map_config_to_modality((256, 256, 1)) == "image"
|
||||
assert video_dataset.map_config_to_modality((720, 1280, 1)) == "image"
|
||||
|
||||
# Test video configurations (num_frames > 1)
|
||||
assert video_dataset.map_config_to_modality((256, 256, 8)) == "video"
|
||||
assert video_dataset.map_config_to_modality((720, 1280, 16)) == "video"
|
||||
assert video_dataset.map_config_to_modality((64, 64, 32)) == "video"
|
||||
|
||||
# Test invalid configurations
|
||||
with pytest.raises(ValueError, match="Invalid multimodal item configuration"):
|
||||
video_dataset.map_config_to_modality((256, 256, 0))
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid multimodal item configuration"):
|
||||
video_dataset.map_config_to_modality((256, 256, -1))
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_generate_mm_item_video(video_dataset: RandomMultiModalDataset):
|
||||
"""Test generating multimodal items for video configurations."""
|
||||
# Test video item generation
|
||||
video_config = (64, 48, 8) # height, width, num_frames
|
||||
result = video_dataset.generate_mm_item(video_config)
|
||||
|
||||
# Check the result structure matches OpenAI API format
|
||||
assert isinstance(result, dict)
|
||||
assert result["type"] == "video_url"
|
||||
assert "video_url" in result
|
||||
assert "url" in result["video_url"]
|
||||
|
||||
# Check that the URL is a data URL with base64 encoded video
|
||||
url = result["video_url"]["url"]
|
||||
assert url.startswith("data:video/mp4;base64,")
|
||||
|
||||
# Decode and verify the video content
|
||||
base64_data = url.split(",")[1]
|
||||
video_bytes = base64.b64decode(base64_data)
|
||||
assert len(video_bytes) > 0
|
||||
|
||||
# Verify the video can be decoded
|
||||
with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
|
||||
temp_path = temp_file.name
|
||||
temp_file.write(video_bytes)
|
||||
|
||||
try:
|
||||
cap = cv2.VideoCapture(temp_path)
|
||||
assert cap.isOpened()
|
||||
|
||||
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
|
||||
assert frame_count == 8
|
||||
assert frame_width == 48
|
||||
assert frame_height == 64
|
||||
|
||||
cap.release()
|
||||
finally:
|
||||
if os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_generate_mm_item_image(video_dataset: RandomMultiModalDataset):
|
||||
"""Test generating multimodal items for image configurations."""
|
||||
# Test image item generation
|
||||
image_config = (64, 48, 1) # height, width, num_frames=1
|
||||
result = video_dataset.generate_mm_item(image_config)
|
||||
|
||||
# Check the result structure matches OpenAI API format
|
||||
assert isinstance(result, dict)
|
||||
assert result["type"] == "image_url"
|
||||
assert "image_url" in result
|
||||
assert "url" in result["image_url"]
|
||||
|
||||
# Check that the URL is a data URL with base64 encoded image
|
||||
url = result["image_url"]["url"]
|
||||
assert url.startswith("data:image/jpeg;base64,")
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_generate_mm_item_invalid_config(video_dataset: RandomMultiModalDataset):
|
||||
"""Test error handling for invalid configurations."""
|
||||
with pytest.raises(ValueError, match="Invalid multimodal item configuration"):
|
||||
video_dataset.generate_mm_item((256, 256, 0))
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_sample_with_video_buckets(
|
||||
video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
|
||||
):
|
||||
"""Test sampling with video bucket configurations."""
|
||||
# Configure bucket with video probability > 0
|
||||
bucket_config = {
|
||||
(64, 64, 1): 0.3, # Images
|
||||
(64, 64, 8): 0.7, # Videos
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 5, "video": 3}
|
||||
|
||||
samples = video_dataset.sample(
|
||||
tokenizer=hf_tokenizer,
|
||||
num_requests=5,
|
||||
base_items_per_request=2,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
input_len=20,
|
||||
output_len=5,
|
||||
)
|
||||
|
||||
assert len(samples) == 5
|
||||
|
||||
# Check that samples contain both images and videos
|
||||
video_count = 0
|
||||
image_count = 0
|
||||
|
||||
for sample in samples:
|
||||
assert isinstance(sample, SampleRequest)
|
||||
assert sample.multi_modal_data is not None
|
||||
assert isinstance(sample.multi_modal_data, list)
|
||||
|
||||
mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
|
||||
assert len(mm_data) == 2 # base_items_per_request
|
||||
|
||||
for item in mm_data:
|
||||
if item["type"] == "video_url":
|
||||
video_count += 1
|
||||
# Verify video URL format
|
||||
url = item["video_url"]["url"]
|
||||
assert url.startswith("data:video/mp4;base64,")
|
||||
elif item["type"] == "image_url":
|
||||
image_count += 1
|
||||
# Verify image URL format
|
||||
url = item["image_url"]["url"]
|
||||
assert url.startswith("data:image/jpeg;base64,")
|
||||
|
||||
# Should have some videos due to 0.7 probability
|
||||
assert video_count > 0
|
||||
assert image_count > 0
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_sample_video_only_buckets(
|
||||
video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
|
||||
):
|
||||
"""Test sampling with only video buckets."""
|
||||
bucket_config = {
|
||||
(64, 64, 8): 1.0, # Only videos
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 0, "video": 2}
|
||||
|
||||
samples = video_dataset.sample(
|
||||
tokenizer=hf_tokenizer,
|
||||
num_requests=3,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
input_len=20,
|
||||
output_len=5,
|
||||
)
|
||||
|
||||
assert len(samples) == 3
|
||||
|
||||
for sample in samples:
|
||||
assert isinstance(sample, SampleRequest)
|
||||
assert sample.multi_modal_data is not None
|
||||
assert isinstance(sample.multi_modal_data, list)
|
||||
|
||||
mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
|
||||
assert len(mm_data) == 1
|
||||
|
||||
item = mm_data[0]
|
||||
assert item["type"] == "video_url"
|
||||
url = item["video_url"]["url"]
|
||||
assert url.startswith("data:video/mp4;base64,")
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_sample_respects_video_limits(
|
||||
video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
|
||||
):
|
||||
"""Test that sampling respects video limits per prompt."""
|
||||
bucket_config = {
|
||||
(64, 64, 8): 1.0, # Only videos
|
||||
}
|
||||
|
||||
# Set very low video limit
|
||||
limit_mm_per_prompt = {"image": 0, "video": 1}
|
||||
|
||||
samples = video_dataset.sample(
|
||||
tokenizer=hf_tokenizer,
|
||||
num_requests=3,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
input_len=20,
|
||||
output_len=5,
|
||||
)
|
||||
|
||||
assert len(samples) == 3
|
||||
|
||||
for sample in samples:
|
||||
mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
|
||||
assert len(mm_data) <= 1 # Should respect video limit
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_sample_mixed_buckets_with_zero_probability(
|
||||
video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
|
||||
):
|
||||
"""Test sampling with mixed buckets including zero probability entries."""
|
||||
bucket_config = {
|
||||
(64, 64, 1): 0.5, # Images
|
||||
(64, 64, 8): 0.5, # Videos
|
||||
(128, 128, 16): 0.0, # Zero probability videos (should be ignored)
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 2, "video": 2}
|
||||
|
||||
samples = video_dataset.sample(
|
||||
tokenizer=hf_tokenizer,
|
||||
num_requests=4,
|
||||
base_items_per_request=2,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
input_len=20,
|
||||
output_len=5,
|
||||
)
|
||||
|
||||
assert len(samples) == 4
|
||||
|
||||
# Should only see 64x64 videos, not 128x128 videos
|
||||
for sample in samples:
|
||||
mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
|
||||
for item in mm_data:
|
||||
if item["type"] == "video_url":
|
||||
# Decode video to verify dimensions
|
||||
url = item["video_url"]["url"]
|
||||
base64_data = url.split(",")[1]
|
||||
video_bytes = base64.b64decode(base64_data)
|
||||
|
||||
with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file: # noqa
|
||||
temp_path = temp_file.name
|
||||
temp_file.write(video_bytes)
|
||||
|
||||
try:
|
||||
cap = cv2.VideoCapture(temp_path)
|
||||
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
cap.release()
|
||||
|
||||
# Should be 64x64, not 128x128
|
||||
assert frame_width == 64
|
||||
assert frame_height == 64
|
||||
finally:
|
||||
if os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_sample_deterministic_with_videos(hf_tokenizer: PreTrainedTokenizerBase):
|
||||
"""Test that sampling with videos is deterministic with same seed."""
|
||||
dataset1 = RandomMultiModalDataset(random_seed=123)
|
||||
dataset2 = RandomMultiModalDataset(random_seed=123)
|
||||
|
||||
bucket_config = {
|
||||
(64, 64, 1): 0.3, # Images
|
||||
(64, 64, 8): 0.7, # Videos
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 2, "video": 2}
|
||||
|
||||
samples1 = dataset1.sample(
|
||||
tokenizer=hf_tokenizer,
|
||||
num_requests=3,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
input_len=20,
|
||||
output_len=5,
|
||||
)
|
||||
|
||||
samples2 = dataset2.sample(
|
||||
tokenizer=hf_tokenizer,
|
||||
num_requests=3,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
input_len=20,
|
||||
output_len=5,
|
||||
)
|
||||
|
||||
assert len(samples1) == len(samples2)
|
||||
|
||||
# Compare multimodal data
|
||||
for s1, s2 in zip(samples1, samples2):
|
||||
assert s1.multi_modal_data == s2.multi_modal_data
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_sample_different_seeds_produce_different_videos(
|
||||
hf_tokenizer: PreTrainedTokenizerBase,
|
||||
):
|
||||
"""Test that different seeds produce different video content."""
|
||||
dataset1 = RandomMultiModalDataset(random_seed=123)
|
||||
dataset2 = RandomMultiModalDataset(random_seed=456)
|
||||
|
||||
bucket_config = {
|
||||
(64, 64, 8): 1.0, # Only videos
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 0, "video": 1}
|
||||
|
||||
samples1 = dataset1.sample(
|
||||
tokenizer=hf_tokenizer,
|
||||
num_requests=2,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
input_len=20,
|
||||
output_len=5,
|
||||
)
|
||||
|
||||
samples2 = dataset2.sample(
|
||||
tokenizer=hf_tokenizer,
|
||||
num_requests=2,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
input_len=20,
|
||||
output_len=5,
|
||||
)
|
||||
|
||||
# Video content should be different
|
||||
for s1, s2 in zip(samples1, samples2):
|
||||
mm_data1 = cast(list[dict[str, Any]], s1.multi_modal_data)
|
||||
mm_data2 = cast(list[dict[str, Any]], s2.multi_modal_data)
|
||||
|
||||
assert len(mm_data1) == len(mm_data2) == 1
|
||||
|
||||
url1 = mm_data1[0]["video_url"]["url"]
|
||||
url2 = mm_data2[0]["video_url"]["url"]
|
||||
|
||||
assert url1 != url2 # Different video content
|
||||
77
tests/benchmarks/test_serve_cli.py
Normal file
77
tests/benchmarks/test_serve_cli.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
from ..utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = ["--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_bench_serve(server):
|
||||
command = [
|
||||
"vllm",
|
||||
"bench",
|
||||
"serve",
|
||||
"--model",
|
||||
MODEL_NAME,
|
||||
"--host",
|
||||
server.host,
|
||||
"--port",
|
||||
str(server.port),
|
||||
"--dataset-name",
|
||||
"random",
|
||||
"--random-input-len",
|
||||
"32",
|
||||
"--random-output-len",
|
||||
"4",
|
||||
"--num-prompts",
|
||||
"5",
|
||||
]
|
||||
result = subprocess.run(command, capture_output=True, text=True)
|
||||
print(result.stdout)
|
||||
print(result.stderr)
|
||||
|
||||
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_bench_serve_chat(server):
|
||||
command = [
|
||||
"vllm",
|
||||
"bench",
|
||||
"serve",
|
||||
"--model",
|
||||
MODEL_NAME,
|
||||
"--host",
|
||||
server.host,
|
||||
"--port",
|
||||
str(server.port),
|
||||
"--dataset-name",
|
||||
"random",
|
||||
"--random-input-len",
|
||||
"32",
|
||||
"--random-output-len",
|
||||
"4",
|
||||
"--num-prompts",
|
||||
"5",
|
||||
"--endpoint",
|
||||
"/v1/chat/completions",
|
||||
"--backend",
|
||||
"openai-chat",
|
||||
]
|
||||
result = subprocess.run(command, capture_output=True, text=True)
|
||||
print(result.stdout)
|
||||
print(result.stderr)
|
||||
|
||||
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
|
||||
30
tests/benchmarks/test_throughput_cli.py
Normal file
30
tests/benchmarks/test_throughput_cli.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_bench_throughput():
|
||||
command = [
|
||||
"vllm",
|
||||
"bench",
|
||||
"throughput",
|
||||
"--model",
|
||||
MODEL_NAME,
|
||||
"--input-len",
|
||||
"32",
|
||||
"--output-len",
|
||||
"1",
|
||||
"--enforce-eager",
|
||||
"--load-format",
|
||||
"dummy",
|
||||
]
|
||||
result = subprocess.run(command, capture_output=True, text=True)
|
||||
print(result.stdout)
|
||||
print(result.stderr)
|
||||
|
||||
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
|
||||
Reference in New Issue
Block a user