Sync from v0.13
This commit is contained in:
484
tests/benchmarks/test_random_dataset.py
Normal file
484
tests/benchmarks/test_random_dataset.py
Normal file
@@ -0,0 +1,484 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import random
|
||||
from typing import Any, NamedTuple, cast
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from vllm.benchmarks.datasets import (
|
||||
RandomDataset,
|
||||
RandomMultiModalDataset,
|
||||
SampleRequest,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def hf_tokenizer() -> PreTrainedTokenizerBase:
|
||||
# Use a small, commonly available tokenizer
|
||||
return AutoTokenizer.from_pretrained("gpt2")
|
||||
|
||||
|
||||
class Params(NamedTuple):
|
||||
num_requests: int
|
||||
prefix_len: int
|
||||
range_ratio: float
|
||||
input_len: int
|
||||
output_len: int
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def random_dataset_params() -> Params:
|
||||
return Params(
|
||||
num_requests=16, prefix_len=7, range_ratio=0.3, input_len=50, output_len=20
|
||||
)
|
||||
|
||||
|
||||
def _fingerprint_sample(req: SampleRequest) -> tuple[str, int, int]:
|
||||
"""Project a SampleRequest into a comparable tuple."""
|
||||
return (req.prompt, req.prompt_len, req.expected_output_len)
|
||||
|
||||
|
||||
def _collect_samples(
|
||||
dataset: RandomDataset,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int = 16,
|
||||
prefix_len: int = 7,
|
||||
range_ratio: float = 0.3,
|
||||
input_len: int = 50,
|
||||
output_len: int = 20,
|
||||
) -> list[tuple[str, int, int]]:
|
||||
samples = dataset.sample(
|
||||
tokenizer=tokenizer,
|
||||
num_requests=num_requests,
|
||||
prefix_len=prefix_len,
|
||||
range_ratio=range_ratio,
|
||||
input_len=input_len,
|
||||
output_len=output_len,
|
||||
)
|
||||
return [_fingerprint_sample(s) for s in samples]
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_dataset_same_seed(
|
||||
hf_tokenizer: PreTrainedTokenizerBase, random_dataset_params: Params
|
||||
) -> None:
|
||||
"""Same seed should yield identical outputs, even if global RNGs change.
|
||||
|
||||
This guards against accidental reliance on Python's random or np.random
|
||||
in RandomDataset after moving to numpy.default_rng.
|
||||
"""
|
||||
p = random_dataset_params
|
||||
common_seed = 123
|
||||
dataset_a = RandomDataset(random_seed=common_seed)
|
||||
dataset_b = RandomDataset(random_seed=common_seed)
|
||||
a = _collect_samples(
|
||||
dataset_a,
|
||||
hf_tokenizer,
|
||||
num_requests=p.num_requests,
|
||||
prefix_len=p.prefix_len,
|
||||
range_ratio=p.range_ratio,
|
||||
input_len=p.input_len,
|
||||
output_len=p.output_len,
|
||||
)
|
||||
|
||||
# Perturb global RNG state to ensure isolation
|
||||
random.seed(999)
|
||||
_ = [random.random() for _ in range(100)]
|
||||
np.random.seed(888)
|
||||
_ = [np.random.random() for _ in range(100)]
|
||||
|
||||
b = _collect_samples(
|
||||
dataset_b,
|
||||
hf_tokenizer,
|
||||
num_requests=p.num_requests,
|
||||
prefix_len=p.prefix_len,
|
||||
range_ratio=p.range_ratio,
|
||||
input_len=p.input_len,
|
||||
output_len=p.output_len,
|
||||
)
|
||||
assert a == b
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_dataset_different_seeds(
|
||||
hf_tokenizer: PreTrainedTokenizerBase, random_dataset_params: Params
|
||||
) -> None:
|
||||
"""Different seeds should change outputs with overwhelming likelihood."""
|
||||
p = random_dataset_params
|
||||
seed_a = 0
|
||||
dataset_a = RandomDataset(random_seed=seed_a)
|
||||
a = _collect_samples(
|
||||
dataset_a,
|
||||
hf_tokenizer,
|
||||
num_requests=p.num_requests,
|
||||
prefix_len=p.prefix_len,
|
||||
range_ratio=p.range_ratio,
|
||||
input_len=p.input_len,
|
||||
output_len=p.output_len,
|
||||
)
|
||||
|
||||
seed_b = 999
|
||||
dataset_b = RandomDataset(random_seed=seed_b)
|
||||
# Perturb global RNG with same seed as dataset_a to ensure isolation
|
||||
random.seed(seed_a)
|
||||
np.random.seed(seed_a)
|
||||
b = _collect_samples(
|
||||
dataset_b,
|
||||
hf_tokenizer,
|
||||
num_requests=p.num_requests,
|
||||
prefix_len=p.prefix_len,
|
||||
range_ratio=p.range_ratio,
|
||||
input_len=p.input_len,
|
||||
output_len=p.output_len,
|
||||
)
|
||||
assert a != b
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# RandomMultiModalDataset tests
|
||||
# -----------------------------
|
||||
|
||||
|
||||
def _mm_fingerprint_sample(
|
||||
req: SampleRequest,
|
||||
) -> tuple[str, int, int, int, list[str]]:
|
||||
"""Create a compact fingerprint for multimodal samples.
|
||||
|
||||
Includes:
|
||||
- prompt string
|
||||
- prompt_len
|
||||
- expected_output_len
|
||||
- count of multimodal items
|
||||
- per-item type and URL prefix (e.g., 'data:image/jpeg;base64,')
|
||||
"""
|
||||
items = req.multi_modal_data or []
|
||||
item_prefixes: list[str] = []
|
||||
for it in items:
|
||||
if isinstance(it, dict) and it.get("type") == "image_url":
|
||||
url = it.get("image_url", {}).get("url", "")
|
||||
# Only keep a short identifying prefix to avoid huge strings
|
||||
item_prefixes.append(f"image:{url[:22]}")
|
||||
elif isinstance(it, dict) and it.get("type") == "video_url":
|
||||
url = it.get("video_url", {}).get("url", "")
|
||||
item_prefixes.append(f"video:{url[:22]}")
|
||||
else:
|
||||
item_prefixes.append("unknown:")
|
||||
return (
|
||||
req.prompt,
|
||||
req.prompt_len,
|
||||
req.expected_output_len,
|
||||
len(items),
|
||||
item_prefixes,
|
||||
)
|
||||
|
||||
|
||||
def _collect_mm_samples(
|
||||
dataset: RandomMultiModalDataset,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
*,
|
||||
num_requests: int = 8,
|
||||
prefix_len: int = 3,
|
||||
range_ratio: float = 0.0,
|
||||
input_len: int = 20,
|
||||
output_len: int = 5,
|
||||
base_items_per_request: int = 2,
|
||||
num_mm_items_range_ratio: float = 0.0,
|
||||
limit_mm_per_prompt: dict[str, int] | None = None,
|
||||
bucket_config: dict[tuple[int, int, int], float] | None = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
) -> list[SampleRequest]:
|
||||
if limit_mm_per_prompt is None:
|
||||
limit_mm_per_prompt = {"image": 5, "video": 0}
|
||||
if bucket_config is None:
|
||||
bucket_config = {(32, 32, 1): 0.5, (52, 64, 1): 0.5}
|
||||
return dataset.sample(
|
||||
tokenizer=tokenizer,
|
||||
num_requests=num_requests,
|
||||
prefix_len=prefix_len,
|
||||
range_ratio=range_ratio,
|
||||
input_len=input_len,
|
||||
output_len=output_len,
|
||||
base_items_per_request=base_items_per_request,
|
||||
num_mm_items_range_ratio=num_mm_items_range_ratio,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
enable_multimodal_chat=enable_multimodal_chat,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_same_seed(hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
||||
seed = 42
|
||||
ds_a = RandomMultiModalDataset(random_seed=seed)
|
||||
ds_b = RandomMultiModalDataset(random_seed=seed)
|
||||
a = _collect_mm_samples(ds_a, hf_tokenizer)
|
||||
b = _collect_mm_samples(ds_b, hf_tokenizer)
|
||||
fa = [_mm_fingerprint_sample(s) for s in a]
|
||||
fb = [_mm_fingerprint_sample(s) for s in b]
|
||||
assert fa == fb
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_different_seeds(
|
||||
hf_tokenizer: PreTrainedTokenizerBase,
|
||||
) -> None:
|
||||
ds_a = RandomMultiModalDataset(random_seed=0)
|
||||
ds_b = RandomMultiModalDataset(random_seed=999)
|
||||
a = _collect_mm_samples(ds_a, hf_tokenizer)
|
||||
b = _collect_mm_samples(ds_b, hf_tokenizer)
|
||||
fa = [_mm_fingerprint_sample(s) for s in a]
|
||||
fb = [_mm_fingerprint_sample(s) for s in b]
|
||||
assert fa != fb
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_respects_limits(
|
||||
hf_tokenizer: PreTrainedTokenizerBase,
|
||||
) -> None:
|
||||
ds = RandomMultiModalDataset(random_seed=0)
|
||||
# Requesting 3 items with a per-prompt limit of 1 should error per current
|
||||
# design (dataset refuses to silently clamp below the requested baseline).
|
||||
with pytest.raises(ValueError):
|
||||
_collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=12,
|
||||
base_items_per_request=3,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt={"image": 1, "video": 0},
|
||||
bucket_config={(32, 32, 1): 1.0},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_zero_prob_entries_are_removed(
|
||||
hf_tokenizer: PreTrainedTokenizerBase,
|
||||
) -> None:
|
||||
ds = RandomMultiModalDataset(random_seed=0)
|
||||
# Second bucket has zero probability and should be ignored after
|
||||
# normalization
|
||||
samples = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=6,
|
||||
base_items_per_request=2,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt={"image": 10, "video": 0},
|
||||
bucket_config={(32, 32, 1): 1.0, (52, 64, 1): 0.0},
|
||||
)
|
||||
for s in samples:
|
||||
assert isinstance(s.multi_modal_data, list)
|
||||
typed_mm = cast(list[dict[str, Any]], s.multi_modal_data)
|
||||
for it in typed_mm:
|
||||
assert it.get("type") == "image_url"
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_zero_items(hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
||||
ds = RandomMultiModalDataset(random_seed=0)
|
||||
samples = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=5,
|
||||
base_items_per_request=0,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt={"image": 5, "video": 0},
|
||||
bucket_config={(32, 32, 1): 1.0},
|
||||
)
|
||||
for s in samples:
|
||||
assert s.multi_modal_data == []
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_num_items_per_prompt(hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
||||
ds = RandomMultiModalDataset(random_seed=0)
|
||||
# Fixed number of images per prompt
|
||||
# set num_mm_items_range_ratio to 0.0
|
||||
# TODO: modify video values when video sampling is implemented
|
||||
samples_fixed_items = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=5,
|
||||
base_items_per_request=3,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt={"image": 3, "video": 0},
|
||||
bucket_config={(32, 32, 1): 1.0},
|
||||
)
|
||||
# Must have 5 requests each with 3 mm items per prompt
|
||||
assert len(samples_fixed_items) == 5
|
||||
for s in samples_fixed_items:
|
||||
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
|
||||
assert len(mm_data) == 3
|
||||
for it in mm_data:
|
||||
assert it.get("type") == "image_url"
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_bucket_config_not_mutated(
|
||||
hf_tokenizer: PreTrainedTokenizerBase,
|
||||
) -> None:
|
||||
ds = RandomMultiModalDataset(random_seed=0)
|
||||
# This bucket config is not normalized to sum to 1
|
||||
# and has more buckets than requested images
|
||||
original = {(32, 32, 1): 0.2, (52, 64, 1): 6, (25, 64, 1): 3}
|
||||
# Keep a snapshot to compare after sampling
|
||||
snapshot = dict(original)
|
||||
|
||||
_ = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=4,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt={"image": 1, "video": 0},
|
||||
bucket_config=original,
|
||||
)
|
||||
|
||||
# Ensure the original dict content is unchanged
|
||||
assert original == snapshot
|
||||
|
||||
# Vary number of mm items per prompt
|
||||
# set num_mm_items_range_ratio to 0.5
|
||||
samples_varying_items = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=5,
|
||||
base_items_per_request=2,
|
||||
num_mm_items_range_ratio=0.5,
|
||||
limit_mm_per_prompt={"image": 4, "video": 0},
|
||||
bucket_config={(32, 32, 1): 1.0},
|
||||
)
|
||||
# Must have 5 requests each with less than 4 mm items per prompt
|
||||
# but at least 1 mm item per prompt
|
||||
assert len(samples_varying_items) == 5
|
||||
for s in samples_varying_items:
|
||||
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
|
||||
assert len(mm_data) <= 4
|
||||
assert len(mm_data) >= 1
|
||||
for it in mm_data:
|
||||
assert it.get("type") == "image_url"
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_video_sampling(hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
||||
"""Test video sampling functionality in RandomMultiModalDataset."""
|
||||
ds = RandomMultiModalDataset(random_seed=42)
|
||||
|
||||
# Test with video bucket configuration
|
||||
bucket_config = {
|
||||
(64, 64, 1): 0.3, # Images
|
||||
(64, 64, 8): 0.7, # Videos
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 2, "video": 2}
|
||||
|
||||
samples = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=5,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
)
|
||||
|
||||
assert len(samples) == 5
|
||||
|
||||
# Check that we have both images and videos
|
||||
video_count = 0
|
||||
image_count = 0
|
||||
|
||||
for s in samples:
|
||||
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
|
||||
assert len(mm_data) == 1
|
||||
|
||||
item = mm_data[0]
|
||||
if item.get("type") == "video_url":
|
||||
video_count += 1
|
||||
# Verify video URL format
|
||||
url = item.get("video_url", {}).get("url", "")
|
||||
assert url.startswith("data:video/mp4;base64,")
|
||||
elif item.get("type") == "image_url":
|
||||
image_count += 1
|
||||
# Verify image URL format
|
||||
url = item.get("image_url", {}).get("url", "")
|
||||
assert url.startswith("data:image/jpeg;base64,")
|
||||
|
||||
# Should have some videos due to 0.7 probability
|
||||
assert video_count > 0
|
||||
assert image_count > 0
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_video_only_sampling(hf_tokenizer: PreTrainedTokenizerBase) -> None:
|
||||
"""Test sampling with only video buckets."""
|
||||
ds = RandomMultiModalDataset(random_seed=42)
|
||||
|
||||
bucket_config = {
|
||||
(64, 64, 8): 1.0, # Only videos
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 0, "video": 1}
|
||||
|
||||
samples = _collect_mm_samples(
|
||||
ds,
|
||||
hf_tokenizer,
|
||||
num_requests=3,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
)
|
||||
|
||||
assert len(samples) == 3
|
||||
|
||||
for s in samples:
|
||||
mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
|
||||
assert len(mm_data) == 1
|
||||
|
||||
item = mm_data[0]
|
||||
assert item.get("type") == "video_url"
|
||||
url = item.get("video_url", {}).get("url", "")
|
||||
assert url.startswith("data:video/mp4;base64,")
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_random_mm_video_deterministic_sampling(
|
||||
hf_tokenizer: PreTrainedTokenizerBase,
|
||||
) -> None:
|
||||
"""Test that video sampling is deterministic with same seed."""
|
||||
seed = 123
|
||||
ds_a = RandomMultiModalDataset(random_seed=seed)
|
||||
ds_b = RandomMultiModalDataset(random_seed=seed)
|
||||
|
||||
bucket_config = {
|
||||
(64, 64, 8): 1.0, # Only videos
|
||||
}
|
||||
|
||||
limit_mm_per_prompt = {"image": 0, "video": 1}
|
||||
|
||||
a = _collect_mm_samples(
|
||||
ds_a,
|
||||
hf_tokenizer,
|
||||
num_requests=3,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
)
|
||||
|
||||
b = _collect_mm_samples(
|
||||
ds_b,
|
||||
hf_tokenizer,
|
||||
num_requests=3,
|
||||
base_items_per_request=1,
|
||||
num_mm_items_range_ratio=0.0,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
bucket_config=bucket_config,
|
||||
)
|
||||
|
||||
fa = [_mm_fingerprint_sample(s) for s in a]
|
||||
fb = [_mm_fingerprint_sample(s) for s in b]
|
||||
assert fa == fb
|
||||
Reference in New Issue
Block a user