Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/benchmarks/init.py
+++ b/tests/benchmarks/init.py
--- a/tests/benchmarks/test_latency_cli.py
+++ b/tests/benchmarks/test_latency_cli.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import subprocess
+
+import pytest
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.benchmark
+def test_bench_latency():
+    command = [
+        "vllm",
+        "bench",
+        "latency",
+        "--model",
+        MODEL_NAME,
+        "--input-len",
+        "32",
+        "--output-len",
+        "1",
+        "--enforce-eager",
+        "--load-format",
+        "dummy",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
--- a/tests/benchmarks/test_param_sweep.py
+++ b/tests/benchmarks/test_param_sweep.py
@@ -0,0 +1,249 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from vllm.benchmarks.sweep.param_sweep import ParameterSweep, ParameterSweepItem
+
+
+class TestParameterSweepItem:
+    """Test ParameterSweepItem functionality."""
+
+    @pytest.mark.parametrize(
+        "input_dict,expected",
+        [
+            (
+                {"compilation_config.use_inductor_graph_partition": False},
+                "--compilation-config.use_inductor_graph_partition=false",
+            ),
+            (
+                {"compilation_config.use_inductor_graph_partition": True},
+                "--compilation-config.use_inductor_graph_partition=true",
+            ),
+        ],
+    )
+    def test_nested_boolean_params(self, input_dict, expected):
+        """Test that nested boolean params use =true/false syntax."""
+        item = ParameterSweepItem.from_record(input_dict)
+        cmd = item.apply_to_cmd(["vllm", "serve", "model"])
+        assert expected in cmd
+
+    @pytest.mark.parametrize(
+        "input_dict,expected",
+        [
+            ({"enable_prefix_caching": False}, "--no-enable-prefix-caching"),
+            ({"enable_prefix_caching": True}, "--enable-prefix-caching"),
+            ({"disable_log_stats": False}, "--no-disable-log-stats"),
+            ({"disable_log_stats": True}, "--disable-log-stats"),
+        ],
+    )
+    def test_non_nested_boolean_params(self, input_dict, expected):
+        """Test that non-nested boolean params use --no- prefix."""
+        item = ParameterSweepItem.from_record(input_dict)
+        cmd = item.apply_to_cmd(["vllm", "serve", "model"])
+        assert expected in cmd
+
+    @pytest.mark.parametrize(
+        "compilation_config",
+        [
+            {"cudagraph_mode": "full", "mode": 2, "use_inductor_graph_partition": True},
+            {
+                "cudagraph_mode": "piecewise",
+                "mode": 3,
+                "use_inductor_graph_partition": False,
+            },
+        ],
+    )
+    def test_nested_dict_value(self, compilation_config):
+        """Test that nested dict values are serialized as JSON."""
+        item = ParameterSweepItem.from_record(
+            {"compilation_config": compilation_config}
+        )
+        cmd = item.apply_to_cmd(["vllm", "serve", "model"])
+        assert "--compilation-config" in cmd
+        # The dict should be JSON serialized
+        idx = cmd.index("--compilation-config")
+        assert json.loads(cmd[idx + 1]) == compilation_config
+
+    @pytest.mark.parametrize(
+        "input_dict,expected_key,expected_value",
+        [
+            ({"model": "test-model"}, "--model", "test-model"),
+            ({"max_tokens": 100}, "--max-tokens", "100"),
+            ({"temperature": 0.7}, "--temperature", "0.7"),
+        ],
+    )
+    def test_string_and_numeric_values(self, input_dict, expected_key, expected_value):
+        """Test that string and numeric values are handled correctly."""
+        item = ParameterSweepItem.from_record(input_dict)
+        cmd = item.apply_to_cmd(["vllm", "serve"])
+        assert expected_key in cmd
+        assert expected_value in cmd
+
+    @pytest.mark.parametrize(
+        "input_dict,expected_key,key_idx_offset",
+        [
+            ({"max_tokens": 200}, "--max-tokens", 1),
+            ({"enable_prefix_caching": False}, "--no-enable-prefix-caching", 0),
+        ],
+    )
+    def test_replace_existing_parameter(self, input_dict, expected_key, key_idx_offset):
+        """Test that existing parameters in cmd are replaced."""
+        item = ParameterSweepItem.from_record(input_dict)
+
+        if key_idx_offset == 1:
+            # Key-value pair
+            cmd = item.apply_to_cmd(["vllm", "serve", "--max-tokens", "100", "model"])
+            assert expected_key in cmd
+            idx = cmd.index(expected_key)
+            assert cmd[idx + 1] == "200"
+            assert "100" not in cmd
+        else:
+            # Boolean flag
+            cmd = item.apply_to_cmd(
+                ["vllm", "serve", "--enable-prefix-caching", "model"]
+            )
+            assert expected_key in cmd
+            assert "--enable-prefix-caching" not in cmd
+
+
+class TestParameterSweep:
+    """Test ParameterSweep functionality."""
+
+    def test_from_records_list(self):
+        """Test creating ParameterSweep from a list of records."""
+        records = [
+            {"max_tokens": 100, "temperature": 0.7},
+            {"max_tokens": 200, "temperature": 0.9},
+        ]
+        sweep = ParameterSweep.from_records(records)
+        assert len(sweep) == 2
+        assert sweep[0]["max_tokens"] == 100
+        assert sweep[1]["max_tokens"] == 200
+
+    def test_read_from_dict(self):
+        """Test creating ParameterSweep from a dict format."""
+        data = {
+            "experiment1": {"max_tokens": 100, "temperature": 0.7},
+            "experiment2": {"max_tokens": 200, "temperature": 0.9},
+        }
+        sweep = ParameterSweep.read_from_dict(data)
+        assert len(sweep) == 2
+
+        # Check that items have the _benchmark_name field
+        names = {item["_benchmark_name"] for item in sweep}
+        assert names == {"experiment1", "experiment2"}
+
+        # Check that parameters are preserved
+        for item in sweep:
+            if item["_benchmark_name"] == "experiment1":
+                assert item["max_tokens"] == 100
+                assert item["temperature"] == 0.7
+            elif item["_benchmark_name"] == "experiment2":
+                assert item["max_tokens"] == 200
+                assert item["temperature"] == 0.9
+
+    def test_read_json_list_format(self):
+        """Test reading JSON file with list format."""
+        records = [
+            {"max_tokens": 100, "temperature": 0.7},
+            {"max_tokens": 200, "temperature": 0.9},
+        ]
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(records, f)
+            temp_path = Path(f.name)
+
+        try:
+            sweep = ParameterSweep.read_json(temp_path)
+            assert len(sweep) == 2
+            assert sweep[0]["max_tokens"] == 100
+            assert sweep[1]["max_tokens"] == 200
+        finally:
+            temp_path.unlink()
+
+    def test_read_json_dict_format(self):
+        """Test reading JSON file with dict format."""
+        data = {
+            "experiment1": {"max_tokens": 100, "temperature": 0.7},
+            "experiment2": {"max_tokens": 200, "temperature": 0.9},
+        }
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(data, f)
+            temp_path = Path(f.name)
+
+        try:
+            sweep = ParameterSweep.read_json(temp_path)
+            assert len(sweep) == 2
+
+            # Check that items have the _benchmark_name field
+            names = {item["_benchmark_name"] for item in sweep}
+            assert names == {"experiment1", "experiment2"}
+        finally:
+            temp_path.unlink()
+
+    def test_unique_benchmark_names_validation(self):
+        """Test that duplicate _benchmark_name values raise an error."""
+        # Test with duplicate names in list format
+        records = [
+            {"_benchmark_name": "exp1", "max_tokens": 100},
+            {"_benchmark_name": "exp1", "max_tokens": 200},
+        ]
+
+        with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
+            ParameterSweep.from_records(records)
+
+    def test_unique_benchmark_names_multiple_duplicates(self):
+        """Test validation with multiple duplicate names."""
+        records = [
+            {"_benchmark_name": "exp1", "max_tokens": 100},
+            {"_benchmark_name": "exp1", "max_tokens": 200},
+            {"_benchmark_name": "exp2", "max_tokens": 300},
+            {"_benchmark_name": "exp2", "max_tokens": 400},
+        ]
+
+        with pytest.raises(ValueError, match="Duplicate _benchmark_name values"):
+            ParameterSweep.from_records(records)
+
+    def test_no_benchmark_names_allowed(self):
+        """Test that records without _benchmark_name are allowed."""
+        records = [
+            {"max_tokens": 100, "temperature": 0.7},
+            {"max_tokens": 200, "temperature": 0.9},
+        ]
+        sweep = ParameterSweep.from_records(records)
+        assert len(sweep) == 2
+
+    def test_mixed_benchmark_names_allowed(self):
+        """Test that mixing records with and without _benchmark_name is allowed."""
+        records = [
+            {"_benchmark_name": "exp1", "max_tokens": 100},
+            {"max_tokens": 200, "temperature": 0.9},
+        ]
+        sweep = ParameterSweep.from_records(records)
+        assert len(sweep) == 2
+
+
+class TestParameterSweepItemKeyNormalization:
+    """Test key normalization in ParameterSweepItem."""
+
+    def test_underscore_to_hyphen_conversion(self):
+        """Test that underscores are converted to hyphens in CLI."""
+        item = ParameterSweepItem.from_record({"max_tokens": 100})
+        cmd = item.apply_to_cmd(["vllm", "serve"])
+        assert "--max-tokens" in cmd
+
+    def test_nested_key_preserves_suffix(self):
+        """Test that nested keys preserve the suffix format."""
+        # The suffix after the dot should preserve underscores
+        item = ParameterSweepItem.from_record(
+            {"compilation_config.some_nested_param": "value"}
+        )
+        cmd = item.apply_to_cmd(["vllm", "serve"])
+        # The prefix (compilation_config) gets converted to hyphens,
+        # but the suffix (some_nested_param) is preserved
+        assert any("compilation-config.some_nested_param" in arg for arg in cmd)
--- a/tests/benchmarks/test_plot_filters.py
+++ b/tests/benchmarks/test_plot_filters.py
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pandas as pd
+import pytest
+
+from vllm.benchmarks.sweep.plot import (
+    PlotEqualTo,
+    PlotFilterBase,
+    PlotFilters,
+    PlotGreaterThan,
+    PlotGreaterThanOrEqualTo,
+    PlotLessThan,
+    PlotLessThanOrEqualTo,
+    PlotNotEqualTo,
+)
+
+
+class TestPlotFilters:
+    """Test PlotFilter functionality including 'inf' edge case."""
+
+    def setup_method(self):
+        """Create sample DataFrames for testing."""
+        # DataFrame with numeric values
+        self.df_numeric = pd.DataFrame(
+            {
+                "request_rate": [1.0, 5.0, 10.0, 50.0, 100.0],
+                "value": [10, 20, 30, 40, 50],
+            }
+        )
+
+        # DataFrame with float('inf') - note: string "inf" values are coerced
+        # to float when loading data, so we only test with float('inf')
+        self.df_inf_float = pd.DataFrame(
+            {
+                "request_rate": [1.0, 5.0, 10.0, float("inf"), float("inf")],
+                "value": [10, 20, 30, 40, 50],
+            }
+        )
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("5.0", 1),
+            ("10.0", 1),
+            ("1.0", 1),
+        ],
+    )
+    def test_equal_to_numeric(self, target, expected_count):
+        """Test PlotEqualTo with numeric values."""
+        filter_obj = PlotEqualTo("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    def test_equal_to_inf_float(self):
+        """Test PlotEqualTo with float('inf')."""
+        filter_obj = PlotEqualTo("request_rate", "inf")
+        result = filter_obj.apply(self.df_inf_float)
+        # Should match both float('inf') entries because float('inf') == float('inf')
+        assert len(result) == 2
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("5.0", 4),  # All except 5.0
+            ("1.0", 4),  # All except 1.0
+        ],
+    )
+    def test_not_equal_to_numeric(self, target, expected_count):
+        """Test PlotNotEqualTo with numeric values."""
+        filter_obj = PlotNotEqualTo("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    def test_not_equal_to_inf_float(self):
+        """Test PlotNotEqualTo with float('inf')."""
+        filter_obj = PlotNotEqualTo("request_rate", "inf")
+        result = filter_obj.apply(self.df_inf_float)
+        # Should exclude float('inf') entries
+        assert len(result) == 3
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("10.0", 2),  # 1.0, 5.0
+            ("50.0", 3),  # 1.0, 5.0, 10.0
+            ("5.0", 1),  # 1.0
+        ],
+    )
+    def test_less_than(self, target, expected_count):
+        """Test PlotLessThan with numeric values."""
+        filter_obj = PlotLessThan("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("10.0", 3),  # 1.0, 5.0, 10.0
+            ("5.0", 2),  # 1.0, 5.0
+        ],
+    )
+    def test_less_than_or_equal_to(self, target, expected_count):
+        """Test PlotLessThanOrEqualTo with numeric values."""
+        filter_obj = PlotLessThanOrEqualTo("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("10.0", 2),  # 50.0, 100.0
+            ("5.0", 3),  # 10.0, 50.0, 100.0
+        ],
+    )
+    def test_greater_than(self, target, expected_count):
+        """Test PlotGreaterThan with numeric values."""
+        filter_obj = PlotGreaterThan("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    @pytest.mark.parametrize(
+        "target,expected_count",
+        [
+            ("10.0", 3),  # 10.0, 50.0, 100.0
+            ("5.0", 4),  # 5.0, 10.0, 50.0, 100.0
+        ],
+    )
+    def test_greater_than_or_equal_to(self, target, expected_count):
+        """Test PlotGreaterThanOrEqualTo with numeric values."""
+        filter_obj = PlotGreaterThanOrEqualTo("request_rate", target)
+        result = filter_obj.apply(self.df_numeric)
+        assert len(result) == expected_count
+
+    @pytest.mark.parametrize(
+        "filter_str,expected_var,expected_target,expected_type",
+        [
+            ("request_rate==5.0", "request_rate", "5.0", PlotEqualTo),
+            ("request_rate!=10.0", "request_rate", "10.0", PlotNotEqualTo),
+            ("request_rate<50.0", "request_rate", "50.0", PlotLessThan),
+            ("request_rate<=50.0", "request_rate", "50.0", PlotLessThanOrEqualTo),
+            ("request_rate>10.0", "request_rate", "10.0", PlotGreaterThan),
+            ("request_rate>=10.0", "request_rate", "10.0", PlotGreaterThanOrEqualTo),
+            ("request_rate==inf", "request_rate", "inf", PlotEqualTo),
+            ("request_rate!='inf'", "request_rate", "inf", PlotNotEqualTo),
+        ],
+    )
+    def test_parse_str(self, filter_str, expected_var, expected_target, expected_type):
+        """Test parsing filter strings."""
+        filter_obj = PlotFilterBase.parse_str(filter_str)
+        assert isinstance(filter_obj, expected_type)
+        assert filter_obj.var == expected_var
+        assert filter_obj.target == expected_target
+
+    def test_parse_str_inf_edge_case(self):
+        """Test parsing 'inf' string in filter."""
+        filter_obj = PlotFilterBase.parse_str("request_rate==inf")
+        assert isinstance(filter_obj, PlotEqualTo)
+        assert filter_obj.var == "request_rate"
+        assert filter_obj.target == "inf"
+
+    def test_parse_multiple_filters(self):
+        """Test parsing multiple filters."""
+        filters = PlotFilters.parse_str("request_rate>5.0,value<=40")
+        assert len(filters) == 2
+        assert isinstance(filters[0], PlotGreaterThan)
+        assert isinstance(filters[1], PlotLessThanOrEqualTo)
+
+    def test_parse_empty_filter(self):
+        """Test parsing empty filter string."""
+        filters = PlotFilters.parse_str("")
+        assert len(filters) == 0
--- a/tests/benchmarks/test_random_dataset.py
+++ b/tests/benchmarks/test_random_dataset.py
@@ -0,0 +1,484 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+from typing import Any, NamedTuple, cast
+
+import numpy as np
+import pytest
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.benchmarks.datasets import (
+    RandomDataset,
+    RandomMultiModalDataset,
+    SampleRequest,
+)
+
+
+@pytest.fixture(scope="session")
+def hf_tokenizer() -> PreTrainedTokenizerBase:
+    # Use a small, commonly available tokenizer
+    return AutoTokenizer.from_pretrained("gpt2")
+
+
+class Params(NamedTuple):
+    num_requests: int
+    prefix_len: int
+    range_ratio: float
+    input_len: int
+    output_len: int
+
+
+@pytest.fixture(scope="session")
+def random_dataset_params() -> Params:
+    return Params(
+        num_requests=16, prefix_len=7, range_ratio=0.3, input_len=50, output_len=20
+    )
+
+
+def _fingerprint_sample(req: SampleRequest) -> tuple[str, int, int]:
+    """Project a SampleRequest into a comparable tuple."""
+    return (req.prompt, req.prompt_len, req.expected_output_len)
+
+
+def _collect_samples(
+    dataset: RandomDataset,
+    tokenizer: PreTrainedTokenizerBase,
+    num_requests: int = 16,
+    prefix_len: int = 7,
+    range_ratio: float = 0.3,
+    input_len: int = 50,
+    output_len: int = 20,
+) -> list[tuple[str, int, int]]:
+    samples = dataset.sample(
+        tokenizer=tokenizer,
+        num_requests=num_requests,
+        prefix_len=prefix_len,
+        range_ratio=range_ratio,
+        input_len=input_len,
+        output_len=output_len,
+    )
+    return [_fingerprint_sample(s) for s in samples]
+
+
+@pytest.mark.benchmark
+def test_random_dataset_same_seed(
+    hf_tokenizer: PreTrainedTokenizerBase, random_dataset_params: Params
+) -> None:
+    """Same seed should yield identical outputs, even if global RNGs change.
+
+    This guards against accidental reliance on Python's random or np.random
+    in RandomDataset after moving to numpy.default_rng.
+    """
+    p = random_dataset_params
+    common_seed = 123
+    dataset_a = RandomDataset(random_seed=common_seed)
+    dataset_b = RandomDataset(random_seed=common_seed)
+    a = _collect_samples(
+        dataset_a,
+        hf_tokenizer,
+        num_requests=p.num_requests,
+        prefix_len=p.prefix_len,
+        range_ratio=p.range_ratio,
+        input_len=p.input_len,
+        output_len=p.output_len,
+    )
+
+    # Perturb global RNG state to ensure isolation
+    random.seed(999)
+    _ = [random.random() for _ in range(100)]
+    np.random.seed(888)
+    _ = [np.random.random() for _ in range(100)]
+
+    b = _collect_samples(
+        dataset_b,
+        hf_tokenizer,
+        num_requests=p.num_requests,
+        prefix_len=p.prefix_len,
+        range_ratio=p.range_ratio,
+        input_len=p.input_len,
+        output_len=p.output_len,
+    )
+    assert a == b
+
+
+@pytest.mark.benchmark
+def test_random_dataset_different_seeds(
+    hf_tokenizer: PreTrainedTokenizerBase, random_dataset_params: Params
+) -> None:
+    """Different seeds should change outputs with overwhelming likelihood."""
+    p = random_dataset_params
+    seed_a = 0
+    dataset_a = RandomDataset(random_seed=seed_a)
+    a = _collect_samples(
+        dataset_a,
+        hf_tokenizer,
+        num_requests=p.num_requests,
+        prefix_len=p.prefix_len,
+        range_ratio=p.range_ratio,
+        input_len=p.input_len,
+        output_len=p.output_len,
+    )
+
+    seed_b = 999
+    dataset_b = RandomDataset(random_seed=seed_b)
+    # Perturb global RNG with same seed as dataset_a to ensure isolation
+    random.seed(seed_a)
+    np.random.seed(seed_a)
+    b = _collect_samples(
+        dataset_b,
+        hf_tokenizer,
+        num_requests=p.num_requests,
+        prefix_len=p.prefix_len,
+        range_ratio=p.range_ratio,
+        input_len=p.input_len,
+        output_len=p.output_len,
+    )
+    assert a != b
+
+
+# -----------------------------
+# RandomMultiModalDataset tests
+# -----------------------------
+
+
+def _mm_fingerprint_sample(
+    req: SampleRequest,
+) -> tuple[str, int, int, int, list[str]]:
+    """Create a compact fingerprint for multimodal samples.
+
+    Includes:
+    - prompt string
+    - prompt_len
+    - expected_output_len
+    - count of multimodal items
+    - per-item type and URL prefix (e.g., 'data:image/jpeg;base64,')
+    """
+    items = req.multi_modal_data or []
+    item_prefixes: list[str] = []
+    for it in items:
+        if isinstance(it, dict) and it.get("type") == "image_url":
+            url = it.get("image_url", {}).get("url", "")
+            # Only keep a short identifying prefix to avoid huge strings
+            item_prefixes.append(f"image:{url[:22]}")
+        elif isinstance(it, dict) and it.get("type") == "video_url":
+            url = it.get("video_url", {}).get("url", "")
+            item_prefixes.append(f"video:{url[:22]}")
+        else:
+            item_prefixes.append("unknown:")
+    return (
+        req.prompt,
+        req.prompt_len,
+        req.expected_output_len,
+        len(items),
+        item_prefixes,
+    )
+
+
+def _collect_mm_samples(
+    dataset: RandomMultiModalDataset,
+    tokenizer: PreTrainedTokenizerBase,
+    *,
+    num_requests: int = 8,
+    prefix_len: int = 3,
+    range_ratio: float = 0.0,
+    input_len: int = 20,
+    output_len: int = 5,
+    base_items_per_request: int = 2,
+    num_mm_items_range_ratio: float = 0.0,
+    limit_mm_per_prompt: dict[str, int] | None = None,
+    bucket_config: dict[tuple[int, int, int], float] | None = None,
+    enable_multimodal_chat: bool = False,
+) -> list[SampleRequest]:
+    if limit_mm_per_prompt is None:
+        limit_mm_per_prompt = {"image": 5, "video": 0}
+    if bucket_config is None:
+        bucket_config = {(32, 32, 1): 0.5, (52, 64, 1): 0.5}
+    return dataset.sample(
+        tokenizer=tokenizer,
+        num_requests=num_requests,
+        prefix_len=prefix_len,
+        range_ratio=range_ratio,
+        input_len=input_len,
+        output_len=output_len,
+        base_items_per_request=base_items_per_request,
+        num_mm_items_range_ratio=num_mm_items_range_ratio,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        enable_multimodal_chat=enable_multimodal_chat,
+    )
+
+
+@pytest.mark.benchmark
+def test_random_mm_same_seed(hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    seed = 42
+    ds_a = RandomMultiModalDataset(random_seed=seed)
+    ds_b = RandomMultiModalDataset(random_seed=seed)
+    a = _collect_mm_samples(ds_a, hf_tokenizer)
+    b = _collect_mm_samples(ds_b, hf_tokenizer)
+    fa = [_mm_fingerprint_sample(s) for s in a]
+    fb = [_mm_fingerprint_sample(s) for s in b]
+    assert fa == fb
+
+
+@pytest.mark.benchmark
+def test_random_mm_different_seeds(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    ds_a = RandomMultiModalDataset(random_seed=0)
+    ds_b = RandomMultiModalDataset(random_seed=999)
+    a = _collect_mm_samples(ds_a, hf_tokenizer)
+    b = _collect_mm_samples(ds_b, hf_tokenizer)
+    fa = [_mm_fingerprint_sample(s) for s in a]
+    fb = [_mm_fingerprint_sample(s) for s in b]
+    assert fa != fb
+
+
+@pytest.mark.benchmark
+def test_random_mm_respects_limits(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    # Requesting 3 items with a per-prompt limit of 1 should error per current
+    # design (dataset refuses to silently clamp below the requested baseline).
+    with pytest.raises(ValueError):
+        _collect_mm_samples(
+            ds,
+            hf_tokenizer,
+            num_requests=12,
+            base_items_per_request=3,
+            num_mm_items_range_ratio=0.0,
+            limit_mm_per_prompt={"image": 1, "video": 0},
+            bucket_config={(32, 32, 1): 1.0},
+        )
+
+
+@pytest.mark.benchmark
+def test_random_mm_zero_prob_entries_are_removed(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    # Second bucket has zero probability and should be ignored after
+    # normalization
+    samples = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=6,
+        base_items_per_request=2,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt={"image": 10, "video": 0},
+        bucket_config={(32, 32, 1): 1.0, (52, 64, 1): 0.0},
+    )
+    for s in samples:
+        assert isinstance(s.multi_modal_data, list)
+        typed_mm = cast(list[dict[str, Any]], s.multi_modal_data)
+        for it in typed_mm:
+            assert it.get("type") == "image_url"
+
+
+@pytest.mark.benchmark
+def test_random_mm_zero_items(hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    samples = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=0,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt={"image": 5, "video": 0},
+        bucket_config={(32, 32, 1): 1.0},
+    )
+    for s in samples:
+        assert s.multi_modal_data == []
+
+
+@pytest.mark.benchmark
+def test_random_mm_num_items_per_prompt(hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    # Fixed number of images per prompt
+    # set num_mm_items_range_ratio to 0.0
+    # TODO: modify video values when video sampling is implemented
+    samples_fixed_items = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=3,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt={"image": 3, "video": 0},
+        bucket_config={(32, 32, 1): 1.0},
+    )
+    # Must have 5 requests each with 3 mm items per prompt
+    assert len(samples_fixed_items) == 5
+    for s in samples_fixed_items:
+        mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
+        assert len(mm_data) == 3
+        for it in mm_data:
+            assert it.get("type") == "image_url"
+
+
+@pytest.mark.benchmark
+def test_random_mm_bucket_config_not_mutated(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    ds = RandomMultiModalDataset(random_seed=0)
+    # This bucket config is not normalized to sum to 1
+    # and has more buckets than requested images
+    original = {(32, 32, 1): 0.2, (52, 64, 1): 6, (25, 64, 1): 3}
+    # Keep a snapshot to compare after sampling
+    snapshot = dict(original)
+
+    _ = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=4,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt={"image": 1, "video": 0},
+        bucket_config=original,
+    )
+
+    # Ensure the original dict content is unchanged
+    assert original == snapshot
+
+    # Vary number of mm items per prompt
+    # set num_mm_items_range_ratio to 0.5
+    samples_varying_items = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=2,
+        num_mm_items_range_ratio=0.5,
+        limit_mm_per_prompt={"image": 4, "video": 0},
+        bucket_config={(32, 32, 1): 1.0},
+    )
+    # Must have 5 requests each with less than 4 mm items per prompt
+    # but at least 1 mm item per prompt
+    assert len(samples_varying_items) == 5
+    for s in samples_varying_items:
+        mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
+        assert len(mm_data) <= 4
+        assert len(mm_data) >= 1
+        for it in mm_data:
+            assert it.get("type") == "image_url"
+
+
+@pytest.mark.benchmark
+def test_random_mm_video_sampling(hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    """Test video sampling functionality in RandomMultiModalDataset."""
+    ds = RandomMultiModalDataset(random_seed=42)
+
+    # Test with video bucket configuration
+    bucket_config = {
+        (64, 64, 1): 0.3,  # Images
+        (64, 64, 8): 0.7,  # Videos
+    }
+
+    limit_mm_per_prompt = {"image": 2, "video": 2}
+
+    samples = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+    )
+
+    assert len(samples) == 5
+
+    # Check that we have both images and videos
+    video_count = 0
+    image_count = 0
+
+    for s in samples:
+        mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
+        assert len(mm_data) == 1
+
+        item = mm_data[0]
+        if item.get("type") == "video_url":
+            video_count += 1
+            # Verify video URL format
+            url = item.get("video_url", {}).get("url", "")
+            assert url.startswith("data:video/mp4;base64,")
+        elif item.get("type") == "image_url":
+            image_count += 1
+            # Verify image URL format
+            url = item.get("image_url", {}).get("url", "")
+            assert url.startswith("data:image/jpeg;base64,")
+
+    # Should have some videos due to 0.7 probability
+    assert video_count > 0
+    assert image_count > 0
+
+
+@pytest.mark.benchmark
+def test_random_mm_video_only_sampling(hf_tokenizer: PreTrainedTokenizerBase) -> None:
+    """Test sampling with only video buckets."""
+    ds = RandomMultiModalDataset(random_seed=42)
+
+    bucket_config = {
+        (64, 64, 8): 1.0,  # Only videos
+    }
+
+    limit_mm_per_prompt = {"image": 0, "video": 1}
+
+    samples = _collect_mm_samples(
+        ds,
+        hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+    )
+
+    assert len(samples) == 3
+
+    for s in samples:
+        mm_data = cast(list[dict[str, Any]], s.multi_modal_data)
+        assert len(mm_data) == 1
+
+        item = mm_data[0]
+        assert item.get("type") == "video_url"
+        url = item.get("video_url", {}).get("url", "")
+        assert url.startswith("data:video/mp4;base64,")
+
+
+@pytest.mark.benchmark
+def test_random_mm_video_deterministic_sampling(
+    hf_tokenizer: PreTrainedTokenizerBase,
+) -> None:
+    """Test that video sampling is deterministic with same seed."""
+    seed = 123
+    ds_a = RandomMultiModalDataset(random_seed=seed)
+    ds_b = RandomMultiModalDataset(random_seed=seed)
+
+    bucket_config = {
+        (64, 64, 8): 1.0,  # Only videos
+    }
+
+    limit_mm_per_prompt = {"image": 0, "video": 1}
+
+    a = _collect_mm_samples(
+        ds_a,
+        hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+    )
+
+    b = _collect_mm_samples(
+        ds_b,
+        hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+    )
+
+    fa = [_mm_fingerprint_sample(s) for s in a]
+    fb = [_mm_fingerprint_sample(s) for s in b]
+    assert fa == fb
--- a/tests/benchmarks/test_random_multimodal_dataset_video.py
+++ b/tests/benchmarks/test_random_multimodal_dataset_video.py
@@ -0,0 +1,398 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import os
+from tempfile import NamedTemporaryFile
+from typing import Any, cast
+
+import cv2
+import pytest
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.benchmarks.datasets import RandomMultiModalDataset, SampleRequest
+
+
+@pytest.fixture(scope="session")
+def hf_tokenizer() -> PreTrainedTokenizerBase:
+    """Use a small, commonly available tokenizer."""
+    return AutoTokenizer.from_pretrained("gpt2")
+
+
+@pytest.fixture
+def video_dataset() -> RandomMultiModalDataset:
+    """Create a RandomMultiModalDataset instance for testing."""
+    return RandomMultiModalDataset(random_seed=42)
+
+
+@pytest.mark.benchmark
+def test_generate_synthetic_video_different_seeds():
+    """Test that different seeds produce different videos."""
+    dataset1 = RandomMultiModalDataset(random_seed=123)
+    dataset2 = RandomMultiModalDataset(random_seed=456)
+
+    width, height, num_frames = 64, 48, 8
+
+    video1 = dataset1.generate_synthetic_video(width, height, num_frames)
+    video2 = dataset2.generate_synthetic_video(width, height, num_frames)
+
+    # Videos should be different due to different seeds
+    assert video1["bytes"] != video2["bytes"]
+
+
+@pytest.mark.benchmark
+def test_map_config_to_modality(video_dataset: RandomMultiModalDataset):
+    """Test modality mapping for different configurations."""
+    # Test image configuration (num_frames = 1)
+    assert video_dataset.map_config_to_modality((256, 256, 1)) == "image"
+    assert video_dataset.map_config_to_modality((720, 1280, 1)) == "image"
+
+    # Test video configurations (num_frames > 1)
+    assert video_dataset.map_config_to_modality((256, 256, 8)) == "video"
+    assert video_dataset.map_config_to_modality((720, 1280, 16)) == "video"
+    assert video_dataset.map_config_to_modality((64, 64, 32)) == "video"
+
+    # Test invalid configurations
+    with pytest.raises(ValueError, match="Invalid multimodal item configuration"):
+        video_dataset.map_config_to_modality((256, 256, 0))
+
+    with pytest.raises(ValueError, match="Invalid multimodal item configuration"):
+        video_dataset.map_config_to_modality((256, 256, -1))
+
+
+@pytest.mark.benchmark
+def test_generate_mm_item_video(video_dataset: RandomMultiModalDataset):
+    """Test generating multimodal items for video configurations."""
+    # Test video item generation
+    video_config = (64, 48, 8)  # height, width, num_frames
+    result = video_dataset.generate_mm_item(video_config)
+
+    # Check the result structure matches OpenAI API format
+    assert isinstance(result, dict)
+    assert result["type"] == "video_url"
+    assert "video_url" in result
+    assert "url" in result["video_url"]
+
+    # Check that the URL is a data URL with base64 encoded video
+    url = result["video_url"]["url"]
+    assert url.startswith("data:video/mp4;base64,")
+
+    # Decode and verify the video content
+    base64_data = url.split(",")[1]
+    video_bytes = base64.b64decode(base64_data)
+    assert len(video_bytes) > 0
+
+    # Verify the video can be decoded
+    with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
+        temp_path = temp_file.name
+        temp_file.write(video_bytes)
+
+    try:
+        cap = cv2.VideoCapture(temp_path)
+        assert cap.isOpened()
+
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+        assert frame_count == 8
+        assert frame_width == 48
+        assert frame_height == 64
+
+        cap.release()
+    finally:
+        if os.path.exists(temp_path):
+            os.unlink(temp_path)
+
+
+@pytest.mark.benchmark
+def test_generate_mm_item_image(video_dataset: RandomMultiModalDataset):
+    """Test generating multimodal items for image configurations."""
+    # Test image item generation
+    image_config = (64, 48, 1)  # height, width, num_frames=1
+    result = video_dataset.generate_mm_item(image_config)
+
+    # Check the result structure matches OpenAI API format
+    assert isinstance(result, dict)
+    assert result["type"] == "image_url"
+    assert "image_url" in result
+    assert "url" in result["image_url"]
+
+    # Check that the URL is a data URL with base64 encoded image
+    url = result["image_url"]["url"]
+    assert url.startswith("data:image/jpeg;base64,")
+
+
+@pytest.mark.benchmark
+def test_generate_mm_item_invalid_config(video_dataset: RandomMultiModalDataset):
+    """Test error handling for invalid configurations."""
+    with pytest.raises(ValueError, match="Invalid multimodal item configuration"):
+        video_dataset.generate_mm_item((256, 256, 0))
+
+
+@pytest.mark.benchmark
+def test_sample_with_video_buckets(
+    video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
+):
+    """Test sampling with video bucket configurations."""
+    # Configure bucket with video probability > 0
+    bucket_config = {
+        (64, 64, 1): 0.3,  # Images
+        (64, 64, 8): 0.7,  # Videos
+    }
+
+    limit_mm_per_prompt = {"image": 5, "video": 3}
+
+    samples = video_dataset.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=5,
+        base_items_per_request=2,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    assert len(samples) == 5
+
+    # Check that samples contain both images and videos
+    video_count = 0
+    image_count = 0
+
+    for sample in samples:
+        assert isinstance(sample, SampleRequest)
+        assert sample.multi_modal_data is not None
+        assert isinstance(sample.multi_modal_data, list)
+
+        mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
+        assert len(mm_data) == 2  # base_items_per_request
+
+        for item in mm_data:
+            if item["type"] == "video_url":
+                video_count += 1
+                # Verify video URL format
+                url = item["video_url"]["url"]
+                assert url.startswith("data:video/mp4;base64,")
+            elif item["type"] == "image_url":
+                image_count += 1
+                # Verify image URL format
+                url = item["image_url"]["url"]
+                assert url.startswith("data:image/jpeg;base64,")
+
+    # Should have some videos due to 0.7 probability
+    assert video_count > 0
+    assert image_count > 0
+
+
+@pytest.mark.benchmark
+def test_sample_video_only_buckets(
+    video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
+):
+    """Test sampling with only video buckets."""
+    bucket_config = {
+        (64, 64, 8): 1.0,  # Only videos
+    }
+
+    limit_mm_per_prompt = {"image": 0, "video": 2}
+
+    samples = video_dataset.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    assert len(samples) == 3
+
+    for sample in samples:
+        assert isinstance(sample, SampleRequest)
+        assert sample.multi_modal_data is not None
+        assert isinstance(sample.multi_modal_data, list)
+
+        mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
+        assert len(mm_data) == 1
+
+        item = mm_data[0]
+        assert item["type"] == "video_url"
+        url = item["video_url"]["url"]
+        assert url.startswith("data:video/mp4;base64,")
+
+
+@pytest.mark.benchmark
+def test_sample_respects_video_limits(
+    video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
+):
+    """Test that sampling respects video limits per prompt."""
+    bucket_config = {
+        (64, 64, 8): 1.0,  # Only videos
+    }
+
+    # Set very low video limit
+    limit_mm_per_prompt = {"image": 0, "video": 1}
+
+    samples = video_dataset.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    assert len(samples) == 3
+
+    for sample in samples:
+        mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
+        assert len(mm_data) <= 1  # Should respect video limit
+
+
+@pytest.mark.benchmark
+def test_sample_mixed_buckets_with_zero_probability(
+    video_dataset: RandomMultiModalDataset, hf_tokenizer: PreTrainedTokenizerBase
+):
+    """Test sampling with mixed buckets including zero probability entries."""
+    bucket_config = {
+        (64, 64, 1): 0.5,  # Images
+        (64, 64, 8): 0.5,  # Videos
+        (128, 128, 16): 0.0,  # Zero probability videos (should be ignored)
+    }
+
+    limit_mm_per_prompt = {"image": 2, "video": 2}
+
+    samples = video_dataset.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=4,
+        base_items_per_request=2,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    assert len(samples) == 4
+
+    # Should only see 64x64 videos, not 128x128 videos
+    for sample in samples:
+        mm_data = cast(list[dict[str, Any]], sample.multi_modal_data)
+        for item in mm_data:
+            if item["type"] == "video_url":
+                # Decode video to verify dimensions
+                url = item["video_url"]["url"]
+                base64_data = url.split(",")[1]
+                video_bytes = base64.b64decode(base64_data)
+
+                with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:  # noqa
+                    temp_path = temp_file.name
+                    temp_file.write(video_bytes)
+
+                try:
+                    cap = cv2.VideoCapture(temp_path)
+                    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+                    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+                    cap.release()
+
+                    # Should be 64x64, not 128x128
+                    assert frame_width == 64
+                    assert frame_height == 64
+                finally:
+                    if os.path.exists(temp_path):
+                        os.unlink(temp_path)
+
+
+@pytest.mark.benchmark
+def test_sample_deterministic_with_videos(hf_tokenizer: PreTrainedTokenizerBase):
+    """Test that sampling with videos is deterministic with same seed."""
+    dataset1 = RandomMultiModalDataset(random_seed=123)
+    dataset2 = RandomMultiModalDataset(random_seed=123)
+
+    bucket_config = {
+        (64, 64, 1): 0.3,  # Images
+        (64, 64, 8): 0.7,  # Videos
+    }
+
+    limit_mm_per_prompt = {"image": 2, "video": 2}
+
+    samples1 = dataset1.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    samples2 = dataset2.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=3,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    assert len(samples1) == len(samples2)
+
+    # Compare multimodal data
+    for s1, s2 in zip(samples1, samples2):
+        assert s1.multi_modal_data == s2.multi_modal_data
+
+
+@pytest.mark.benchmark
+def test_sample_different_seeds_produce_different_videos(
+    hf_tokenizer: PreTrainedTokenizerBase,
+):
+    """Test that different seeds produce different video content."""
+    dataset1 = RandomMultiModalDataset(random_seed=123)
+    dataset2 = RandomMultiModalDataset(random_seed=456)
+
+    bucket_config = {
+        (64, 64, 8): 1.0,  # Only videos
+    }
+
+    limit_mm_per_prompt = {"image": 0, "video": 1}
+
+    samples1 = dataset1.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=2,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    samples2 = dataset2.sample(
+        tokenizer=hf_tokenizer,
+        num_requests=2,
+        base_items_per_request=1,
+        num_mm_items_range_ratio=0.0,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        bucket_config=bucket_config,
+        input_len=20,
+        output_len=5,
+    )
+
+    # Video content should be different
+    for s1, s2 in zip(samples1, samples2):
+        mm_data1 = cast(list[dict[str, Any]], s1.multi_modal_data)
+        mm_data2 = cast(list[dict[str, Any]], s2.multi_modal_data)
+
+        assert len(mm_data1) == len(mm_data2) == 1
+
+        url1 = mm_data1[0]["video_url"]["url"]
+        url2 = mm_data2[0]["video_url"]["url"]
+
+        assert url1 != url2  # Different video content
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import subprocess
+
+import pytest
+
+from ..utils import RemoteOpenAIServer
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.benchmark
+def test_bench_serve(server):
+    command = [
+        "vllm",
+        "bench",
+        "serve",
+        "--model",
+        MODEL_NAME,
+        "--host",
+        server.host,
+        "--port",
+        str(server.port),
+        "--dataset-name",
+        "random",
+        "--random-input-len",
+        "32",
+        "--random-output-len",
+        "4",
+        "--num-prompts",
+        "5",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
+
+
+@pytest.mark.benchmark
+def test_bench_serve_chat(server):
+    command = [
+        "vllm",
+        "bench",
+        "serve",
+        "--model",
+        MODEL_NAME,
+        "--host",
+        server.host,
+        "--port",
+        str(server.port),
+        "--dataset-name",
+        "random",
+        "--random-input-len",
+        "32",
+        "--random-output-len",
+        "4",
+        "--num-prompts",
+        "5",
+        "--endpoint",
+        "/v1/chat/completions",
+        "--backend",
+        "openai-chat",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
--- a/tests/benchmarks/test_throughput_cli.py
+++ b/tests/benchmarks/test_throughput_cli.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import subprocess
+
+import pytest
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.benchmark
+def test_bench_throughput():
+    command = [
+        "vllm",
+        "bench",
+        "throughput",
+        "--model",
+        MODEL_NAME,
+        "--input-len",
+        "32",
+        "--output-len",
+        "1",
+        "--enforce-eager",
+        "--load-format",
+        "dummy",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"