Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""A basic performance regression test for TPUs
+
+Run `pytest tests/v1/tpu/test_perf.py`.
+"""
+
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pytest
+
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.tokenizers import get_tokenizer
+
+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
+else:
+    VllmRunner = object
+
+
+@dataclass
+class TestParams:
+    model: str
+    num_prompts: int
+    prefix_len: int
+    decode_len: int
+    expected_avg_time: float
+    err_tol: float
+
+
+TEST_PARAMS = [
+    # TODO: Cannot run a series of tests because:
+    #   RuntimeError: Bad StatusOr access: UNKNOWN: TPU initialization failed:
+    #   open(/dev/vfio/0): Device or resource busy: Device or resource busy;
+    #   Couldn't open iommu group /dev/vfio/0
+    # => Investigate
+    # TestParams(
+    #     model="Qwen/Qwen2.5-1.5B-Instruct",
+    #     num_prompts=1,
+    #     prefix_len=10,
+    #     decode_len=5,
+    #     expected_avg_time=0.03,
+    #     err_tol=0.01,
+    # ),
+    # TestParams(
+    #     model="Qwen/Qwen2.5-1.5B-Instruct",
+    #     num_prompts=10,
+    #     prefix_len=100,
+    #     decode_len=50,
+    #     expected_avg_time=0.234,
+    #     err_tol=0.020,
+    # ),
+    TestParams(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        num_prompts=64,
+        prefix_len=500,
+        decode_len=50,
+        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
+        # tpu: v5lite (old vllm CI/CD)
+        # expected_avg_time=1.4,
+        # err_tol=0.30,
+        # (This is the active CI/CD instance)
+        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
+        # tpu: v6e (current vllm CI/CD)
+        expected_avg_time=1.7,  # measured with VLLM_XLA_CACHE_PATH=
+        err_tol=0.20,
+    ),
+]
+
+NUM_WARMUPS = 5
+NUM_RUNS = 10
+
+MAX_MODEL_LEN = 1024
+MAX_NUM_SEQS = 32
+GPU_UTIL = 0.9
+
+
+@pytest.mark.skipif(
+    not current_platform.is_tpu(),
+    reason="This is a basic performance test for TPU only",
+)
+@pytest.mark.parametrize("params", TEST_PARAMS)
+def test_perf(
+    vllm_runner: type[VllmRunner],
+    params: TestParams,
+) -> None:
+    tokenizer = get_tokenizer(
+        params.model, tokenizer_mode="auto", trust_remote_code=True
+    )
+
+    prompts = []
+    for i in range(params.num_prompts):
+        prefix_token_ids = np.random.randint(
+            0, tokenizer.vocab_size, size=params.prefix_len
+        ).tolist()
+        prompt = tokenizer.decode(prefix_token_ids)
+        prompts.append(prompt)
+
+    print(
+        "-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(
+            len(prompts), params.prefix_len, params.decode_len
+        )
+    )
+
+    sampling_params = SamplingParams(
+        max_tokens=params.decode_len, temperature=1.0, min_p=0.0
+    )
+
+    with vllm_runner(
+        params.model,
+        max_num_batched_tokens=MAX_MODEL_LEN,
+        max_model_len=MAX_MODEL_LEN,
+        max_num_seqs=MAX_NUM_SEQS,
+        gpu_memory_utilization=GPU_UTIL,
+        enforce_eager=False,
+        tensor_parallel_size=1,
+    ) as vllm_model:
+        print("  -- Warmup / Compile")
+        for i in range(NUM_WARMUPS):
+            _ = vllm_model.generate(prompts, sampling_params)
+
+        print("  -- Benchmarking... ")
+        times = []
+        for i in range(NUM_RUNS):
+            start_time = time.time()
+            _ = vllm_model.generate(prompts, sampling_params)
+            times.append(time.time() - start_time)
+
+        avg_time = sum(times) / len(times)
+
+        print("  -- avg_time = {}".format(avg_time))
+        print(
+            "  -- expected_avg_time = {} with err_tol = {}".format(
+                params.expected_avg_time, params.err_tol
+            )
+        )
+        diff = avg_time - params.expected_avg_time
+        ok = diff < params.err_tol
+        if diff < -params.err_tol:
+            print(
+                "  !! WARNING !! Performance has improved by {}, "
+                "it may be necessary to fine-tune the "
+                "expected_avg_time = {}".format(-diff, params.expected_avg_time)
+            )
+
+        assert ok, " !! ERROR !! Regression detected"