Files
sglang/test/srt/xpu/test_intel_xpu_backend.py
2025-10-21 11:41:28 +08:00

66 lines
1.7 KiB
Python

"""
Usage:
python3 -m unittest test_intel_xpu_backend.TestIntelXPUBackend.test_latency_qwen_model
"""
import os
import unittest
from functools import wraps
from sglang.test.test_utils import (
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
CustomTestCase,
is_in_ci,
run_bench_one_batch,
)
def intel_xpu_benchmark(extra_args=None, min_throughput=None):
def decorator(test_func):
@wraps(test_func)
def wrapper(self):
common_args = [
"--disable-radix",
"--trust-remote-code",
"--mem-fraction-static",
"0.3",
"--batch-size",
"1",
"--device",
"xpu",
]
full_args = common_args + (extra_args or [])
model = test_func(self)
prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
model, full_args
)
print(f"{model=}")
print(f"{prefill_latency=}")
print(f"{decode_throughput=}")
print(f"{decode_latency=}")
if is_in_ci() and min_throughput is not None:
self.assertGreater(decode_throughput, min_throughput)
return wrapper
return decorator
class TestIntelXPUBackend(CustomTestCase):
@intel_xpu_benchmark(min_throughput=10)
def test_latency_qwen_model(self):
return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN
@intel_xpu_benchmark(["--attention-backend", "intel_xpu", "--page-size", "128"])
def test_attention_backend(self):
return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE
if __name__ == "__main__":
unittest.main()