Files
sglang/test/srt/cpu/test_intel_amx_attention_backend_c.py
2025-10-15 19:22:32 -07:00

54 lines
1.2 KiB
Python

"""
For intel_amx attention backend w8a8 tests
Usage:
python3 -m unittest test_intel_amx_attention_backend_2.TestIntelAMXAttnBackendQuant.test_latency_w8a8_default_model
"""
import unittest
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST_W8A8,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE,
CustomTestCase,
intel_amx_benchmark,
)
class TestIntelAMXAttnBackendQuant(CustomTestCase):
@intel_amx_benchmark(
extra_args=[
"--batch-size",
"4",
"--quantization",
"w8a8_int8",
"--mem-fraction-static",
"0.1",
],
min_throughput=100,
)
def test_latency_w8a8_default_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_W8A8
@intel_amx_benchmark(
extra_args=[
"--batch-size",
"4",
"--quantization",
"w8a8_int8",
"--mem-fraction-static",
"0.9",
"--max-total-tokens",
"65536",
"--tp",
"6",
],
min_throughput=100,
)
def test_latency_w8a8_moe_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
if __name__ == "__main__":
unittest.main()