54 lines
1.2 KiB
Python
54 lines
1.2 KiB
Python
"""
|
|
For intel_amx attention backend w8a8 tests
|
|
Usage:
|
|
python3 -m unittest test_intel_amx_attention_backend_2.TestIntelAMXAttnBackendQuant.test_latency_w8a8_default_model
|
|
"""
|
|
|
|
import unittest
|
|
|
|
from sglang.test.test_utils import (
|
|
DEFAULT_MODEL_NAME_FOR_TEST_W8A8,
|
|
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE,
|
|
CustomTestCase,
|
|
intel_amx_benchmark,
|
|
)
|
|
|
|
|
|
class TestIntelAMXAttnBackendQuant(CustomTestCase):
|
|
|
|
@intel_amx_benchmark(
|
|
extra_args=[
|
|
"--batch-size",
|
|
"4",
|
|
"--quantization",
|
|
"w8a8_int8",
|
|
"--mem-fraction-static",
|
|
"0.1",
|
|
],
|
|
min_throughput=100,
|
|
)
|
|
def test_latency_w8a8_default_model(self):
|
|
return DEFAULT_MODEL_NAME_FOR_TEST_W8A8
|
|
|
|
@intel_amx_benchmark(
|
|
extra_args=[
|
|
"--batch-size",
|
|
"4",
|
|
"--quantization",
|
|
"w8a8_int8",
|
|
"--mem-fraction-static",
|
|
"0.9",
|
|
"--max-total-tokens",
|
|
"65536",
|
|
"--tp",
|
|
"6",
|
|
],
|
|
min_throughput=100,
|
|
)
|
|
def test_latency_w8a8_moe_model(self):
|
|
return DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|