diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 95a842684..2e75909e9 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -16,7 +16,7 @@ import unittest from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from datetime import datetime -from functools import partial +from functools import partial, wraps from pathlib import Path from types import SimpleNamespace from typing import Any, Awaitable, Callable, List, Optional, Tuple @@ -1807,3 +1807,33 @@ def write_results_to_json(model, metrics, mode="a"): with open("results.json", "w") as f: json.dump(existing_results, f, indent=2) + + +def intel_amx_benchmark(extra_args=None, min_throughput=None): + def decorator(test_func): + @wraps(test_func) + def wrapper(self): + common_args = [ + "--attention-backend", + "intel_amx", + "--disable-radix", + "--trust-remote-code", + ] + full_args = common_args + (extra_args or []) + + model = test_func(self) + prefill_latency, decode_throughput, decode_latency = run_bench_one_batch( + model, full_args + ) + + print(f"{model=}") + print(f"{prefill_latency=}") + print(f"{decode_throughput=}") + print(f"{decode_latency=}") + + if is_in_ci() and min_throughput is not None: + self.assertGreater(decode_throughput, min_throughput) + + return wrapper + + return decorator diff --git a/test/srt/test_cpu_graph.py b/test/srt/cpu/test_cpu_graph.py similarity index 97% rename from test/srt/test_cpu_graph.py rename to test/srt/cpu/test_cpu_graph.py index 4e3c40539..1adc0e893 100644 --- a/test/srt/test_cpu_graph.py +++ b/test/srt/cpu/test_cpu_graph.py @@ -8,8 +8,6 @@ import os import unittest from types import SimpleNamespace -from test_intel_amx_attention_backend import intel_amx_benchmark - from sglang.srt.utils import get_cpu_ids_by_node, kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( @@ -17,6 +15,7 @@ from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, + intel_amx_benchmark, is_in_ci, popen_launch_server, ) diff --git a/test/srt/cpu/test_intel_amx_attention_backend_a.py b/test/srt/cpu/test_intel_amx_attention_backend_a.py new file mode 100644 index 000000000..d7cfb22fb --- /dev/null +++ b/test/srt/cpu/test_intel_amx_attention_backend_a.py @@ -0,0 +1,73 @@ +""" +Usage: +python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_latency_default_model +""" + +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MLA_MODEL_NAME_FOR_TEST, + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + intel_amx_benchmark, + is_in_ci, + popen_launch_server, +) + + +class TestIntelAMXAttnBackend(CustomTestCase): + + @intel_amx_benchmark( + extra_args=["--batch-size", "4", "--mem-fraction-static", "0.3"], + min_throughput=10, + ) + def test_latency_mla_model(self): + return DEFAULT_MLA_MODEL_NAME_FOR_TEST + + @intel_amx_benchmark( + extra_args=["--batch-size", "4", "--mem-fraction-static", "0.1"], + min_throughput=40, + ) + def test_latency_default_model(self): + return DEFAULT_MODEL_NAME_FOR_TEST + + def test_mmlu(self): + model = DEFAULT_MLA_MODEL_NAME_FOR_TEST + base_url = DEFAULT_URL_FOR_TEST + process = popen_launch_server( + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--attention-backend", + "intel_amx", + "--mem-fraction-static", + "0.3", + "--disable-radix", + "--trust-remote-code", + "--disable-overlap-schedule", + ], + ) + + try: + args = SimpleNamespace( + base_url=base_url, + model=model, + eval_name="mmlu", + num_examples=64, + num_threads=32, + ) + metrics = run_eval(args) + if is_in_ci(): + self.assertGreater(metrics["score"], 0.45) + finally: + kill_process_tree(process.pid) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/cpu/test_intel_amx_attention_backend_b.py b/test/srt/cpu/test_intel_amx_attention_backend_b.py new file mode 100644 index 000000000..58328e343 --- /dev/null +++ b/test/srt/cpu/test_intel_amx_attention_backend_b.py @@ -0,0 +1,35 @@ +""" +For intel_amx attention backend FP8 tests +Usage: +python3 -m unittest test_intel_amx_attention_backend_1.TestIntelAMXAttnBackendQuant.test_latency_fp8_qwen +""" + +import unittest + +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE, + DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8, + CustomTestCase, + intel_amx_benchmark, +) + + +class TestIntelAMXAttnBackendQuant(CustomTestCase): + + @intel_amx_benchmark( + extra_args=["--batch-size", "4", "--mem-fraction-static", "0.1"], + min_throughput=150, + ) + def test_latency_fp8_qwen(self): + return DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8 + + @intel_amx_benchmark( + extra_args=["--batch-size", "4", "--mem-fraction-static", "0.1"], + min_throughput=50, + ) + def test_latency_fp8_moe_model(self): + return DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/cpu/test_intel_amx_attention_backend_c.py b/test/srt/cpu/test_intel_amx_attention_backend_c.py new file mode 100644 index 000000000..42b1fd219 --- /dev/null +++ b/test/srt/cpu/test_intel_amx_attention_backend_c.py @@ -0,0 +1,53 @@ +""" +For intel_amx attention backend w8a8 tests +Usage: +python3 -m unittest test_intel_amx_attention_backend_2.TestIntelAMXAttnBackendQuant.test_latency_w8a8_default_model +""" + +import unittest + +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST_W8A8, + DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE, + CustomTestCase, + intel_amx_benchmark, +) + + +class TestIntelAMXAttnBackendQuant(CustomTestCase): + + @intel_amx_benchmark( + extra_args=[ + "--batch-size", + "4", + "--quantization", + "w8a8_int8", + "--mem-fraction-static", + "0.1", + ], + min_throughput=100, + ) + def test_latency_w8a8_default_model(self): + return DEFAULT_MODEL_NAME_FOR_TEST_W8A8 + + @intel_amx_benchmark( + extra_args=[ + "--batch-size", + "4", + "--quantization", + "w8a8_int8", + "--mem-fraction-static", + "0.9", + "--max-total-tokens", + "65536", + "--tp", + "6", + ], + min_throughput=100, + ) + def test_latency_w8a8_moe_model(self): + return DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 781803c1b..37eeecdc9 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -312,8 +312,10 @@ suite_xeon = { TestFile("cpu/test_rope.py"), TestFile("cpu/test_shared_expert.py"), TestFile("cpu/test_topk.py"), - TestFile("test_cpu_graph.py"), - TestFile("test_intel_amx_attention_backend.py"), + TestFile("cpu/test_cpu_graph.py"), + TestFile("cpu/test_intel_amx_attention_backend_a.py"), + TestFile("cpu/test_intel_amx_attention_backend_b.py"), + TestFile("cpu/test_intel_amx_attention_backend_c.py"), ], } diff --git a/test/srt/test_intel_amx_attention_backend.py b/test/srt/test_intel_amx_attention_backend.py deleted file mode 100644 index 5534c57f9..000000000 --- a/test/srt/test_intel_amx_attention_backend.py +++ /dev/null @@ -1,135 +0,0 @@ -""" -Usage: -python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_mmlu -""" - -import unittest -from functools import wraps -from types import SimpleNamespace - -from sglang.srt.utils import kill_process_tree -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_MLA_MODEL_NAME_FOR_TEST, - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE, - DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8, - DEFAULT_MODEL_NAME_FOR_TEST_W8A8, - DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - CustomTestCase, - is_in_ci, - popen_launch_server, - run_bench_one_batch, -) - - -def intel_amx_benchmark(extra_args=None, min_throughput=None): - def decorator(test_func): - @wraps(test_func) - def wrapper(self): - common_args = [ - "--attention-backend", - "intel_amx", - "--disable-radix", - "--trust-remote-code", - ] - full_args = common_args + (extra_args or []) - - model = test_func(self) - prefill_latency, decode_throughput, decode_latency = run_bench_one_batch( - model, full_args - ) - - print(f"{model=}") - print(f"{prefill_latency=}") - print(f"{decode_throughput=}") - print(f"{decode_latency=}") - - if is_in_ci() and min_throughput is not None: - self.assertGreater(decode_throughput, min_throughput) - - return wrapper - - return decorator - - -class TestIntelAMXAttnBackend(CustomTestCase): - - @intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=10) - def test_latency_mla_model(self): - return DEFAULT_MLA_MODEL_NAME_FOR_TEST - - @intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=40) - def test_latency_default_model(self): - return DEFAULT_MODEL_NAME_FOR_TEST - - @intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=150) - def test_latency_fp8_qwen(self): - return DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8 - - @intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=50) - def test_latency_fp8_moe_model(self): - return DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE - - @intel_amx_benchmark( - extra_args=["--batch-size", "4", "--quantization", "w8a8_int8"], - min_throughput=100, - ) - def test_latency_w8a8_default_model(self): - return DEFAULT_MODEL_NAME_FOR_TEST_W8A8 - - @intel_amx_benchmark( - extra_args=[ - "--batch-size", - "4", - "--quantization", - "w8a8_int8", - "--mem-fraction-static", - "0.9", - "--max-total-tokens", - "65536", - "--tp", - "6", - ], - min_throughput=100, - ) - def test_latency_w8a8_moe_model(self): - return DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE - - def test_mmlu(self): - model = DEFAULT_MLA_MODEL_NAME_FOR_TEST - base_url = DEFAULT_URL_FOR_TEST - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--attention-backend", - "intel_amx", - "--mem-fraction-static", - "0.3", - "--disable-radix", - "--trust-remote-code", - "--disable-overlap-schedule", - ], - ) - - try: - args = SimpleNamespace( - base_url=base_url, - model=model, - eval_name="mmlu", - num_examples=64, - num_threads=32, - ) - metrics = run_eval(args) - if is_in_ci(): - self.assertGreater(metrics["score"], 0.45) - finally: - kill_process_tree(process.pid) - - -if __name__ == "__main__": - unittest.main()