Split test_intel_amx_attention_backend.py to pass CI of timeout (#11370)

Co-authored-by: Ma Mingfei <mingfei.ma@intel.com>
This commit is contained in:
YanbingJiang
2025-10-16 10:22:32 +08:00
committed by GitHub
parent 476c67d7fc
commit cbac499750
7 changed files with 197 additions and 140 deletions

View File

@@ -16,7 +16,7 @@ import unittest
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from datetime import datetime
from functools import partial
from functools import partial, wraps
from pathlib import Path
from types import SimpleNamespace
from typing import Any, Awaitable, Callable, List, Optional, Tuple
@@ -1807,3 +1807,33 @@ def write_results_to_json(model, metrics, mode="a"):
with open("results.json", "w") as f:
json.dump(existing_results, f, indent=2)
def intel_amx_benchmark(extra_args=None, min_throughput=None):
def decorator(test_func):
@wraps(test_func)
def wrapper(self):
common_args = [
"--attention-backend",
"intel_amx",
"--disable-radix",
"--trust-remote-code",
]
full_args = common_args + (extra_args or [])
model = test_func(self)
prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
model, full_args
)
print(f"{model=}")
print(f"{prefill_latency=}")
print(f"{decode_throughput=}")
print(f"{decode_latency=}")
if is_in_ci() and min_throughput is not None:
self.assertGreater(decode_throughput, min_throughput)
return wrapper
return decorator

View File

@@ -8,8 +8,6 @@ import os
import unittest
from types import SimpleNamespace
from test_intel_amx_attention_backend import intel_amx_benchmark
from sglang.srt.utils import get_cpu_ids_by_node, kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
@@ -17,6 +15,7 @@ from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
intel_amx_benchmark,
is_in_ci,
popen_launch_server,
)

View File

@@ -0,0 +1,73 @@
"""
Usage:
python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_latency_default_model
"""
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MLA_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
intel_amx_benchmark,
is_in_ci,
popen_launch_server,
)
class TestIntelAMXAttnBackend(CustomTestCase):
@intel_amx_benchmark(
extra_args=["--batch-size", "4", "--mem-fraction-static", "0.3"],
min_throughput=10,
)
def test_latency_mla_model(self):
return DEFAULT_MLA_MODEL_NAME_FOR_TEST
@intel_amx_benchmark(
extra_args=["--batch-size", "4", "--mem-fraction-static", "0.1"],
min_throughput=40,
)
def test_latency_default_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST
def test_mmlu(self):
model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
base_url = DEFAULT_URL_FOR_TEST
process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--attention-backend",
"intel_amx",
"--mem-fraction-static",
"0.3",
"--disable-radix",
"--trust-remote-code",
"--disable-overlap-schedule",
],
)
try:
args = SimpleNamespace(
base_url=base_url,
model=model,
eval_name="mmlu",
num_examples=64,
num_threads=32,
)
metrics = run_eval(args)
if is_in_ci():
self.assertGreater(metrics["score"], 0.45)
finally:
kill_process_tree(process.pid)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,35 @@
"""
For intel_amx attention backend FP8 tests
Usage:
python3 -m unittest test_intel_amx_attention_backend_1.TestIntelAMXAttnBackendQuant.test_latency_fp8_qwen
"""
import unittest
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE,
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8,
CustomTestCase,
intel_amx_benchmark,
)
class TestIntelAMXAttnBackendQuant(CustomTestCase):
@intel_amx_benchmark(
extra_args=["--batch-size", "4", "--mem-fraction-static", "0.1"],
min_throughput=150,
)
def test_latency_fp8_qwen(self):
return DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
@intel_amx_benchmark(
extra_args=["--batch-size", "4", "--mem-fraction-static", "0.1"],
min_throughput=50,
)
def test_latency_fp8_moe_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,53 @@
"""
For intel_amx attention backend w8a8 tests
Usage:
python3 -m unittest test_intel_amx_attention_backend_2.TestIntelAMXAttnBackendQuant.test_latency_w8a8_default_model
"""
import unittest
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST_W8A8,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE,
CustomTestCase,
intel_amx_benchmark,
)
class TestIntelAMXAttnBackendQuant(CustomTestCase):
@intel_amx_benchmark(
extra_args=[
"--batch-size",
"4",
"--quantization",
"w8a8_int8",
"--mem-fraction-static",
"0.1",
],
min_throughput=100,
)
def test_latency_w8a8_default_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_W8A8
@intel_amx_benchmark(
extra_args=[
"--batch-size",
"4",
"--quantization",
"w8a8_int8",
"--mem-fraction-static",
"0.9",
"--max-total-tokens",
"65536",
"--tp",
"6",
],
min_throughput=100,
)
def test_latency_w8a8_moe_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
if __name__ == "__main__":
unittest.main()

View File

@@ -312,8 +312,10 @@ suite_xeon = {
TestFile("cpu/test_rope.py"),
TestFile("cpu/test_shared_expert.py"),
TestFile("cpu/test_topk.py"),
TestFile("test_cpu_graph.py"),
TestFile("test_intel_amx_attention_backend.py"),
TestFile("cpu/test_cpu_graph.py"),
TestFile("cpu/test_intel_amx_attention_backend_a.py"),
TestFile("cpu/test_intel_amx_attention_backend_b.py"),
TestFile("cpu/test_intel_amx_attention_backend_c.py"),
],
}

View File

@@ -1,135 +0,0 @@
"""
Usage:
python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_mmlu
"""
import unittest
from functools import wraps
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MLA_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE,
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
popen_launch_server,
run_bench_one_batch,
)
def intel_amx_benchmark(extra_args=None, min_throughput=None):
def decorator(test_func):
@wraps(test_func)
def wrapper(self):
common_args = [
"--attention-backend",
"intel_amx",
"--disable-radix",
"--trust-remote-code",
]
full_args = common_args + (extra_args or [])
model = test_func(self)
prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
model, full_args
)
print(f"{model=}")
print(f"{prefill_latency=}")
print(f"{decode_throughput=}")
print(f"{decode_latency=}")
if is_in_ci() and min_throughput is not None:
self.assertGreater(decode_throughput, min_throughput)
return wrapper
return decorator
class TestIntelAMXAttnBackend(CustomTestCase):
@intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=10)
def test_latency_mla_model(self):
return DEFAULT_MLA_MODEL_NAME_FOR_TEST
@intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=40)
def test_latency_default_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST
@intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=150)
def test_latency_fp8_qwen(self):
return DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
@intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=50)
def test_latency_fp8_moe_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
@intel_amx_benchmark(
extra_args=["--batch-size", "4", "--quantization", "w8a8_int8"],
min_throughput=100,
)
def test_latency_w8a8_default_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_W8A8
@intel_amx_benchmark(
extra_args=[
"--batch-size",
"4",
"--quantization",
"w8a8_int8",
"--mem-fraction-static",
"0.9",
"--max-total-tokens",
"65536",
"--tp",
"6",
],
min_throughput=100,
)
def test_latency_w8a8_moe_model(self):
return DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
def test_mmlu(self):
model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
base_url = DEFAULT_URL_FOR_TEST
process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--attention-backend",
"intel_amx",
"--mem-fraction-static",
"0.3",
"--disable-radix",
"--trust-remote-code",
"--disable-overlap-schedule",
],
)
try:
args = SimpleNamespace(
base_url=base_url,
model=model,
eval_name="mmlu",
num_examples=64,
num_threads=32,
)
metrics = run_eval(args)
if is_in_ci():
self.assertGreater(metrics["score"], 0.45)
finally:
kill_process_tree(process.pid)
if __name__ == "__main__":
unittest.main()