From 4de039534337857768ac80b1d11cd8ed4aede04b Mon Sep 17 00:00:00 2001 From: YanbingJiang Date: Fri, 4 Jul 2025 13:25:50 +0800 Subject: [PATCH] Add V2-lite model test (#7390) Co-authored-by: DiweiSun <105627594+DiweiSun@users.noreply.github.com> --- python/sglang/test/test_utils.py | 18 ++++- test/srt/models/test_dummy_grok_models.py | 2 +- test/srt/run_suite.py | 1 + test/srt/test_bench_one_batch.py | 2 +- test/srt/test_flashmla.py | 2 +- test/srt/test_intel_amx_attention_backend.py | 79 ++++++++++++++++++++ 6 files changed, 98 insertions(+), 6 deletions(-) create mode 100644 test/srt/test_intel_amx_attention_backend.py diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 7c335c79d..23d44eb35 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -5,6 +5,7 @@ import copy import logging import os import random +import re import subprocess import threading import time @@ -840,12 +841,23 @@ def run_bench_one_batch(model, other_args): print(f"Output: {output}", flush=True) print(f"Error: {error}", flush=True) - lastline = output.split("\n")[-3] - output_throughput = float(lastline.split(" ")[-2]) + # Return prefill_latency, decode_throughput, decode_latency + prefill_line = output.split("\n")[-9] + decode_line = output.split("\n")[-3] + pattern = ( + r"latency: (?P\d+\.\d+).*?throughput:\s*(?P\d+\.\d+)" + ) + match = re.search(pattern, prefill_line) + if match: + prefill_latency = float(match.group("latency")) + match = re.search(pattern, decode_line) + if match: + decode_latency = float(match.group("latency")) + decode_throughput = float(match.group("throughput")) finally: kill_process_tree(process.pid) - return output_throughput + return prefill_latency, decode_throughput, decode_latency def run_bench_offline_throughput(model, other_args): diff --git a/test/srt/models/test_dummy_grok_models.py b/test/srt/models/test_dummy_grok_models.py index 290c49164..bebf9949c 100644 --- a/test/srt/models/test_dummy_grok_models.py +++ b/test/srt/models/test_dummy_grok_models.py @@ -6,7 +6,7 @@ from sglang.test.test_utils import CustomTestCase, is_in_ci, run_bench_one_batch class TestDummyGrok1(CustomTestCase): def test_dummy_grok_1(self): - output_throughput = run_bench_one_batch( + _, output_throughput, _ = run_bench_one_batch( None, [ "--model", diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index b98e37ca8..33935fc14 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -198,6 +198,7 @@ suites = { TestFile("cpu/test_rope.py"), TestFile("cpu/test_shared_expert.py"), TestFile("cpu/test_topk.py"), + TestFile("test_intel_amx_attention_backend.py"), ], "nightly": [ TestFile("test_nightly_gsm8k_eval.py"), diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py index 2de46ee51..7ec33a559 100644 --- a/test/srt/test_bench_one_batch.py +++ b/test/srt/test_bench_one_batch.py @@ -20,7 +20,7 @@ from sglang.test.test_utils import ( class TestBenchOneBatch(CustomTestCase): def test_bs1_small(self): - output_throughput = run_bench_one_batch( + _, output_throughput, _ = run_bench_one_batch( DEFAULT_SMALL_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"] ) self.assertGreater(output_throughput, 50) diff --git a/test/srt/test_flashmla.py b/test/srt/test_flashmla.py index bc17b3119..184e20ff2 100644 --- a/test/srt/test_flashmla.py +++ b/test/srt/test_flashmla.py @@ -67,7 +67,7 @@ class TestFlashMLAAttnBackend(unittest.TestCase): class TestFlashMLAAttnLatency(unittest.TestCase): def test_latency(self): - output_throughput = run_bench_one_batch( + _, output_throughput, _ = run_bench_one_batch( DEFAULT_MODEL_NAME_FOR_TEST_MLA, [ "--attention-backend", diff --git a/test/srt/test_intel_amx_attention_backend.py b/test/srt/test_intel_amx_attention_backend.py new file mode 100644 index 000000000..4c2bc130e --- /dev/null +++ b/test/srt/test_intel_amx_attention_backend.py @@ -0,0 +1,79 @@ +""" +Usage: +python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_mmlu +""" + +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MLA_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + is_in_ci, + popen_launch_server, + run_bench_one_batch, +) + + +class TestIntelAMXAttnBackend(CustomTestCase): + def test_latency(self): + prefill_latency, decode_throughput, decode_latency = run_bench_one_batch( + DEFAULT_MLA_MODEL_NAME_FOR_TEST, + [ + "--attention-backend", + "intel_amx", + "--mem-fraction-static", + "0.05", + "--disable-radix", + "--trust-remote-code", + "--batch-size", + "4", + ], + ) + + print(f"{prefill_latency=}") + print(f"{decode_throughput=}") + print(f"{decode_latency=}") + + if is_in_ci(): + self.assertGreater(decode_throughput, 10) + + def test_mmlu(self): + model = DEFAULT_MLA_MODEL_NAME_FOR_TEST + base_url = DEFAULT_URL_FOR_TEST + process = popen_launch_server( + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--attention-backend", + "intel_amx", + "--mem-fraction-static", + "0.05", + "--disable-radix", + "--trust-remote-code", + "--disable-overlap-schedule", + ], + ) + + try: + args = SimpleNamespace( + base_url=base_url, + model=model, + eval_name="mmlu", + num_examples=64, + num_threads=32, + ) + + metrics = run_eval(args) + self.assertGreater(metrics["score"], 0.5) + finally: + kill_process_tree(process.pid) + + +if __name__ == "__main__": + unittest.main()