diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 62c61f86e..f9fac05ad 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -123,7 +123,7 @@ jobs: timeout-minutes: 10 run: | cd test/srt - python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1 + python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default - name: Benchmark online latency timeout-minutes: 10 diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index a15c2f5b0..ea7fa30b3 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -1970,6 +1970,7 @@ def is_fa3_default_architecture(hf_config): "Llama4ForConditionalGeneration", "LlamaForCausalLM", "MistralForCausalLM", + "MixtralForCausalLM", "Gemma2ForCausalLM", "Gemma3ForConditionalGeneration", } diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index a74f9c160..659dd2814 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -64,7 +64,7 @@ suites = { TestFile("test_retract_decode.py", 54), TestFile("test_server_args.py", 1), TestFile("test_skip_tokenizer_init.py", 117), - TestFile("test_srt_engine.py", 237), + TestFile("test_srt_engine.py", 261), TestFile("test_srt_endpoint.py", 130), TestFile("test_torch_compile.py", 76), TestFile("test_torch_compile_moe.py", 172), diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py index 9f5a56566..1d3e972ec 100644 --- a/test/srt/test_bench_one_batch.py +++ b/test/srt/test_bench_one_batch.py @@ -4,7 +4,6 @@ from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MOE_MODEL_NAME_FOR_TEST, CustomTestCase, - get_bool_env_var, is_in_ci, run_bench_one_batch, write_github_step_summary, @@ -12,15 +11,15 @@ from sglang.test.test_utils import ( class TestBenchOneBatch(CustomTestCase): - def test_bs1(self): + def test_bs1_default(self): output_throughput = run_bench_one_batch( DEFAULT_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"] ) if is_in_ci(): write_github_step_summary( - f"### test_bs1\n" - f"output_throughput : {output_throughput:.2f} token/s\n" + f"### test_bs1_default (llama-3.1-8b)\n" + f"output_throughput: {output_throughput:.2f} token/s\n" ) self.assertGreater(output_throughput, 135) @@ -32,9 +31,9 @@ class TestBenchOneBatch(CustomTestCase): if is_in_ci(): write_github_step_summary( f"### test_moe_tp2_bs1\n" - f"output_throughput : {output_throughput:.2f} token/s\n" + f"output_throughput: {output_throughput:.2f} token/s\n" ) - self.assertGreater(output_throughput, 124) + self.assertGreater(output_throughput, 125) def test_torch_compile_tp2_bs1(self): output_throughput = run_bench_one_batch( @@ -45,9 +44,9 @@ class TestBenchOneBatch(CustomTestCase): if is_in_ci(): write_github_step_summary( f"### test_torch_compile_tp2_bs1\n" - f"output_throughput : {output_throughput:.2f} token/s\n" + f"output_throughput: {output_throughput:.2f} token/s\n" ) - self.assertGreater(output_throughput, 225) + self.assertGreater(output_throughput, 220) if __name__ == "__main__": diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index e22df62a9..645394bf0 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -98,7 +98,7 @@ class TestBenchServing(CustomTestCase): f"### test_offline_throughput_with_triton_attention_backend\n" f'Output throughput: {res["output_throughput"]:.2f} token/s\n' ) - self.assertGreater(res["output_throughput"], 3600) + self.assertGreater(res["output_throughput"], 3700) def test_offline_throughput_default_fp8(self): res = run_bench_serving( @@ -113,7 +113,7 @@ class TestBenchServing(CustomTestCase): f"### test_offline_throughput_default_fp8\n" f'Output throughput: {res["output_throughput"]:.2f} token/s\n' ) - self.assertGreater(res["output_throughput"], 4200) + self.assertGreater(res["output_throughput"], 4300) def test_online_latency_default(self): res = run_bench_serving( @@ -126,7 +126,7 @@ class TestBenchServing(CustomTestCase): if is_in_ci(): write_github_step_summary( f"### test_online_latency_default\n" - f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n' + f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n' ) self.assertLess(res["median_e2e_latency_ms"], 11000) self.assertLess(res["median_ttft_ms"], 86) @@ -161,8 +161,8 @@ class TestBenchServing(CustomTestCase): if is_in_ci(): write_github_step_summary( f"### test_online_latency_eagle\n" - f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n' - f'accept_length : {res["accept_length"]:.2f} \n' + f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n' + f'accept_length: {res["accept_length"]:.2f} \n' ) self.assertLess(res["median_e2e_latency_ms"], 900) self.assertGreater(res["accept_length"], 3.0) diff --git a/test/srt/test_full_deepseek_v3.py b/test/srt/test_full_deepseek_v3.py index 552e0e36b..c5de0e562 100644 --- a/test/srt/test_full_deepseek_v3.py +++ b/test/srt/test_full_deepseek_v3.py @@ -2,7 +2,6 @@ import unittest from types import SimpleNamespace import requests -import torch from sglang.srt.utils import kill_process_tree from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k @@ -49,7 +48,7 @@ class TestDeepseekV3(CustomTestCase): metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - self.assertGreater(metrics["accuracy"], 0.94) + self.assertGreater(metrics["accuracy"], 0.935) class TestBenchOneBatch(CustomTestCase): @@ -58,11 +57,11 @@ class TestBenchOneBatch(CustomTestCase): FULL_DEEPSEEK_V3_MODEL_PATH, ["--trust-remote-code", "--tp", "8", "--cuda-graph-max-bs", "2"], ) - print(f"output_throughput : {output_throughput:.2f} token/s") + print(f"{output_throughput=:.2f} token/s") + if is_in_ci(): write_github_step_summary( - f"### test_bs1\n" - f"output_throughput : {output_throughput:.2f} token/s\n" + f"### test_bs1 (deepseek-v3)\n" f"{output_throughput=:.2f} token/s\n" ) self.assertGreater(output_throughput, 70) @@ -121,6 +120,13 @@ class TestDeepseekV3MTP(CustomTestCase): print(f"{avg_spec_accept_length=}") self.assertGreater(avg_spec_accept_length, 3.2) + if is_in_ci(): + write_github_step_summary( + f"### test_gsm8k (deepseek-v3)\n" + f'{metrics["accuracy"]=:.3f}\n' + f"{avg_spec_accept_length=:.2f}\n" + ) + if __name__ == "__main__": unittest.main()