From 2cadd51d11a7fddf7c15833f6fca617428af7ef2 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sat, 8 Mar 2025 05:23:06 -0800 Subject: [PATCH] Test no vllm custom allreduce (#4210) --- .github/workflows/pr-test.yml | 2 ++ test/srt/test_bench_one_batch.py | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 225c215c8..5ac065973 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -269,6 +269,8 @@ jobs: cd test/srt python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 + USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 + - name: Benchmark single latency + torch.compile (TP=2) timeout-minutes: 10 run: | diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py index 1d50b5747..f4140b89f 100644 --- a/test/srt/test_bench_one_batch.py +++ b/test/srt/test_bench_one_batch.py @@ -11,7 +11,9 @@ from sglang.test.test_utils import ( class TestBenchOneBatch(unittest.TestCase): def test_bs1(self): - output_throughput = run_bench_one_batch(DEFAULT_MODEL_NAME_FOR_TEST, []) + output_throughput = run_bench_one_batch( + DEFAULT_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"] + ) if is_in_ci(): write_github_step_summary( @@ -22,7 +24,7 @@ class TestBenchOneBatch(unittest.TestCase): def test_moe_tp2_bs1(self): output_throughput = run_bench_one_batch( - DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2"] + DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2", "--cuda-graph-max-bs", "2"] ) if is_in_ci():