From b6256e8bc98b6f3c8991257a7ab139ba969b4345 Mon Sep 17 00:00:00 2001 From: starmountain1997 <77533802+starmountain1997@users.noreply.github.com> Date: Tue, 3 Feb 2026 08:42:58 +0800 Subject: [PATCH] Revert "[CI] fix DS3.2 single node cudagraph_sizes config (#6241)" (#6497) # What this PR does / why we need it? This PR reverts commit 8134146ab62f90badfb6bde04cc2b4a44d9aeb13, which modified the DeepSeek V3.2 (W8A8) single-node nightly test configuration. as there is no limit between tp_size and MTP. # Does this PR introduce any user-facing change? No. This PR only affects CI/CD test configurations and does not introduce any user-facing changes. # How was this patch tested? N/A for a revert PR. The changes restore the previously known working configuration. - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 Signed-off-by: guozr Co-authored-by: guozr --- .../single_node/models/test_deepseek_v3_2_w8a8.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py b/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py index 30cc0020..7559e2da 100644 --- a/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py +++ b/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py @@ -54,7 +54,7 @@ aisbench_cases = [{ "max_out_len": 1500, "batch_size": 4, "request_rate": 11.2, - "baseline": 110.5681, + "baseline": 134, "threshold": 0.97 }] @@ -80,12 +80,13 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None: str(tp_size), "--data-parallel-size", str(dp_size), "--port", str(port), "--max-model-len", "8192", "--max-num-batched-tokens", - "8192", "--max-num-seqs", "8", "--trust-remote-code", "--quantization", - "ascend", "--gpu-memory-utilization", "0.98", "--compilation-config", - '{"cudagraph_capture_sizes":[8, 16, 24, 32], "cudagraph_mode":"FULL_DECODE_ONLY"}', + "8192", "--max-num-seqs", "4", "--trust-remote-code", "--quantization", + "ascend", "--gpu-memory-utilization", "0.92", "--compilation-config", + '{"cudagraph_capture_sizes":[3, 6, 9, 12], "cudagraph_mode":"FULL_DECODE_ONLY"}', "--speculative-config", - '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}', - "--additional-config", '{"layer_sharding": ["q_b_proj", "o_proj"]}', + '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}', + "--additional-config", + '{"layer_sharding": ["q_b_proj", "o_proj"]}', "--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32" ] request_keyword_args: dict[str, Any] = {