From 8134146ab62f90badfb6bde04cc2b4a44d9aeb13 Mon Sep 17 00:00:00 2001
From: starmountain1997 <77533802+starmountain1997@users.noreply.github.com>
Date: Mon, 2 Feb 2026 11:47:32 +0800
Subject: [PATCH] [CI] fix DS3.2 single node cudagraph_sizes config (#6241)
# What this PR does / why we need it?
This PR fixes the single-node nightly test for DeepSeek V3.2 (W8A8)
model to ensure CI stability. The changes include:
1. Simplified nightly test matrix (nightly_test_a3.yaml):
- Temporarily reduced to only run deepseek3_2-w8a8 test case for
debugging
- Changed trigger from schedule/workflow_dispatch to support
push/pull_request for faster iteration
2. Updated DeepSeek V3.2 test configuration
(test_deepseek_v3_2_w8a8.py):
- Adjusted cudagraph_capture_sizes from [3, 6, 9, 12] to [8, 16, 24, 32]
for better performance
- Increased max-num-seqs from 4 to 8
- Increased gpu-memory-utilization from 0.92 to 0.98
- Increased num_speculative_tokens from 2 to 3
3. Added PR checkout step (_e2e_nightly_single_node.yaml):
- Added ability to checkout a specific PR (#6241) for testing
# Does this PR introduce any user-facing change?
No. This PR only affects CI/CD test configurations and does not
introduce any user-facing changes.
# How was this patch tested?
Mock nightly test has passed, see
[here](https://github.com/vllm-project/vllm-ascend/actions/runs/21574655952/job/62159656622?pr=6241).
- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60
---------
Signed-off-by: guozr
Co-authored-by: guozr
---
.../single_node/models/test_deepseek_v3_2_w8a8.py | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py b/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py
index 7559e2da..30cc0020 100644
--- a/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py
+++ b/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py
@@ -54,7 +54,7 @@ aisbench_cases = [{
"max_out_len": 1500,
"batch_size": 4,
"request_rate": 11.2,
- "baseline": 134,
+ "baseline": 110.5681,
"threshold": 0.97
}]
@@ -80,13 +80,12 @@ async def test_models(model: str, tp_size: int, dp_size: int) -> None:
str(tp_size), "--data-parallel-size",
str(dp_size), "--port",
str(port), "--max-model-len", "8192", "--max-num-batched-tokens",
- "8192", "--max-num-seqs", "4", "--trust-remote-code", "--quantization",
- "ascend", "--gpu-memory-utilization", "0.92", "--compilation-config",
- '{"cudagraph_capture_sizes":[3, 6, 9, 12], "cudagraph_mode":"FULL_DECODE_ONLY"}',
+ "8192", "--max-num-seqs", "8", "--trust-remote-code", "--quantization",
+ "ascend", "--gpu-memory-utilization", "0.98", "--compilation-config",
+ '{"cudagraph_capture_sizes":[8, 16, 24, 32], "cudagraph_mode":"FULL_DECODE_ONLY"}',
"--speculative-config",
- '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}',
- "--additional-config",
- '{"layer_sharding": ["q_b_proj", "o_proj"]}',
+ '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}',
+ "--additional-config", '{"layer_sharding": ["q_b_proj", "o_proj"]}',
"--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32"
]
request_keyword_args: dict[str, Any] = {