From 2cb9195ff0fce9f4c1c8360b56121d6588684faf Mon Sep 17 00:00:00 2001
From: Nagisa125 <166619298+Nagisa125@users.noreply.github.com>
Date: Wed, 1 Apr 2026 10:28:46 +0800
Subject: [PATCH] [Releases/v0.18.0][CI] Updated the parameters for the
 single-node test to fix the OOM issue for DeepSeek-V3.2 (#7862)

### What this PR does / why we need it?
Fix the OOM (Out-of-Memory) error in the single-node-deepseek-v3-2-w8a8
nightly test of vllm-ascend:

- Reduced the value of HCCL_BUFFSIZE

- Lowered the gpu-memory-utilization

Optimize service-side performance:
Updated service-oriented configuration parameters (e.g., max-num-seqs,
cudagraph_capture_sizes, batch_size) to improve the inference
performance,so that the performance is closer to the optimal performance
of the current mainline.
Align performance baseline with main branch:
Updated the performance baseline according to the latest performance
data

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
The test has passed.

https://github.com/vllm-project/vllm-ascend/actions/runs/23734079080/job/69134387320?pr=7793

---------

Signed-off-by: wyh145 <1987244901@qq.com>
---
 .../models/configs/DeepSeek-V3.2-W8A8.yaml       | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/e2e/nightly/single_node/models/configs/DeepSeek-V3.2-W8A8.yaml b/tests/e2e/nightly/single_node/models/configs/DeepSeek-V3.2-W8A8.yaml
index b2374079..1ab0b3ea 100644
--- a/tests/e2e/nightly/single_node/models/configs/DeepSeek-V3.2-W8A8.yaml
+++ b/tests/e2e/nightly/single_node/models/configs/DeepSeek-V3.2-W8A8.yaml
@@ -9,7 +9,7 @@ test_cases:
       HCCL_OP_EXPANSION_MODE: "AIV"
       OMP_PROC_BIND: "false"
       OMP_NUM_THREADS: "1"
-      HCCL_BUFFSIZE: "1024"
+      HCCL_BUFFSIZE: "256"
       VLLM_ASCEND_ENABLE_MLAPO: "1"
       PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
       VLLM_ASCEND_ENABLE_FLASHCOMM1: "1"
@@ -28,14 +28,14 @@ test_cases:
       - "--max-num-batched-tokens"
       - "8192"
       - "--max-num-seqs"
-      - "4"
+      - "8"
       - "--trust-remote-code"
       - "--quantization"
       - "ascend"
       - "--gpu-memory-utilization"
-      - "0.98"
+      - "0.93"
       - "--compilation-config"
-      - '{"cudagraph_capture_sizes":[8, 16, 24, 32, 40, 48], "cudagraph_mode":"FULL_DECODE_ONLY"}'
+      - '{"cudagraph_capture_sizes":[4, 8, 16, 20, 24, 28, 32], "cudagraph_mode":"FULL_DECODE_ONLY"}'
       - "--speculative-config"
       - '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
       - "--additional-config"
@@ -63,16 +63,16 @@ test_cases:
         max_out_len: 1500
         batch_size: 1
         request_rate: 11.2
-        baseline: 134
+        baseline: 1
         threshold: 0.97
       perf_2:
         case_type: performance
         dataset_path: vllm-ascend/GSM8K-in3500-bs400
         request_conf: vllm_api_stream_chat
         dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
-        num_prompts: 100
+        num_prompts: 128
         max_out_len: 1500
-        batch_size: 4
+        batch_size: 32
         request_rate: 11.2
-        baseline: 134
+        baseline: 210
         threshold: 0.97