From 325cb16e3fa1da0838009fb58e1d9ce447555853 Mon Sep 17 00:00:00 2001
From: dsxsteven <36877507+dsxsteven@users.noreply.github.com>
Date: Wed, 28 Jan 2026 16:36:24 +0800
Subject: [PATCH] [BugFix][CI]Fix DeepSeek-R1-W8A8-longseq nightly CI (#6297)

### What this PR does / why we need it?
The precision issue arose because the kv cache of the p-node had not
been fetched for an extended period(>6min) and was forcibly freed. To
avoid this problem, the batch size was reduced and the timeout period
has also been extended.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd

Signed-off-by: dsxsteven <dsxsteven@sina.com>
---
 .../multi_node/config/DeepSeek-R1-W8A8-longseq.yaml      | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml
index 5cbc108c..1021a8db 100644
--- a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml
+++ b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml
@@ -13,6 +13,7 @@ env_common:
   HCCL_DETERMINISTIC: True
   TASK_QUEUE_ENABLE: 1
   HCCL_OP_RETRY_ENABLE: "L0:0, L1:0"
+  VLLM_NIXL_ABORT_REQUEST_TIMEOUT: 300000
 
 disaggregated_prefill:
   enabled: true
@@ -34,7 +35,7 @@ deployment:
           --enable-expert-parallel
           --seed 1024
           --quantization ascend
-          --max-num-seqs 4
+          --max-num-seqs 3
           --max-model-len 32768
           --max-num-batched-tokens 16384
           --trust-remote-code
@@ -71,12 +72,12 @@ deployment:
         --enable-expert-parallel
         --seed 1024
         --quantization ascend
-        --max-num-seqs 4
+        --max-num-seqs 8
         --max-model-len 32768
         --max-num-batched-tokens 256
         --trust-remote-code
         --gpu-memory-utilization 0.85
-        --compilation_config '{"cudagraph_capture_sizes":[4,8,12,16],"cudagraph_mode": "FULL_DECODE_ONLY"}'
+        --compilation_config '{"cudagraph_capture_sizes":[4,8,16,32],"cudagraph_mode": "FULL_DECODE_ONLY"}'
         --enable-chunked-prefill
         --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
         --kv-transfer-config
@@ -103,6 +104,6 @@ benchmarks:
     request_conf: vllm_api_general_chat
     dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
     max_out_len: 24576
-    batch_size: 512
+    batch_size: 16
     baseline: 95
     threshold: 5