From 325cb16e3fa1da0838009fb58e1d9ce447555853 Mon Sep 17 00:00:00 2001 From: dsxsteven <36877507+dsxsteven@users.noreply.github.com> Date: Wed, 28 Jan 2026 16:36:24 +0800 Subject: [PATCH] [BugFix][CI]Fix DeepSeek-R1-W8A8-longseq nightly CI (#6297) ### What this PR does / why we need it? The precision issue arose because the kv cache of the p-node had not been fetched for an extended period(>6min) and was forcibly freed. To avoid this problem, the batch size was reduced and the timeout period has also been extended. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd Signed-off-by: dsxsteven --- .../multi_node/config/DeepSeek-R1-W8A8-longseq.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml index 5cbc108c..1021a8db 100644 --- a/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml +++ b/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml @@ -13,6 +13,7 @@ env_common: HCCL_DETERMINISTIC: True TASK_QUEUE_ENABLE: 1 HCCL_OP_RETRY_ENABLE: "L0:0, L1:0" + VLLM_NIXL_ABORT_REQUEST_TIMEOUT: 300000 disaggregated_prefill: enabled: true @@ -34,7 +35,7 @@ deployment: --enable-expert-parallel --seed 1024 --quantization ascend - --max-num-seqs 4 + --max-num-seqs 3 --max-model-len 32768 --max-num-batched-tokens 16384 --trust-remote-code @@ -71,12 +72,12 @@ deployment: --enable-expert-parallel --seed 1024 --quantization ascend - --max-num-seqs 4 + --max-num-seqs 8 --max-model-len 32768 --max-num-batched-tokens 256 --trust-remote-code --gpu-memory-utilization 0.85 - --compilation_config '{"cudagraph_capture_sizes":[4,8,12,16],"cudagraph_mode": "FULL_DECODE_ONLY"}' + --compilation_config '{"cudagraph_capture_sizes":[4,8,16,32],"cudagraph_mode": "FULL_DECODE_ONLY"}' --enable-chunked-prefill --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' --kv-transfer-config @@ -103,6 +104,6 @@ benchmarks: request_conf: vllm_api_general_chat dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt max_out_len: 24576 - batch_size: 512 + batch_size: 16 baseline: 95 threshold: 5