From 59a75263396ad0889aa2c85c25438a18cb16640f Mon Sep 17 00:00:00 2001
From: weiguihua2 <weiguihua2@huawei.com>
Date: Wed, 1 Apr 2026 08:58:21 +0800
Subject: [PATCH] [CI][Misc] modify ds3.2+dcp ci (#7841)

### What this PR does / why we need it?

Due to the current dcp solution of allgathering the KV cache, the
performance deteriorates significantly, and the CI may get stuck. This
PR temporarily removes the performance and accuracy benchmarks for
DeepSeek-V3.2-W8A8-cp to prevent CI hangs until optimization is
complete.

pcik-from:https://github.com/vllm-project/vllm-ascend/pull/7842

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Verified that the configuration file remains valid and that the CI no
longer attempts to run the problematic benchmarks.

pick-from: https://github.com/vllm-project/vllm-ascend/pull/7842

---------

Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
---
 .../workflows/schedule_nightly_test_a3.yaml   |  3 -
 .../config/DeepSeek-V3_2-W8A8-cp.yaml         | 91 -------------------
 2 files changed, 94 deletions(-)
 delete mode 100644 tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-cp.yaml

diff --git a/.github/workflows/schedule_nightly_test_a3.yaml b/.github/workflows/schedule_nightly_test_a3.yaml
index 79b19805..66a5818e 100644
--- a/.github/workflows/schedule_nightly_test_a3.yaml
+++ b/.github/workflows/schedule_nightly_test_a3.yaml
@@ -104,9 +104,6 @@ jobs:
           - name: multi-node-qwenw8a8-2node-longseq
             config_file_path: Qwen3-235B-W8A8-longseq.yaml
             size: 2
-          - name: multi-node-deepseek-V3_2-W8A8-cp
-            config_file_path: DeepSeek-V3_2-W8A8-cp.yaml
-            size: 2
           - name: multi-node-qwen-disagg-pd
             config_file_path: Qwen3-235B-disagg-pd.yaml
             size: 2
diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-cp.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-cp.yaml
deleted file mode 100644
index 77978d4a..00000000
--- a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-cp.yaml
+++ /dev/null
@@ -1,91 +0,0 @@
-test_name: "test DeepSeek-V3.2-W8A8 for PCP&DCP"
-model: "vllm-ascend/DeepSeek-V3.2-W8A8"
-num_nodes: 2
-npu_per_node: 16
-env_common:
-  HCCL_OP_EXPANSION_MODE: "AIV"
-
-  VLLM_USE_MODELSCOPE: true
-  HCCL_BUFFSIZE: 1024
-  SERVER_PORT: 8080
-  OMP_PROC_BIND: false
-  OMP_NUM_THREADS: 1
-  PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
-  VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
-  ASCEND_A3_EBA_ENABLE: 1
-
-
-deployment:
-  -
-    server_cmd: >
-      vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
-      --host 0.0.0.0
-      --port $SERVER_PORT
-      --data-parallel-size 4
-      --data-parallel-size-local 2
-      --data-parallel-address $LOCAL_IP
-      --data-parallel-rpc-port 13399
-      --tensor-parallel-size 8
-      --decode-context-parallel-size 8
-      --quantization ascend
-      --seed 1024
-      --enable-expert-parallel
-      --max-num-seqs 16
-      --max-model-len 8192
-      --max-num-batched-tokens 4096
-      --no-enable-prefix-caching
-      --gpu-memory-utilization 0.85
-      --trust-remote-code
-      --speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
-      --compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}' 
-      --additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
-      --tokenizer-mode deepseek_v32
-      --reasoning-parser deepseek_v3
-
-  -
-    server_cmd: >
-      vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
-      --headless
-      --data-parallel-size 4
-      --data-parallel-rpc-port 13399
-      --data-parallel-size-local 2
-      --data-parallel-start-rank 2
-      --data-parallel-address $MASTER_IP
-      --tensor-parallel-size 8
-      --decode-context-parallel-size 8
-      --quantization ascend
-      --seed 1024
-      --enable-expert-parallel
-      --max-num-seqs 16
-      --max-model-len 8192
-      --max-num-batched-tokens 4096
-      --no-enable-prefix-caching
-      --gpu-memory-utilization 0.85
-      --trust-remote-code
-      --speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
-      --compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}' 
-      --additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
-      --tokenizer-mode deepseek_v32
-      --reasoning-parser deepseek_v3
-benchmarks:
-  perf:
-    case_type: performance
-    dataset_path: vllm-ascend/GSM8K-in3500-bs2800
-    request_conf: vllm_api_stream_chat
-    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
-    num_prompts: 512
-    max_out_len: 3000
-    batch_size: 512
-    request_rate: 11.2
-    baseline: 1253.8466 
-    threshold: 0.97
-    
-  acc:
-    case_type: accuracy
-    dataset_path: vllm-ascend/gsm8k-lite
-    request_conf: vllm_api_general_chat
-    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
-    max_out_len: 4096
-    batch_size: 64
-    baseline: 95
-    threshold: 5