[CI][Misc] modify ds3.2+dcp ci (#7841)

### What this PR does / why we need it? Due to the current dcp solution of allgathering the KV cache, the performance deteriorates significantly, and the CI may get stuck. This PR temporarily removes the performance and accuracy benchmarks for DeepSeek-V3.2-W8A8-cp to prevent CI hangs until optimization is complete. pcik-from:https://github.com/vllm-project/vllm-ascend/pull/7842 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Verified that the configuration file remains valid and that the CI no longer attempts to run the problematic benchmarks. pick-from: https://github.com/vllm-project/vllm-ascend/pull/7842 --------- Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
2026-04-01 08:58:21 +08:00
parent ef9964389f
commit 59a7526339
2 changed files with 0 additions and 94 deletions
--- a/.github/workflows/schedule_nightly_test_a3.yaml
+++ b/.github/workflows/schedule_nightly_test_a3.yaml
@@ -104,9 +104,6 @@ jobs:
          - name: multi-node-qwenw8a8-2node-longseq
            config_file_path: Qwen3-235B-W8A8-longseq.yaml
            size: 2
-          - name: multi-node-deepseek-V3_2-W8A8-cp
-            config_file_path: DeepSeek-V3_2-W8A8-cp.yaml
-            size: 2
          - name: multi-node-qwen-disagg-pd
            config_file_path: Qwen3-235B-disagg-pd.yaml
            size: 2
--- a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-cp.yaml
+++ b/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-cp.yaml
@@ -1,91 +0,0 @@
-test_name: "test DeepSeek-V3.2-W8A8 for PCP&DCP"
-model: "vllm-ascend/DeepSeek-V3.2-W8A8"
-num_nodes: 2
-npu_per_node: 16
-env_common:
-  HCCL_OP_EXPANSION_MODE: "AIV"
-
-  VLLM_USE_MODELSCOPE: true
-  HCCL_BUFFSIZE: 1024
-  SERVER_PORT: 8080
-  OMP_PROC_BIND: false
-  OMP_NUM_THREADS: 1
-  PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
-  VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
-  ASCEND_A3_EBA_ENABLE: 1
-
-
-deployment:
-  -
-    server_cmd: >
-      vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
-      --host 0.0.0.0
-      --port $SERVER_PORT
-      --data-parallel-size 4
-      --data-parallel-size-local 2
-      --data-parallel-address $LOCAL_IP
-      --data-parallel-rpc-port 13399
-      --tensor-parallel-size 8
-      --decode-context-parallel-size 8
-      --quantization ascend
-      --seed 1024
-      --enable-expert-parallel
-      --max-num-seqs 16
-      --max-model-len 8192
-      --max-num-batched-tokens 4096
-      --no-enable-prefix-caching
-      --gpu-memory-utilization 0.85
-      --trust-remote-code
-      --speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
-      --compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}' 
-      --additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
-      --tokenizer-mode deepseek_v32
-      --reasoning-parser deepseek_v3
-
-  -
-    server_cmd: >
-      vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
-      --headless
-      --data-parallel-size 4
-      --data-parallel-rpc-port 13399
-      --data-parallel-size-local 2
-      --data-parallel-start-rank 2
-      --data-parallel-address $MASTER_IP
-      --tensor-parallel-size 8
-      --decode-context-parallel-size 8
-      --quantization ascend
-      --seed 1024
-      --enable-expert-parallel
-      --max-num-seqs 16
-      --max-model-len 8192
-      --max-num-batched-tokens 4096
-      --no-enable-prefix-caching
-      --gpu-memory-utilization 0.85
-      --trust-remote-code
-      --speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
-      --compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}' 
-      --additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
-      --tokenizer-mode deepseek_v32
-      --reasoning-parser deepseek_v3
-benchmarks:
-  perf:
-    case_type: performance
-    dataset_path: vllm-ascend/GSM8K-in3500-bs2800
-    request_conf: vllm_api_stream_chat
-    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
-    num_prompts: 512
-    max_out_len: 3000
-    batch_size: 512
-    request_rate: 11.2
-    baseline: 1253.8466 
-    threshold: 0.97
-    
-  acc:
-    case_type: accuracy
-    dataset_path: vllm-ascend/gsm8k-lite
-    request_conf: vllm_api_general_chat
-    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
-    max_out_len: 4096
-    batch_size: 64
-    baseline: 95
-    threshold: 5