From 59a75263396ad0889aa2c85c25438a18cb16640f Mon Sep 17 00:00:00 2001 From: weiguihua2 Date: Wed, 1 Apr 2026 08:58:21 +0800 Subject: [PATCH] [CI][Misc] modify ds3.2+dcp ci (#7841) ### What this PR does / why we need it? Due to the current dcp solution of allgathering the KV cache, the performance deteriorates significantly, and the CI may get stuck. This PR temporarily removes the performance and accuracy benchmarks for DeepSeek-V3.2-W8A8-cp to prevent CI hangs until optimization is complete. pcik-from:https://github.com/vllm-project/vllm-ascend/pull/7842 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Verified that the configuration file remains valid and that the CI no longer attempts to run the problematic benchmarks. pick-from: https://github.com/vllm-project/vllm-ascend/pull/7842 --------- Signed-off-by: weiguihua2 --- .../workflows/schedule_nightly_test_a3.yaml | 3 - .../config/DeepSeek-V3_2-W8A8-cp.yaml | 91 ------------------- 2 files changed, 94 deletions(-) delete mode 100644 tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-cp.yaml diff --git a/.github/workflows/schedule_nightly_test_a3.yaml b/.github/workflows/schedule_nightly_test_a3.yaml index 79b19805..66a5818e 100644 --- a/.github/workflows/schedule_nightly_test_a3.yaml +++ b/.github/workflows/schedule_nightly_test_a3.yaml @@ -104,9 +104,6 @@ jobs: - name: multi-node-qwenw8a8-2node-longseq config_file_path: Qwen3-235B-W8A8-longseq.yaml size: 2 - - name: multi-node-deepseek-V3_2-W8A8-cp - config_file_path: DeepSeek-V3_2-W8A8-cp.yaml - size: 2 - name: multi-node-qwen-disagg-pd config_file_path: Qwen3-235B-disagg-pd.yaml size: 2 diff --git a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-cp.yaml b/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-cp.yaml deleted file mode 100644 index 77978d4a..00000000 --- a/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-cp.yaml +++ /dev/null @@ -1,91 +0,0 @@ -test_name: "test DeepSeek-V3.2-W8A8 for PCP&DCP" -model: "vllm-ascend/DeepSeek-V3.2-W8A8" -num_nodes: 2 -npu_per_node: 16 -env_common: - HCCL_OP_EXPANSION_MODE: "AIV" - - VLLM_USE_MODELSCOPE: true - HCCL_BUFFSIZE: 1024 - SERVER_PORT: 8080 - OMP_PROC_BIND: false - OMP_NUM_THREADS: 1 - PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" - VLLM_ASCEND_ENABLE_FLASHCOMM1: 1 - ASCEND_A3_EBA_ENABLE: 1 - - -deployment: - - - server_cmd: > - vllm serve vllm-ascend/DeepSeek-V3.2-W8A8 - --host 0.0.0.0 - --port $SERVER_PORT - --data-parallel-size 4 - --data-parallel-size-local 2 - --data-parallel-address $LOCAL_IP - --data-parallel-rpc-port 13399 - --tensor-parallel-size 8 - --decode-context-parallel-size 8 - --quantization ascend - --seed 1024 - --enable-expert-parallel - --max-num-seqs 16 - --max-model-len 8192 - --max-num-batched-tokens 4096 - --no-enable-prefix-caching - --gpu-memory-utilization 0.85 - --trust-remote-code - --speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}' - --compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}' - --additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' - --tokenizer-mode deepseek_v32 - --reasoning-parser deepseek_v3 - - - - server_cmd: > - vllm serve vllm-ascend/DeepSeek-V3.2-W8A8 - --headless - --data-parallel-size 4 - --data-parallel-rpc-port 13399 - --data-parallel-size-local 2 - --data-parallel-start-rank 2 - --data-parallel-address $MASTER_IP - --tensor-parallel-size 8 - --decode-context-parallel-size 8 - --quantization ascend - --seed 1024 - --enable-expert-parallel - --max-num-seqs 16 - --max-model-len 8192 - --max-num-batched-tokens 4096 - --no-enable-prefix-caching - --gpu-memory-utilization 0.85 - --trust-remote-code - --speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}' - --compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}' - --additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}' - --tokenizer-mode deepseek_v32 - --reasoning-parser deepseek_v3 -benchmarks: - perf: - case_type: performance - dataset_path: vllm-ascend/GSM8K-in3500-bs2800 - request_conf: vllm_api_stream_chat - dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf - num_prompts: 512 - max_out_len: 3000 - batch_size: 512 - request_rate: 11.2 - baseline: 1253.8466 - threshold: 0.97 - - acc: - case_type: accuracy - dataset_path: vllm-ascend/gsm8k-lite - request_conf: vllm_api_general_chat - dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt - max_out_len: 4096 - batch_size: 64 - baseline: 95 - threshold: 5