[CI][Misc] modify ds3.2+dcp ci (#7841)

### What this PR does / why we need it?

Due to the current dcp solution of allgathering the KV cache, the
performance deteriorates significantly, and the CI may get stuck. This
PR temporarily removes the performance and accuracy benchmarks for
DeepSeek-V3.2-W8A8-cp to prevent CI hangs until optimization is
complete.

pcik-from:https://github.com/vllm-project/vllm-ascend/pull/7842

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Verified that the configuration file remains valid and that the CI no
longer attempts to run the problematic benchmarks.

pick-from: https://github.com/vllm-project/vllm-ascend/pull/7842

---------

Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
This commit is contained in:
weiguihua2
2026-04-01 08:58:21 +08:00
committed by GitHub
parent ef9964389f
commit 59a7526339
2 changed files with 0 additions and 94 deletions

View File

@@ -104,9 +104,6 @@ jobs:
- name: multi-node-qwenw8a8-2node-longseq
config_file_path: Qwen3-235B-W8A8-longseq.yaml
size: 2
- name: multi-node-deepseek-V3_2-W8A8-cp
config_file_path: DeepSeek-V3_2-W8A8-cp.yaml
size: 2
- name: multi-node-qwen-disagg-pd
config_file_path: Qwen3-235B-disagg-pd.yaml
size: 2

View File

@@ -1,91 +0,0 @@
test_name: "test DeepSeek-V3.2-W8A8 for PCP&DCP"
model: "vllm-ascend/DeepSeek-V3.2-W8A8"
num_nodes: 2
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: "AIV"
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 1
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
ASCEND_A3_EBA_ENABLE: 1
deployment:
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 4
--data-parallel-size-local 2
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13399
--tensor-parallel-size 8
--decode-context-parallel-size 8
--quantization ascend
--seed 1024
--enable-expert-parallel
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 4096
--no-enable-prefix-caching
--gpu-memory-utilization 0.85
--trust-remote-code
--speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--headless
--data-parallel-size 4
--data-parallel-rpc-port 13399
--data-parallel-size-local 2
--data-parallel-start-rank 2
--data-parallel-address $MASTER_IP
--tensor-parallel-size 8
--decode-context-parallel-size 8
--quantization ascend
--seed 1024
--enable-expert-parallel
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 4096
--no-enable-prefix-caching
--gpu-memory-utilization 0.85
--trust-remote-code
--speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 512
max_out_len: 3000
batch_size: 512
request_rate: 11.2
baseline: 1253.8466
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 4096
batch_size: 64
baseline: 95
threshold: 5