[CI][Misc] modify ds3.2+dcp ci (#7841)
### What this PR does / why we need it? Due to the current dcp solution of allgathering the KV cache, the performance deteriorates significantly, and the CI may get stuck. This PR temporarily removes the performance and accuracy benchmarks for DeepSeek-V3.2-W8A8-cp to prevent CI hangs until optimization is complete. pcik-from:https://github.com/vllm-project/vllm-ascend/pull/7842 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Verified that the configuration file remains valid and that the CI no longer attempts to run the problematic benchmarks. pick-from: https://github.com/vllm-project/vllm-ascend/pull/7842 --------- Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
This commit is contained in:
@@ -104,9 +104,6 @@ jobs:
|
||||
- name: multi-node-qwenw8a8-2node-longseq
|
||||
config_file_path: Qwen3-235B-W8A8-longseq.yaml
|
||||
size: 2
|
||||
- name: multi-node-deepseek-V3_2-W8A8-cp
|
||||
config_file_path: DeepSeek-V3_2-W8A8-cp.yaml
|
||||
size: 2
|
||||
- name: multi-node-qwen-disagg-pd
|
||||
config_file_path: Qwen3-235B-disagg-pd.yaml
|
||||
size: 2
|
||||
|
||||
@@ -1,91 +0,0 @@
|
||||
test_name: "test DeepSeek-V3.2-W8A8 for PCP&DCP"
|
||||
model: "vllm-ascend/DeepSeek-V3.2-W8A8"
|
||||
num_nodes: 2
|
||||
npu_per_node: 16
|
||||
env_common:
|
||||
HCCL_OP_EXPANSION_MODE: "AIV"
|
||||
|
||||
VLLM_USE_MODELSCOPE: true
|
||||
HCCL_BUFFSIZE: 1024
|
||||
SERVER_PORT: 8080
|
||||
OMP_PROC_BIND: false
|
||||
OMP_NUM_THREADS: 1
|
||||
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
||||
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
|
||||
ASCEND_A3_EBA_ENABLE: 1
|
||||
|
||||
|
||||
deployment:
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
|
||||
--host 0.0.0.0
|
||||
--port $SERVER_PORT
|
||||
--data-parallel-size 4
|
||||
--data-parallel-size-local 2
|
||||
--data-parallel-address $LOCAL_IP
|
||||
--data-parallel-rpc-port 13399
|
||||
--tensor-parallel-size 8
|
||||
--decode-context-parallel-size 8
|
||||
--quantization ascend
|
||||
--seed 1024
|
||||
--enable-expert-parallel
|
||||
--max-num-seqs 16
|
||||
--max-model-len 8192
|
||||
--max-num-batched-tokens 4096
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.85
|
||||
--trust-remote-code
|
||||
--speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
|
||||
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
|
||||
--tokenizer-mode deepseek_v32
|
||||
--reasoning-parser deepseek_v3
|
||||
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
|
||||
--headless
|
||||
--data-parallel-size 4
|
||||
--data-parallel-rpc-port 13399
|
||||
--data-parallel-size-local 2
|
||||
--data-parallel-start-rank 2
|
||||
--data-parallel-address $MASTER_IP
|
||||
--tensor-parallel-size 8
|
||||
--decode-context-parallel-size 8
|
||||
--quantization ascend
|
||||
--seed 1024
|
||||
--enable-expert-parallel
|
||||
--max-num-seqs 16
|
||||
--max-model-len 8192
|
||||
--max-num-batched-tokens 4096
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.85
|
||||
--trust-remote-code
|
||||
--speculative-config '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}'
|
||||
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
||||
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
|
||||
--tokenizer-mode deepseek_v32
|
||||
--reasoning-parser deepseek_v3
|
||||
benchmarks:
|
||||
perf:
|
||||
case_type: performance
|
||||
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
|
||||
request_conf: vllm_api_stream_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
||||
num_prompts: 512
|
||||
max_out_len: 3000
|
||||
batch_size: 512
|
||||
request_rate: 11.2
|
||||
baseline: 1253.8466
|
||||
threshold: 0.97
|
||||
|
||||
acc:
|
||||
case_type: accuracy
|
||||
dataset_path: vllm-ascend/gsm8k-lite
|
||||
request_conf: vllm_api_general_chat
|
||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||
max_out_len: 4096
|
||||
batch_size: 64
|
||||
baseline: 95
|
||||
threshold: 5
|
||||
Reference in New Issue
Block a user