Files
xc-llm-ascend/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-cp.yaml
Qiu cb7c419bc0 [Feat](sfa,dcp) support dcp for sfa (#6563)
### What this PR does / why we need it?
This PR adds DCP support to the SFA backend.

Please note that due to operator constraints, the current implementation
has to all-gather the entire KV cache and modify the block table to
satisfy the operator input requirements. This results in significantly
increased communication overhead and peak memory usage. Therefore, this
is only a temporary workaround and will be refactored once the operator
provides proper support.

Additionally, because of the above limitations,
`cp_kv_cache_interleave_size` is currently required to be equal to
`block_size`. This restriction will also be removed after the refactor.

#### Test
accuracy test using DeepSeek-V3.2-Exp-W8A8 with dp2tp8dcp8

| dataset | version | metric | mode | vllm-api-general-stream |
|----- | ----- | ----- | ----- | -----|
| gsm8kdataset | - | accuracy | gen | 96.35 |

- vLLM version: v0.15.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0

---------

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
2026-02-09 18:52:25 +08:00

92 lines
2.9 KiB
YAML

test_name: "test DeepSeek-V3.2-W8A8 for PCP&DCP"
model: "vllm-ascend/DeepSeek-V3.2-W8A8"
num_nodes: 2
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: "AIV"
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 1
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
ASCEND_A3_EBA_ENABLE: 1
deployment:
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 4
--data-parallel-size-local 2
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13399
--tensor-parallel-size 8
--decode-context-parallel-size 8
--quantization ascend
--seed 1024
--enable-expert-parallel
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 4096
--no-enable-prefix-caching
--gpu-memory-utilization 0.85
--trust-remote-code
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--headless
--data-parallel-size 4
--data-parallel-rpc-port 13399
--data-parallel-size-local 2
--data-parallel-start-rank 2
--data-parallel-address $MASTER_IP
--tensor-parallel-size 8
--decode-context-parallel-size 8
--quantization ascend
--seed 1024
--enable-expert-parallel
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 4096
--no-enable-prefix-caching
--gpu-memory-utilization 0.85
--trust-remote-code
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 512
max_out_len: 3000
batch_size: 512
request_rate: 11.2
baseline: 1253.8466
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 4096
batch_size: 64
baseline: 95
threshold: 5