### What this PR does / why we need it? This PR adds DCP support to the SFA backend. Please note that due to operator constraints, the current implementation has to all-gather the entire KV cache and modify the block table to satisfy the operator input requirements. This results in significantly increased communication overhead and peak memory usage. Therefore, this is only a temporary workaround and will be refactored once the operator provides proper support. Additionally, because of the above limitations, `cp_kv_cache_interleave_size` is currently required to be equal to `block_size`. This restriction will also be removed after the refactor. #### Test accuracy test using DeepSeek-V3.2-Exp-W8A8 with dp2tp8dcp8 | dataset | version | metric | mode | vllm-api-general-stream | |----- | ----- | ----- | ----- | -----| | gsm8kdataset | - | accuracy | gen | 96.35 | - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 --------- Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
92 lines
2.9 KiB
YAML
92 lines
2.9 KiB
YAML
test_name: "test DeepSeek-V3.2-W8A8 for PCP&DCP"
|
|
model: "vllm-ascend/DeepSeek-V3.2-W8A8"
|
|
num_nodes: 2
|
|
npu_per_node: 16
|
|
env_common:
|
|
HCCL_OP_EXPANSION_MODE: "AIV"
|
|
|
|
VLLM_USE_MODELSCOPE: true
|
|
HCCL_BUFFSIZE: 1024
|
|
SERVER_PORT: 8080
|
|
OMP_PROC_BIND: false
|
|
OMP_NUM_THREADS: 1
|
|
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
|
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
|
|
ASCEND_A3_EBA_ENABLE: 1
|
|
|
|
|
|
deployment:
|
|
-
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
|
|
--host 0.0.0.0
|
|
--port $SERVER_PORT
|
|
--data-parallel-size 4
|
|
--data-parallel-size-local 2
|
|
--data-parallel-address $LOCAL_IP
|
|
--data-parallel-rpc-port 13399
|
|
--tensor-parallel-size 8
|
|
--decode-context-parallel-size 8
|
|
--quantization ascend
|
|
--seed 1024
|
|
--enable-expert-parallel
|
|
--max-num-seqs 16
|
|
--max-model-len 8192
|
|
--max-num-batched-tokens 4096
|
|
--no-enable-prefix-caching
|
|
--gpu-memory-utilization 0.85
|
|
--trust-remote-code
|
|
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
|
|
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
|
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
|
|
--tokenizer-mode deepseek_v32
|
|
--reasoning-parser deepseek_v3
|
|
|
|
-
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
|
|
--headless
|
|
--data-parallel-size 4
|
|
--data-parallel-rpc-port 13399
|
|
--data-parallel-size-local 2
|
|
--data-parallel-start-rank 2
|
|
--data-parallel-address $MASTER_IP
|
|
--tensor-parallel-size 8
|
|
--decode-context-parallel-size 8
|
|
--quantization ascend
|
|
--seed 1024
|
|
--enable-expert-parallel
|
|
--max-num-seqs 16
|
|
--max-model-len 8192
|
|
--max-num-batched-tokens 4096
|
|
--no-enable-prefix-caching
|
|
--gpu-memory-utilization 0.85
|
|
--trust-remote-code
|
|
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
|
|
--compilation-config '{"cudagraph_capture_sizes": [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48], "cudagraph_mode": "FULL_DECODE_ONLY"}'
|
|
--additional-config '{"layer_sharding": ["q_b_proj", "o_proj"]}'
|
|
--tokenizer-mode deepseek_v32
|
|
--reasoning-parser deepseek_v3
|
|
benchmarks:
|
|
perf:
|
|
case_type: performance
|
|
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
|
|
request_conf: vllm_api_stream_chat
|
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
|
num_prompts: 512
|
|
max_out_len: 3000
|
|
batch_size: 512
|
|
request_rate: 11.2
|
|
baseline: 1253.8466
|
|
threshold: 0.97
|
|
|
|
acc:
|
|
case_type: accuracy
|
|
dataset_path: vllm-ascend/gsm8k-lite
|
|
request_conf: vllm_api_general_chat
|
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
|
max_out_len: 4096
|
|
batch_size: 64
|
|
baseline: 95
|
|
threshold: 5
|