Files
xc-llm-ascend/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-EP.yaml
zhangxinyuehfad 566c367a10 [CI] Add DeepSeek-V3.2 large EP nightly ci (#6378)
### What this PR does / why we need it?

Add DeepSeek-V3.2 nightly ci

Fix PD routing to exclude headless nodes when collecting
prefiller/decoder IPs

- vLLM version: v0.14.1
- vLLM main:
dc917cceb8

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2026-03-04 16:15:56 +08:00

234 lines
7.7 KiB
YAML

test_name: "test DeepSeek-V3.2-W8A8-EP disaggregated_prefill"
model: "vllm-ascend/DeepSeek-V3.2-W8A8"
num_nodes: 4
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: "AIV"
VLLM_USE_MODELSCOPE: true
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 10
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
HCCL_BUFFSIZE: 1024
VLLM_TORCH_PROFILER_WITH_STACK: 0
ASCEND_AGGREGATE_ENABLE: 1
ASCEND_TRANSPORT_PRINT: 1
ACL_OP_INIT_MODE: 1
ASCEND_A3_ENABLE: 1
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: 300000
VLLM_ENGINE_READY_TIMEOUT_S: 1800
HCCL_CONNECT_TIMEOUT: 1200
HCCL_INTRA_PCIE_ENABLE: 1
HCCL_INTRA_ROCE_ENABLE: 0
disaggregated_prefill:
enabled: true
prefiller_host_index: [0, 1]
decoder_host_index: [2, 3]
deployment:
-
envs:
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-start-rank 0
--data-parallel-size-local 1
--data-parallel-address $LOCAL_IP
--tensor-parallel-size 16
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-num-seqs 64
--max-model-len 68000
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.82
--enforce-eager
--no-enable-prefix-caching
--additional-config '{"enable_cpu_binding" : false, "enable_sfa_cp":false,"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
-
envs:
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--headless
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-start-rank 1
--data-parallel-size-local 1
--data-parallel-address $MASTER_IP
--tensor-parallel-size 16
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-num-seqs 64
--max-model-len 68000
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.82
--enforce-eager
--no-enable-prefix-caching
--additional-config '{"enable_cpu_binding" : false, "enable_sfa_cp":false,"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
-
envs:
VLLM_ASCEND_ENABLE_MLAPO: 1
TASK_QUEUE_ENABLE: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 8
--data-parallel-size-local 4
--data-parallel-start-rank 0
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 4
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-model-len 68000
--max-num-batched-tokens 12
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3, 6, 9, 12]}'
--trust-remote-code
--max-num-seqs 4
--gpu-memory-utilization 0.95
--no-enable-prefix-caching
--async-scheduling
--additional-config '{"enable_cpu_binding" : false,"recompute_scheduler_enable" : true}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30200",
"engine_id": "1",
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
-
envs:
VLLM_ASCEND_ENABLE_MLAPO: 1
TASK_QUEUE_ENABLE: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--headless
--port $SERVER_PORT
--data-parallel-size 8
--data-parallel-size-local 4
--data-parallel-start-rank 4
--data-parallel-address $MASTER_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 4
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-model-len 68000
--max-num-batched-tokens 12
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3, 6, 9, 12]}'
--trust-remote-code
--max-num-seqs 4
--gpu-memory-utilization 0.95
--no-enable-prefix-caching
--async-scheduling
--additional-config '{"enable_cpu_binding" : false,"recompute_scheduler_enable" : true}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30200",
"engine_id": "1",
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 512
max_out_len: 1500
batch_size: 512
request_rate: 11.2
baseline: 1146
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 4096
batch_size: 64
baseline: 95
threshold: 5