### What this PR does / why we need it? This PR renames the environment variable VLLM_NIXL_ABORT_REQUEST_TIMEOUT to VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT to align with the Mooncake connector naming convention. It also updates the documentation and test configurations to reflect this change and adjusts the suggested timeout value in the documentation to 480 seconds for consistency. ### Does this PR introduce _any_ user-facing change? Yes. The environment variable for configuring the abort request timeout has been renamed. Users should update their environment settings from VLLM_NIXL_ABORT_REQUEST_TIMEOUT to VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT. ### How was this patch tested? The changes were verified by updating the corresponding test configuration files and ensuring consistency across the documentation. --------- Signed-off-by: herizhen <1270637059@qq.com> Signed-off-by: herizhen <59841270+herizhen@users.noreply.github.com>
220 lines
7.2 KiB
YAML
220 lines
7.2 KiB
YAML
test_name: "test DeepSeek-V3.2-W8A8-EP disaggregated_prefill"
|
|
model: "vllm-ascend/DeepSeek-V3.2-W8A8"
|
|
num_nodes: 4
|
|
npu_per_node: 16
|
|
env_common:
|
|
HCCL_OP_EXPANSION_MODE: "AIV"
|
|
VLLM_USE_MODELSCOPE: true
|
|
SERVER_PORT: 8080
|
|
OMP_PROC_BIND: false
|
|
OMP_NUM_THREADS: 10
|
|
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
|
HCCL_BUFFSIZE: 1024
|
|
VLLM_TORCH_PROFILER_WITH_STACK: 0
|
|
ASCEND_AGGREGATE_ENABLE: 1
|
|
ASCEND_TRANSPORT_PRINT: 1
|
|
ACL_OP_INIT_MODE: 1
|
|
ASCEND_A3_ENABLE: 1
|
|
VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: 480
|
|
VLLM_ENGINE_READY_TIMEOUT_S: 1800
|
|
HCCL_CONNECT_TIMEOUT: 1200
|
|
HCCL_INTRA_PCIE_ENABLE: 1
|
|
HCCL_INTRA_ROCE_ENABLE: 0
|
|
|
|
disaggregated_prefill:
|
|
enabled: true
|
|
prefiller_host_index: [0, 1]
|
|
decoder_host_index: [2, 3]
|
|
|
|
|
|
deployment:
|
|
-
|
|
envs:
|
|
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
|
|
--host 0.0.0.0
|
|
--port $SERVER_PORT
|
|
--data-parallel-size 2
|
|
--data-parallel-start-rank 0
|
|
--data-parallel-size-local 1
|
|
--data-parallel-address $LOCAL_IP
|
|
--tensor-parallel-size 16
|
|
--enable-expert-parallel
|
|
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
|
|
--seed 1024
|
|
--quantization ascend
|
|
--max-num-seqs 64
|
|
--max-model-len 85000
|
|
--max-num-batched-tokens 16384
|
|
--trust-remote-code
|
|
--gpu-memory-utilization 0.82
|
|
--enforce-eager
|
|
--no-enable-prefix-caching
|
|
--additional-config '{"enable_cpu_binding" : false, "enable_sfa_cp":false,"layer_sharding": ["q_b_proj", "o_proj"]}'
|
|
--tokenizer-mode deepseek_v32
|
|
--reasoning-parser deepseek_v3
|
|
--kv-transfer-config
|
|
'{"kv_connector": "MooncakeConnectorV1",
|
|
"kv_role": "kv_producer",
|
|
"kv_port": "30000",
|
|
"engine_id": "0",
|
|
"kv_connector_extra_config": {
|
|
"prefill": {
|
|
"dp_size": 2,
|
|
"tp_size": 16
|
|
},
|
|
"decode": {
|
|
"dp_size": 8,
|
|
"tp_size": 4
|
|
}
|
|
}
|
|
}'
|
|
|
|
|
|
-
|
|
envs:
|
|
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
|
|
--host 0.0.0.0
|
|
--headless
|
|
--port $SERVER_PORT
|
|
--data-parallel-size 2
|
|
--data-parallel-start-rank 1
|
|
--data-parallel-size-local 1
|
|
--data-parallel-address $MASTER_IP
|
|
--tensor-parallel-size 16
|
|
--enable-expert-parallel
|
|
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
|
|
--seed 1024
|
|
--quantization ascend
|
|
--max-num-seqs 64
|
|
--max-model-len 85000
|
|
--max-num-batched-tokens 16384
|
|
--trust-remote-code
|
|
--gpu-memory-utilization 0.82
|
|
--enforce-eager
|
|
--no-enable-prefix-caching
|
|
--additional-config '{"enable_cpu_binding" : false, "enable_sfa_cp":false,"layer_sharding": ["q_b_proj", "o_proj"]}'
|
|
--tokenizer-mode deepseek_v32
|
|
--reasoning-parser deepseek_v3
|
|
--kv-transfer-config
|
|
'{"kv_connector": "MooncakeConnectorV1",
|
|
"kv_role": "kv_producer",
|
|
"kv_port": "30000",
|
|
"engine_id": "0",
|
|
"kv_connector_extra_config": {
|
|
"prefill": {
|
|
"dp_size": 2,
|
|
"tp_size": 16
|
|
},
|
|
"decode": {
|
|
"dp_size": 8,
|
|
"tp_size": 4
|
|
}
|
|
}
|
|
}'
|
|
-
|
|
envs:
|
|
VLLM_ASCEND_ENABLE_MLAPO: 1
|
|
TASK_QUEUE_ENABLE: 1
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
|
|
--host 0.0.0.0
|
|
--port $SERVER_PORT
|
|
--data-parallel-size 8
|
|
--data-parallel-size-local 4
|
|
--data-parallel-start-rank 0
|
|
--data-parallel-address $LOCAL_IP
|
|
--data-parallel-rpc-port 13389
|
|
--tensor-parallel-size 4
|
|
--enable-expert-parallel
|
|
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
|
|
--seed 1024
|
|
--quantization ascend
|
|
--max-model-len 85000
|
|
--max-num-batched-tokens 12
|
|
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3, 6, 9, 12]}'
|
|
--trust-remote-code
|
|
--max-num-seqs 4
|
|
--gpu-memory-utilization 0.95
|
|
--no-enable-prefix-caching
|
|
--async-scheduling
|
|
--additional-config '{"enable_cpu_binding" : false,"recompute_scheduler_enable" : true}'
|
|
--tokenizer-mode deepseek_v32
|
|
--reasoning-parser deepseek_v3
|
|
--kv-transfer-config
|
|
'{"kv_connector": "MooncakeConnectorV1",
|
|
"kv_role": "kv_consumer",
|
|
"kv_port": "30200",
|
|
"engine_id": "1",
|
|
"kv_connector_extra_config": {
|
|
"prefill": {
|
|
"dp_size": 2,
|
|
"tp_size": 16
|
|
},
|
|
"decode": {
|
|
"dp_size": 8,
|
|
"tp_size": 4
|
|
}
|
|
}
|
|
}'
|
|
|
|
-
|
|
envs:
|
|
VLLM_ASCEND_ENABLE_MLAPO: 1
|
|
TASK_QUEUE_ENABLE: 1
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
|
|
--host 0.0.0.0
|
|
--headless
|
|
--port $SERVER_PORT
|
|
--data-parallel-size 8
|
|
--data-parallel-size-local 4
|
|
--data-parallel-start-rank 4
|
|
--data-parallel-address $MASTER_IP
|
|
--data-parallel-rpc-port 13389
|
|
--tensor-parallel-size 4
|
|
--enable-expert-parallel
|
|
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
|
|
--seed 1024
|
|
--quantization ascend
|
|
--max-model-len 85000
|
|
--max-num-batched-tokens 12
|
|
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3, 6, 9, 12]}'
|
|
--trust-remote-code
|
|
--max-num-seqs 4
|
|
--gpu-memory-utilization 0.95
|
|
--no-enable-prefix-caching
|
|
--async-scheduling
|
|
--additional-config '{"enable_cpu_binding" : false,"recompute_scheduler_enable" : true}'
|
|
--tokenizer-mode deepseek_v32
|
|
--reasoning-parser deepseek_v3
|
|
--kv-transfer-config
|
|
'{"kv_connector": "MooncakeConnectorV1",
|
|
"kv_role": "kv_consumer",
|
|
"kv_port": "30200",
|
|
"engine_id": "1",
|
|
"kv_connector_extra_config": {
|
|
"prefill": {
|
|
"dp_size": 2,
|
|
"tp_size": 16
|
|
},
|
|
"decode": {
|
|
"dp_size": 8,
|
|
"tp_size": 4
|
|
}
|
|
}
|
|
}'
|
|
benchmarks:
|
|
acc:
|
|
case_type: accuracy
|
|
dataset_path: vllm-ascend/aime2025
|
|
request_conf: vllm_api_general_chat
|
|
dataset_conf: aime2025/aime2025_gen_0_shot_chat_prompt
|
|
max_out_len: 72348
|
|
batch_size: 32
|
|
baseline: 66.67
|
|
threshold: 7
|