Files
xc-llm-ascend/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-EP-aime2025.yaml
herizhen ff76c6780e [releases/v0.18.0][Doc][Misc] Modifying Configuration Parameters (#8618)
### What this PR does / why we need it?
This PR renames the environment variable VLLM_NIXL_ABORT_REQUEST_TIMEOUT
to VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT to align with the Mooncake
connector naming convention. It also updates the documentation and test
configurations to reflect this change and adjusts the suggested timeout
value in the documentation to 480 seconds for consistency.

### Does this PR introduce _any_ user-facing change?
Yes. The environment variable for configuring the abort request timeout
has been renamed. Users should update their environment settings from
VLLM_NIXL_ABORT_REQUEST_TIMEOUT to VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT.

### How was this patch tested?
The changes were verified by updating the corresponding test
configuration files and ensuring consistency across the documentation.

---------

Signed-off-by: herizhen <1270637059@qq.com>
Signed-off-by: herizhen <59841270+herizhen@users.noreply.github.com>
2026-04-23 16:23:31 +08:00

220 lines
7.2 KiB
YAML

test_name: "test DeepSeek-V3.2-W8A8-EP disaggregated_prefill"
model: "vllm-ascend/DeepSeek-V3.2-W8A8"
num_nodes: 4
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: "AIV"
VLLM_USE_MODELSCOPE: true
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 10
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
HCCL_BUFFSIZE: 1024
VLLM_TORCH_PROFILER_WITH_STACK: 0
ASCEND_AGGREGATE_ENABLE: 1
ASCEND_TRANSPORT_PRINT: 1
ACL_OP_INIT_MODE: 1
ASCEND_A3_ENABLE: 1
VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: 480
VLLM_ENGINE_READY_TIMEOUT_S: 1800
HCCL_CONNECT_TIMEOUT: 1200
HCCL_INTRA_PCIE_ENABLE: 1
HCCL_INTRA_ROCE_ENABLE: 0
disaggregated_prefill:
enabled: true
prefiller_host_index: [0, 1]
decoder_host_index: [2, 3]
deployment:
-
envs:
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-start-rank 0
--data-parallel-size-local 1
--data-parallel-address $LOCAL_IP
--tensor-parallel-size 16
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-num-seqs 64
--max-model-len 85000
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.82
--enforce-eager
--no-enable-prefix-caching
--additional-config '{"enable_cpu_binding" : false, "enable_sfa_cp":false,"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
-
envs:
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--headless
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-start-rank 1
--data-parallel-size-local 1
--data-parallel-address $MASTER_IP
--tensor-parallel-size 16
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-num-seqs 64
--max-model-len 85000
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.82
--enforce-eager
--no-enable-prefix-caching
--additional-config '{"enable_cpu_binding" : false, "enable_sfa_cp":false,"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
-
envs:
VLLM_ASCEND_ENABLE_MLAPO: 1
TASK_QUEUE_ENABLE: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 8
--data-parallel-size-local 4
--data-parallel-start-rank 0
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 4
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-model-len 85000
--max-num-batched-tokens 12
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3, 6, 9, 12]}'
--trust-remote-code
--max-num-seqs 4
--gpu-memory-utilization 0.95
--no-enable-prefix-caching
--async-scheduling
--additional-config '{"enable_cpu_binding" : false,"recompute_scheduler_enable" : true}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30200",
"engine_id": "1",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
-
envs:
VLLM_ASCEND_ENABLE_MLAPO: 1
TASK_QUEUE_ENABLE: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--headless
--port $SERVER_PORT
--data-parallel-size 8
--data-parallel-size-local 4
--data-parallel-start-rank 4
--data-parallel-address $MASTER_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 4
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-model-len 85000
--max-num-batched-tokens 12
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3, 6, 9, 12]}'
--trust-remote-code
--max-num-seqs 4
--gpu-memory-utilization 0.95
--no-enable-prefix-caching
--async-scheduling
--additional-config '{"enable_cpu_binding" : false,"recompute_scheduler_enable" : true}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30200",
"engine_id": "1",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
benchmarks:
acc:
case_type: accuracy
dataset_path: vllm-ascend/aime2025
request_conf: vllm_api_general_chat
dataset_conf: aime2025/aime2025_gen_0_shot_chat_prompt
max_out_len: 72348
batch_size: 32
baseline: 66.67
threshold: 7