Files
xc-llm-ascend/tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-EP.yaml
herizhen ff76c6780e [releases/v0.18.0][Doc][Misc] Modifying Configuration Parameters (#8618)
### What this PR does / why we need it?
This PR renames the environment variable VLLM_NIXL_ABORT_REQUEST_TIMEOUT
to VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT to align with the Mooncake
connector naming convention. It also updates the documentation and test
configurations to reflect this change and adjusts the suggested timeout
value in the documentation to 480 seconds for consistency.

### Does this PR introduce _any_ user-facing change?
Yes. The environment variable for configuring the abort request timeout
has been renamed. Users should update their environment settings from
VLLM_NIXL_ABORT_REQUEST_TIMEOUT to VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT.

### How was this patch tested?
The changes were verified by updating the corresponding test
configuration files and ensuring consistency across the documentation.

---------

Signed-off-by: herizhen <1270637059@qq.com>
Signed-off-by: herizhen <59841270+herizhen@users.noreply.github.com>
2026-04-23 16:23:31 +08:00

230 lines
7.6 KiB
YAML

test_name: "test DeepSeek-V3.2-W8A8-EP disaggregated_prefill"
model: "vllm-ascend/DeepSeek-V3.2-W8A8"
num_nodes: 4
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: "AIV"
VLLM_USE_MODELSCOPE: true
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 10
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
HCCL_BUFFSIZE: 1024
VLLM_TORCH_PROFILER_WITH_STACK: 0
ASCEND_AGGREGATE_ENABLE: 1
ASCEND_TRANSPORT_PRINT: 1
ACL_OP_INIT_MODE: 1
ASCEND_A3_ENABLE: 1
VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: 480
VLLM_ENGINE_READY_TIMEOUT_S: 1800
HCCL_CONNECT_TIMEOUT: 1200
HCCL_INTRA_PCIE_ENABLE: 1
HCCL_INTRA_ROCE_ENABLE: 0
disaggregated_prefill:
enabled: true
prefiller_host_index: [0, 1]
decoder_host_index: [2, 3]
deployment:
-
envs:
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-start-rank 0
--data-parallel-size-local 1
--data-parallel-address $LOCAL_IP
--tensor-parallel-size 16
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-num-seqs 64
--max-model-len 68000
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.82
--enforce-eager
--no-enable-prefix-caching
--additional-config '{"enable_cpu_binding" : false, "enable_sfa_cp":false,"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
-
envs:
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--headless
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-start-rank 1
--data-parallel-size-local 1
--data-parallel-address $MASTER_IP
--tensor-parallel-size 16
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-num-seqs 64
--max-model-len 68000
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.82
--enforce-eager
--no-enable-prefix-caching
--additional-config '{"enable_cpu_binding" : false, "enable_sfa_cp":false,"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
-
envs:
VLLM_ASCEND_ENABLE_MLAPO: 1
TASK_QUEUE_ENABLE: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 8
--data-parallel-size-local 4
--data-parallel-start-rank 0
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 4
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-model-len 68000
--max-num-batched-tokens 12
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3, 6, 9, 12]}'
--trust-remote-code
--max-num-seqs 4
--gpu-memory-utilization 0.95
--no-enable-prefix-caching
--async-scheduling
--additional-config '{"enable_cpu_binding" : false,"recompute_scheduler_enable" : true}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30200",
"engine_id": "1",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
-
envs:
VLLM_ASCEND_ENABLE_MLAPO: 1
TASK_QUEUE_ENABLE: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--headless
--port $SERVER_PORT
--data-parallel-size 8
--data-parallel-size-local 4
--data-parallel-start-rank 4
--data-parallel-address $MASTER_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 4
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-model-len 68000
--max-num-batched-tokens 12
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3, 6, 9, 12]}'
--trust-remote-code
--max-num-seqs 4
--gpu-memory-utilization 0.95
--no-enable-prefix-caching
--async-scheduling
--additional-config '{"enable_cpu_binding" : false,"recompute_scheduler_enable" : true}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30200",
"engine_id": "1",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 512
max_out_len: 1500
batch_size: 512
request_rate: 11.2
baseline: 1146
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 4096
batch_size: 64
baseline: 95
threshold: 5