Files
xc-llm-ascend/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml
herizhen ff76c6780e [releases/v0.18.0][Doc][Misc] Modifying Configuration Parameters (#8618)
### What this PR does / why we need it?
This PR renames the environment variable VLLM_NIXL_ABORT_REQUEST_TIMEOUT
to VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT to align with the Mooncake
connector naming convention. It also updates the documentation and test
configurations to reflect this change and adjusts the suggested timeout
value in the documentation to 480 seconds for consistency.

### Does this PR introduce _any_ user-facing change?
Yes. The environment variable for configuring the abort request timeout
has been renamed. Users should update their environment settings from
VLLM_NIXL_ABORT_REQUEST_TIMEOUT to VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT.

### How was this patch tested?
The changes were verified by updating the corresponding test
configuration files and ensuring consistency across the documentation.

---------

Signed-off-by: herizhen <1270637059@qq.com>
Signed-off-by: herizhen <59841270+herizhen@users.noreply.github.com>
2026-04-23 16:23:31 +08:00

110 lines
3.4 KiB
YAML

test_name: "test DeepSeek-R1-W8A8-longseq disaggregated_prefill"
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
num_nodes: 2
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: AIV
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 768
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 1
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
HCCL_DETERMINISTIC: True
TASK_QUEUE_ENABLE: 1
HCCL_OP_RETRY_ENABLE: "L0:0, L1:0"
VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: 480
disaggregated_prefill:
enabled: true
prefiller_host_index: [0]
decoder_host_index: [1]
deployment:
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 1
--decode-context-parallel-size 8
--prefill-context-parallel-size 2
--tensor-parallel-size 8
--cp-kv-cache-interleave-size 128
--enforce-eager
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 3
--max-model-len 32768
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.85
--enable-chunked-prefill
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 8
},
"decode": {
"dp_size": 2,
"tp_size": 8
}
}
}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--decode-context-parallel-size 2
--prefill-context-parallel-size 1
--tensor-parallel-size 8
--cp-kv-cache-interleave-size 128
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 8
--max-model-len 32768
--max-num-batched-tokens 256
--trust-remote-code
--gpu-memory-utilization 0.85
--compilation_config '{"cudagraph_capture_sizes":[4,8,16,32],"cudagraph_mode": "FULL_DECODE_ONLY"}'
--enable-chunked-prefill
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30100",
"engine_id": "1",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 8
},
"decode": {
"dp_size": 2,
"tp_size": 8
}
}
}'
benchmarks:
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 24576
batch_size: 16
baseline: 95
threshold: 5