Files
xc-llm-ascend/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml
Nengjun Ma 297f6deb09 [CI] Align multi-node nightly test paramter with corresponding tutorials document (#5756)
### What this PR does / why we need it?
Align multi-node nightly test paramter with tutorials documents.

### Does this PR introduce _any_ user-facing change?
NA

### How was this patch tested?
Test locally and nighly e2e multi-node test cases.

- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef

---------

Signed-off-by: leo-pony <nengjunma@outlook.com>
2026-01-12 09:00:31 +08:00

111 lines
3.5 KiB
YAML

test_name: "test DeepSeek-R1-W8A8-longseq disaggregated_prefill"
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
num_nodes: 2
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: AIV
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 768
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 1
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
HCCL_DETERMINISTIC: True
TASK_QUEUE_ENABLE: 1
HCCL_OP_RETRY_ENABLE: "L0:0, L1:0"
disaggregated_prefill:
enabled: true
prefiller_host_index: [0]
decoder_host_index: [1]
deployment:
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 1
--decode-context-parallel-size 8
--prefill-context-parallel-size 2
--tensor-parallel-size 8
--cp-kv-cache-interleave-size 128
--enforce-eager
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 4
--max-model-len 32768
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.85
--enable-chunked-prefill
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 8
},
"decode": {
"dp_size": 2,
"tp_size": 8
}
}
}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--decode-context-parallel-size 2
--prefill-context-parallel-size 1
--tensor-parallel-size 8
--cp-kv-cache-interleave-size 128
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 4
--max-model-len 32768
--max-num-batched-tokens 256
--trust-remote-code
--gpu-memory-utilization 0.85
--compilation_config '{"cudagraph_capture_sizes":[4,8,12,16],"cudagraph_mode": "FULL_DECODE_ONLY"}'
--enable-chunked-prefill
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30100",
"engine_id": "1",
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 8
},
"decode": {
"dp_size": 2,
"tp_size": 8
}
}
}'
benchmarks:
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 24576
batch_size: 512
baseline: 95
threshold: 5