Files
xc-llm-ascend/tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-longseq.yaml
dsxsteven 325cb16e3f [BugFix][CI]Fix DeepSeek-R1-W8A8-longseq nightly CI (#6297)
### What this PR does / why we need it?
The precision issue arose because the kv cache of the p-node had not
been fetched for an extended period(>6min) and was forcibly freed. To
avoid this problem, the batch size was reduced and the timeout period
has also been extended.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.14.1
- vLLM main:
dc917cceb8

Signed-off-by: dsxsteven <dsxsteven@sina.com>
2026-01-28 16:36:24 +08:00

110 lines
3.4 KiB
YAML

test_name: "test DeepSeek-R1-W8A8-longseq disaggregated_prefill"
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
num_nodes: 2
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: AIV
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 768
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 1
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
HCCL_DETERMINISTIC: True
TASK_QUEUE_ENABLE: 1
HCCL_OP_RETRY_ENABLE: "L0:0, L1:0"
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: 300000
disaggregated_prefill:
enabled: true
prefiller_host_index: [0]
decoder_host_index: [1]
deployment:
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 1
--decode-context-parallel-size 8
--prefill-context-parallel-size 2
--tensor-parallel-size 8
--cp-kv-cache-interleave-size 128
--enforce-eager
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 3
--max-model-len 32768
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.85
--enable-chunked-prefill
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 8
},
"decode": {
"dp_size": 2,
"tp_size": 8
}
}
}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--decode-context-parallel-size 2
--prefill-context-parallel-size 1
--tensor-parallel-size 8
--cp-kv-cache-interleave-size 128
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 8
--max-model-len 32768
--max-num-batched-tokens 256
--trust-remote-code
--gpu-memory-utilization 0.85
--compilation_config '{"cudagraph_capture_sizes":[4,8,16,32],"cudagraph_mode": "FULL_DECODE_ONLY"}'
--enable-chunked-prefill
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30100",
"engine_id": "1",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 8
},
"decode": {
"dp_size": 2,
"tp_size": 8
}
}
}'
benchmarks:
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 24576
batch_size: 16
baseline: 95
threshold: 5