Files
xc-llm-ascend/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-EPLB.yaml
wangxiyuan f10acddb78 drop ascend scheduler (#4498)
Ascend scheduler was added for non chunk prefill case before, since that
the npu ops didn't work well with chunked prefill.

Now the ops with chunked prefill work better, it's time to remove the
ascend scheduler to use vLLM default scheduler.

- vLLM version: v0.11.2

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-11-29 16:18:34 +08:00

166 lines
6.2 KiB
YAML

test_name: "test DeepSeek-R1-W8A8 disaggregated_prefill"
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
num_nodes: 4
npu_per_node: 16
env_common:
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 10
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
HCCL_DETERMINISTIC: True
TASK_QUEUE_ENABLE: 1
HCCL_OP_RETRY_ENABLE: "L0:0, L1:0, L2:0"
DYNAMIC_EPLB: true
disaggregated_prefill:
enabled: true
prefiller_host_index: [0, 1]
decoder_host_index: [2]
ranktable_gen_path: "examples/disaggregated_prefill_v1/gen_ranktable.py"
ranktable_path: "/tmp/ranktable.json"
deployment:
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-size-local 2
--tensor-parallel-size 8
--enforce-eager
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 4
--max-model-len 36864
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.9
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--kv-transfer-config
'{"kv_connector": "LLMDataDistCMgrConnector",
"kv_buffer_device": "npu",
"kv_role": "kv_producer",
"kv_parallel_size": 1,
"kv_port": "20001",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-size-local 2
--tensor-parallel-size 8
--enforce-eager
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 4
--max-model-len 36864
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.9
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--kv-transfer-config
'{"kv_connector": "LLMDataDistCMgrConnector",
"kv_buffer_device": "npu",
"kv_role": "kv_producer",
"kv_parallel_size": 1,
"kv_port": "20001",
"engine_id": "1",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 32
--data-parallel-size-local 16
--data-parallel-start-rank 0
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 1
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 28
--max-model-len 36864
--max-num-batched-tokens 256
--trust-remote-code
--gpu-memory-utilization 0.9
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--kv-transfer-config
'{"kv_connector": "LLMDataDistCMgrConnector",
"kv_buffer_device": "npu",
"kv_role": "kv_consumer",
"kv_parallel_size": 1,
"kv_port": "20001",
"engine_id": "2",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--headless
--data-parallel-size 32
--data-parallel-size-local 16
--data-parallel-start-rank 16
--data-parallel-address $MASTER_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 1
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 28
--max-model-len 36864
--max-num-batched-tokens 256
--trust-remote-code
--gpu-memory-utilization 0.9
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--kv-transfer-config
'{"kv_connector": "LLMDataDistCMgrConnector",
"kv_buffer_device": "npu",
"kv_role": "kv_consumer",
"kv_parallel_size": 1,
"kv_port": "20001",
"engine_id": "2",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 2800
max_out_len: 1500
batch_size: 700
request_rate: 11.2
baseline: 1
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 32768
batch_size: 512
baseline: 95
threshold: 5