Ascend scheduler was added for non chunk prefill case before, since that the npu ops didn't work well with chunked prefill. Now the ops with chunked prefill work better, it's time to remove the ascend scheduler to use vLLM default scheduler. - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
165 lines
5.8 KiB
YAML
165 lines
5.8 KiB
YAML
test_name: "test DeepSeek-R1-W8A8 disaggregated_prefill"
|
|
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
|
num_nodes: 4
|
|
npu_per_node: 16
|
|
env_common:
|
|
VLLM_USE_MODELSCOPE: true
|
|
HCCL_BUFFSIZE: 1024
|
|
SERVER_PORT: 8080
|
|
OMP_PROC_BIND: false
|
|
OMP_NUM_THREADS: 10
|
|
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
|
|
HCCL_DETERMINISTIC: True
|
|
TASK_QUEUE_ENABLE: 1
|
|
HCCL_OP_RETRY_ENABLE: "L0:0, L1:0, L2:0"
|
|
|
|
disaggregated_prefill:
|
|
enabled: true
|
|
prefiller_host_index: [0, 1]
|
|
decoder_host_index: [2]
|
|
ranktable_gen_path: "examples/disaggregated_prefill_v1/gen_ranktable.py"
|
|
ranktable_path: "/tmp/ranktable.json"
|
|
|
|
deployment:
|
|
-
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
|
--host 0.0.0.0
|
|
--port $SERVER_PORT
|
|
--data-parallel-size 2
|
|
--data-parallel-size-local 2
|
|
--tensor-parallel-size 8
|
|
--enforce-eager
|
|
--enable-expert-parallel
|
|
--seed 1024
|
|
--quantization ascend
|
|
--max-num-seqs 4
|
|
--max-model-len 36864
|
|
--max-num-batched-tokens 16384
|
|
--trust-remote-code
|
|
--gpu-memory-utilization 0.9
|
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
|
--kv-transfer-config
|
|
'{"kv_connector": "LLMDataDistCMgrConnector",
|
|
"kv_buffer_device": "npu",
|
|
"kv_role": "kv_producer",
|
|
"kv_parallel_size": 1,
|
|
"kv_port": "20001",
|
|
"engine_id": "0",
|
|
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
|
}'
|
|
--additional-config
|
|
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
|
|
|
-
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
|
--host 0.0.0.0
|
|
--port $SERVER_PORT
|
|
--data-parallel-size 2
|
|
--data-parallel-size-local 2
|
|
--tensor-parallel-size 8
|
|
--enforce-eager
|
|
--enable-expert-parallel
|
|
--seed 1024
|
|
--quantization ascend
|
|
--max-num-seqs 4
|
|
--max-model-len 36864
|
|
--max-num-batched-tokens 16384
|
|
--trust-remote-code
|
|
--gpu-memory-utilization 0.9
|
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
|
--kv-transfer-config
|
|
'{"kv_connector": "LLMDataDistCMgrConnector",
|
|
"kv_buffer_device": "npu",
|
|
"kv_role": "kv_producer",
|
|
"kv_parallel_size": 1,
|
|
"kv_port": "20001",
|
|
"engine_id": "0",
|
|
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
|
}'
|
|
--additional-config
|
|
'{"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
|
|
-
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
|
--host 0.0.0.0
|
|
--port $SERVER_PORT
|
|
--data-parallel-size 32
|
|
--data-parallel-size-local 16
|
|
--data-parallel-start-rank 0
|
|
--data-parallel-address $LOCAL_IP
|
|
--data-parallel-rpc-port 13389
|
|
--tensor-parallel-size 1
|
|
--enable-expert-parallel
|
|
--seed 1024
|
|
--quantization ascend
|
|
--max-num-seqs 28
|
|
--max-model-len 36864
|
|
--max-num-batched-tokens 256
|
|
--trust-remote-code
|
|
--gpu-memory-utilization 0.9
|
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
|
--kv-transfer-config
|
|
'{"kv_connector": "LLMDataDistCMgrConnector",
|
|
"kv_buffer_device": "npu",
|
|
"kv_role": "kv_consumer",
|
|
"kv_parallel_size": 1,
|
|
"kv_port": "20001",
|
|
"engine_id": "0",
|
|
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
|
}'
|
|
--additional-config
|
|
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
|
-
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
|
--headless
|
|
--data-parallel-size 32
|
|
--data-parallel-size-local 16
|
|
--data-parallel-start-rank 16
|
|
--data-parallel-address $MASTER_IP
|
|
--data-parallel-rpc-port 13389
|
|
--tensor-parallel-size 1
|
|
--enable-expert-parallel
|
|
--seed 1024
|
|
--quantization ascend
|
|
--max-num-seqs 28
|
|
--max-model-len 36864
|
|
--max-num-batched-tokens 256
|
|
--trust-remote-code
|
|
--gpu-memory-utilization 0.9
|
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
|
--kv-transfer-config
|
|
'{"kv_connector": "LLMDataDistCMgrConnector",
|
|
"kv_buffer_device": "npu",
|
|
"kv_role": "kv_consumer",
|
|
"kv_parallel_size": 1,
|
|
"kv_port": "20001",
|
|
"engine_id": "0",
|
|
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
|
}'
|
|
--additional-config
|
|
'{"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
|
|
benchmarks:
|
|
perf:
|
|
case_type: performance
|
|
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
|
|
request_conf: vllm_api_stream_chat
|
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
|
|
num_prompts: 2800
|
|
max_out_len: 1500
|
|
batch_size: 700
|
|
request_rate: 11.2
|
|
baseline: 1
|
|
threshold: 0.97
|
|
acc:
|
|
case_type: accuracy
|
|
dataset_path: vllm-ascend/gsm8k
|
|
request_conf: vllm_api_general_chat
|
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
|
max_out_len: 32768
|
|
batch_size: 512
|
|
baseline: 95
|
|
threshold: 5
|