Ascend scheduler was added for non chunk prefill case before, since that the npu ops didn't work well with chunked prefill. Now the ops with chunked prefill work better, it's time to remove the ascend scheduler to use vLLM default scheduler. - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
65 lines
2.0 KiB
YAML
65 lines
2.0 KiB
YAML
test_name: "test DeepSeek-R1-W8A8 torchair on A2"
|
|
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
|
num_nodes: 2
|
|
npu_per_node: 8
|
|
env_common:
|
|
VLLM_USE_MODELSCOPE: true
|
|
HCCL_BUFFSIZE: 1024
|
|
SERVER_PORT: 8080
|
|
OMP_PROC_BIND: false
|
|
OMP_NUM_THREADS: 10
|
|
|
|
|
|
deployment:
|
|
-
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
|
--host 0.0.0.0
|
|
--port $SERVER_PORT
|
|
--data-parallel-size 4
|
|
--data-parallel-size-local 2
|
|
--data-parallel-address $LOCAL_IP
|
|
--data-parallel-rpc-port 13399
|
|
--no-enable-prefix-caching
|
|
--max-num-seqs 16
|
|
--tensor-parallel-size 4
|
|
--max-model-len 36864
|
|
--max-num-batched-tokens 6000
|
|
--enable-expert-parallel
|
|
--trust-remote-code
|
|
--quantization ascend
|
|
--gpu-memory-utilization 0.9
|
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
|
--additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
|
|
|
-
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
|
--headless
|
|
--data-parallel-size 4
|
|
--data-parallel-rpc-port 13399
|
|
--data-parallel-size-local 2
|
|
--data-parallel-start-rank 2
|
|
--data-parallel-address $MASTER_IP
|
|
--no-enable-prefix-caching
|
|
--max-num-seqs 16
|
|
--tensor-parallel-size 4
|
|
--max-model-len 36864
|
|
--max-num-batched-tokens 6000
|
|
--enable-expert-parallel
|
|
--trust-remote-code
|
|
--quantization ascend
|
|
--gpu-memory-utilization 0.9
|
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
|
--additional-config '{"torchair_graph_config":{"enabled":true,"enable_multistream_moe":true},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
|
benchmarks:
|
|
acc:
|
|
case_type: accuracy
|
|
dataset_path: vllm-ascend/gsm8k
|
|
request_conf: vllm_api_general_chat
|
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
|
max_out_len: 32768
|
|
batch_size: 512
|
|
baseline: 95
|
|
threshold: 5
|