Ascend scheduler was added for non chunk prefill case before, since that the npu ops didn't work well with chunked prefill. Now the ops with chunked prefill work better, it's time to remove the ascend scheduler to use vLLM default scheduler. - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
58 lines
1.7 KiB
YAML
58 lines
1.7 KiB
YAML
test_name: "test DeepSeek-R1-W8A8 on A2"
|
|
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
|
num_nodes: 2
|
|
npu_per_node: 8
|
|
env_common:
|
|
VLLM_USE_MODELSCOPE: true
|
|
HCCL_BUFFSIZE: 1024
|
|
SERVER_PORT: 8080
|
|
OMP_PROC_BIND: false
|
|
OMP_NUM_THREADS: 10
|
|
|
|
|
|
deployment:
|
|
-
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
|
--host 0.0.0.0
|
|
--port $SERVER_PORT
|
|
--data-parallel-size 4
|
|
--data-parallel-size-local 2
|
|
--data-parallel-address $LOCAL_IP
|
|
--data-parallel-rpc-port 13399
|
|
--no-enable-prefix-caching
|
|
--max-num-seqs 16
|
|
--tensor-parallel-size 4
|
|
--max-model-len 36864
|
|
--max-num-batched-tokens 6000
|
|
--enable-expert-parallel
|
|
--trust-remote-code
|
|
--quantization ascend
|
|
--gpu-memory-utilization 0.9
|
|
--enforce-eager
|
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
|
--additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
|
|
|
-
|
|
server_cmd: >
|
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
|
--headless
|
|
--data-parallel-size 4
|
|
--data-parallel-rpc-port 13399
|
|
--data-parallel-size-local 2
|
|
--data-parallel-start-rank 2
|
|
--data-parallel-address $MASTER_IP
|
|
--no-enable-prefix-caching
|
|
--max-num-seqs 16
|
|
--tensor-parallel-size 4
|
|
--max-model-len 36864
|
|
--max-num-batched-tokens 6000
|
|
--enable-expert-parallel
|
|
--trust-remote-code
|
|
--quantization ascend
|
|
--gpu-memory-utilization 0.9
|
|
--enforce-eager
|
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
|
|
--additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
|
benchmarks:
|