Ascend scheduler was added for non chunk prefill case before, since that the npu ops didn't work well with chunked prefill. Now the ops with chunked prefill work better, it's time to remove the ascend scheduler to use vLLM default scheduler. - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
32 lines
820 B
Bash
32 lines
820 B
Bash
|
|
export HCCL_IF_IP=2.0.0.0
|
|
export GLOO_SOCKET_IFNAME="eth0"
|
|
export TP_SOCKET_IFNAME="eth0"
|
|
export HCCL_SOCKET_IFNAME="eth0"
|
|
|
|
export OMP_PROC_BIND=false
|
|
export OMP_NUM_THREADS=100
|
|
|
|
export VLLM_USE_MODELSCOPE=true
|
|
|
|
export ASCEND_LAUNCH_BLOCKING=0
|
|
|
|
vllm serve Qwen/Qwen1.5-MoE-A2.7B \
|
|
--host 0.0.0.0 \
|
|
--port 20002 \
|
|
--served-model-name Qwen \
|
|
--data-parallel-size 2 \
|
|
--data-parallel-size-local 2 \
|
|
--data-parallel-address 2.0.0.0 \
|
|
--data-parallel-rpc-port 13389 \
|
|
--tensor-parallel-size 4 \
|
|
--enable-expert-parallel \
|
|
--no-enable-prefix-caching \
|
|
--max-num-seqs 16 \
|
|
--max-model-len 4096 \
|
|
--max-num-batched-tokens 4096 \
|
|
--gpu-memory-utilization 0.9 \
|
|
--trust-remote-code \
|
|
--enforce-eager \
|
|
--additional-config '{"torchair_graph_config":{"enabled":false, "use_cached_graph":false}}'
|