Files
xc-llm-ascend/examples/run_dp_attention_etp16.sh
wangxiyuan 787010a637 [Test] Remove VLLM_USE_V1 in example and tests (#1733)
V1 is enabled by default, no need to set it by hand now. This PR remove
the useless setting in example and tests

- vLLM version: v0.9.2
- vLLM main:
9ad0a4588b

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-07-15 12:49:57 +08:00

22 lines
840 B
Bash

export TASK_QUEUE_ENABLE=1
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
export ASCEND_LAUNCH_BLOCKING=0
export VLLM_VERSION=0.9.1
nohup python -m vllm.entrypoints.openai.api_server --model=/mnt/deepseek/DeepSeek-R1-W8A8-VLLM \
--served-model-name auto \
--quantization ascend \
--trust-remote-code \
--distributed-executor-backend=mp \
--port 8006 \
-tp=8 \
-dp=2 \
--max-num-seqs 24 \
--max-model-len 32768 \
--max-num-batched-tokens 32768 \
--block-size 128 \
--no-enable-prefix-caching \
--additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes":[24]},"ascend_scheduler_config":{"enabled":true},"expert_tensor_parallel_size":16}' \
--gpu-memory-utilization 0.96 &> run.log &
disown