etp best a2 (#1101)
### What this PR does / why we need it? Single machine 16 cards deepseekr1 attention (tp8/dp2) / moe(etp) Best performance rely on: vllm-ascend commit id:da9acfca6053352730fce75fb772e214755d0341 vllm commit id:b124e1085b1bf977e3dac96d99ffd9d8ddfdb6cc + https://github.com/vllm-project/vllm-ascend/pull/910 + [Reduce _npu_flash_attention mask to 128x128 for memory savings] https://github.com/vllm-project/vllm-ascend/pull/1100+ [Reduce memory usage by splitting tokens in fused_experts] --------- Signed-off-by: ttanzhiqiang <389825161@qq.com>
This commit is contained in:
23
examples/run_dp_attention_etp16.sh
Normal file
23
examples/run_dp_attention_etp16.sh
Normal file
@@ -0,0 +1,23 @@
|
||||
export VLLM_ENABLE_MC2=0
|
||||
export VLLM_USE_V1=1
|
||||
export TASK_QUEUE_ENABLE=1
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh
|
||||
export ASCEND_LAUNCH_BLOCKING=0
|
||||
export VLLM_VERSION=0.9.0
|
||||
|
||||
nohup python -m vllm.entrypoints.openai.api_server --model=/mnt/deepseek/DeepSeek-R1-W8A8-VLLM \
|
||||
--quantization ascend \
|
||||
--trust-remote-code \
|
||||
--distributed-executor-backend=mp \
|
||||
--port 8006 \
|
||||
-tp=8 \
|
||||
-dp=2 \
|
||||
--max-num-seqs 24 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-batched-tokens 32768 \
|
||||
--block-size 128 \
|
||||
--no-enable-prefix-caching \
|
||||
--additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes":[24]},"ascend_scheduler_config":{"enabled":true},"expert_tensor_parallel_size":16}' \
|
||||
--gpu-memory-utilization 0.96 &> run.log &
|
||||
disown
|
||||
Reference in New Issue
Block a user