etp best a2 (#1101)

### What this PR does / why we need it? Single machine 16 cards deepseekr1 attention (tp8/dp2) / moe(etp) Best performance rely on: vllm-ascend commit id:da9acfca6053352730fce75fb772e214755d0341 vllm commit id:b124e1085b1bf977e3dac96d99ffd9d8ddfdb6cc + https://github.com/vllm-project/vllm-ascend/pull/910 + [Reduce _npu_flash_attention mask to 128x128 for memory savings] https://github.com/vllm-project/vllm-ascend/pull/1100+ [Reduce memory usage by splitting tokens in fused_experts] --------- Signed-off-by: ttanzhiqiang <389825161@qq.com>
2025-06-11 10:40:50 +08:00
parent 860a5ef7fd
commit 980cd81466
2 changed files with 79 additions and 0 deletions
--- a/examples/run_dp_attention_etp16.sh
+++ b/examples/run_dp_attention_etp16.sh
@@ -0,0 +1,23 @@
+export VLLM_ENABLE_MC2=0
+export VLLM_USE_V1=1
+export TASK_QUEUE_ENABLE=1
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+source /usr/local/Ascend/nnal/atb/set_env.sh
+export ASCEND_LAUNCH_BLOCKING=0
+export VLLM_VERSION=0.9.0
+
+nohup python -m vllm.entrypoints.openai.api_server --model=/mnt/deepseek/DeepSeek-R1-W8A8-VLLM \
+    --quantization ascend \
+    --trust-remote-code \
+    --distributed-executor-backend=mp \
+    --port 8006 \
+    -tp=8 \
+    -dp=2 \
+    --max-num-seqs 24 \
+    --max-model-len 32768 \
+    --max-num-batched-tokens 32768 \
+    --block-size 128 \
+    --no-enable-prefix-caching \
+    --additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes":[24]},"ascend_scheduler_config":{"enabled":true},"expert_tensor_parallel_size":16}' \
+    --gpu-memory-utilization 0.96 &> run.log &
+disown