etp best a2 (#1101)

### What this PR does / why we need it?
Single machine 16 cards deepseekr1 attention (tp8/dp2) / moe(etp) Best
performance

rely on:
vllm-ascend commit id:da9acfca6053352730fce75fb772e214755d0341
vllm commit id:b124e1085b1bf977e3dac96d99ffd9d8ddfdb6cc
+ https://github.com/vllm-project/vllm-ascend/pull/910 + [Reduce
_npu_flash_attention mask to 128x128 for memory savings]
https://github.com/vllm-project/vllm-ascend/pull/1100+ [Reduce memory
usage by splitting tokens in fused_experts]


---------

Signed-off-by: ttanzhiqiang <389825161@qq.com>
This commit is contained in:
ttanzhiqiang
2025-06-11 10:40:50 +08:00
committed by GitHub
parent 860a5ef7fd
commit 980cd81466
2 changed files with 79 additions and 0 deletions

View File

@@ -0,0 +1,23 @@
export VLLM_ENABLE_MC2=0
export VLLM_USE_V1=1
export TASK_QUEUE_ENABLE=1
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
export ASCEND_LAUNCH_BLOCKING=0
export VLLM_VERSION=0.9.0
nohup python -m vllm.entrypoints.openai.api_server --model=/mnt/deepseek/DeepSeek-R1-W8A8-VLLM \
--quantization ascend \
--trust-remote-code \
--distributed-executor-backend=mp \
--port 8006 \
-tp=8 \
-dp=2 \
--max-num-seqs 24 \
--max-model-len 32768 \
--max-num-batched-tokens 32768 \
--block-size 128 \
--no-enable-prefix-caching \
--additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes":[24]},"ascend_scheduler_config":{"enabled":true},"expert_tensor_parallel_size":16}' \
--gpu-memory-utilization 0.96 &> run.log &
disown

View File

@@ -0,0 +1,56 @@
#!/bin/bash
# Concurrency array
concurrency_array=(48)
#best rate
rate_array=(0.7)
# Result file
result_file="benchmark_results.txt"
echo "Benchmark Results" > $result_file
echo "===================" >> $result_file
# Loop through all combinations
for concurrency in "${concurrency_array[@]}"; do
for rate in "${rate_array[@]}"; do
echo "Testing with concurrency=$concurrency, rate=$rate"
echo "" >> $result_file
echo "Concurrency: $concurrency, Request Rate: $rate" >> $result_file
echo "-------------------" >> $result_file
# Run benchmark test
python /mnt/deepseek/vllm/benchmarks/benchmark_serving.py \
--backend vllm \
--trust-remote-code \
--model /mnt/deepseek/DeepSeek-R1-W8A8-VLLM \
--dataset-name random \
--random-input-len 4096 \
--random-output-len 1536 \
--ignore-eos \
--num-prompts 400 \
--max-concurrency $concurrency \
--request-rate $rate \
--metric-percentiles 90 \
--base-url http://localhost:8006 2>&1 | tee -a $result_file
# Wait for system cool down
sleep 30
done
done
# Analyze results
echo "Analysis Results" > analysis_results.txt
echo "=================" >> analysis_results.txt
# Extract and analyze TPOT data
echo "TPOT Analysis:" >> analysis_results.txt
grep "Mean TPOT" $result_file | awk -F':' '{
printf "Concurrency %s, Rate %s: %s ms\n", $1, $2, $3
}' >> analysis_results.txt
# Extract and analyze throughput data
echo -e "\nThroughput Analysis:" >> analysis_results.txt
grep "Output token throughput" $result_file | awk -F':' '{
printf "Concurrency %s, Rate %s: %s tokens/s\n", $1, $2, $3
}' >> analysis_results.txt
echo "Testing completed. Results saved in $result_file and analysis in analysis_results.txt"