diff --git a/examples/run_dp_attention_etp16.sh b/examples/run_dp_attention_etp16.sh new file mode 100644 index 0000000..b736492 --- /dev/null +++ b/examples/run_dp_attention_etp16.sh @@ -0,0 +1,23 @@ +export VLLM_ENABLE_MC2=0 +export VLLM_USE_V1=1 +export TASK_QUEUE_ENABLE=1 +source /usr/local/Ascend/ascend-toolkit/set_env.sh +source /usr/local/Ascend/nnal/atb/set_env.sh +export ASCEND_LAUNCH_BLOCKING=0 +export VLLM_VERSION=0.9.0 + +nohup python -m vllm.entrypoints.openai.api_server --model=/mnt/deepseek/DeepSeek-R1-W8A8-VLLM \ + --quantization ascend \ + --trust-remote-code \ + --distributed-executor-backend=mp \ + --port 8006 \ + -tp=8 \ + -dp=2 \ + --max-num-seqs 24 \ + --max-model-len 32768 \ + --max-num-batched-tokens 32768 \ + --block-size 128 \ + --no-enable-prefix-caching \ + --additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes":[24]},"ascend_scheduler_config":{"enabled":true},"expert_tensor_parallel_size":16}' \ + --gpu-memory-utilization 0.96 &> run.log & +disown \ No newline at end of file diff --git a/examples/run_dp_attention_etp16_benmark.sh b/examples/run_dp_attention_etp16_benmark.sh new file mode 100644 index 0000000..ab72b3b --- /dev/null +++ b/examples/run_dp_attention_etp16_benmark.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Concurrency array +concurrency_array=(48) +#best rate +rate_array=(0.7) + +# Result file +result_file="benchmark_results.txt" +echo "Benchmark Results" > $result_file +echo "===================" >> $result_file + +# Loop through all combinations +for concurrency in "${concurrency_array[@]}"; do + for rate in "${rate_array[@]}"; do + echo "Testing with concurrency=$concurrency, rate=$rate" + echo "" >> $result_file + echo "Concurrency: $concurrency, Request Rate: $rate" >> $result_file + echo "-------------------" >> $result_file + + # Run benchmark test + python /mnt/deepseek/vllm/benchmarks/benchmark_serving.py \ + --backend vllm \ + --trust-remote-code \ + --model /mnt/deepseek/DeepSeek-R1-W8A8-VLLM \ + --dataset-name random \ + --random-input-len 4096 \ + --random-output-len 1536 \ + --ignore-eos \ + --num-prompts 400 \ + --max-concurrency $concurrency \ + --request-rate $rate \ + --metric-percentiles 90 \ + --base-url http://localhost:8006 2>&1 | tee -a $result_file + + # Wait for system cool down + sleep 30 + done +done + +# Analyze results +echo "Analysis Results" > analysis_results.txt +echo "=================" >> analysis_results.txt + +# Extract and analyze TPOT data +echo "TPOT Analysis:" >> analysis_results.txt +grep "Mean TPOT" $result_file | awk -F':' '{ + printf "Concurrency %s, Rate %s: %s ms\n", $1, $2, $3 +}' >> analysis_results.txt + +# Extract and analyze throughput data +echo -e "\nThroughput Analysis:" >> analysis_results.txt +grep "Output token throughput" $result_file | awk -F':' '{ + printf "Concurrency %s, Rate %s: %s tokens/s\n", $1, $2, $3 +}' >> analysis_results.txt + +echo "Testing completed. Results saved in $result_file and analysis in analysis_results.txt"