[Doc]Add benchmark scripts (#74)

### What this PR does / why we need it? The purpose of this PR is to add benchmark scripts for npu, developers can easily run performance tests on their own machines with one line of code . --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-03-21 15:54:34 +08:00
parent befbee5883
commit 9a175ca0fc
6 changed files with 397 additions and 0 deletions
--- a/benchmarks/scripts/run-performance-benchmarks.sh
+++ b/benchmarks/scripts/run-performance-benchmarks.sh
@@ -0,0 +1,289 @@
+#!/bin/bash
+
+
+check_npus() {
+  # shellcheck disable=SC2155
+  declare -g npu_count=$(npu-smi info -l | grep "Total Count" | awk -F ':' '{print $2}' | tr -d ' ')
+  
+  if [[ -z "$npu_count" || "$npu_count" -eq 0 ]]; then
+    echo "Need at least 1 NPU to run benchmarking."
+    exit 1
+  else
+    echo "found NPU conut: $npu_count"
+  fi
+
+  npu_type=$(npu-smi info | grep -E "^\| [0-9]+" | awk -F '|' '{print $2}' | awk '{$1=$1;print}' | awk '{print $2}')
+
+  echo "NPU type is: $npu_type"
+}
+
+ensure_sharegpt_downloaded() {
+  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
+  if [ ! -f "$FILE" ]; then
+    echo "$FILE not found, downloading from hf-mirror ..."
+    wget https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
+  else
+    echo "$FILE already exists."
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args
+  args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -X POST localhost:8000/v1/completions; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+get_cur_npu_id() {
+    npu-smi info -l | awk -F ':' '/NPU ID/ {print $2+0; exit}'
+}
+
+kill_npu_processes() {
+  ps -aux
+  lsof -t -i:8000 | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
+  
+  sleep 4
+  rm -rf ~/.config/vllm
+
+}
+
+
+run_latency_tests() {
+  # run latency tests using `benchmark_latency.py`
+  # $1: a json file specifying latency test cases
+
+  local latency_test_file
+  latency_test_file=$1
+
+  # Iterate over latency tests
+  jq -c '.[]' "$latency_test_file" | while read -r params; do
+    # get the test name, and append the NPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^latency_ ]]; then
+      echo "In latency-test.json, test_name must start with \"latency_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    latency_params=$(echo "$params" | jq -r '.parameters')
+    latency_args=$(json2args "$latency_params")
+
+    latency_command="python3 vllm_benchmarks/benchmark_latency.py \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $latency_args"
+
+    echo "Running test case $test_name"
+    echo "Latency command: $latency_command"
+
+    # run the benchmark
+    eval "$latency_command"
+
+    kill_npu_processes
+
+  done
+}
+
+run_throughput_tests() {
+  # run throughput tests using `benchmark_throughput.py`
+  # $1: a json file specifying throughput test cases
+
+  local throughput_test_file
+  throughput_test_file=$1
+
+  # Iterate over throughput tests
+  jq -c '.[]' "$throughput_test_file" | while read -r params; do
+    # get the test name, and append the NPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^throughput_ ]]; then
+      echo "In throughput-test.json, test_name must start with \"throughput_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    throughput_params=$(echo "$params" | jq -r '.parameters')
+    throughput_args=$(json2args "$throughput_params")
+
+    throughput_command="python3 vllm_benchmarks/benchmark_throughput.py \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $throughput_args"
+
+    echo "Running test case $test_name"
+    echo "Throughput command: $throughput_command"
+
+    # run the benchmark
+    eval "$throughput_command"
+
+    kill_npu_processes
+
+  done
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the NPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^serving_ ]]; then
+      echo "In serving-test.json, test_name must start with \"serving_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.server_parameters')
+    client_params=$(echo "$params" | jq -r '.client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if server model and client model is aligned
+    server_model=$(echo "$server_params" | jq -r '.model')
+    client_model=$(echo "$client_params" | jq -r '.model')
+    if [[ $server_model != "$client_model" ]]; then
+      echo "Server model and client model must be the same. Skip testcase $test_name."
+      continue
+    fi
+
+    server_command="python3 \
+      -m vllm.entrypoints.openai.api_server \
+      $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    bash -c "$server_command" &
+    server_pid=$!
+
+    # wait until the server is alive
+    if wait_for_server; then
+      echo ""
+      echo "vllm server is up and running."
+    else
+      echo ""
+      echo "vllm failed to start within the timeout period."
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 vllm_benchmarks/benchmark_serving.py \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      bash -c "$client_command"
+    done
+
+    # clean up
+    kill -9 $server_pid
+    kill_npu_processes
+  done
+}
+
+cleanup() {
+  rm -rf ./vllm_benchmarks
+}
+
+get_benchmarks_scripts() {
+  git clone -b main --depth=1 git@github.com:vllm-project/vllm.git && \
+  mv vllm/benchmarks vllm_benchmarks
+  rm -rf ./vllm
+}
+
+main() {
+
+  START_TIME=$(date +%s)
+  check_npus
+
+  # dependencies
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get update && apt-get -y install jq)
+  (which lsof) || (apt-get update && apt-get install -y lsof)
+
+  # get the current IP address, required by benchmark_serving.py
+  # shellcheck disable=SC2155
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+  # turn of the reporting of the status of each request, to clean up the terminal output
+  export VLLM_LOG_LEVEL="WARNING"
+
+  # prepare for benchmarking
+  cd benchmarks || exit 1
+  get_benchmarks_scripts
+  trap cleanup EXIT
+
+  QUICK_BENCHMARK_ROOT=./
+
+  declare -g RESULTS_FOLDER=results
+  mkdir -p $RESULTS_FOLDER
+
+  ensure_sharegpt_downloaded
+  # benchmarks
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
+  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
+  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
+
+  END_TIME=$(date +%s)
+  ELAPSED_TIME=$((END_TIME - START_TIME))
+  echo "Total execution time: $ELAPSED_TIME seconds"
+
+}
+
+main "$@"