From d9fb027068c623df2f15f05f7863cd959e8808ce Mon Sep 17 00:00:00 2001 From: Li Wang Date: Fri, 30 May 2025 22:42:44 +0800 Subject: [PATCH] [CI] Add benchmark workflows (#1014) ### What this PR does / why we need it? Add benchmark workflows ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Run locally --------- Signed-off-by: wangli --- .github/workflows/nightly_benchmarks.yaml | 160 ++++++++++++++++++ .../scripts/run-performance-benchmarks.sh | 32 +++- benchmarks/tests/latency-tests.json | 11 +- benchmarks/tests/serving-tests.json | 25 +-- benchmarks/tests/throughput-tests.json | 20 ++- 5 files changed, 217 insertions(+), 31 deletions(-) create mode 100644 .github/workflows/nightly_benchmarks.yaml diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml new file mode 100644 index 0000000..fc2dc2c --- /dev/null +++ b/.github/workflows/nightly_benchmarks.yaml @@ -0,0 +1,160 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: 'run benchmarks main' + +on: + schedule: + - cron: '00 16 * * *' + workflow_dispatch: + + # after merged, secrets will be available + # pull_request: + # branches: + # - 'main' + # - '*-dev' + # paths: + # - '.github/workflows/nightly_benchmarks.yaml' + + +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. +defaults: + run: + shell: bash -el {0} + +jobs: + test: + name: run benchmarks main + runs-on: 'linux-arm64-npu-static-8' + strategy: + matrix: + include: + - vllm_branch: v0.9.0 + vllm_ascend_branch: main + container: + image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 + volumes: + - /usr/local/dcmi:/usr/local/dcmi + - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi + - /usr/local/Ascend/driver/:/usr/local/Ascend/driver/ + # Use self-host cache speed up pip and model download + - /home/action/.cache:/github/home/.cache/ + options: >- + --device /dev/davinci0 + --device /dev/davinci1 + --device /dev/davinci_manager + --device /dev/devmm_svm + --device /dev/hisi_hdc + env: + HF_ENDPOINT: https://hf-mirror.com + HF_TOKEN: ${{ secrets.HF_TOKEN }} + ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }} + ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }} + steps: + - name: Check npu and CANN info + run: | + npu-smi info + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + + - name: Config mirrors + run: | + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + - name: Install system dependencies + run: | + apt-get update -y + apt-get -y install git jq wget curl lsof gcc g++ cmake libnuma-dev + + - name: Config git + run: | + git config --global --add safe.directory "$GITHUB_WORKSPACE" + git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ + + + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v4 + with: + ref: ${{ matrix.vllm_ascend_branch }} + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: ./vllm-empty + ref: ${{ matrix.vllm_branch }} + + - name: Install vllm-project/vllm from source + working-directory: ./vllm-empty + run: | + VLLM_TARGET_DEVICE=empty pip install -e . + + - name: Install vllm-project/vllm-ascend + run: | + pip install -e . + pip install -r benchmarks/requirements-bench.txt + + - name: Checkout cosdt/elastic-tool + uses: actions/checkout@v4 + with: + repository: cosdt/elastic-tool + path: ./elastic_tool + ref: 0.1.0-dev + + - name: Install elastic_tool + working-directory: ./elastic_tool + run: | + pip install -e . + + - name: Collect pr info from vllm-project/vllm-ascend + run: | + # Only get the pull request which may influences performance + git log --pretty=format:"%H %s" -- '**/*.py' ':!docs/*' ':!tests/*' ':!examples/*' > commit_log.txt + escli check commit_log.txt + + - name: Run benchmark iteration + run: | + while IFS= read -r line || [[ -n "$line" ]]; do + commit_id=${line%% *} + commit_title=${line#* } + commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict) + commit_time_no_tz=${commit_time::19} + + git checkout $commit_id + pip install -e . + + echo "------------------------" + echo "commit_id: $commit_id" + echo "commit_title: $commit_title" + echo "commit_time: $commit_time_no_tz" + echo "vllm branch: ${{ matrix.vllm_branch }}" + echo "vllm-ascend branch: ${{ matrix.vllm_ascend_branch }}" + echo "------------------------" + + bash benchmarks/scripts/run-performance-benchmarks.sh + # send the result to es + if [[ "${{ github.event_name }}" != "pull request" ]]; then + escli add --vllm_branch ${{ matrix.vllm_branch }} \ + --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \ + --commit_id $commit_id \ + --commit_title "$commit_title" \ + --created_at "$commit_time_no_tz" \ + --res_dir ./benchmarks/results + rm -rf ./benchmarks/results + fi + done < commit_log.txt diff --git a/benchmarks/scripts/run-performance-benchmarks.sh b/benchmarks/scripts/run-performance-benchmarks.sh index b0e3121..9e7c85c 100644 --- a/benchmarks/scripts/run-performance-benchmarks.sh +++ b/benchmarks/scripts/run-performance-benchmarks.sh @@ -1,5 +1,6 @@ #!/bin/bash +set -e check_npus() { # shellcheck disable=SC2155 @@ -48,7 +49,7 @@ wait_for_server() { # wait for vllm server to start # return 1 if vllm server crashes timeout 1200 bash -c ' - until curl -X POST localhost:8000/v1/completions; do + until curl -s -X POST localhost:8000/v1/completions || curl -s -X POST localhost:8000/v1/chat/completions; do sleep 1 done' && return 0 || return 1 } @@ -67,6 +68,16 @@ kill_npu_processes() { } +update_json_field() { + local json_file="$1" + local field_name="$2" + local field_value="$3" + + jq --arg value "$field_value" \ + --arg key "$field_name" \ + '.[$key] = $value' "$json_file" > "${json_file}.tmp" && \ + mv "${json_file}.tmp" "$json_file" +} run_latency_tests() { # run latency tests using `benchmark_latency.py` @@ -103,7 +114,9 @@ run_latency_tests() { # run the benchmark eval "$latency_command" - + # echo model_name to result file + model_name=$(echo "$latency_params" | jq -r '.model') + update_json_field "$RESULTS_FOLDER/${test_name}.json" "model_name" "$model_name" kill_npu_processes done @@ -144,7 +157,9 @@ run_throughput_tests() { # run the benchmark eval "$throughput_command" - + # echo model_name to result file + model_name=$(echo "$throughput_params" | jq -r '.model') + update_json_field "$RESULTS_FOLDER/${test_name}.json" "model_name" "$model_name" kill_npu_processes done @@ -242,8 +257,13 @@ cleanup() { rm -rf ./vllm_benchmarks } +cleanup_on_error() { + echo "An error occurred. Cleaning up results folder..." + rm -rf $RESULTS_FOLDER +} + get_benchmarks_scripts() { - git clone -b main --depth=1 git@github.com:vllm-project/vllm.git && \ + git clone -b main --depth=1 https://github.com/vllm-project/vllm.git && \ mv vllm/benchmarks vllm_benchmarks rm -rf ./vllm } @@ -263,9 +283,8 @@ main() { export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') # turn of the reporting of the status of each request, to clean up the terminal output export VLLM_LOG_LEVEL="WARNING" - + # set env - export VLLM_USE_MODELSCOPE="True" export HF_ENDPOINT="https://hf-mirror.com" # prepare for benchmarking @@ -278,6 +297,7 @@ main() { declare -g RESULTS_FOLDER=results mkdir -p $RESULTS_FOLDER + trap cleanup_on_error ERR ensure_sharegpt_downloaded # benchmarks run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json index dccfb5d..d7d8674 100644 --- a/benchmarks/tests/latency-tests.json +++ b/benchmarks/tests/latency-tests.json @@ -1,20 +1,21 @@ [ { - "test_name": "latency_llama8B_tp1", + "test_name": "latency_qwen2_5vl_7B_tp1", "parameters": { - "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct", + "model": "Qwen/Qwen2.5-VL-7B-Instruct", "tensor_parallel_size": 1, - "load_format": "dummy", + "max_model_len": 16384, "num_iters_warmup": 5, "num_iters": 15 } }, { - "test_name": "latency_qwen2_5_7B_tp1", + "test_name": "latency_qwen3_8B_tp1", "parameters": { - "model": "Qwen/Qwen2.5-7B-Instruct", + "model": "Qwen/Qwen3-8B", "tensor_parallel_size": 1, "load_format": "dummy", + "max_model_len": 16384, "num_iters_warmup": 5, "num_iters": 15 } diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json index 109e597..bf28799 100644 --- a/benchmarks/tests/serving-tests.json +++ b/benchmarks/tests/serving-tests.json @@ -1,6 +1,6 @@ [ { - "test_name": "serving_llama8B_tp1", + "test_name": "serving_qwen2_5vl_7B_tp1", "qps_list": [ 1, 4, @@ -8,23 +8,26 @@ "inf" ], "server_parameters": { - "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct", + "model": "Qwen/Qwen2.5-VL-7B-Instruct", "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", "disable_log_requests": "", - "load_format": "dummy" + "trust_remote_code": "", + "max_model_len": 16384 }, "client_parameters": { - "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "backend": "openai-chat", + "dataset_name": "hf", + "hf_split": "train", + "endpoint": "/v1/chat/completions", + "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", "num_prompts": 200 } }, { - "test_name": "serving_qwen2_5_7B_tp1", + "test_name": "serving_qwen3_8B_tp1", "qps_list": [ 1, 4, @@ -32,7 +35,7 @@ "inf" ], "server_parameters": { - "model": "Qwen/Qwen2.5-7B-Instruct", + "model": "Qwen/Qwen3-8B", "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", @@ -40,10 +43,10 @@ "load_format": "dummy" }, "client_parameters": { - "model": "Qwen/Qwen2.5-7B-Instruct", + "model": "Qwen/Qwen3-8B", "backend": "vllm", "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "dataset_path": "/root/.cache/datasets/sharegpt/ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } } diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json index 57ecd52..58b0296 100644 --- a/benchmarks/tests/throughput-tests.json +++ b/benchmarks/tests/throughput-tests.json @@ -1,24 +1,26 @@ [ { - "test_name": "throughput_llama8B_tp1", + "test_name": "throughput_qwen3_8B_tp1", "parameters": { - "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct", + "model": "Qwen/Qwen3-8B", "tensor_parallel_size": 1, "load_format": "dummy", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "dataset_path": "/root/.cache/datasets/sharegpt/ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm" } }, { - "test_name": "throughput_qwen2_5_7B_tp1", + "test_name": "throughput_qwen2_5vl_7B_tp1", "parameters": { - "model": "Qwen/Qwen2.5-7B-Instruct", + "model": "Qwen/Qwen2.5-VL-7B-Instruct", "tensor_parallel_size": 1, - "load_format": "dummy", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" + "backend": "vllm-chat", + "dataset_name": "hf", + "hf_split": "train", + "max_model_len": 16384, + "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", + "num_prompts": 200 } } ]