From 76dacf3fa0ee5ccafc55c1106bd8f32076ba1959 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Tue, 3 Jun 2025 23:38:34 +0800 Subject: [PATCH] [CI][Benchmark] Optimize performance benchmark workflow (#1039) ### What this PR does / why we need it? This is a post patch of #1014, for some convenience optimization - Set cached dataset path for speed - Use pypi to install escli-tool - Add benchmark results convert script to have a developer-friendly result - Patch the `benchmark_dataset.py` to disable streaming load for internet - Add more trigger ways for different purpose, `pr` for debug, `schedule` for daily test, `dispatch` and `pr-labled` for manual testing of a single(current) commit - Disable latency test for `qwen-2.5-vl`, (This script does not support multi-modal yet) ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed --------- Signed-off-by: wangli --- .github/workflows/nightly_benchmarks.yaml | 60 ++++-- benchmarks/requirements-bench.txt | 4 +- .../scripts/convert_json_to_markdown.py | 183 ++++++++++++++++++ benchmarks/scripts/patch_benchmark_dataset.py | 68 +++++++ benchmarks/scripts/perf_result_template.md | 31 +++ .../scripts/run-performance-benchmarks.sh | 18 +- benchmarks/tests/latency-tests.json | 10 - benchmarks/tests/serving-tests.json | 2 +- benchmarks/tests/throughput-tests.json | 2 +- 9 files changed, 340 insertions(+), 38 deletions(-) create mode 100644 benchmarks/scripts/convert_json_to_markdown.py create mode 100644 benchmarks/scripts/patch_benchmark_dataset.py create mode 100644 benchmarks/scripts/perf_result_template.md diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index fc2dc2c..fb82c88 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -15,21 +15,17 @@ # limitations under the License. # -name: 'run benchmarks main' +name: 'Benchmarks / Performance' +# This workflow runs nightly benchmarks for vllm-ascend. on: schedule: + # Run at 24:00 everyday - cron: '00 16 * * *' workflow_dispatch: - - # after merged, secrets will be available - # pull_request: - # branches: - # - 'main' - # - '*-dev' - # paths: - # - '.github/workflows/nightly_benchmarks.yaml' + pull_request: + types: [ labeled ] # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -38,9 +34,15 @@ defaults: run: shell: bash -el {0} +concurrency: + group: pr-${{ github.event.pull_request.number }} + cancel-in-progress: true + jobs: test: - name: run benchmarks main + if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }} + + name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }} runs-on: 'linux-arm64-npu-static-8' strategy: matrix: @@ -85,13 +87,10 @@ jobs: run: | git config --global --add safe.directory "$GITHUB_WORKSPACE" git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ - - name: Checkout vllm-project/vllm-ascend repo uses: actions/checkout@v4 - with: - ref: ${{ matrix.vllm_ascend_branch }} - + - name: Checkout vllm-project/vllm repo uses: actions/checkout@v4 with: @@ -109,25 +108,44 @@ jobs: pip install -e . pip install -r benchmarks/requirements-bench.txt - - name: Checkout cosdt/elastic-tool - uses: actions/checkout@v4 + - name: Run current commit benchmarks + if: github.event_name != 'schedule' + run: | + # Sometimes we only want to run benchmarks on the current commit + # This is useful for debugging or a release benchmark + bash benchmarks/scripts/run-performance-benchmarks.sh + # Convert the benchmark results to markdown format + python3 benchmarks/scripts/convert_json_to_markdown.py + + - name: Generate step summary + if: github.event_name != 'schedule' + run: | + cat ./benchmarks/results/benchmark_results.md >> $GITHUB_STEP_SUMMARY + + - name: Upload benchmark artifacts + if: github.event_name != 'schedule' + uses: actions/upload-artifact@v4 with: - repository: cosdt/elastic-tool - path: ./elastic_tool - ref: 0.1.0-dev + name: "benchmark-performance-${{ matrix.vllm_branch }}-${{ matrix.vllm_ascend_branch }}-report" + path: ./benchmarks/results/benchmark_results.md + if-no-files-found: warn + retention-days: 90 + overwrite: true - name: Install elastic_tool - working-directory: ./elastic_tool + if: github.event_name == 'schedule' run: | - pip install -e . + pip install escli-tool==0.2.0 - name: Collect pr info from vllm-project/vllm-ascend + if: github.event_name == 'schedule' run: | # Only get the pull request which may influences performance git log --pretty=format:"%H %s" -- '**/*.py' ':!docs/*' ':!tests/*' ':!examples/*' > commit_log.txt escli check commit_log.txt - name: Run benchmark iteration + if: github.event_name == 'schedule' run: | while IFS= read -r line || [[ -n "$line" ]]; do commit_id=${line%% *} diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt index b3f3c06..54c28c8 100644 --- a/benchmarks/requirements-bench.txt +++ b/benchmarks/requirements-bench.txt @@ -1,3 +1,5 @@ pandas datasets -modelscope \ No newline at end of file +modelscope +libcst +tabulate \ No newline at end of file diff --git a/benchmarks/scripts/convert_json_to_markdown.py b/benchmarks/scripts/convert_json_to_markdown.py new file mode 100644 index 0000000..7a1c5d9 --- /dev/null +++ b/benchmarks/scripts/convert_json_to_markdown.py @@ -0,0 +1,183 @@ +import argparse +import json +import os +from pathlib import Path + +import pandas as pd +from tabulate import tabulate + +CUR_PATH = Path(__file__).parent.resolve() +# latency results and the keys that will be printed into markdown +latency_results = [] +latency_column_mapping = { + "test_name": "Test name", + "avg_latency": "Mean latency (ms)", + "P50": "Median latency (ms)", + "P99": "P99 latency (ms)", +} + +# throughput tests and the keys that will be printed into markdown +throughput_results = [] +throughput_results_column_mapping = { + "test_name": "Test name", + "num_requests": "Num of reqs", + "total_num_tokens": "Total num of tokens", + "elapsed_time": "Elapsed time (s)", + "requests_per_second": "Tput (req/s)", + "tokens_per_second": "Tput (tok/s)", +} + +# serving results and the keys that will be printed into markdown +serving_results = [] +serving_column_mapping = { + "test_name": "Test name", + "request_rate": "Request rate (req/s)", + "request_throughput": "Tput (req/s)", + "output_throughput": "Output Tput (tok/s)", + "median_ttft_ms": "TTFT (ms)", + "median_tpot_ms": "TPOT (ms)", + "median_itl_ms": "ITL (ms)", +} + + +def read_markdown(file): + if os.path.exists(file): + with open(file) as f: + return f.read() + "\n" + else: + return f"{file} not found.\n" + + +def results_to_json(latency, throughput, serving): + return json.dumps({ + 'latency': latency.to_dict(), + 'throughput': throughput.to_dict(), + 'serving': serving.to_dict() + }) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Process the results of the benchmark tests.") + parser.add_argument( + "--results_folder", + type=str, + default="../results/", + help="The folder where the benchmark results are stored.") + parser.add_argument( + "--output_folder", + type=str, + default="../results/", + help="The folder where the benchmark results are stored.") + parser.add_argument("--markdown_template", + type=str, + default="./perf_result_template.md", + help="The template file for the markdown report.") + parser.add_argument("--tag", + default="main", + help="Tag to be used for release message.") + parser.add_argument("--commit_id", + default="", + help="Commit ID to be used for release message.") + + args = parser.parse_args() + results_folder = (CUR_PATH / args.results_folder).resolve() + output_folder = (CUR_PATH / args.output_folder).resolve() + markdown_template = (CUR_PATH / args.markdown_template).resolve() + + # collect results + for test_file in results_folder.glob("*.json"): + + with open(test_file) as f: + raw_result = json.loads(f.read()) + + if "serving" in str(test_file): + # this result is generated via `benchmark_serving.py` + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + serving_results.append(raw_result) + continue + + elif "latency" in f.name: + # this result is generated via `benchmark_latency.py` + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # get different percentiles + for perc in [10, 25, 50, 75, 90, 99]: + # Multiply 1000 to convert the time unit from s to ms + raw_result.update( + {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}) + raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 + + # add the result to raw_result + latency_results.append(raw_result) + continue + + elif "throughput" in f.name: + # this result is generated via `benchmark_throughput.py` + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + throughput_results.append(raw_result) + continue + + print(f"Skipping {test_file}") + serving_results.sort(key=lambda x: (len(x['test_name']), x['test_name'])) + + latency_results = pd.DataFrame.from_dict(latency_results) + serving_results = pd.DataFrame.from_dict(serving_results) + throughput_results = pd.DataFrame.from_dict(throughput_results) + + raw_results_json = results_to_json(latency_results, throughput_results, + serving_results) + + # remapping the key, for visualization purpose + if not latency_results.empty: + latency_results = latency_results[list( + latency_column_mapping.keys())].rename( + columns=latency_column_mapping) + if not serving_results.empty: + serving_results = serving_results[list( + serving_column_mapping.keys())].rename( + columns=serving_column_mapping) + if not throughput_results.empty: + throughput_results = throughput_results[list( + throughput_results_column_mapping.keys())].rename( + columns=throughput_results_column_mapping) + + processed_results_json = results_to_json(latency_results, + throughput_results, + serving_results) + + # get markdown tables + latency_md_table = tabulate(latency_results, + headers='keys', + tablefmt='pipe', + showindex=False) + serving_md_table = tabulate(serving_results, + headers='keys', + tablefmt='pipe', + showindex=False) + throughput_md_table = tabulate(throughput_results, + headers='keys', + tablefmt='pipe', + showindex=False) + + # document the result + print(output_folder) + with open(output_folder / "benchmark_results.md", "w") as f: + + results = read_markdown(markdown_template) + results = results.format( + latency_tests_markdown_table=latency_md_table, + throughput_tests_markdown_table=throughput_md_table, + serving_tests_markdown_table=serving_md_table, + benchmarking_results_in_json_string=processed_results_json) + f.write(results) diff --git a/benchmarks/scripts/patch_benchmark_dataset.py b/benchmarks/scripts/patch_benchmark_dataset.py new file mode 100644 index 0000000..d114a65 --- /dev/null +++ b/benchmarks/scripts/patch_benchmark_dataset.py @@ -0,0 +1,68 @@ +from argparse import ArgumentParser + +import libcst as cst +import libcst.matchers as m + +# Patch the benchmark_dataset.py file to set streaming=False in load_dataset calls + + +# TDOO(Potabk): Remove this patch when the issue is fixed in the upstream +class StreamingFalseTransformer(cst.CSTTransformer): + + def __init__(self): + self.in_target_class = False + self.in_target_func = False + + def visit_ClassDef(self, node): + if node.name.value == "HuggingFaceDataset": + self.in_target_class = True + + def leave_ClassDef(self, original_node, updated_node): + self.in_target_class = False + return updated_node + + def visit_FunctionDef(self, node): + if self.in_target_class and node.name.value == "load_data": + self.in_target_func = True + + def leave_FunctionDef(self, original_node, updated_node): + self.in_target_func = False + return updated_node + + def leave_Call(self, original_node, updated_node): + if self.in_target_class and self.in_target_func: + if m.matches(updated_node.func, m.Name("load_dataset")): + new_args = [] + for arg in updated_node.args: + if arg.keyword and arg.keyword.value == "streaming": + new_arg = arg.with_changes(value=cst.Name("False")) + new_args.append(new_arg) + else: + new_args.append(arg) + return updated_node.with_changes(args=new_args) + return updated_node + + +def patch_file(path): + with open(path, "r", encoding="utf-8") as f: + source = f.read() + + module = cst.parse_module(source) + modified = module.visit(StreamingFalseTransformer()) + + with open(path, "w", encoding="utf-8") as f: + f.write(modified.code) + + print(f"Patched: {path}") + + +if __name__ == '__main__': + parser = ArgumentParser( + description= + "Patch benchmark_dataset.py to set streaming=False in load_dataset calls" + ) + parser.add_argument("--path", + type=str, + help="Path to the benchmark_dataset.py file") + args = parser.parse_args() + patch_file(args.path) diff --git a/benchmarks/scripts/perf_result_template.md b/benchmarks/scripts/perf_result_template.md new file mode 100644 index 0000000..2bf857a --- /dev/null +++ b/benchmarks/scripts/perf_result_template.md @@ -0,0 +1,31 @@ +## Online serving tests + +- Input length: randomly sample 200 prompts from [ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json) and [lmarena-ai/vision-arena-bench-v0.1](https://huggingface.co/datasets/lmarena-ai/vision-arena-bench-v0.1/tree/main)(multi-modal) dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm and the arrival pattern of the requests. +- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). +- Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct +- Evaluation metrics: throughput, TTFT (median time to the first token ), ITL (median inter-token latency) TPOT(median time per output token). + +{serving_tests_markdown_table} + +## Offline tests +### Latency tests + +- Input length: 32 tokens. +- Output length: 128 tokens. +- Batch size: fixed (8). +- Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct +- Evaluation metrics: end-to-end latency. + +{latency_tests_markdown_table} + +### Throughput tests + +- Input length: randomly sample 200 prompts from [ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json) and [lmarena-ai/vision-arena-bench-v0.1](https://huggingface.co/datasets/lmarena-ai/vision-arena-bench-v0.1/tree/main)(multi-modal) dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm to achieve maximum throughput. +- Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct +- Evaluation metrics: throughput. + +{throughput_tests_markdown_table} \ No newline at end of file diff --git a/benchmarks/scripts/run-performance-benchmarks.sh b/benchmarks/scripts/run-performance-benchmarks.sh index 9e7c85c..8997fc4 100644 --- a/benchmarks/scripts/run-performance-benchmarks.sh +++ b/benchmarks/scripts/run-performance-benchmarks.sh @@ -1,6 +1,5 @@ #!/bin/bash -set -e check_npus() { # shellcheck disable=SC2155 @@ -19,10 +18,19 @@ check_npus() { } ensure_sharegpt_downloaded() { - local FILE=ShareGPT_V3_unfiltered_cleaned_split.json + local FILE="/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json" + local DIR + DIR=$(dirname "$FILE") + if [ ! -f "$FILE" ]; then echo "$FILE not found, downloading from hf-mirror ..." - wget https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE + mkdir -p "$DIR" + wget -O "$FILE" https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + if [ $? -ne 0 ]; then + echo "Download failed!" >&2 + return 1 + fi + echo "Download completed and saved to $FILE" else echo "$FILE already exists." fi @@ -49,7 +57,8 @@ wait_for_server() { # wait for vllm server to start # return 1 if vllm server crashes timeout 1200 bash -c ' - until curl -s -X POST localhost:8000/v1/completions || curl -s -X POST localhost:8000/v1/chat/completions; do + until curl -s -X GET localhost:8000/health; do + echo "Waiting for vllm server to start..." sleep 1 done' && return 0 || return 1 } @@ -290,6 +299,7 @@ main() { # prepare for benchmarking cd benchmarks || exit 1 get_benchmarks_scripts + python3 scripts/patch_benchmark_dataset.py --path vllm_benchmarks/benchmark_dataset.py trap cleanup EXIT QUICK_BENCHMARK_ROOT=./ diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json index d7d8674..576ced2 100644 --- a/benchmarks/tests/latency-tests.json +++ b/benchmarks/tests/latency-tests.json @@ -1,14 +1,4 @@ [ - { - "test_name": "latency_qwen2_5vl_7B_tp1", - "parameters": { - "model": "Qwen/Qwen2.5-VL-7B-Instruct", - "tensor_parallel_size": 1, - "max_model_len": 16384, - "num_iters_warmup": 5, - "num_iters": 15 - } - }, { "test_name": "latency_qwen3_8B_tp1", "parameters": { diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json index bf28799..d8ad2be 100644 --- a/benchmarks/tests/serving-tests.json +++ b/benchmarks/tests/serving-tests.json @@ -46,7 +46,7 @@ "model": "Qwen/Qwen3-8B", "backend": "vllm", "dataset_name": "sharegpt", - "dataset_path": "/root/.cache/datasets/sharegpt/ShareGPT_V3_unfiltered_cleaned_split.json", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } } diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json index 58b0296..551d238 100644 --- a/benchmarks/tests/throughput-tests.json +++ b/benchmarks/tests/throughput-tests.json @@ -5,7 +5,7 @@ "model": "Qwen/Qwen3-8B", "tensor_parallel_size": 1, "load_format": "dummy", - "dataset_path": "/root/.cache/datasets/sharegpt/ShareGPT_V3_unfiltered_cleaned_split.json", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm" }