[CI][Benchmark] Optimize performance benchmark workflow (#1039)

### What this PR does / why we need it? This is a post patch of #1014, for some convenience optimization - Set cached dataset path for speed - Use pypi to install escli-tool - Add benchmark results convert script to have a developer-friendly result - Patch the `benchmark_dataset.py` to disable streaming load for internet - Add more trigger ways for different purpose, `pr` for debug, `schedule` for daily test, `dispatch` and `pr-labled` for manual testing of a single(current) commit - Disable latency test for `qwen-2.5-vl`, (This script does not support multi-modal yet) ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-06-03 23:38:34 +08:00
parent 543380ceae
commit 76dacf3fa0
9 changed files with 340 additions and 38 deletions
--- a/benchmarks/scripts/convert_json_to_markdown.py
+++ b/benchmarks/scripts/convert_json_to_markdown.py
@@ -0,0 +1,183 @@
+import argparse
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+CUR_PATH = Path(__file__).parent.resolve()
+# latency results and the keys that will be printed into markdown
+latency_results = []
+latency_column_mapping = {
+    "test_name": "Test name",
+    "avg_latency": "Mean latency (ms)",
+    "P50": "Median latency (ms)",
+    "P99": "P99 latency (ms)",
+}
+
+# throughput tests and the keys that will be printed into markdown
+throughput_results = []
+throughput_results_column_mapping = {
+    "test_name": "Test name",
+    "num_requests": "Num of reqs",
+    "total_num_tokens": "Total num of tokens",
+    "elapsed_time": "Elapsed time (s)",
+    "requests_per_second": "Tput (req/s)",
+    "tokens_per_second": "Tput (tok/s)",
+}
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "request_rate": "Request rate (req/s)",
+    "request_throughput": "Tput (req/s)",
+    "output_throughput": "Output Tput (tok/s)",
+    "median_ttft_ms": "TTFT (ms)",
+    "median_tpot_ms": "TPOT (ms)",
+    "median_itl_ms": "ITL (ms)",
+}
+
+
+def read_markdown(file):
+    if os.path.exists(file):
+        with open(file) as f:
+            return f.read() + "\n"
+    else:
+        return f"{file} not found.\n"
+
+
+def results_to_json(latency, throughput, serving):
+    return json.dumps({
+        'latency': latency.to_dict(),
+        'throughput': throughput.to_dict(),
+        'serving': serving.to_dict()
+    })
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Process the results of the benchmark tests.")
+    parser.add_argument(
+        "--results_folder",
+        type=str,
+        default="../results/",
+        help="The folder where the benchmark results are stored.")
+    parser.add_argument(
+        "--output_folder",
+        type=str,
+        default="../results/",
+        help="The folder where the benchmark results are stored.")
+    parser.add_argument("--markdown_template",
+                        type=str,
+                        default="./perf_result_template.md",
+                        help="The template file for the markdown report.")
+    parser.add_argument("--tag",
+                        default="main",
+                        help="Tag to be used for release message.")
+    parser.add_argument("--commit_id",
+                        default="",
+                        help="Commit ID to be used for release message.")
+
+    args = parser.parse_args()
+    results_folder = (CUR_PATH / args.results_folder).resolve()
+    output_folder = (CUR_PATH / args.output_folder).resolve()
+    markdown_template = (CUR_PATH / args.markdown_template).resolve()
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+
+        with open(test_file) as f:
+            raw_result = json.loads(f.read())
+
+        if "serving" in str(test_file):
+            # this result is generated via `benchmark_serving.py`
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            serving_results.append(raw_result)
+            continue
+
+        elif "latency" in f.name:
+            # this result is generated via `benchmark_latency.py`
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # get different percentiles
+            for perc in [10, 25, 50, 75, 90, 99]:
+                # Multiply 1000 to convert the time unit from s to ms
+                raw_result.update(
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
+            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
+
+            # add the result to raw_result
+            latency_results.append(raw_result)
+            continue
+
+        elif "throughput" in f.name:
+            # this result is generated via `benchmark_throughput.py`
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            throughput_results.append(raw_result)
+            continue
+
+        print(f"Skipping {test_file}")
+    serving_results.sort(key=lambda x: (len(x['test_name']), x['test_name']))
+
+    latency_results = pd.DataFrame.from_dict(latency_results)
+    serving_results = pd.DataFrame.from_dict(serving_results)
+    throughput_results = pd.DataFrame.from_dict(throughput_results)
+
+    raw_results_json = results_to_json(latency_results, throughput_results,
+                                       serving_results)
+
+    # remapping the key, for visualization purpose
+    if not latency_results.empty:
+        latency_results = latency_results[list(
+            latency_column_mapping.keys())].rename(
+                columns=latency_column_mapping)
+    if not serving_results.empty:
+        serving_results = serving_results[list(
+            serving_column_mapping.keys())].rename(
+                columns=serving_column_mapping)
+    if not throughput_results.empty:
+        throughput_results = throughput_results[list(
+            throughput_results_column_mapping.keys())].rename(
+                columns=throughput_results_column_mapping)
+
+    processed_results_json = results_to_json(latency_results,
+                                             throughput_results,
+                                             serving_results)
+
+    # get markdown tables
+    latency_md_table = tabulate(latency_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+    serving_md_table = tabulate(serving_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+    throughput_md_table = tabulate(throughput_results,
+                                   headers='keys',
+                                   tablefmt='pipe',
+                                   showindex=False)
+
+    # document the result
+    print(output_folder)
+    with open(output_folder / "benchmark_results.md", "w") as f:
+
+        results = read_markdown(markdown_template)
+        results = results.format(
+            latency_tests_markdown_table=latency_md_table,
+            throughput_tests_markdown_table=throughput_md_table,
+            serving_tests_markdown_table=serving_md_table,
+            benchmarking_results_in_json_string=processed_results_json)
+        f.write(results)