[1/N][CI] Move linting system to pre-commits hooks (#1256)

### What this PR does / why we need it? Follow vllm-project/vllm lint way: https://github.com/vllm-project/vllm/blob/main/.pre-commit-config.yaml Enable pre-commit to avoid some low level error AMAP. This pr is one step of #1241, The purpose is make linting system more clear and convenient, on this step, Mainly did the following things: yapf, actionlint, ruff, typos, isort, mypy, png-lint, signoff-commit, enforce-import-regex-instead-of-re. TODO: - clang-format(check for csrc with google style) need clean code, disable for now - pymarkdown need clean code, disable for now - shellcheck need clean code, disable for now ### Does this PR introduce _any_ user-facing change? Only developer UX change: https://vllm-ascend--1256.org.readthedocs.build/en/1256/developer_guide/contributing.html#run-lint-locally ``` pip install -r requirements-lint.txt && pre-commit install bash format.sh ``` ### How was this patch tested? CI passed with new added/existing test. Co-authored-by: Yikun [yikunkero@gmail.com](mailto:yikunkero@gmail.com) Co-authored-by: wangli [wangli858794774@gmail.com](mailto:wangli858794774@gmail.com) - vLLM version: v0.9.1 - vLLM main: 5358cce5ff --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-07-10 14:17:15 +08:00
parent 643e6f5486
commit c7446438a9
28 changed files with 753 additions and 667 deletions
--- a/benchmarks/scripts/convert_json_to_markdown.py
+++ b/benchmarks/scripts/convert_json_to_markdown.py
@@ -49,36 +49,43 @@ def read_markdown(file):


 def results_to_json(latency, throughput, serving):
-    return json.dumps({
-        'latency': latency.to_dict(),
-        'throughput': throughput.to_dict(),
-        'serving': serving.to_dict()
-    })
+    return json.dumps(
+        {
+            "latency": latency.to_dict(),
+            "throughput": throughput.to_dict(),
+            "serving": serving.to_dict(),
+        }
+    )


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description="Process the results of the benchmark tests.")
+        description="Process the results of the benchmark tests."
+    )
    parser.add_argument(
        "--results_folder",
        type=str,
        default="../results/",
-        help="The folder where the benchmark results are stored.")
+        help="The folder where the benchmark results are stored.",
+    )
    parser.add_argument(
        "--output_folder",
        type=str,
        default="../results/",
-        help="The folder where the benchmark results are stored.")
-    parser.add_argument("--markdown_template",
-                        type=str,
-                        default="./perf_result_template.md",
-                        help="The template file for the markdown report.")
-    parser.add_argument("--tag",
-                        default="main",
-                        help="Tag to be used for release message.")
-    parser.add_argument("--commit_id",
-                        default="",
-                        help="Commit ID to be used for release message.")
+        help="The folder where the benchmark results are stored.",
+    )
+    parser.add_argument(
+        "--markdown_template",
+        type=str,
+        default="./perf_result_template.md",
+        help="The template file for the markdown report.",
+    )
+    parser.add_argument(
+        "--tag", default="main", help="Tag to be used for release message."
+    )
+    parser.add_argument(
+        "--commit_id", default="", help="Commit ID to be used for release message."
+    )

    args = parser.parse_args()
    results_folder = (CUR_PATH / args.results_folder).resolve()
@@ -87,7 +94,6 @@ if __name__ == "__main__":

    # collect results
    for test_file in results_folder.glob("*.json"):
-
        with open(test_file) as f:
            raw_result = json.loads(f.read())

@@ -111,7 +117,8 @@ if __name__ == "__main__":
            for perc in [10, 25, 50, 75, 90, 99]:
                # Multiply 1000 to convert the time unit from s to ms
                raw_result.update(
-                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
+                )
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000

            # add the result to raw_result
@@ -129,55 +136,53 @@ if __name__ == "__main__":
            continue

        print(f"Skipping {test_file}")
-    serving_results.sort(key=lambda x: (len(x['test_name']), x['test_name']))
+    serving_results.sort(key=lambda x: (len(x["test_name"]), x["test_name"]))

    latency_results = pd.DataFrame.from_dict(latency_results)
    serving_results = pd.DataFrame.from_dict(serving_results)
    throughput_results = pd.DataFrame.from_dict(throughput_results)

-    raw_results_json = results_to_json(latency_results, throughput_results,
-                                       serving_results)
+    raw_results_json = results_to_json(
+        latency_results, throughput_results, serving_results
+    )

    # remapping the key, for visualization purpose
    if not latency_results.empty:
-        latency_results = latency_results[list(
-            latency_column_mapping.keys())].rename(
-                columns=latency_column_mapping)
+        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
+            columns=latency_column_mapping
+        )
    if not serving_results.empty:
-        serving_results = serving_results[list(
-            serving_column_mapping.keys())].rename(
-                columns=serving_column_mapping)
+        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+            columns=serving_column_mapping
+        )
    if not throughput_results.empty:
-        throughput_results = throughput_results[list(
-            throughput_results_column_mapping.keys())].rename(
-                columns=throughput_results_column_mapping)
+        throughput_results = throughput_results[
+            list(throughput_results_column_mapping.keys())
+        ].rename(columns=throughput_results_column_mapping)

-    processed_results_json = results_to_json(latency_results,
-                                             throughput_results,
-                                             serving_results)
+    processed_results_json = results_to_json(
+        latency_results, throughput_results, serving_results
+    )

    # get markdown tables
-    latency_md_table = tabulate(latency_results,
-                                headers='keys',
-                                tablefmt='pipe',
-                                showindex=False)
-    serving_md_table = tabulate(serving_results,
-                                headers='keys',
-                                tablefmt='pipe',
-                                showindex=False)
-    throughput_md_table = tabulate(throughput_results,
-                                   headers='keys',
-                                   tablefmt='pipe',
-                                   showindex=False)
+    latency_md_table = tabulate(
+        latency_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    serving_md_table = tabulate(
+        serving_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    throughput_md_table = tabulate(
+        throughput_results, headers="keys", tablefmt="pipe", showindex=False
+    )

    # document the result
    print(output_folder)
    with open(output_folder / "benchmark_results.md", "w") as f:
-
        results = read_markdown(markdown_template)
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
            serving_tests_markdown_table=serving_md_table,
-            benchmarking_results_in_json_string=processed_results_json)
+            benchmarking_results_in_json_string=processed_results_json,
+        )
        f.write(results)
--- a/benchmarks/scripts/patch_benchmark_dataset.py
+++ b/benchmarks/scripts/patch_benchmark_dataset.py
@@ -7,9 +7,8 @@ import libcst.matchers as m
 # Patch the benchmark_dataset.py file to set streaming=False in load_dataset calls


-# TDOO(Potabk): Remove this patch when the issue is fixed in the upstream
+# TODO(Potabk): Remove this patch when the issue is fixed in the upstream
 class StreamingFalseTransformer(cst.CSTTransformer):
-
    def __init__(self):
        self.in_target_class = False
        self.in_target_func = False
@@ -63,15 +62,18 @@ def patch_file(path):
    print(f"Patched: {abs_path}")


-if __name__ == '__main__':
+if __name__ == "__main__":
    parser = ArgumentParser(
-        description=
-        "Patch benchmark_dataset.py to set streaming=False in load_dataset calls"
+        description="Patch benchmark_dataset.py to set streaming=False in load_dataset calls"
+    )
+    parser.add_argument(
+        "--path", type=str, help="Path to the benchmark_dataset.py file"
    )
    parser.add_argument(
        "--path",
        type=str,
        default="/vllm-workspace/vllm/vllm/benchmarks/datasets.py",
-        help="Path to the benchmark_dataset.py file")
+        help="Path to the benchmark_dataset.py file",
+    )
    args = parser.parse_args()
    patch_file(args.path)
--- a/benchmarks/scripts/run_accuracy.py
+++ b/benchmarks/scripts/run_accuracy.py
@@ -44,82 +44,72 @@ BATCH_SIZE = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1}
 MODEL_TYPE = {
    "Qwen/Qwen3-8B-Base": "vllm",
    "Qwen/Qwen3-30B-A3B": "vllm",
-    "Qwen/Qwen2.5-VL-7B-Instruct": "vllm-vlm"
+    "Qwen/Qwen2.5-VL-7B-Instruct": "vllm-vlm",
 }

 # Command templates for running evaluations
 MODEL_RUN_INFO = {
-    "Qwen/Qwen3-30B-A3B":
-    ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n"
-     "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
-     "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
-     ),
-    "Qwen/Qwen3-8B-Base":
-    ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
-     "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
-     "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
-     ),
-    "Qwen/Qwen2.5-VL-7B-Instruct":
-    ("export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2'\n"
-     "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
-     "--apply_chat_template --fewshot_as_multiturn  --batch_size 1"),
+    "Qwen/Qwen3-30B-A3B": (
+        "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n"
+        "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
+        "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
+    ),
+    "Qwen/Qwen3-8B-Base": (
+        "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
+        "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
+        "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
+    ),
+    "Qwen/Qwen2.5-VL-7B-Instruct": (
+        "export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2'\n"
+        "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
+        "--apply_chat_template --fewshot_as_multiturn  --batch_size 1"
+    ),
 }

 # Evaluation metric filters per task
 FILTER = {
    "gsm8k": "exact_match,flexible-extract",
    "ceval-valid": "acc,none",
-    "mmmu_val": "acc,none"
+    "mmmu_val": "acc,none",
 }

 # Expected accuracy values for models
 EXPECTED_VALUE = {
-    "Qwen/Qwen3-30B-A3B": {
-        "ceval-valid": 0.83,
-        "gsm8k": 0.85
-    },
-    "Qwen/Qwen3-8B-Base": {
-        "ceval-valid": 0.82,
-        "gsm8k": 0.83
-    },
-    "Qwen/Qwen2.5-VL-7B-Instruct": {
-        "mmmu_val": 0.51
-    }
+    "Qwen/Qwen3-30B-A3B": {"ceval-valid": 0.83, "gsm8k": 0.85},
+    "Qwen/Qwen3-8B-Base": {"ceval-valid": 0.82, "gsm8k": 0.83},
+    "Qwen/Qwen2.5-VL-7B-Instruct": {"mmmu_val": 0.51},
 }
 PARALLEL_MODE = {
    "Qwen/Qwen3-8B-Base": "TP",
    "Qwen/Qwen2.5-VL-7B-Instruct": "TP",
-    "Qwen/Qwen3-30B-A3B": "EP"
+    "Qwen/Qwen3-30B-A3B": "EP",
 }

 # Execution backend configuration
 EXECUTION_MODE = {
    "Qwen/Qwen3-8B-Base": "ACLGraph",
    "Qwen/Qwen2.5-VL-7B-Instruct": "ACLGraph",
-    "Qwen/Qwen3-30B-A3B": "ACLGraph"
+    "Qwen/Qwen3-30B-A3B": "ACLGraph",
 }

 # Model arguments for evaluation
 MODEL_ARGS = {
-    "Qwen/Qwen3-8B-Base":
-    "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6",
-    "Qwen/Qwen2.5-VL-7B-Instruct":
-    "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2",
-    "Qwen/Qwen3-30B-A3B":
-    "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True"
+    "Qwen/Qwen3-8B-Base": "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6",
+    "Qwen/Qwen2.5-VL-7B-Instruct": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2",
+    "Qwen/Qwen3-30B-A3B": "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True",
 }

 # Whether to apply chat template formatting
 APPLY_CHAT_TEMPLATE = {
    "Qwen/Qwen3-8B-Base": True,
    "Qwen/Qwen2.5-VL-7B-Instruct": True,
-    "Qwen/Qwen3-30B-A3B": False
+    "Qwen/Qwen3-30B-A3B": False,
 }
 # Few-shot examples handling as multi-turn dialogues.
 FEWSHOT_AS_MULTITURN = {
    "Qwen/Qwen3-8B-Base": True,
    "Qwen/Qwen2.5-VL-7B-Instruct": True,
-    "Qwen/Qwen3-30B-A3B": False
+    "Qwen/Qwen3-30B-A3B": False,
 }

 # Relative tolerance for accuracy checks
@@ -136,7 +126,7 @@ def run_accuracy_test(queue, model, dataset):
            "tasks": dataset,
            "apply_chat_template": APPLY_CHAT_TEMPLATE[model],
            "fewshot_as_multiturn": FEWSHOT_AS_MULTITURN[model],
-            "batch_size": BATCH_SIZE[dataset]
+            "batch_size": BATCH_SIZE[dataset],
        }

        if MODEL_TYPE[model] == "vllm":
@@ -151,7 +141,7 @@ def run_accuracy_test(queue, model, dataset):
        queue.put(e)
        sys.exit(1)
    finally:
-        if 'results' in locals():
+        if "results" in locals():
            del results
        gc.collect()
        torch.npu.empty_cache()
@@ -161,16 +151,15 @@ def run_accuracy_test(queue, model, dataset):
 def generate_md(model_name, tasks_list, args, datasets):
    """Generate Markdown report with evaluation results"""
    # Format the run command
-    run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name,
-                                                datasets=datasets)
+    run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name, datasets=datasets)
    model = model_name.split("/")[1]

    # Version information section
    version_info = (
        f"**vLLM Version**: vLLM: {args.vllm_version} "
-        f"([{args.vllm_commit}]({VLLM_URL+args.vllm_commit})), "
+        f"([{args.vllm_commit}]({VLLM_URL + args.vllm_commit})), "
        f"vLLM Ascend: {args.vllm_ascend_version} "
-        f"([{args.vllm_ascend_commit}]({VLLM_ASCEND_URL+args.vllm_ascend_commit}))  "
+        f"([{args.vllm_ascend_commit}]({VLLM_ASCEND_URL + args.vllm_ascend_commit}))  "
    )

    # Report header with system info
@@ -218,21 +207,39 @@ def generate_md(model_name, tasks_list, args, datasets):
            else:
                n_shot = "0"
            flag = ACCURACY_FLAG.get(task_name, "")
-            row = (f"| {task_name:<37} "
-                   f"| {flt:<6} "
-                   f"| {n_shot:6} "
-                   f"| {metric:<6} "
-                   f"| {flag}{value:>5.4f} "
-                   f"| ± {stderr:>5.4f} |")
+            row = (
+                f"| {task_name:<37} "
+                f"| {flt:<6} "
+                f"| {n_shot:6} "
+                f"| {metric:<6} "
+                f"| {flag}{value:>5.4f} "
+                f"| ± {stderr:>5.4f} |"
+            )
            if not task_name.startswith("-"):
                rows.append(row)
-                rows_sub.append("<details>" + "\n" + "<summary>" + task_name +
-                                " details" + "</summary>" + "\n" * 2 + header)
+                rows_sub.append(
+                    "<details>"
+                    + "\n"
+                    + "<summary>"
+                    + task_name
+                    + " details"
+                    + "</summary>"
+                    + "\n" * 2
+                    + header
+                )
            rows_sub.append(row)
        rows_sub.append("</details>")
    # Combine all Markdown sections
-    md = preamble + "\n" + header + "\n" + "\n".join(rows) + "\n" + "\n".join(
-        rows_sub) + "\n"
+    md = (
+        preamble
+        + "\n"
+        + header
+        + "\n"
+        + "\n".join(rows)
+        + "\n"
+        + "\n".join(rows_sub)
+        + "\n"
+    )
    print(md)
    return md

@@ -262,8 +269,9 @@ def main(args):
    # Evaluate model on each dataset
    for dataset in datasets:
        accuracy_expected = EXPECTED_VALUE[args.model][dataset]
-        p = multiprocessing.Process(target=run_accuracy_test,
-                                    args=(result_queue, args.model, dataset))
+        p = multiprocessing.Process(
+            target=run_accuracy_test, args=(result_queue, args.model, dataset)
+        )
        p.start()
        p.join()
        if p.is_alive():
@@ -274,8 +282,11 @@ def main(args):
        time.sleep(10)
        result = result_queue.get()
        print(result)
-        if accuracy_expected - RTOL < result[dataset][
-                FILTER[dataset]] < accuracy_expected + RTOL:
+        if (
+            accuracy_expected - RTOL
+            < result[dataset][FILTER[dataset]]
+            < accuracy_expected + RTOL
+        ):
            ACCURACY_FLAG[dataset] = "✅"
        else:
            ACCURACY_FLAG[dataset] = "❌"
@@ -285,10 +296,11 @@ def main(args):


 if __name__ == "__main__":
-    multiprocessing.set_start_method('spawn', force=True)
+    multiprocessing.set_start_method("spawn", force=True)
    # Initialize argument parser
    parser = argparse.ArgumentParser(
-        description="Run model accuracy evaluation and generate report")
+        description="Run model accuracy evaluation and generate report"
+    )
    parser.add_argument("--output", type=str, required=True)
    parser.add_argument("--model", type=str, required=True)
    parser.add_argument("--vllm_ascend_version", type=str, required=False)