Enable accuracy test for PR labeled with "*accuracy-test" (#1040)

### What this PR does / why we need it? This PR enable accuracy test for PR labeled with "*accuracy-test" and workflow_dispatch. Only one model test running for each type test to reduce excution time. - The dense test costs about `25mins` to complete (gsm8k 7mins, ~mmlu 3h24mins,~ cEval 18mins) - The vl test costs about `40mins` to complete In futute, we might consider enable all job test as nightly schedule job. Below is mainly changes: - the dense/vl accuracy test will be triggered by lableling `accuracy-test` and `ready-for-test` - the dense accuracy test will be triggered by lableling `dense-accuracy-test` and `ready-for-test` - the vl accuracy test will be triggered by lableling `vl-accuracy-test` and `ready-for-test` - accuracy test will also be triggered by workflow_dispatch - Support V1 and V0 for qwen and V0 for VL For PR test we also generate summary in test summary. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - CI passed with accuracy-test label - Preview: https://github.com/vllm-project/vllm-ascend/actions/runs/15407628722?pr=1040 Closes: https://github.com/vllm-project/vllm-ascend/pull/953 --------- Signed-off-by: hfadzxy <starmoon_zhang@163.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com>
2025-06-03 15:38:13 +08:00
parent 068c3a0167
commit f24375f318
3 changed files with 133 additions and 110 deletions
--- a/benchmarks/scripts/run_accuracy.py
+++ b/benchmarks/scripts/run_accuracy.py
@@ -26,11 +26,8 @@ from multiprocessing import Queue
 import lm_eval
 import torch

-UNIMODAL_MODEL_NAME = [
-    "Qwen/Qwen2.5-7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct",
-    "Qwen/Qwen3-8B"
-]
-UNIMODAL_TASK = ["ceval-valid", "mmlu", "gsm8k"]
+UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B"]
+UNIMODAL_TASK = ["ceval-valid", "gsm8k"]
 MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"]
 MULTIMODAL_TASK = ["mmmu_val"]

@@ -38,22 +35,17 @@ batch_size_dict = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1}

 MODEL_RUN_INFO = {
    "Qwen/Qwen2.5-7B-Instruct":
-    ("export MODEL_AEGS='{model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
+    ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
     "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
     "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
     ),
-    "LLM-Research/Meta-Llama-3.1-8B-Instruct":
-    ("export MODEL_AEGS='{model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
-     "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
-     "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
-     ),
-    "Qwen/Qwen3-8B":
-    ("export MODEL_AEGS='{model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
+    "Qwen/Qwen3-8B-Base":
+    ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
     "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
     "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
     ),
    "Qwen/Qwen2.5-VL-7B-Instruct":
-    ("export MODEL_AEGS='{model}, max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2'\n"
+    ("export MODEL_ARGS='pretrained={model}, max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2'\n"
     "lm_eval --model vllm-vlm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
     "--apply_chat_template --fewshot_as_multiturn  --batch_size 1"),
 }
@@ -85,7 +77,7 @@ def run_accuracy_unimodal(queue, model, dataset):

 def run_accuracy_multimodal(queue, model, dataset):
    try:
-        model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2"
+        model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2"
        results = lm_eval.simple_evaluate(
            model="vllm-vlm",
            model_args=model_args,
@@ -110,7 +102,7 @@ def generate_md(model_name, tasks_list, args, datasets):
    run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name,
                                                datasets=datasets)
    model = model_name.split("/")[1]
-    preamble = f"""# {model} Accuracy Test
+    preamble = f"""# 🎯 {model} Accuracy Test
  <div>
    <strong>vLLM version:</strong> vLLM: {args.vllm_version}, vLLM Ascend: {args.vllm_ascend_version} <br>
  </div>
@@ -228,4 +220,7 @@ if __name__ == "__main__":
    parser.add_argument("--vllm_version", type=str, required=False)
    parser.add_argument("--cann_version", type=str, required=False)
    args = parser.parse_args()
+    # TODO(yikun):
+    # 1. add a exit 1 if accuracy is not as expected
+    # 2. Add ✅, ❌ to markdown if accuracy is not as expected
    main(args)