From 14373f65d72533d84fc3f9975775ac03338a5517 Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Sun, 6 Jul 2025 11:10:19 +0800 Subject: [PATCH] [Test] Remove V0 accuracy test and enable MoE and VL test on V1 (#1574) ### What this PR does / why we need it? Update accuracy test 1. remove accuarcy report on V0 2. add parallel and execution mode 3. add Qwen/Qwen3-30B-A3B and remove Qwen/Qwen2.5-7B-Instruct ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed Signed-off-by: hfadzxy --- .github/workflows/accuracy_test.yaml | 48 +++--- benchmarks/scripts/run_accuracy.py | 220 ++++++++++++++++----------- 2 files changed, 153 insertions(+), 115 deletions(-) diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 7dadc87..4acb9f6 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -53,9 +53,9 @@ on: type: choice options: - all - - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen3-8B-Base + - Qwen/Qwen3-30B-A3B default: 'all' # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly @@ -77,48 +77,48 @@ jobs: ${{ (contains(github.event.pull_request.labels.*.name, 'accuracy-test') || contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') || + contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') || contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test')) && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' }} runs-on: >- ${{ - (matrix.model_name == 'Qwen/Qwen2.5-VL-7B-Instruct' && 'linux-arm64-npu-4') || + (matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-arm64-npu-4') || 'linux-arm64-npu-2' }} strategy: matrix: - vllm_use_version: [0, 1] + vllm_use_version: [1] # the accuracy test will run: # 1. workflow_dispatch with models input - # - all: Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base - # - specified but not all: Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base + # - all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base + # - specified but not all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base # 2. PR labeled with "*-accuracy-test" - # - accuracy-test: Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-VL-7B-Instruct - # - dense-accuracy-test: Qwen/Qwen2.5-7B-Instruct + # - accuracy-test: Qwen/Qwen3-8B-Base, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-30B-A3B + # - dense-accuracy-test: Qwen/Qwen3-8B-Base # - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct + # - moe-accuracy-test: Qwen/Qwen3-30B-A3B model_name: ${{ fromJSON( (github.event_name == 'schedule' && - '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || + '["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || (github.event.inputs.models == 'all' && - '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || - (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' && - '["Qwen/Qwen2.5-7B-Instruct"]') || + '["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || + (github.event.inputs.models == 'Qwen/Qwen3-30B-A3B' && + '["Qwen/Qwen3-30B-A3B"]') || (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && '["Qwen/Qwen2.5-VL-7B-Instruct"]') || (github.event.inputs.models == 'Qwen/Qwen3-8B-Base' && '["Qwen/Qwen3-8B-Base"]') || contains(github.event.pull_request.labels.*.name, 'accuracy-test') && - '["Qwen/Qwen3-8B-Base","Qwen/Qwen2.5-VL-7B-Instruct"]' || + '["Qwen/Qwen3-8B-Base","Qwen/Qwen2.5-VL-7B-Instruct", "Qwen/Qwen3-30B-A3B"]' || contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') && '["Qwen/Qwen3-8B-Base"]' || contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') && - '["Qwen/Qwen2.5-VL-7B-Instruct"]' + '["Qwen/Qwen2.5-VL-7B-Instruct"]' || + contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') && + '["Qwen/Qwen3-30B-A3B"]' ) }} - # Remove exclude after https://github.com/vllm-project/vllm-ascend/issues/1044 resolved - exclude: - - model_name: Qwen/Qwen2.5-VL-7B-Instruct - vllm_use_version: 1 fail-fast: false name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }} @@ -187,23 +187,19 @@ jobs: - name: Get vLLM commit hash and URL working-directory: ./vllm-empty run: | - VLLM_COMMIT=$(git rev-parse HEAD) + VLLM_COMMIT=$(git rev-parse --short=7 HEAD) echo "VLLM_COMMIT=$VLLM_COMMIT" >> $GITHUB_ENV - echo "VLLM_COMMIT_URL=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Get vLLM-Ascend commit hash and URL working-directory: ./vllm-ascend run: | - VLLM_ASCEND_COMMIT=$(git rev-parse HEAD) + VLLM_ASCEND_COMMIT=$(git rev-parse --short=7 HEAD) echo "VLLM_ASCEND_COMMIT=$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV - echo "VLLM_ASCEND_COMMIT_URL=https://github.com/vllm-project/vllm-ascend/commit/$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV - - name: Print resolved hashes and URLs + - name: Print resolved hashes run: | echo "vLLM : ${{ env.VLLM_COMMIT }}" - echo "vLLM link : ${{ env.VLLM_COMMIT_URL }}" echo "vLLM-Ascend: ${{ env.VLLM_ASCEND_COMMIT }}" - echo "Ascend link: ${{ env.VLLM_ASCEND_COMMIT_URL }}" - name: Install lm-eval, ray, and datasets run: | @@ -262,8 +258,6 @@ jobs: --vllm_version "${{ env.GHA_VLLM_VERSION }}" \ --vllm_commit "${{ env.VLLM_COMMIT }}" \ --vllm_ascend_commit "${{ env.VLLM_ASCEND_COMMIT }}" \ - --vllm_commit_url "${{ env.VLLM_COMMIT_URL }}" \ - --vllm_ascend_commit_url "${{ env.VLLM_ASCEND_COMMIT_URL }}" \ --vllm_use_v1 "$VLLM_USE_V1" - name: Generate step summary @@ -385,7 +379,7 @@ jobs: body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: ${{ github.event.inputs.models == 'all' - && 'All models (Qwen2.5-7B-Instruct, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)' + && 'All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)' || github.event.inputs.models }} diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py index 6b320ff..2922c52 100644 --- a/benchmarks/scripts/run_accuracy.py +++ b/benchmarks/scripts/run_accuracy.py @@ -21,21 +21,36 @@ import gc import json import multiprocessing import sys +import time from multiprocessing import Queue import lm_eval import torch -UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B-Base"] +# URLs for version information in Markdown report +VLLM_URL = "https://github.com/vllm-project/vllm/commit/" +VLLM_ASCEND_URL = "https://github.com/vllm-project/vllm-ascend/commit/" + +# Model and task configurations +UNIMODAL_MODEL_NAME = ["Qwen/Qwen3-8B-Base", "Qwen/Qwen3-30B-A3B"] UNIMODAL_TASK = ["ceval-valid", "gsm8k"] MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"] MULTIMODAL_TASK = ["mmmu_val"] +# Batch size configurations per task BATCH_SIZE = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1} +# Model type mapping (vllm for text, vllm-vlm for vision-language) +MODEL_TYPE = { + "Qwen/Qwen3-8B-Base": "vllm", + "Qwen/Qwen3-30B-A3B": "vllm", + "Qwen/Qwen2.5-VL-7B-Instruct": "vllm-vlm" +} + +# Command templates for running evaluations MODEL_RUN_INFO = { - "Qwen/Qwen2.5-7B-Instruct": - ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" + "Qwen/Qwen3-30B-A3B": + ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n" "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), @@ -45,19 +60,23 @@ MODEL_RUN_INFO = { "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen2.5-VL-7B-Instruct": - ("export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2'\n" + ("export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2'\n" "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --batch_size 1"), } + +# Evaluation metric filters per task FILTER = { "gsm8k": "exact_match,flexible-extract", "ceval-valid": "acc,none", "mmmu_val": "acc,none" } + +# Expected accuracy values for models EXPECTED_VALUE = { - "Qwen/Qwen2.5-7B-Instruct": { - "ceval-valid": 0.80, - "gsm8k": 0.72 + "Qwen/Qwen3-30B-A3B": { + "ceval-valid": 0.83, + "gsm8k": 0.85 }, "Qwen/Qwen3-8B-Base": { "ceval-valid": 0.82, @@ -67,73 +86,102 @@ EXPECTED_VALUE = { "mmmu_val": 0.51 } } +PARALLEL_MODE = { + "Qwen/Qwen3-8B-Base": "TP", + "Qwen/Qwen2.5-VL-7B-Instruct": "TP", + "Qwen/Qwen3-30B-A3B": "EP" +} + +# Execution backend configuration +EXECUTION_MODE = { + "Qwen/Qwen3-8B-Base": "ACLGraph", + "Qwen/Qwen2.5-VL-7B-Instruct": "ACLGraph", + "Qwen/Qwen3-30B-A3B": "ACLGraph" +} + +# Model arguments for evaluation +MODEL_ARGS = { + "Qwen/Qwen3-8B-Base": + "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6", + "Qwen/Qwen2.5-VL-7B-Instruct": + "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2", + "Qwen/Qwen3-30B-A3B": + "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True" +} + +# Whether to apply chat template formatting +APPLY_CHAT_TEMPLATE = { + "Qwen/Qwen3-8B-Base": True, + "Qwen/Qwen2.5-VL-7B-Instruct": True, + "Qwen/Qwen3-30B-A3B": False +} +# Few-shot examples handling as multi-turn dialogues. +FEWSHOT_AS_MULTITURN = { + "Qwen/Qwen3-8B-Base": True, + "Qwen/Qwen2.5-VL-7B-Instruct": True, + "Qwen/Qwen3-30B-A3B": False +} + +# Relative tolerance for accuracy checks RTOL = 0.03 ACCURACY_FLAG = {} -def run_accuracy_unimodal(queue, model, dataset): +def run_accuracy_test(queue, model, dataset): + """Run accuracy evaluation for a model on a dataset in separate process""" try: - model_args = f"pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6" - results = lm_eval.simple_evaluate( - model="vllm", - model_args=model_args, - tasks=dataset, - apply_chat_template=True, - fewshot_as_multiturn=True, - batch_size=BATCH_SIZE[dataset], - num_fewshot=5, - ) - print(f"Success: {model} on {dataset}") + eval_params = { + "model": MODEL_TYPE[model], + "model_args": MODEL_ARGS[model], + "tasks": dataset, + "apply_chat_template": APPLY_CHAT_TEMPLATE[model], + "fewshot_as_multiturn": FEWSHOT_AS_MULTITURN[model], + "batch_size": BATCH_SIZE[dataset] + } + + if MODEL_TYPE[model] == "vllm": + eval_params["num_fewshot"] = 5 + + results = lm_eval.simple_evaluate(**eval_params) + print(f"Success: {model} on {dataset} ") measured_value = results["results"] queue.put(measured_value) except Exception as e: - print(f"Error in run_accuracy_unimodal: {e}") + print(f"Error in run_accuracy_test: {e}") queue.put(e) sys.exit(1) finally: - torch.npu.empty_cache() + if 'results' in locals(): + del results gc.collect() - - -def run_accuracy_multimodal(queue, model, dataset): - try: - model_args = f"pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2" - results = lm_eval.simple_evaluate( - model="vllm-vlm", - model_args=model_args, - tasks=dataset, - apply_chat_template=True, - fewshot_as_multiturn=True, - batch_size=BATCH_SIZE[dataset], - ) - print(f"Success: {model} on {dataset}") - measured_value = results["results"] - queue.put(measured_value) - except Exception as e: - print(f"Error in run_accuracy_multimodal: {e}") - queue.put(e) - sys.exit(1) - finally: torch.npu.empty_cache() - gc.collect() + time.sleep(5) def generate_md(model_name, tasks_list, args, datasets): + """Generate Markdown report with evaluation results""" + # Format the run command run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name, datasets=datasets) model = model_name.split("/")[1] + + # Version information section version_info = ( f"**vLLM Version**: vLLM: {args.vllm_version} " - f"([{args.vllm_commit}]({args.vllm_commit_url})), " - f"**vLLM Ascend**: {args.vllm_ascend_version} " - f"([{args.vllm_ascend_commit}]({args.vllm_ascend_commit_url}))") + f"([{args.vllm_commit}]({VLLM_URL+args.vllm_commit})), " + f"vLLM Ascend: {args.vllm_ascend_version} " + f"([{args.vllm_ascend_commit}]({VLLM_ASCEND_URL+args.vllm_ascend_commit})) " + ) - preamble = f"""# 🎯 {model} + # Report header with system info + preamble = f"""# {model} {version_info} -**vLLM Engine**: V{args.vllm_use_v1} **Software Environment**: CANN: {args.cann_version}, PyTorch: {args.torch_version}, torch-npu: {args.torch_npu_version} **Hardware Environment**: Atlas A2 Series **Datasets**: {datasets} +**vLLM Engine**: V{args.vllm_use_v1} +**Parallel Mode**: {PARALLEL_MODE[model_name]} +**Execution Mode**: {EXECUTION_MODE[model_name]} **Command**: ```bash {run_cmd} @@ -146,6 +194,7 @@ def generate_md(model_name, tasks_list, args, datasets): ) rows = [] rows_sub = [] + # Process results for each task for task_dict in tasks_list: for key, stats in task_dict.items(): alias = stats.get("alias", key) @@ -181,6 +230,7 @@ def generate_md(model_name, tasks_list, args, datasets): " details" + "" + "\n" * 2 + header) rows_sub.append(row) rows_sub.append("") + # Combine all Markdown sections md = preamble + "\n" + header + "\n" + "\n".join(rows) + "\n" + "\n".join( rows_sub) + "\n" print(md) @@ -188,6 +238,9 @@ def generate_md(model_name, tasks_list, args, datasets): def safe_md(args, accuracy, datasets): + """ + Safely generate and save Markdown report from accuracy results. + """ data = json.loads(json.dumps(accuracy)) for model_key, tasks_list in data.items(): md_content = generate_md(model_key, tasks_list, args, datasets) @@ -197,50 +250,45 @@ def safe_md(args, accuracy, datasets): def main(args): + """Main evaluation workflow""" accuracy = {} accuracy[args.model] = [] result_queue: Queue[float] = multiprocessing.Queue() if args.model in UNIMODAL_MODEL_NAME: - datasets = ",".join(UNIMODAL_TASK) - for dataset in UNIMODAL_TASK: - accuracy_expected = EXPECTED_VALUE[args.model][dataset] - p = multiprocessing.Process(target=run_accuracy_unimodal, - args=(result_queue, args.model, - dataset)) - p.start() + datasets = UNIMODAL_TASK + else: + datasets = MULTIMODAL_TASK + datasets_str = ",".join(datasets) + # Evaluate model on each dataset + for dataset in datasets: + accuracy_expected = EXPECTED_VALUE[args.model][dataset] + p = multiprocessing.Process(target=run_accuracy_test, + args=(result_queue, args.model, dataset)) + p.start() + p.join() + if p.is_alive(): + p.terminate() p.join() - result = result_queue.get() - print(result) - if accuracy_expected - RTOL < result[dataset][ - FILTER[dataset]] < accuracy_expected + RTOL: - ACCURACY_FLAG[dataset] = "✅" - else: - ACCURACY_FLAG[dataset] = "❌" - accuracy[args.model].append(result) - if args.model in MULTIMODAL_NAME: - datasets = ",".join(MULTIMODAL_TASK) - for dataset in MULTIMODAL_TASK: - accuracy_expected = EXPECTED_VALUE[args.model][dataset] - p = multiprocessing.Process(target=run_accuracy_multimodal, - args=(result_queue, args.model, - dataset)) - p.start() - p.join() - result = result_queue.get() - print(result) - if accuracy_expected - RTOL < result[dataset][ - FILTER[dataset]] < accuracy_expected + RTOL: - ACCURACY_FLAG[dataset] = "✅" - else: - ACCURACY_FLAG[dataset] = "❌" - accuracy[args.model].append(result) + gc.collect() + torch.npu.empty_cache() + time.sleep(10) + result = result_queue.get() + print(result) + if accuracy_expected - RTOL < result[dataset][ + FILTER[dataset]] < accuracy_expected + RTOL: + ACCURACY_FLAG[dataset] = "✅" + else: + ACCURACY_FLAG[dataset] = "❌" + accuracy[args.model].append(result) print(accuracy) - safe_md(args, accuracy, datasets) + safe_md(args, accuracy, datasets_str) if __name__ == "__main__": multiprocessing.set_start_method('spawn', force=True) - parser = argparse.ArgumentParser() + # Initialize argument parser + parser = argparse.ArgumentParser( + description="Run model accuracy evaluation and generate report") parser.add_argument("--output", type=str, required=True) parser.add_argument("--model", type=str, required=True) parser.add_argument("--vllm_ascend_version", type=str, required=False) @@ -248,12 +296,8 @@ if __name__ == "__main__": parser.add_argument("--torch_npu_version", type=str, required=False) parser.add_argument("--vllm_version", type=str, required=False) parser.add_argument("--cann_version", type=str, required=False) - parser.add_argument("--vllm_commit", type=lambda s: s[:7], required=False) - parser.add_argument("--vllm_commit_url", type=str, required=False) - parser.add_argument("--vllm_ascend_commit", - type=lambda s: s[:7], - required=False) - parser.add_argument("--vllm_ascend_commit_url", type=str, required=False) + parser.add_argument("--vllm_commit", type=str, required=False) + parser.add_argument("--vllm_ascend_commit", type=str, required=False) parser.add_argument("--vllm_use_v1", type=str, required=False) args = parser.parse_args() main(args)