diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 0a98feb..7140f26 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -29,35 +29,15 @@ on: types: [ labeled ] workflow_dispatch: inputs: - vllm-version: - description: 'vllm version:' + vllm-ascend-version: + description: 'vllm-ascend:' required: true type: choice - # Please also update this when bump matched version # Current supported vLLM versions options: + - latest - main - - v0.10.0 - - v0.9.1 - - v0.7.3 - vllm-ascend-version: - description: 'vllm-ascend version:' - required: true - type: choice - options: - - main - - v0.9.1-dev - - v0.7.3-dev - models: - description: 'model:' - required: true - type: choice - options: - - all - - Qwen/Qwen2.5-VL-7B-Instruct - - Qwen/Qwen3-8B-Base - - Qwen/Qwen3-30B-A3B - default: 'all' + default: main # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -76,58 +56,27 @@ jobs: # test will be triggered when tag '*-accuracy-test' & 'ready-for-test' or workflow_dispatch job if: >- ${{ - (contains(github.event.pull_request.labels.*.name, 'accuracy-test') || - contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') || - contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') || - contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test')) && + contains(github.event.pull_request.labels.*.name, 'accuracy-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' }} - runs-on: >- - ${{ - (matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-aarch64-a2-2') || - 'linux-aarch64-a2-1' - }} + runs-on: ${{ matrix.runner }} strategy: matrix: - # the accuracy test will run: - # 1. workflow_dispatch with models input - # - all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base - # - specified but not all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base - # 2. PR labeled with "*-accuracy-test" - # - accuracy-test: Qwen/Qwen3-8B-Base, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-30B-A3B - # - dense-accuracy-test: Qwen/Qwen3-8B-Base - # - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct - # - moe-accuracy-test: Qwen/Qwen3-30B-A3B - model_name: ${{ fromJSON( - (github.event_name == 'schedule' && - '["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || - (github.event.inputs.models == 'all' && - '["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || - (github.event.inputs.models == 'Qwen/Qwen3-30B-A3B' && - '["Qwen/Qwen3-30B-A3B"]') || - (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && - '["Qwen/Qwen2.5-VL-7B-Instruct"]') || - (github.event.inputs.models == 'Qwen/Qwen3-8B-Base' && - '["Qwen/Qwen3-8B-Base"]') || - contains(github.event.pull_request.labels.*.name, 'accuracy-test') && - '["Qwen/Qwen3-8B-Base","Qwen/Qwen2.5-VL-7B-Instruct", "Qwen/Qwen3-30B-A3B"]' || - contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') && - '["Qwen/Qwen3-8B-Base"]' || - contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') && - '["Qwen/Qwen2.5-VL-7B-Instruct"]' || - contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') && - '["Qwen/Qwen3-30B-A3B"]' - ) }} - + include: + - model_name: Qwen3-8B-Base + runner: linux-aarch64-a2-1 + - model_name: Qwen2.5-VL-7B-Instruct + runner: linux-aarch64-a2-1 + - model_name: Qwen3-30B-A3B + runner: linux-aarch64-a2-2 fail-fast: false + name: ${{ matrix.model_name }} accuracy container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 env: - DATASET_SOURCE: ModelScope VLLM_USE_MODELSCOPE: True - USE_MODELSCOPE_HUB: 1 # 1. If version specified (work_dispatch), do specified branch accuracy test # 2. If no version (labeled PR), do accuracy test by default ref: # The branch, tag or SHA to checkout. When checking out the repository that @@ -139,10 +88,10 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Check npu and CANN info + - name: Set model name as output + id: set_output run: | - npu-smi info - cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + echo "model_name=${{ matrix.model_name }}" >> $GITHUB_OUTPUT - name: Config mirrors run: | @@ -161,19 +110,19 @@ jobs: uses: actions/checkout@v4 with: repository: vllm-project/vllm + ref: v0.10.0 path: ./vllm-empty - # Please also update this when bump matched version - ref: ${{ github.event.inputs.vllm-version || 'v0.10.0' }} - name: Install vllm-project/vllm from source working-directory: ./vllm-empty - run: VLLM_TARGET_DEVICE=empty pip install -e . + run: | + VLLM_TARGET_DEVICE=empty pip install -e . - name: Resolve vllm-ascend version run: | VERSION_INPUT="${{ github.event.inputs.vllm-ascend-version }}" - if [[ "$VERSION_INPUT" == "main" ]]; then + if [[ "$VERSION_INPUT" == "latest" ]]; then TAGS=$(git ls-remote --tags --sort=-v:refname https://github.com/vllm-project/vllm-ascend "v*" | cut -f2 | sed 's|refs/tags/||') LATEST_TAG=$(echo "$TAGS" | head -n1) if [[ -z "$LATEST_TAG" ]]; then @@ -199,8 +148,8 @@ jobs: PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | pip install -r requirements-dev.txt - pip install -v -e . - + pip install -v -e . + - name: Get vLLM commit hash and URL working-directory: ./vllm-empty run: | @@ -213,15 +162,6 @@ jobs: VLLM_ASCEND_COMMIT=$(git rev-parse --short=7 HEAD) echo "VLLM_ASCEND_COMMIT=$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV - - name: Print resolved hashes - run: | - echo "vLLM : ${{ env.VLLM_COMMIT }}" - echo "vLLM-Ascend: ${{ env.VLLM_ASCEND_COMMIT }}" - - - name: Install lm-eval, ray, and datasets - run: | - pip install lm-eval==0.4.8 - - name: Collect version info run: | for dir in /usr/local/Ascend/ascend-toolkit/*; do @@ -242,37 +182,27 @@ jobs: pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}' pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//' } >> "$GITHUB_ENV" - - - name: Print versions - run: | - echo "CANN: ${{ env.GHA_CANN_VERSION }}" - echo "Torch NPU: ${{ env.GHA_TORCH_NPU_VERSION }}" - echo "Torch: ${{ env.GHA_TORCH_VERSION }}" - echo "vLLM: ${{ env.GHA_VLLM_VERSION }}" - echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION }}" - - name: Run Accuracy Test + - name: Run accuracy test id: report - working-directory: ./benchmarks env: - PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 + VLLM_WORKER_MULTIPROC_METHOD: spawn + VLLM_USE_MODELSCOPE: True + VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }} + VLLM_COMMIT: ${{ env.VLLM_COMMIT }} + VLLM_ASCEND_VERSION: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }} + VLLM_ASCEND_COMMIT: ${{ env.VLLM_ASCEND_COMMIT }} + CANN_VERSION: ${{ env.GHA_CANN_VERSION }} + TORCH_VERSION: ${{ env.GHA_TORCH_VERSION }} + TORCH_NPU_VERSION: ${{ env.GHA_TORCH_NPU_VERSION }} run: | model_base_name=$(basename ${{ matrix.model_name }}) markdown_name="${model_base_name}" - echo "markdown_name=$markdown_name" echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT - mkdir -p ./accuracy - - python ./scripts/run_accuracy.py \ - --model "${{ matrix.model_name }}" \ - --output "./accuracy/${markdown_name}.md" \ - --vllm_ascend_version "${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}" \ - --cann_version "${{ env.GHA_CANN_VERSION }}" \ - --torch_npu_version "${{ env.GHA_TORCH_NPU_VERSION }}" \ - --torch_version "${{ env.GHA_TORCH_VERSION }}" \ - --vllm_version "${{ env.GHA_VLLM_VERSION }}" \ - --vllm_commit "${{ env.VLLM_COMMIT }}" \ - --vllm_ascend_commit "${{ env.VLLM_ASCEND_COMMIT }}" \ + mkdir -p ./benchmarks/accuracy + pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \ + --config ./tests/e2e/singlecard/models/configs/${{ matrix.model_name }}.yaml \ + --report_output ./benchmarks/accuracy/${model_base_name}.md - name: Generate step summary if: ${{ always() }} @@ -284,19 +214,7 @@ jobs: SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}" echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV" - - name: Check report first line for failure - id: check_report - run: | - REPORT_PATH="./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md" - echo "Scanning $REPORT_PATH for ❌ …" - if grep -q '❌' "$REPORT_PATH"; then - echo "contains_fail=true" >> $GITHUB_OUTPUT - else - echo "contains_fail=false" >> $GITHUB_OUTPUT - fi - - name: Upload Report - if: ${{ github.event_name == 'workflow_dispatch' && steps.check_report.outputs.contains_fail == 'false' }} uses: actions/upload-artifact@v4 with: name: "report-${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}" @@ -305,12 +223,16 @@ jobs: retention-days: 90 overwrite: true + outputs: + model_name: ${{ steps.set_output.outputs.model_name }} + create_pr: runs-on: ubuntu-latest needs: accuracy_tests - if: ${{ github.event_name == 'workflow_dispatch' }} + if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }} env: UPSTREAM_REPO: vllm-project/vllm-ascend + steps: - name: Checkout repository uses: actions/checkout@v4 @@ -318,7 +240,7 @@ jobs: repository: vllm-ascend-ci/vllm-ascend token: ${{ secrets.PAT_TOKEN }} ref: main - + - name: Add upstream remote run: | git remote add upstream https://github.com/${{ env.UPSTREAM_REPO }}.git @@ -350,7 +272,7 @@ jobs: find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \; find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete - + - name: Update accuracy_report/index.md run: | REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report" @@ -390,16 +312,10 @@ jobs: head: `vllm-ascend-ci:${{ env.BRANCH_NAME }}`, base: '${{ github.event.inputs.vllm-ascend-version }}', title: `[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}`, - body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: - ${{ - github.event.inputs.models == 'all' - && 'All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)' - || github.event.inputs.models - }} + body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base) - - [Workflow run][1] - - [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}` + - [Workflow run][1] + + [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}` }); core.info(`Created PR #${pr.data.number}`); - diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py deleted file mode 100644 index cc2f4e2..0000000 --- a/benchmarks/scripts/run_accuracy.py +++ /dev/null @@ -1,313 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -import argparse -import gc -import json -import multiprocessing -import sys -import time -from multiprocessing import Queue - -import lm_eval -import torch - -# URLs for version information in Markdown report -VLLM_URL = "https://github.com/vllm-project/vllm/commit/" -VLLM_ASCEND_URL = "https://github.com/vllm-project/vllm-ascend/commit/" - -# Model and task configurations -UNIMODAL_MODEL_NAME = ["Qwen/Qwen3-8B-Base", "Qwen/Qwen3-30B-A3B"] -UNIMODAL_TASK = ["ceval-valid", "gsm8k"] -MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"] -MULTIMODAL_TASK = ["mmmu_val"] - -# Batch size configurations per task -BATCH_SIZE = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1} - -# Model type mapping (vllm for text, vllm-vlm for vision-language) -MODEL_TYPE = { - "Qwen/Qwen3-8B-Base": "vllm", - "Qwen/Qwen3-30B-A3B": "vllm", - "Qwen/Qwen2.5-VL-7B-Instruct": "vllm-vlm", -} - -# Command templates for running evaluations -MODEL_RUN_INFO = { - "Qwen/Qwen3-30B-A3B": ( - "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n" - "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" - "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" - ), - "Qwen/Qwen3-8B-Base": ( - "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6'\n" - "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" - "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" - ), - "Qwen/Qwen2.5-VL-7B-Instruct": ( - "export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2'\n" - "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n" - "--apply_chat_template --fewshot_as_multiturn --batch_size 1" - ), -} - -# Evaluation metric filters per task -FILTER = { - "gsm8k": "exact_match,flexible-extract", - "ceval-valid": "acc,none", - "mmmu_val": "acc,none", -} - -# Expected accuracy values for models -EXPECTED_VALUE = { - "Qwen/Qwen3-30B-A3B": {"ceval-valid": 0.83, "gsm8k": 0.85}, - "Qwen/Qwen3-8B-Base": {"ceval-valid": 0.82, "gsm8k": 0.83}, - "Qwen/Qwen2.5-VL-7B-Instruct": {"mmmu_val": 0.51}, -} -PARALLEL_MODE = { - "Qwen/Qwen3-8B-Base": "TP", - "Qwen/Qwen2.5-VL-7B-Instruct": "TP", - "Qwen/Qwen3-30B-A3B": "EP", -} - -# Execution backend configuration -EXECUTION_MODE = { - "Qwen/Qwen3-8B-Base": "ACLGraph", - "Qwen/Qwen2.5-VL-7B-Instruct": "ACLGraph", - "Qwen/Qwen3-30B-A3B": "ACLGraph", -} - -# Model arguments for evaluation -MODEL_ARGS = { - "Qwen/Qwen3-8B-Base": "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6", - "Qwen/Qwen2.5-VL-7B-Instruct": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2", - "Qwen/Qwen3-30B-A3B": "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True", -} - -# Whether to apply chat template formatting -APPLY_CHAT_TEMPLATE = { - "Qwen/Qwen3-8B-Base": True, - "Qwen/Qwen2.5-VL-7B-Instruct": True, - "Qwen/Qwen3-30B-A3B": False, -} -# Few-shot examples handling as multi-turn dialogues. -FEWSHOT_AS_MULTITURN = { - "Qwen/Qwen3-8B-Base": True, - "Qwen/Qwen2.5-VL-7B-Instruct": True, - "Qwen/Qwen3-30B-A3B": False, -} - -# Relative tolerance for accuracy checks -RTOL = 0.03 -ACCURACY_FLAG = {} - - -def run_accuracy_test(queue, model, dataset): - """Run accuracy evaluation for a model on a dataset in separate process""" - try: - eval_params = { - "model": MODEL_TYPE[model], - "model_args": MODEL_ARGS[model], - "tasks": dataset, - "apply_chat_template": APPLY_CHAT_TEMPLATE[model], - "fewshot_as_multiturn": FEWSHOT_AS_MULTITURN[model], - "batch_size": BATCH_SIZE[dataset], - } - - if MODEL_TYPE[model] == "vllm": - eval_params["num_fewshot"] = 5 - - results = lm_eval.simple_evaluate(**eval_params) - print(f"Success: {model} on {dataset} ") - measured_value = results["results"] - queue.put(measured_value) - except Exception as e: - print(f"Error in run_accuracy_test: {e}") - queue.put(e) - sys.exit(1) - finally: - if "results" in locals(): - del results - gc.collect() - torch.npu.empty_cache() - time.sleep(5) - - -def generate_md(model_name, tasks_list, args, datasets): - """Generate Markdown report with evaluation results""" - # Format the run command - run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name, datasets=datasets) - model = model_name.split("/")[1] - - # Version information section - version_info = ( - f"**vLLM Version**: vLLM: {args.vllm_version} " - f"([{args.vllm_commit}]({VLLM_URL + args.vllm_commit})), " - f"vLLM Ascend: {args.vllm_ascend_version} " - f"([{args.vllm_ascend_commit}]({VLLM_ASCEND_URL + args.vllm_ascend_commit})) " - ) - - # Report header with system info - preamble = f"""# {model} -{version_info} -**Software Environment**: CANN: {args.cann_version}, PyTorch: {args.torch_version}, torch-npu: {args.torch_npu_version} -**Hardware Environment**: Atlas A2 Series -**Datasets**: {datasets} -**Parallel Mode**: {PARALLEL_MODE[model_name]} -**Execution Mode**: {EXECUTION_MODE[model_name]} -**Command**: -```bash -{run_cmd} -``` - """ - - header = ( - "| Task | Filter | n-shot | Metric | Value | Stderr |\n" - "|-----------------------|-------:|-------:|----------|--------:|-------:|" - ) - rows = [] - rows_sub = [] - # Process results for each task - for task_dict in tasks_list: - for key, stats in task_dict.items(): - alias = stats.get("alias", key) - task_name = alias.strip() - if "exact_match,flexible-extract" in stats: - metric_key = "exact_match,flexible-extract" - else: - metric_key = None - for k in stats: - if "," in k and not k.startswith("acc_stderr"): - metric_key = k - break - if metric_key is None: - continue - metric, flt = metric_key.split(",", 1) - - value = stats[metric_key] - stderr = stats.get(f"{metric}_stderr,{flt}", 0) - if model_name in UNIMODAL_MODEL_NAME: - n_shot = "5" - else: - n_shot = "0" - flag = ACCURACY_FLAG.get(task_name, "") - row = ( - f"| {task_name:<37} " - f"| {flt:<6} " - f"| {n_shot:6} " - f"| {metric:<6} " - f"| {flag}{value:>5.4f} " - f"| ± {stderr:>5.4f} |" - ) - if not task_name.startswith("-"): - rows.append(row) - rows_sub.append( - "
" - + "\n" - + "" - + task_name - + " details" - + "" - + "\n" * 2 - + header - ) - rows_sub.append(row) - rows_sub.append("
") - # Combine all Markdown sections - md = ( - preamble - + "\n" - + header - + "\n" - + "\n".join(rows) - + "\n" - + "\n".join(rows_sub) - + "\n" - ) - print(md) - return md - - -def safe_md(args, accuracy, datasets): - """ - Safely generate and save Markdown report from accuracy results. - """ - data = json.loads(json.dumps(accuracy)) - for model_key, tasks_list in data.items(): - md_content = generate_md(model_key, tasks_list, args, datasets) - with open(args.output, "w", encoding="utf-8") as f: - f.write(md_content) - print(f"create Markdown file:{args.output}") - - -def main(args): - """Main evaluation workflow""" - accuracy = {} - accuracy[args.model] = [] - result_queue: Queue[float] = multiprocessing.Queue() - if args.model in UNIMODAL_MODEL_NAME: - datasets = UNIMODAL_TASK - else: - datasets = MULTIMODAL_TASK - datasets_str = ",".join(datasets) - # Evaluate model on each dataset - for dataset in datasets: - accuracy_expected = EXPECTED_VALUE[args.model][dataset] - p = multiprocessing.Process( - target=run_accuracy_test, args=(result_queue, args.model, dataset) - ) - p.start() - p.join() - if p.is_alive(): - p.terminate() - p.join() - gc.collect() - torch.npu.empty_cache() - time.sleep(10) - result = result_queue.get() - print(result) - if ( - accuracy_expected - RTOL - < result[dataset][FILTER[dataset]] - < accuracy_expected + RTOL - ): - ACCURACY_FLAG[dataset] = "✅" - else: - ACCURACY_FLAG[dataset] = "❌" - accuracy[args.model].append(result) - print(accuracy) - safe_md(args, accuracy, datasets_str) - - -if __name__ == "__main__": - multiprocessing.set_start_method("spawn", force=True) - # Initialize argument parser - parser = argparse.ArgumentParser( - description="Run model accuracy evaluation and generate report" - ) - parser.add_argument("--output", type=str, required=True) - parser.add_argument("--model", type=str, required=True) - parser.add_argument("--vllm_ascend_version", type=str, required=False) - parser.add_argument("--torch_version", type=str, required=False) - parser.add_argument("--torch_npu_version", type=str, required=False) - parser.add_argument("--vllm_version", type=str, required=False) - parser.add_argument("--cann_version", type=str, required=False) - parser.add_argument("--vllm_commit", type=str, required=False) - parser.add_argument("--vllm_ascend_commit", type=str, required=False) - args = parser.parse_args() - main(args) diff --git a/requirements-dev.txt b/requirements-dev.txt index 787120d..ed71dfe 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,7 +5,7 @@ openai pytest >= 6.0 pytest-asyncio pytest-mock -lm-eval +lm-eval==0.4.8 types-jsonschema xgrammar zmq diff --git a/tests/e2e/singlecard/models/configs/Qwen2.5-VL-7B-Instruct.yaml b/tests/e2e/singlecard/models/configs/Qwen2.5-VL-7B-Instruct.yaml new file mode 100644 index 0000000..eb7196a --- /dev/null +++ b/tests/e2e/singlecard/models/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -0,0 +1,8 @@ +model_name: "Qwen/Qwen2.5-VL-7B-Instruct" +model: "vllm-vlm" +tasks: +- name: "mmmu_val" + metrics: + - name: "acc,none" + value: 0.51 +max_model_len: 8192 \ No newline at end of file diff --git a/tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml b/tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml new file mode 100644 index 0000000..be1bbb0 --- /dev/null +++ b/tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml @@ -0,0 +1,18 @@ +model_name: "Qwen/Qwen3-30B-A3B" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.89 + - name: "exact_match,flexible-extract" + value: 0.85 +- name: "ceval-valid" + metrics: + - name: "acc,none" + value: 0.84 +num_fewshot: 5 +gpu_memory_utilization: 0.6 +enable_expert_parallel: True +tensor_parallel_size: 2 +apply_chat_template: False +fewshot_as_multiturn: False \ No newline at end of file diff --git a/tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml b/tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml new file mode 100644 index 0000000..e60cc9a --- /dev/null +++ b/tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml @@ -0,0 +1,13 @@ +model_name: "Qwen/Qwen3-8B-Base" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.82 + - name: "exact_match,flexible-extract" + value: 0.83 +- name: "ceval-valid" + metrics: + - name: "acc,none" + value: 0.82 +num_fewshot: 5 diff --git a/tests/e2e/singlecard/models/configs/accuracy.txt b/tests/e2e/singlecard/models/configs/accuracy.txt new file mode 100644 index 0000000..e29ff1a --- /dev/null +++ b/tests/e2e/singlecard/models/configs/accuracy.txt @@ -0,0 +1,3 @@ +Qwen3-8B-Base.yaml +Qwen2.5-VL-7B-Instruct.yaml +Qwen3-30B-A3B.yaml \ No newline at end of file diff --git a/tests/e2e/singlecard/models/conftest.py b/tests/e2e/singlecard/models/conftest.py new file mode 100644 index 0000000..2b25c1a --- /dev/null +++ b/tests/e2e/singlecard/models/conftest.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from pathlib import Path + +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--config-list-file", + action="store", + default=None, + help="Path to the file listing model config YAMLs (one per line)", + ) + parser.addoption( + "--tp-size", + action="store", + default="1", + help="Tensor parallel size to use for evaluation", + ) + parser.addoption( + "--config", + action="store", + default="./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml", + help="Path to the model config YAML file", + ) + parser.addoption( + "--report_output", + action="store", + default="./benchmarks/accuracy/Qwen3-8B-Base.md", + help="Path to the report output file", + ) + + +@pytest.fixture(scope="session") +def config_list_file(pytestconfig, config_dir): + rel_path = pytestconfig.getoption("--config-list-file") + return config_dir / rel_path + + +@pytest.fixture(scope="session") +def tp_size(pytestconfig): + return pytestconfig.getoption("--tp-size") + + +@pytest.fixture(scope="session") +def config(pytestconfig): + return pytestconfig.getoption("--config") + + +@pytest.fixture(scope="session") +def report_output(pytestconfig): + return pytestconfig.getoption("--report_output") + + +def pytest_generate_tests(metafunc): + if "config_filename" in metafunc.fixturenames: + # If config specified, use the --config directly + single_config = metafunc.config.getoption("--config") + if single_config: + metafunc.parametrize("config_filename", + [Path(single_config).resolve()]) + return + # Otherwise, check --config-list-file + rel_path = metafunc.config.getoption("--config-list-file") + config_list_file = Path(rel_path).resolve() + config_dir = config_list_file.parent + with open(config_list_file, encoding="utf-8") as f: + configs = [ + config_dir / line.strip() for line in f + if line.strip() and not line.startswith("#") + ] + metafunc.parametrize("config_filename", configs) diff --git a/tests/e2e/singlecard/models/report_template.md b/tests/e2e/singlecard/models/report_template.md new file mode 100644 index 0000000..ddaa9c7 --- /dev/null +++ b/tests/e2e/singlecard/models/report_template.md @@ -0,0 +1,24 @@ +# {{ model_name }} + +**vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), +**vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }})) +**Software Environment**: CANN: {{ cann_version }}, PyTorch: {{ torch_version }}, torch-npu: {{ torch_npu_version }} +**Hardware Environment**: Atlas A2 Series +**Datasets**: {{ datasets }} +**Parallel Mode**: TP +**Execution Mode**: ACLGraph + +**Command**: + +```bash +export MODEL_ARGS={{ model_args }} +lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \ +--apply_chat_template {{ apply_chat_template }} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} \ +--limit {{ limit }} --batch_size {{ batch_size}} +``` + +| Task | Metric | Value | Stderr | +|-----------------------|-------------|----------:|-------:| +{% for row in rows -%} +| {{ row.task.rjust(23) }} | {{ row.metric.rjust(15) }} |{{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} | +{% endfor %} diff --git a/tests/e2e/singlecard/models/test_lm_eval_correctness.py b/tests/e2e/singlecard/models/test_lm_eval_correctness.py new file mode 100644 index 0000000..3453a05 --- /dev/null +++ b/tests/e2e/singlecard/models/test_lm_eval_correctness.py @@ -0,0 +1,148 @@ +import os +from dataclasses import dataclass + +import lm_eval +import numpy as np +import pytest +import yaml +from jinja2 import Environment, FileSystemLoader + +RTOL = 0.03 +TEST_DIR = os.path.dirname(__file__) + + +@dataclass +class EnvConfig: + vllm_version: str + vllm_commit: str + vllm_ascend_version: str + vllm_ascend_commit: str + cann_version: str + torch_version: str + torch_npu_version: str + + +@pytest.fixture +def env_config() -> EnvConfig: + return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'), + vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'), + vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION', + 'unknown'), + vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT', + 'unknown'), + cann_version=os.getenv('CANN_VERSION', 'unknown'), + torch_version=os.getenv('TORCH_VERSION', 'unknown'), + torch_npu_version=os.getenv('TORCH_NPU_VERSION', + 'unknown')) + + +def build_model_args(eval_config, tp_size): + trust_remote_code = eval_config.get("trust_remote_code", False) + max_model_len = eval_config.get("max_model_len", 4096) + model_args = { + "pretrained": eval_config["model_name"], + "tensor_parallel_size": tp_size, + "dtype": "auto", + "trust_remote_code": trust_remote_code, + "max_model_len": max_model_len, + } + for s in [ + "max_images", "gpu_memory_utilization", "enable_expert_parallel", + "tensor_parallel_size" + ]: + val = eval_config.get(s, None) + if val is not None: + model_args[s] = val + + print("Model Parameters:") + print(model_args) + + return model_args + + +def generate_report(tp_size, eval_config, report_data, report_output, + env_config): + env = Environment(loader=FileSystemLoader(TEST_DIR)) + template = env.get_template("report_template.md") + model_args = build_model_args(eval_config, tp_size) + + report_content = template.render( + vllm_version=env_config.vllm_version, + vllm_commit=env_config.vllm_commit, + vllm_ascend_version=env_config.vllm_ascend_version, + vllm_ascend_commit=env_config.vllm_ascend_commit, + cann_version=env_config.cann_version, + torch_version=env_config.torch_version, + torch_npu_version=env_config.torch_npu_version, + model_name=eval_config["model_name"], + model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'", + model_type=eval_config.get("model", "vllm"), + datasets=",".join([task["name"] for task in eval_config["tasks"]]), + apply_chat_template=eval_config.get("apply_chat_template", True), + fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True), + limit=eval_config.get("limit", None), + batch_size="auto", + num_fewshot=eval_config.get("num_fewshot", "N/A"), + rows=report_data["rows"]) + + os.makedirs(os.path.dirname(report_output), exist_ok=True) + with open(report_output, 'w', encoding='utf-8') as f: + f.write(report_content) + + +def test_lm_eval_correctness_param(config_filename, tp_size, report_output, + env_config): + eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) + model_args = build_model_args(eval_config, tp_size) + success = True + report_data: dict[str, list[dict]] = {"rows": []} + + eval_params = { + "model": eval_config.get("model", "vllm"), + "model_args": model_args, + "tasks": [task["name"] for task in eval_config["tasks"]], + "apply_chat_template": eval_config.get("apply_chat_template", True), + "fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True), + "limit": eval_config.get("limit", None), + "batch_size": "auto", + } + for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]: + val = eval_config.get(s, None) + if val is not None: + eval_params[s] = val + + print("Eval Parameters:") + print(eval_params) + + results = lm_eval.simple_evaluate(**eval_params) + + for task in eval_config["tasks"]: + task_name = task["name"] + task_result = results["results"][task_name] + for metric in task["metrics"]: + metric_name = metric["name"] + ground_truth = metric["value"] + measured_value = task_result[metric_name] + task_success = bool( + np.isclose(ground_truth, measured_value, rtol=RTOL)) + success = success and task_success + + print(f"{task_name} | {metric_name}: " + f"ground_truth={ground_truth} | measured={measured_value} | " + f"success={'✅' if task_success else '❌'}") + + report_data["rows"].append({ + "task": + task_name, + "metric": + metric_name, + "value": + f"✅{measured_value}" if success else f"❌{measured_value}", + "stderr": + task_result[ + metric_name.replace(',', '_stderr,') if metric_name == + "acc,none" else metric_name.replace(',', '_stderr,')] + }) + generate_report(tp_size, eval_config, report_data, report_output, + env_config) + assert success