Enable pytest and yaml style accuracy test (#2073)

### What this PR does / why we need it? This PR enabled pytest and yaml style accuracy test, users now can enable accuracy test by running: ```bash cd ~/vllm-ascend pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \ --config ./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml \ --report_output ./benchmarks/accuracy/Qwen3-8B-Base.md pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \ --config-list-file ./tests/e2e/singlecard/models/configs/accuracy.txt ``` Closes: https://github.com/vllm-project/vllm-ascend/issues/1970 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - vLLM version: v0.10.0 - vLLM main: 2836dd73f1 --------- Signed-off-by: Icey <1790571317@qq.com>
2025-07-31 21:39:13 +08:00
parent 9c9a7cd90b
commit 86bdde1ca8
10 changed files with 336 additions and 446 deletions
--- a/.github/workflows/accuracy_test.yaml
+++ b/.github/workflows/accuracy_test.yaml
@@ -29,35 +29,15 @@ on:
    types: [ labeled ]
  workflow_dispatch:
    inputs:
-      vllm-version:
+      vllm-ascend-version:
-        description: 'vllm version:'
+        description: 'vllm-ascend:'
        required: true
        type: choice
        # Please also update this when bump matched version
        # Current supported vLLM versions
        options:
          - latest
          - main
-          - v0.10.0
+        default: main
          - v0.9.1
          - v0.7.3
      vllm-ascend-version:
        description: 'vllm-ascend version:'
        required: true
        type: choice
        options:
          - main
          - v0.9.1-dev
          - v0.7.3-dev
      models:
        description: 'model:'
        required: true
        type: choice
        options:
          - all
          - Qwen/Qwen2.5-VL-7B-Instruct
          - Qwen/Qwen3-8B-Base
          - Qwen/Qwen3-30B-A3B
        default: 'all'
 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
 # declared as "shell: bash -el {0}" on steps that need to be properly activated.
@@ -76,58 +56,27 @@ jobs:
    # test will be triggered when tag '*-accuracy-test' & 'ready-for-test' or workflow_dispatch job
    if:  >-
      ${{
-      (contains(github.event.pull_request.labels.*.name, 'accuracy-test') ||
+      contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
      contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') ||
      contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') ||
      contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test')) &&
      contains(github.event.pull_request.labels.*.name, 'ready-for-test') ||
      github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
      }}
-    runs-on: >-
+    runs-on: ${{ matrix.runner }}
      ${{
          (matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-aarch64-a2-2') ||
          'linux-aarch64-a2-1'
      }}
    strategy:
      matrix:
-        # the accuracy test will run:
+        include:
-        # 1. workflow_dispatch with models input
+          - model_name: Qwen3-8B-Base
-        #   - all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
+            runner: linux-aarch64-a2-1
-        #   - specified but not all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
+          - model_name: Qwen2.5-VL-7B-Instruct
-        # 2. PR labeled with "*-accuracy-test"
+            runner: linux-aarch64-a2-1
-        #   - accuracy-test: Qwen/Qwen3-8B-Base, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-30B-A3B
+          - model_name: Qwen3-30B-A3B
-        #   - dense-accuracy-test: Qwen/Qwen3-8B-Base
+            runner: linux-aarch64-a2-2
        #   - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct
        #   - moe-accuracy-test: Qwen/Qwen3-30B-A3B
        model_name: ${{ fromJSON(
          (github.event_name == 'schedule' &&
            '["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
          (github.event.inputs.models == 'all' &&
            '["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
          (github.event.inputs.models == 'Qwen/Qwen3-30B-A3B' &&
            '["Qwen/Qwen3-30B-A3B"]') ||
          (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' &&
            '["Qwen/Qwen2.5-VL-7B-Instruct"]') ||
          (github.event.inputs.models == 'Qwen/Qwen3-8B-Base' &&
            '["Qwen/Qwen3-8B-Base"]') ||
          contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
            '["Qwen/Qwen3-8B-Base","Qwen/Qwen2.5-VL-7B-Instruct", "Qwen/Qwen3-30B-A3B"]' ||
          contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') &&
            '["Qwen/Qwen3-8B-Base"]' ||
          contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') &&
            '["Qwen/Qwen2.5-VL-7B-Instruct"]' ||
          contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') &&
            '["Qwen/Qwen3-30B-A3B"]'
         ) }}
      fail-fast: false
    name: ${{ matrix.model_name }} accuracy
    container:
      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
      env:
        DATASET_SOURCE: ModelScope
        VLLM_USE_MODELSCOPE: True
        USE_MODELSCOPE_HUB: 1
        # 1. If version specified (work_dispatch), do specified branch accuracy test
        # 2. If no version (labeled PR), do accuracy test by default ref:
        # The branch, tag or SHA to checkout. When checking out the repository that
@@ -139,10 +88,10 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@v4
-      - name: Check npu and CANN info
+      - name: Set model name as output
        id: set_output
        run: |
-          npu-smi info
+          echo "model_name=${{ matrix.model_name }}" >> $GITHUB_OUTPUT
          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
      - name: Config mirrors
        run: |
@@ -161,19 +110,19 @@ jobs:
        uses: actions/checkout@v4
        with:
          repository: vllm-project/vllm
          ref: v0.10.0
          path: ./vllm-empty
          # Please also update this when bump matched version
          ref: ${{ github.event.inputs.vllm-version || 'v0.10.0' }}
      - name: Install vllm-project/vllm from source
        working-directory: ./vllm-empty
-        run: VLLM_TARGET_DEVICE=empty pip install -e .
+        run: |
          VLLM_TARGET_DEVICE=empty pip install -e .
      - name: Resolve vllm-ascend version
        run: |
          VERSION_INPUT="${{ github.event.inputs.vllm-ascend-version }}"
-          if [[ "$VERSION_INPUT" == "main" ]]; then
+          if [[ "$VERSION_INPUT" == "latest" ]]; then
            TAGS=$(git ls-remote --tags --sort=-v:refname https://github.com/vllm-project/vllm-ascend "v*" | cut -f2 | sed 's|refs/tags/||')
            LATEST_TAG=$(echo "$TAGS" | head -n1)
            if [[ -z "$LATEST_TAG" ]]; then
@@ -199,8 +148,8 @@ jobs:
          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
        run: |
          pip install -r requirements-dev.txt
-          pip install -v -e . 
+          pip install -v -e .
-            
+
      - name: Get vLLM commit hash and URL
        working-directory: ./vllm-empty
        run: |
@@ -213,15 +162,6 @@ jobs:
          VLLM_ASCEND_COMMIT=$(git rev-parse --short=7 HEAD)
          echo "VLLM_ASCEND_COMMIT=$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV
      - name: Print resolved hashes
        run: |
          echo "vLLM       : ${{ env.VLLM_COMMIT }}"
          echo "vLLM-Ascend: ${{ env.VLLM_ASCEND_COMMIT }}"
      - name: Install lm-eval, ray, and datasets
        run: |
            pip install lm-eval==0.4.8
      - name: Collect version info
        run: |
          for dir in /usr/local/Ascend/ascend-toolkit/*; do
@@ -242,37 +182,27 @@ jobs:
            pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}'
            pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//'
          } >> "$GITHUB_ENV"
      - name: Print versions
        run: |
          echo "CANN: ${{ env.GHA_CANN_VERSION }}"
          echo "Torch NPU: ${{ env.GHA_TORCH_NPU_VERSION }}"
          echo "Torch: ${{ env.GHA_TORCH_VERSION }}"
          echo "vLLM: ${{ env.GHA_VLLM_VERSION }}"
          echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION }}"
-      - name: Run Accuracy Test
+      - name: Run accuracy test
        id: report
        working-directory: ./benchmarks
        env:
-          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
          VLLM_USE_MODELSCOPE: True
          VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }}
          VLLM_COMMIT: ${{ env.VLLM_COMMIT }}
          VLLM_ASCEND_VERSION: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}
          VLLM_ASCEND_COMMIT: ${{ env.VLLM_ASCEND_COMMIT }}
          CANN_VERSION: ${{ env.GHA_CANN_VERSION }}
          TORCH_VERSION: ${{ env.GHA_TORCH_VERSION }}
          TORCH_NPU_VERSION: ${{ env.GHA_TORCH_NPU_VERSION }}
        run: |
          model_base_name=$(basename ${{ matrix.model_name }})
          markdown_name="${model_base_name}"
          echo "markdown_name=$markdown_name"
          echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT
-          mkdir -p ./accuracy
+          mkdir -p ./benchmarks/accuracy
-
+          pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
-          python ./scripts/run_accuracy.py \
+          --config ./tests/e2e/singlecard/models/configs/${{ matrix.model_name }}.yaml \
-            --model "${{ matrix.model_name }}" \
+          --report_output ./benchmarks/accuracy/${model_base_name}.md 
            --output "./accuracy/${markdown_name}.md" \
            --vllm_ascend_version "${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}" \
            --cann_version "${{ env.GHA_CANN_VERSION }}" \
            --torch_npu_version "${{ env.GHA_TORCH_NPU_VERSION }}" \
            --torch_version "${{ env.GHA_TORCH_VERSION }}" \
            --vllm_version "${{ env.GHA_VLLM_VERSION }}" \
            --vllm_commit "${{ env.VLLM_COMMIT }}" \
            --vllm_ascend_commit "${{ env.VLLM_ASCEND_COMMIT }}" \
      - name: Generate step summary
        if: ${{ always() }}
@@ -284,19 +214,7 @@ jobs:
          SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}"
          echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV"
      - name: Check report first line for failure
        id: check_report
        run: |
          REPORT_PATH="./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md"
          echo "Scanning $REPORT_PATH for ❌ …"
          if grep -q '❌' "$REPORT_PATH"; then
            echo "contains_fail=true" >> $GITHUB_OUTPUT
          else
            echo "contains_fail=false" >> $GITHUB_OUTPUT
          fi
      - name: Upload Report 
        if: ${{ github.event_name == 'workflow_dispatch' && steps.check_report.outputs.contains_fail == 'false' }}
        uses: actions/upload-artifact@v4
        with:
          name: "report-${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}"
@@ -305,12 +223,16 @@ jobs:
          retention-days: 90
          overwrite: true
    outputs:
      model_name: ${{ steps.set_output.outputs.model_name }}
  create_pr:
    runs-on: ubuntu-latest
    needs: accuracy_tests
-    if: ${{ github.event_name == 'workflow_dispatch' }}
+    if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}
    env:
      UPSTREAM_REPO: vllm-project/vllm-ascend
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
@@ -318,7 +240,7 @@ jobs:
          repository: vllm-ascend-ci/vllm-ascend
          token: ${{ secrets.PAT_TOKEN }}
          ref: main
-
+      
      - name: Add upstream remote
        run: |
          git remote add upstream https://github.com/${{ env.UPSTREAM_REPO }}.git
@@ -350,7 +272,7 @@ jobs:
          find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete
          find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \;
          find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete
-
+      
      - name: Update accuracy_report/index.md
        run: |
          REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report"
@@ -390,16 +312,10 @@ jobs:
              head: `vllm-ascend-ci:${{ env.BRANCH_NAME }}`,
              base: '${{ github.event.inputs.vllm-ascend-version }}',
              title: `[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}`,
-              body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for:
+              body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)
            ${{ 
              github.event.inputs.models == 'all' 
                && 'All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)' 
                || github.event.inputs.models 
            }}
-            - [Workflow run][1]
+              - [Workflow run][1]
-            
+              
-            [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`
+              [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`
            });
            core.info(`Created PR #${pr.data.number}`);
--- a/benchmarks/scripts/run_accuracy.py
+++ b/benchmarks/scripts/run_accuracy.py
@@ -1,313 +0,0 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # Copyright 2023 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 import argparse
 import gc
 import json
 import multiprocessing
 import sys
 import time
 from multiprocessing import Queue
 import lm_eval
 import torch
 # URLs for version information in Markdown report
 VLLM_URL = "https://github.com/vllm-project/vllm/commit/"
 VLLM_ASCEND_URL = "https://github.com/vllm-project/vllm-ascend/commit/"
 # Model and task configurations
 UNIMODAL_MODEL_NAME = ["Qwen/Qwen3-8B-Base", "Qwen/Qwen3-30B-A3B"]
 UNIMODAL_TASK = ["ceval-valid", "gsm8k"]
 MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"]
 MULTIMODAL_TASK = ["mmmu_val"]
 # Batch size configurations per task
 BATCH_SIZE = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1}
 # Model type mapping (vllm for text, vllm-vlm for vision-language)
 MODEL_TYPE = {
    "Qwen/Qwen3-8B-Base": "vllm",
    "Qwen/Qwen3-30B-A3B": "vllm",
    "Qwen/Qwen2.5-VL-7B-Instruct": "vllm-vlm",
 }
 # Command templates for running evaluations
 MODEL_RUN_INFO = {
    "Qwen/Qwen3-30B-A3B": (
        "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n"
        "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
        "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
    ),
    "Qwen/Qwen3-8B-Base": (
        "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6'\n"
        "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
        "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
    ),
    "Qwen/Qwen2.5-VL-7B-Instruct": (
        "export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2'\n"
        "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
        "--apply_chat_template --fewshot_as_multiturn  --batch_size 1"
    ),
 }
 # Evaluation metric filters per task
 FILTER = {
    "gsm8k": "exact_match,flexible-extract",
    "ceval-valid": "acc,none",
    "mmmu_val": "acc,none",
 }
 # Expected accuracy values for models
 EXPECTED_VALUE = {
    "Qwen/Qwen3-30B-A3B": {"ceval-valid": 0.83, "gsm8k": 0.85},
    "Qwen/Qwen3-8B-Base": {"ceval-valid": 0.82, "gsm8k": 0.83},
    "Qwen/Qwen2.5-VL-7B-Instruct": {"mmmu_val": 0.51},
 }
 PARALLEL_MODE = {
    "Qwen/Qwen3-8B-Base": "TP",
    "Qwen/Qwen2.5-VL-7B-Instruct": "TP",
    "Qwen/Qwen3-30B-A3B": "EP",
 }
 # Execution backend configuration
 EXECUTION_MODE = {
    "Qwen/Qwen3-8B-Base": "ACLGraph",
    "Qwen/Qwen2.5-VL-7B-Instruct": "ACLGraph",
    "Qwen/Qwen3-30B-A3B": "ACLGraph",
 }
 # Model arguments for evaluation
 MODEL_ARGS = {
    "Qwen/Qwen3-8B-Base": "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6",
    "Qwen/Qwen2.5-VL-7B-Instruct": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2",
    "Qwen/Qwen3-30B-A3B": "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True",
 }
 # Whether to apply chat template formatting
 APPLY_CHAT_TEMPLATE = {
    "Qwen/Qwen3-8B-Base": True,
    "Qwen/Qwen2.5-VL-7B-Instruct": True,
    "Qwen/Qwen3-30B-A3B": False,
 }
 # Few-shot examples handling as multi-turn dialogues.
 FEWSHOT_AS_MULTITURN = {
    "Qwen/Qwen3-8B-Base": True,
    "Qwen/Qwen2.5-VL-7B-Instruct": True,
    "Qwen/Qwen3-30B-A3B": False,
 }
 # Relative tolerance for accuracy checks
 RTOL = 0.03
 ACCURACY_FLAG = {}
 def run_accuracy_test(queue, model, dataset):
    """Run accuracy evaluation for a model on a dataset in separate process"""
    try:
        eval_params = {
            "model": MODEL_TYPE[model],
            "model_args": MODEL_ARGS[model],
            "tasks": dataset,
            "apply_chat_template": APPLY_CHAT_TEMPLATE[model],
            "fewshot_as_multiturn": FEWSHOT_AS_MULTITURN[model],
            "batch_size": BATCH_SIZE[dataset],
        }
        if MODEL_TYPE[model] == "vllm":
            eval_params["num_fewshot"] = 5
        results = lm_eval.simple_evaluate(**eval_params)
        print(f"Success: {model} on {dataset} ")
        measured_value = results["results"]
        queue.put(measured_value)
    except Exception as e:
        print(f"Error in run_accuracy_test: {e}")
        queue.put(e)
        sys.exit(1)
    finally:
        if "results" in locals():
            del results
        gc.collect()
        torch.npu.empty_cache()
        time.sleep(5)
 def generate_md(model_name, tasks_list, args, datasets):
    """Generate Markdown report with evaluation results"""
    # Format the run command
    run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name, datasets=datasets)
    model = model_name.split("/")[1]
    # Version information section
    version_info = (
        f"**vLLM Version**: vLLM: {args.vllm_version} "
        f"([{args.vllm_commit}]({VLLM_URL + args.vllm_commit})), "
        f"vLLM Ascend: {args.vllm_ascend_version} "
        f"([{args.vllm_ascend_commit}]({VLLM_ASCEND_URL + args.vllm_ascend_commit}))  "
    )
    # Report header with system info
    preamble = f"""# {model}
 {version_info}
 **Software Environment**: CANN: {args.cann_version}, PyTorch: {args.torch_version}, torch-npu: {args.torch_npu_version}  
 **Hardware Environment**: Atlas A2 Series  
 **Datasets**: {datasets}  
 **Parallel Mode**: {PARALLEL_MODE[model_name]}  
 **Execution Mode**: {EXECUTION_MODE[model_name]}  
 **Command**:  
 ```bash
 {run_cmd}
 ```
  """
    header = (
        "| Task                  | Filter | n-shot | Metric   | Value   | Stderr |\n"
        "|-----------------------|-------:|-------:|----------|--------:|-------:|"
    )
    rows = []
    rows_sub = []
    # Process results for each task
    for task_dict in tasks_list:
        for key, stats in task_dict.items():
            alias = stats.get("alias", key)
            task_name = alias.strip()
            if "exact_match,flexible-extract" in stats:
                metric_key = "exact_match,flexible-extract"
            else:
                metric_key = None
                for k in stats:
                    if "," in k and not k.startswith("acc_stderr"):
                        metric_key = k
                        break
            if metric_key is None:
                continue
            metric, flt = metric_key.split(",", 1)
            value = stats[metric_key]
            stderr = stats.get(f"{metric}_stderr,{flt}", 0)
            if model_name in UNIMODAL_MODEL_NAME:
                n_shot = "5"
            else:
                n_shot = "0"
            flag = ACCURACY_FLAG.get(task_name, "")
            row = (
                f"| {task_name:<37} "
                f"| {flt:<6} "
                f"| {n_shot:6} "
                f"| {metric:<6} "
                f"| {flag}{value:>5.4f} "
                f"| ± {stderr:>5.4f} |"
            )
            if not task_name.startswith("-"):
                rows.append(row)
                rows_sub.append(
                    "<details>"
                    + "\n"
                    + "<summary>"
                    + task_name
                    + " details"
                    + "</summary>"
                    + "\n" * 2
                    + header
                )
            rows_sub.append(row)
        rows_sub.append("</details>")
    # Combine all Markdown sections
    md = (
        preamble
        + "\n"
        + header
        + "\n"
        + "\n".join(rows)
        + "\n"
        + "\n".join(rows_sub)
        + "\n"
    )
    print(md)
    return md
 def safe_md(args, accuracy, datasets):
    """
    Safely generate and save Markdown report from accuracy results.
    """
    data = json.loads(json.dumps(accuracy))
    for model_key, tasks_list in data.items():
        md_content = generate_md(model_key, tasks_list, args, datasets)
        with open(args.output, "w", encoding="utf-8") as f:
            f.write(md_content)
        print(f"create Markdown file:{args.output}")
 def main(args):
    """Main evaluation workflow"""
    accuracy = {}
    accuracy[args.model] = []
    result_queue: Queue[float] = multiprocessing.Queue()
    if args.model in UNIMODAL_MODEL_NAME:
        datasets = UNIMODAL_TASK
    else:
        datasets = MULTIMODAL_TASK
    datasets_str = ",".join(datasets)
    # Evaluate model on each dataset
    for dataset in datasets:
        accuracy_expected = EXPECTED_VALUE[args.model][dataset]
        p = multiprocessing.Process(
            target=run_accuracy_test, args=(result_queue, args.model, dataset)
        )
        p.start()
        p.join()
        if p.is_alive():
            p.terminate()
            p.join()
        gc.collect()
        torch.npu.empty_cache()
        time.sleep(10)
        result = result_queue.get()
        print(result)
        if (
            accuracy_expected - RTOL
            < result[dataset][FILTER[dataset]]
            < accuracy_expected + RTOL
        ):
            ACCURACY_FLAG[dataset] = "✅"
        else:
            ACCURACY_FLAG[dataset] = "❌"
        accuracy[args.model].append(result)
    print(accuracy)
    safe_md(args, accuracy, datasets_str)
 if __name__ == "__main__":
    multiprocessing.set_start_method("spawn", force=True)
    # Initialize argument parser
    parser = argparse.ArgumentParser(
        description="Run model accuracy evaluation and generate report"
    )
    parser.add_argument("--output", type=str, required=True)
    parser.add_argument("--model", type=str, required=True)
    parser.add_argument("--vllm_ascend_version", type=str, required=False)
    parser.add_argument("--torch_version", type=str, required=False)
    parser.add_argument("--torch_npu_version", type=str, required=False)
    parser.add_argument("--vllm_version", type=str, required=False)
    parser.add_argument("--cann_version", type=str, required=False)
    parser.add_argument("--vllm_commit", type=str, required=False)
    parser.add_argument("--vllm_ascend_commit", type=str, required=False)
    args = parser.parse_args()
    main(args)
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -5,7 +5,7 @@ openai
 pytest >= 6.0
 pytest-asyncio
 pytest-mock
-lm-eval
+lm-eval==0.4.8
 types-jsonschema
 xgrammar
 zmq
--- a/tests/e2e/singlecard/models/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/tests/e2e/singlecard/models/configs/Qwen2.5-VL-7B-Instruct.yaml
@@ -0,0 +1,8 @@
 model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
 model: "vllm-vlm"
 tasks:
 - name: "mmmu_val"
  metrics:
  - name: "acc,none"
    value: 0.51
 max_model_len: 8192
--- a/tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml
+++ b/tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml
@@ -0,0 +1,18 @@
 model_name: "Qwen/Qwen3-30B-A3B"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.89
  - name: "exact_match,flexible-extract"
    value: 0.85
 - name: "ceval-valid"
  metrics:
  - name: "acc,none"
    value: 0.84
 num_fewshot: 5
 gpu_memory_utilization: 0.6
 enable_expert_parallel: True
 tensor_parallel_size: 2
 apply_chat_template: False
 fewshot_as_multiturn: False
--- a/tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml
+++ b/tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml
@@ -0,0 +1,13 @@
 model_name: "Qwen/Qwen3-8B-Base"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.82
  - name: "exact_match,flexible-extract"
    value: 0.83
 - name: "ceval-valid"
  metrics:
  - name: "acc,none"
    value: 0.82
 num_fewshot: 5
--- a/tests/e2e/singlecard/models/configs/accuracy.txt
+++ b/tests/e2e/singlecard/models/configs/accuracy.txt
@@ -0,0 +1,3 @@
 Qwen3-8B-Base.yaml
 Qwen2.5-VL-7B-Instruct.yaml
 Qwen3-30B-A3B.yaml
--- a/tests/e2e/singlecard/models/conftest.py
+++ b/tests/e2e/singlecard/models/conftest.py
@@ -0,0 +1,73 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 import pytest
 def pytest_addoption(parser):
    parser.addoption(
        "--config-list-file",
        action="store",
        default=None,
        help="Path to the file listing model config YAMLs (one per line)",
    )
    parser.addoption(
        "--tp-size",
        action="store",
        default="1",
        help="Tensor parallel size to use for evaluation",
    )
    parser.addoption(
        "--config",
        action="store",
        default="./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml",
        help="Path to the model config YAML file",
    )
    parser.addoption(
        "--report_output",
        action="store",
        default="./benchmarks/accuracy/Qwen3-8B-Base.md",
        help="Path to the report output file",
    )
@pytest.fixture(scope="session")
 def config_list_file(pytestconfig, config_dir):
    rel_path = pytestconfig.getoption("--config-list-file")
    return config_dir / rel_path
@pytest.fixture(scope="session")
 def tp_size(pytestconfig):
    return pytestconfig.getoption("--tp-size")
@pytest.fixture(scope="session")
 def config(pytestconfig):
    return pytestconfig.getoption("--config")
@pytest.fixture(scope="session")
 def report_output(pytestconfig):
    return pytestconfig.getoption("--report_output")
 def pytest_generate_tests(metafunc):
    if "config_filename" in metafunc.fixturenames:
        # If config specified, use the --config directly
        single_config = metafunc.config.getoption("--config")
        if single_config:
            metafunc.parametrize("config_filename",
                                 [Path(single_config).resolve()])
            return
        # Otherwise, check --config-list-file
        rel_path = metafunc.config.getoption("--config-list-file")
        config_list_file = Path(rel_path).resolve()
        config_dir = config_list_file.parent
        with open(config_list_file, encoding="utf-8") as f:
            configs = [
                config_dir / line.strip() for line in f
                if line.strip() and not line.startswith("#")
            ]
        metafunc.parametrize("config_filename", configs)
--- a/tests/e2e/singlecard/models/report_template.md
+++ b/tests/e2e/singlecard/models/report_template.md
@@ -0,0 +1,24 @@
 # {{ model_name }}
 **vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})),
 **vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))  
 **Software Environment**: CANN: {{ cann_version }}, PyTorch: {{ torch_version }}, torch-npu: {{ torch_npu_version }}  
 **Hardware Environment**: Atlas A2 Series  
 **Datasets**: {{ datasets }}  
 **Parallel Mode**: TP  
 **Execution Mode**: ACLGraph  
 **Command**:  
 ```bash
 export MODEL_ARGS={{ model_args }}
 lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
 --apply_chat_template {{ apply_chat_template }} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} \
 --limit {{ limit }} --batch_size {{ batch_size}}
 ```
 | Task                  | Metric      | Value     | Stderr |
 |-----------------------|-------------|----------:|-------:|
 {% for row in rows -%}
 | {{ row.task.rjust(23) }} | {{ row.metric.rjust(15) }} |{{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} |
 {% endfor %}
--- a/tests/e2e/singlecard/models/test_lm_eval_correctness.py
+++ b/tests/e2e/singlecard/models/test_lm_eval_correctness.py
@@ -0,0 +1,148 @@
 import os
 from dataclasses import dataclass
 import lm_eval
 import numpy as np
 import pytest
 import yaml
 from jinja2 import Environment, FileSystemLoader
 RTOL = 0.03
 TEST_DIR = os.path.dirname(__file__)
@dataclass
 class EnvConfig:
    vllm_version: str
    vllm_commit: str
    vllm_ascend_version: str
    vllm_ascend_commit: str
    cann_version: str
    torch_version: str
    torch_npu_version: str
@pytest.fixture
 def env_config() -> EnvConfig:
    return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'),
                     vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'),
                     vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION',
                                                   'unknown'),
                     vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT',
                                                  'unknown'),
                     cann_version=os.getenv('CANN_VERSION', 'unknown'),
                     torch_version=os.getenv('TORCH_VERSION', 'unknown'),
                     torch_npu_version=os.getenv('TORCH_NPU_VERSION',
                                                 'unknown'))
 def build_model_args(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
    model_args = {
        "pretrained": eval_config["model_name"],
        "tensor_parallel_size": tp_size,
        "dtype": "auto",
        "trust_remote_code": trust_remote_code,
        "max_model_len": max_model_len,
    }
    for s in [
            "max_images", "gpu_memory_utilization", "enable_expert_parallel",
            "tensor_parallel_size"
    ]:
        val = eval_config.get(s, None)
        if val is not None:
            model_args[s] = val
    print("Model Parameters:")
    print(model_args)
    return model_args
 def generate_report(tp_size, eval_config, report_data, report_output,
                    env_config):
    env = Environment(loader=FileSystemLoader(TEST_DIR))
    template = env.get_template("report_template.md")
    model_args = build_model_args(eval_config, tp_size)
    report_content = template.render(
        vllm_version=env_config.vllm_version,
        vllm_commit=env_config.vllm_commit,
        vllm_ascend_version=env_config.vllm_ascend_version,
        vllm_ascend_commit=env_config.vllm_ascend_commit,
        cann_version=env_config.cann_version,
        torch_version=env_config.torch_version,
        torch_npu_version=env_config.torch_npu_version,
        model_name=eval_config["model_name"],
        model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'",
        model_type=eval_config.get("model", "vllm"),
        datasets=",".join([task["name"] for task in eval_config["tasks"]]),
        apply_chat_template=eval_config.get("apply_chat_template", True),
        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
        limit=eval_config.get("limit", None),
        batch_size="auto",
        num_fewshot=eval_config.get("num_fewshot", "N/A"),
        rows=report_data["rows"])
    os.makedirs(os.path.dirname(report_output), exist_ok=True)
    with open(report_output, 'w', encoding='utf-8') as f:
        f.write(report_content)
 def test_lm_eval_correctness_param(config_filename, tp_size, report_output,
                                   env_config):
    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
    model_args = build_model_args(eval_config, tp_size)
    success = True
    report_data: dict[str, list[dict]] = {"rows": []}
    eval_params = {
        "model": eval_config.get("model", "vllm"),
        "model_args": model_args,
        "tasks": [task["name"] for task in eval_config["tasks"]],
        "apply_chat_template": eval_config.get("apply_chat_template", True),
        "fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
        "limit": eval_config.get("limit", None),
        "batch_size": "auto",
    }
    for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
        val = eval_config.get(s, None)
        if val is not None:
            eval_params[s] = val
    print("Eval Parameters:")
    print(eval_params)
    results = lm_eval.simple_evaluate(**eval_params)
    for task in eval_config["tasks"]:
        task_name = task["name"]
        task_result = results["results"][task_name]
        for metric in task["metrics"]:
            metric_name = metric["name"]
            ground_truth = metric["value"]
            measured_value = task_result[metric_name]
            task_success = bool(
                np.isclose(ground_truth, measured_value, rtol=RTOL))
            success = success and task_success
            print(f"{task_name} | {metric_name}: "
                  f"ground_truth={ground_truth} | measured={measured_value} | "
                  f"success={'✅' if task_success else '❌'}")
            report_data["rows"].append({
                "task":
                task_name,
                "metric":
                metric_name,
                "value":
                f"✅{measured_value}" if success else f"❌{measured_value}",
                "stderr":
                task_result[
                    metric_name.replace(',', '_stderr,') if metric_name ==
                    "acc,none" else metric_name.replace(',', '_stderr,')]
            })
    generate_report(tp_size, eval_config, report_data, report_output,
                    env_config)
    assert success