Enable pytest and yaml style accuracy test (#2073)

### What this PR does / why we need it?

This PR enabled pytest and yaml style accuracy test, users now can
enable accuracy test by running:

```bash
cd ~/vllm-ascend
pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
          --config ./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml \
          --report_output ./benchmarks/accuracy/Qwen3-8B-Base.md

pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
          --config-list-file ./tests/e2e/singlecard/models/configs/accuracy.txt
```

Closes: https://github.com/vllm-project/vllm-ascend/issues/1970

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?


- vLLM version: v0.10.0
- vLLM main:
2836dd73f1

---------

Signed-off-by: Icey <1790571317@qq.com>
This commit is contained in:
Icey
2025-07-31 21:39:13 +08:00
committed by GitHub
parent 9c9a7cd90b
commit 86bdde1ca8
10 changed files with 336 additions and 446 deletions

View File

@@ -29,35 +29,15 @@ on:
types: [ labeled ]
workflow_dispatch:
inputs:
vllm-version:
description: 'vllm version:'
vllm-ascend-version:
description: 'vllm-ascend:'
required: true
type: choice
# Please also update this when bump matched version
# Current supported vLLM versions
options:
- latest
- main
- v0.10.0
- v0.9.1
- v0.7.3
vllm-ascend-version:
description: 'vllm-ascend version:'
required: true
type: choice
options:
- main
- v0.9.1-dev
- v0.7.3-dev
models:
description: 'model:'
required: true
type: choice
options:
- all
- Qwen/Qwen2.5-VL-7B-Instruct
- Qwen/Qwen3-8B-Base
- Qwen/Qwen3-30B-A3B
default: 'all'
default: main
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
@@ -76,58 +56,27 @@ jobs:
# test will be triggered when tag '*-accuracy-test' & 'ready-for-test' or workflow_dispatch job
if: >-
${{
(contains(github.event.pull_request.labels.*.name, 'accuracy-test') ||
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') ||
contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') ||
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test')) &&
contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
contains(github.event.pull_request.labels.*.name, 'ready-for-test') ||
github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
}}
runs-on: >-
${{
(matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-aarch64-a2-2') ||
'linux-aarch64-a2-1'
}}
runs-on: ${{ matrix.runner }}
strategy:
matrix:
# the accuracy test will run:
# 1. workflow_dispatch with models input
# - all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
# - specified but not all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
# 2. PR labeled with "*-accuracy-test"
# - accuracy-test: Qwen/Qwen3-8B-Base, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-30B-A3B
# - dense-accuracy-test: Qwen/Qwen3-8B-Base
# - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct
# - moe-accuracy-test: Qwen/Qwen3-30B-A3B
model_name: ${{ fromJSON(
(github.event_name == 'schedule' &&
'["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
(github.event.inputs.models == 'all' &&
'["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
(github.event.inputs.models == 'Qwen/Qwen3-30B-A3B' &&
'["Qwen/Qwen3-30B-A3B"]') ||
(github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' &&
'["Qwen/Qwen2.5-VL-7B-Instruct"]') ||
(github.event.inputs.models == 'Qwen/Qwen3-8B-Base' &&
'["Qwen/Qwen3-8B-Base"]') ||
contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
'["Qwen/Qwen3-8B-Base","Qwen/Qwen2.5-VL-7B-Instruct", "Qwen/Qwen3-30B-A3B"]' ||
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') &&
'["Qwen/Qwen3-8B-Base"]' ||
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') &&
'["Qwen/Qwen2.5-VL-7B-Instruct"]' ||
contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') &&
'["Qwen/Qwen3-30B-A3B"]'
) }}
include:
- model_name: Qwen3-8B-Base
runner: linux-aarch64-a2-1
- model_name: Qwen2.5-VL-7B-Instruct
runner: linux-aarch64-a2-1
- model_name: Qwen3-30B-A3B
runner: linux-aarch64-a2-2
fail-fast: false
name: ${{ matrix.model_name }} accuracy
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
env:
DATASET_SOURCE: ModelScope
VLLM_USE_MODELSCOPE: True
USE_MODELSCOPE_HUB: 1
# 1. If version specified (work_dispatch), do specified branch accuracy test
# 2. If no version (labeled PR), do accuracy test by default ref:
# The branch, tag or SHA to checkout. When checking out the repository that
@@ -139,10 +88,10 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4
- name: Check npu and CANN info
- name: Set model name as output
id: set_output
run: |
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
echo "model_name=${{ matrix.model_name }}" >> $GITHUB_OUTPUT
- name: Config mirrors
run: |
@@ -161,19 +110,19 @@ jobs:
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
ref: v0.10.0
path: ./vllm-empty
# Please also update this when bump matched version
ref: ${{ github.event.inputs.vllm-version || 'v0.10.0' }}
- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: VLLM_TARGET_DEVICE=empty pip install -e .
run: |
VLLM_TARGET_DEVICE=empty pip install -e .
- name: Resolve vllm-ascend version
run: |
VERSION_INPUT="${{ github.event.inputs.vllm-ascend-version }}"
if [[ "$VERSION_INPUT" == "main" ]]; then
if [[ "$VERSION_INPUT" == "latest" ]]; then
TAGS=$(git ls-remote --tags --sort=-v:refname https://github.com/vllm-project/vllm-ascend "v*" | cut -f2 | sed 's|refs/tags/||')
LATEST_TAG=$(echo "$TAGS" | head -n1)
if [[ -z "$LATEST_TAG" ]]; then
@@ -199,8 +148,8 @@ jobs:
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
run: |
pip install -r requirements-dev.txt
pip install -v -e .
pip install -v -e .
- name: Get vLLM commit hash and URL
working-directory: ./vllm-empty
run: |
@@ -213,15 +162,6 @@ jobs:
VLLM_ASCEND_COMMIT=$(git rev-parse --short=7 HEAD)
echo "VLLM_ASCEND_COMMIT=$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV
- name: Print resolved hashes
run: |
echo "vLLM : ${{ env.VLLM_COMMIT }}"
echo "vLLM-Ascend: ${{ env.VLLM_ASCEND_COMMIT }}"
- name: Install lm-eval, ray, and datasets
run: |
pip install lm-eval==0.4.8
- name: Collect version info
run: |
for dir in /usr/local/Ascend/ascend-toolkit/*; do
@@ -242,37 +182,27 @@ jobs:
pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}'
pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//'
} >> "$GITHUB_ENV"
- name: Print versions
run: |
echo "CANN: ${{ env.GHA_CANN_VERSION }}"
echo "Torch NPU: ${{ env.GHA_TORCH_NPU_VERSION }}"
echo "Torch: ${{ env.GHA_TORCH_VERSION }}"
echo "vLLM: ${{ env.GHA_VLLM_VERSION }}"
echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION }}"
- name: Run Accuracy Test
- name: Run accuracy test
id: report
working-directory: ./benchmarks
env:
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
VLLM_WORKER_MULTIPROC_METHOD: spawn
VLLM_USE_MODELSCOPE: True
VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }}
VLLM_COMMIT: ${{ env.VLLM_COMMIT }}
VLLM_ASCEND_VERSION: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}
VLLM_ASCEND_COMMIT: ${{ env.VLLM_ASCEND_COMMIT }}
CANN_VERSION: ${{ env.GHA_CANN_VERSION }}
TORCH_VERSION: ${{ env.GHA_TORCH_VERSION }}
TORCH_NPU_VERSION: ${{ env.GHA_TORCH_NPU_VERSION }}
run: |
model_base_name=$(basename ${{ matrix.model_name }})
markdown_name="${model_base_name}"
echo "markdown_name=$markdown_name"
echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT
mkdir -p ./accuracy
python ./scripts/run_accuracy.py \
--model "${{ matrix.model_name }}" \
--output "./accuracy/${markdown_name}.md" \
--vllm_ascend_version "${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}" \
--cann_version "${{ env.GHA_CANN_VERSION }}" \
--torch_npu_version "${{ env.GHA_TORCH_NPU_VERSION }}" \
--torch_version "${{ env.GHA_TORCH_VERSION }}" \
--vllm_version "${{ env.GHA_VLLM_VERSION }}" \
--vllm_commit "${{ env.VLLM_COMMIT }}" \
--vllm_ascend_commit "${{ env.VLLM_ASCEND_COMMIT }}" \
mkdir -p ./benchmarks/accuracy
pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
--config ./tests/e2e/singlecard/models/configs/${{ matrix.model_name }}.yaml \
--report_output ./benchmarks/accuracy/${model_base_name}.md
- name: Generate step summary
if: ${{ always() }}
@@ -284,19 +214,7 @@ jobs:
SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}"
echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV"
- name: Check report first line for failure
id: check_report
run: |
REPORT_PATH="./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md"
echo "Scanning $REPORT_PATH for ❌ …"
if grep -q '❌' "$REPORT_PATH"; then
echo "contains_fail=true" >> $GITHUB_OUTPUT
else
echo "contains_fail=false" >> $GITHUB_OUTPUT
fi
- name: Upload Report
if: ${{ github.event_name == 'workflow_dispatch' && steps.check_report.outputs.contains_fail == 'false' }}
uses: actions/upload-artifact@v4
with:
name: "report-${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}"
@@ -305,12 +223,16 @@ jobs:
retention-days: 90
overwrite: true
outputs:
model_name: ${{ steps.set_output.outputs.model_name }}
create_pr:
runs-on: ubuntu-latest
needs: accuracy_tests
if: ${{ github.event_name == 'workflow_dispatch' }}
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}
env:
UPSTREAM_REPO: vllm-project/vllm-ascend
steps:
- name: Checkout repository
uses: actions/checkout@v4
@@ -318,7 +240,7 @@ jobs:
repository: vllm-ascend-ci/vllm-ascend
token: ${{ secrets.PAT_TOKEN }}
ref: main
- name: Add upstream remote
run: |
git remote add upstream https://github.com/${{ env.UPSTREAM_REPO }}.git
@@ -350,7 +272,7 @@ jobs:
find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete
find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \;
find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete
- name: Update accuracy_report/index.md
run: |
REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report"
@@ -390,16 +312,10 @@ jobs:
head: `vllm-ascend-ci:${{ env.BRANCH_NAME }}`,
base: '${{ github.event.inputs.vllm-ascend-version }}',
title: `[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}`,
body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for:
${{
github.event.inputs.models == 'all'
&& 'All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)'
|| github.event.inputs.models
}}
body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)
- [Workflow run][1]
[1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`
- [Workflow run][1]
[1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`
});
core.info(`Created PR #${pr.data.number}`);

View File

@@ -1,313 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import argparse
import gc
import json
import multiprocessing
import sys
import time
from multiprocessing import Queue
import lm_eval
import torch
# URLs for version information in Markdown report
VLLM_URL = "https://github.com/vllm-project/vllm/commit/"
VLLM_ASCEND_URL = "https://github.com/vllm-project/vllm-ascend/commit/"
# Model and task configurations
UNIMODAL_MODEL_NAME = ["Qwen/Qwen3-8B-Base", "Qwen/Qwen3-30B-A3B"]
UNIMODAL_TASK = ["ceval-valid", "gsm8k"]
MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"]
MULTIMODAL_TASK = ["mmmu_val"]
# Batch size configurations per task
BATCH_SIZE = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1}
# Model type mapping (vllm for text, vllm-vlm for vision-language)
MODEL_TYPE = {
"Qwen/Qwen3-8B-Base": "vllm",
"Qwen/Qwen3-30B-A3B": "vllm",
"Qwen/Qwen2.5-VL-7B-Instruct": "vllm-vlm",
}
# Command templates for running evaluations
MODEL_RUN_INFO = {
"Qwen/Qwen3-30B-A3B": (
"export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n"
"lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
),
"Qwen/Qwen3-8B-Base": (
"export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6'\n"
"lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
),
"Qwen/Qwen2.5-VL-7B-Instruct": (
"export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2'\n"
"lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
"--apply_chat_template --fewshot_as_multiturn --batch_size 1"
),
}
# Evaluation metric filters per task
FILTER = {
"gsm8k": "exact_match,flexible-extract",
"ceval-valid": "acc,none",
"mmmu_val": "acc,none",
}
# Expected accuracy values for models
EXPECTED_VALUE = {
"Qwen/Qwen3-30B-A3B": {"ceval-valid": 0.83, "gsm8k": 0.85},
"Qwen/Qwen3-8B-Base": {"ceval-valid": 0.82, "gsm8k": 0.83},
"Qwen/Qwen2.5-VL-7B-Instruct": {"mmmu_val": 0.51},
}
PARALLEL_MODE = {
"Qwen/Qwen3-8B-Base": "TP",
"Qwen/Qwen2.5-VL-7B-Instruct": "TP",
"Qwen/Qwen3-30B-A3B": "EP",
}
# Execution backend configuration
EXECUTION_MODE = {
"Qwen/Qwen3-8B-Base": "ACLGraph",
"Qwen/Qwen2.5-VL-7B-Instruct": "ACLGraph",
"Qwen/Qwen3-30B-A3B": "ACLGraph",
}
# Model arguments for evaluation
MODEL_ARGS = {
"Qwen/Qwen3-8B-Base": "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6",
"Qwen/Qwen2.5-VL-7B-Instruct": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2",
"Qwen/Qwen3-30B-A3B": "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True",
}
# Whether to apply chat template formatting
APPLY_CHAT_TEMPLATE = {
"Qwen/Qwen3-8B-Base": True,
"Qwen/Qwen2.5-VL-7B-Instruct": True,
"Qwen/Qwen3-30B-A3B": False,
}
# Few-shot examples handling as multi-turn dialogues.
FEWSHOT_AS_MULTITURN = {
"Qwen/Qwen3-8B-Base": True,
"Qwen/Qwen2.5-VL-7B-Instruct": True,
"Qwen/Qwen3-30B-A3B": False,
}
# Relative tolerance for accuracy checks
RTOL = 0.03
ACCURACY_FLAG = {}
def run_accuracy_test(queue, model, dataset):
"""Run accuracy evaluation for a model on a dataset in separate process"""
try:
eval_params = {
"model": MODEL_TYPE[model],
"model_args": MODEL_ARGS[model],
"tasks": dataset,
"apply_chat_template": APPLY_CHAT_TEMPLATE[model],
"fewshot_as_multiturn": FEWSHOT_AS_MULTITURN[model],
"batch_size": BATCH_SIZE[dataset],
}
if MODEL_TYPE[model] == "vllm":
eval_params["num_fewshot"] = 5
results = lm_eval.simple_evaluate(**eval_params)
print(f"Success: {model} on {dataset} ")
measured_value = results["results"]
queue.put(measured_value)
except Exception as e:
print(f"Error in run_accuracy_test: {e}")
queue.put(e)
sys.exit(1)
finally:
if "results" in locals():
del results
gc.collect()
torch.npu.empty_cache()
time.sleep(5)
def generate_md(model_name, tasks_list, args, datasets):
"""Generate Markdown report with evaluation results"""
# Format the run command
run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name, datasets=datasets)
model = model_name.split("/")[1]
# Version information section
version_info = (
f"**vLLM Version**: vLLM: {args.vllm_version} "
f"([{args.vllm_commit}]({VLLM_URL + args.vllm_commit})), "
f"vLLM Ascend: {args.vllm_ascend_version} "
f"([{args.vllm_ascend_commit}]({VLLM_ASCEND_URL + args.vllm_ascend_commit})) "
)
# Report header with system info
preamble = f"""# {model}
{version_info}
**Software Environment**: CANN: {args.cann_version}, PyTorch: {args.torch_version}, torch-npu: {args.torch_npu_version}
**Hardware Environment**: Atlas A2 Series
**Datasets**: {datasets}
**Parallel Mode**: {PARALLEL_MODE[model_name]}
**Execution Mode**: {EXECUTION_MODE[model_name]}
**Command**:
```bash
{run_cmd}
```
"""
header = (
"| Task | Filter | n-shot | Metric | Value | Stderr |\n"
"|-----------------------|-------:|-------:|----------|--------:|-------:|"
)
rows = []
rows_sub = []
# Process results for each task
for task_dict in tasks_list:
for key, stats in task_dict.items():
alias = stats.get("alias", key)
task_name = alias.strip()
if "exact_match,flexible-extract" in stats:
metric_key = "exact_match,flexible-extract"
else:
metric_key = None
for k in stats:
if "," in k and not k.startswith("acc_stderr"):
metric_key = k
break
if metric_key is None:
continue
metric, flt = metric_key.split(",", 1)
value = stats[metric_key]
stderr = stats.get(f"{metric}_stderr,{flt}", 0)
if model_name in UNIMODAL_MODEL_NAME:
n_shot = "5"
else:
n_shot = "0"
flag = ACCURACY_FLAG.get(task_name, "")
row = (
f"| {task_name:<37} "
f"| {flt:<6} "
f"| {n_shot:6} "
f"| {metric:<6} "
f"| {flag}{value:>5.4f} "
f"| ± {stderr:>5.4f} |"
)
if not task_name.startswith("-"):
rows.append(row)
rows_sub.append(
"<details>"
+ "\n"
+ "<summary>"
+ task_name
+ " details"
+ "</summary>"
+ "\n" * 2
+ header
)
rows_sub.append(row)
rows_sub.append("</details>")
# Combine all Markdown sections
md = (
preamble
+ "\n"
+ header
+ "\n"
+ "\n".join(rows)
+ "\n"
+ "\n".join(rows_sub)
+ "\n"
)
print(md)
return md
def safe_md(args, accuracy, datasets):
"""
Safely generate and save Markdown report from accuracy results.
"""
data = json.loads(json.dumps(accuracy))
for model_key, tasks_list in data.items():
md_content = generate_md(model_key, tasks_list, args, datasets)
with open(args.output, "w", encoding="utf-8") as f:
f.write(md_content)
print(f"create Markdown file:{args.output}")
def main(args):
"""Main evaluation workflow"""
accuracy = {}
accuracy[args.model] = []
result_queue: Queue[float] = multiprocessing.Queue()
if args.model in UNIMODAL_MODEL_NAME:
datasets = UNIMODAL_TASK
else:
datasets = MULTIMODAL_TASK
datasets_str = ",".join(datasets)
# Evaluate model on each dataset
for dataset in datasets:
accuracy_expected = EXPECTED_VALUE[args.model][dataset]
p = multiprocessing.Process(
target=run_accuracy_test, args=(result_queue, args.model, dataset)
)
p.start()
p.join()
if p.is_alive():
p.terminate()
p.join()
gc.collect()
torch.npu.empty_cache()
time.sleep(10)
result = result_queue.get()
print(result)
if (
accuracy_expected - RTOL
< result[dataset][FILTER[dataset]]
< accuracy_expected + RTOL
):
ACCURACY_FLAG[dataset] = ""
else:
ACCURACY_FLAG[dataset] = ""
accuracy[args.model].append(result)
print(accuracy)
safe_md(args, accuracy, datasets_str)
if __name__ == "__main__":
multiprocessing.set_start_method("spawn", force=True)
# Initialize argument parser
parser = argparse.ArgumentParser(
description="Run model accuracy evaluation and generate report"
)
parser.add_argument("--output", type=str, required=True)
parser.add_argument("--model", type=str, required=True)
parser.add_argument("--vllm_ascend_version", type=str, required=False)
parser.add_argument("--torch_version", type=str, required=False)
parser.add_argument("--torch_npu_version", type=str, required=False)
parser.add_argument("--vllm_version", type=str, required=False)
parser.add_argument("--cann_version", type=str, required=False)
parser.add_argument("--vllm_commit", type=str, required=False)
parser.add_argument("--vllm_ascend_commit", type=str, required=False)
args = parser.parse_args()
main(args)

View File

@@ -5,7 +5,7 @@ openai
pytest >= 6.0
pytest-asyncio
pytest-mock
lm-eval
lm-eval==0.4.8
types-jsonschema
xgrammar
zmq

View File

@@ -0,0 +1,8 @@
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
model: "vllm-vlm"
tasks:
- name: "mmmu_val"
metrics:
- name: "acc,none"
value: 0.51
max_model_len: 8192

View File

@@ -0,0 +1,18 @@
model_name: "Qwen/Qwen3-30B-A3B"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.89
- name: "exact_match,flexible-extract"
value: 0.85
- name: "ceval-valid"
metrics:
- name: "acc,none"
value: 0.84
num_fewshot: 5
gpu_memory_utilization: 0.6
enable_expert_parallel: True
tensor_parallel_size: 2
apply_chat_template: False
fewshot_as_multiturn: False

View File

@@ -0,0 +1,13 @@
model_name: "Qwen/Qwen3-8B-Base"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.82
- name: "exact_match,flexible-extract"
value: 0.83
- name: "ceval-valid"
metrics:
- name: "acc,none"
value: 0.82
num_fewshot: 5

View File

@@ -0,0 +1,3 @@
Qwen3-8B-Base.yaml
Qwen2.5-VL-7B-Instruct.yaml
Qwen3-30B-A3B.yaml

View File

@@ -0,0 +1,73 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
import pytest
def pytest_addoption(parser):
parser.addoption(
"--config-list-file",
action="store",
default=None,
help="Path to the file listing model config YAMLs (one per line)",
)
parser.addoption(
"--tp-size",
action="store",
default="1",
help="Tensor parallel size to use for evaluation",
)
parser.addoption(
"--config",
action="store",
default="./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml",
help="Path to the model config YAML file",
)
parser.addoption(
"--report_output",
action="store",
default="./benchmarks/accuracy/Qwen3-8B-Base.md",
help="Path to the report output file",
)
@pytest.fixture(scope="session")
def config_list_file(pytestconfig, config_dir):
rel_path = pytestconfig.getoption("--config-list-file")
return config_dir / rel_path
@pytest.fixture(scope="session")
def tp_size(pytestconfig):
return pytestconfig.getoption("--tp-size")
@pytest.fixture(scope="session")
def config(pytestconfig):
return pytestconfig.getoption("--config")
@pytest.fixture(scope="session")
def report_output(pytestconfig):
return pytestconfig.getoption("--report_output")
def pytest_generate_tests(metafunc):
if "config_filename" in metafunc.fixturenames:
# If config specified, use the --config directly
single_config = metafunc.config.getoption("--config")
if single_config:
metafunc.parametrize("config_filename",
[Path(single_config).resolve()])
return
# Otherwise, check --config-list-file
rel_path = metafunc.config.getoption("--config-list-file")
config_list_file = Path(rel_path).resolve()
config_dir = config_list_file.parent
with open(config_list_file, encoding="utf-8") as f:
configs = [
config_dir / line.strip() for line in f
if line.strip() and not line.startswith("#")
]
metafunc.parametrize("config_filename", configs)

View File

@@ -0,0 +1,24 @@
# {{ model_name }}
**vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})),
**vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))
**Software Environment**: CANN: {{ cann_version }}, PyTorch: {{ torch_version }}, torch-npu: {{ torch_npu_version }}
**Hardware Environment**: Atlas A2 Series
**Datasets**: {{ datasets }}
**Parallel Mode**: TP
**Execution Mode**: ACLGraph
**Command**:
```bash
export MODEL_ARGS={{ model_args }}
lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
--apply_chat_template {{ apply_chat_template }} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} \
--limit {{ limit }} --batch_size {{ batch_size}}
```
| Task | Metric | Value | Stderr |
|-----------------------|-------------|----------:|-------:|
{% for row in rows -%}
| {{ row.task.rjust(23) }} | {{ row.metric.rjust(15) }} |{{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} |
{% endfor %}

View File

@@ -0,0 +1,148 @@
import os
from dataclasses import dataclass
import lm_eval
import numpy as np
import pytest
import yaml
from jinja2 import Environment, FileSystemLoader
RTOL = 0.03
TEST_DIR = os.path.dirname(__file__)
@dataclass
class EnvConfig:
vllm_version: str
vllm_commit: str
vllm_ascend_version: str
vllm_ascend_commit: str
cann_version: str
torch_version: str
torch_npu_version: str
@pytest.fixture
def env_config() -> EnvConfig:
return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'),
vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'),
vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION',
'unknown'),
vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT',
'unknown'),
cann_version=os.getenv('CANN_VERSION', 'unknown'),
torch_version=os.getenv('TORCH_VERSION', 'unknown'),
torch_npu_version=os.getenv('TORCH_NPU_VERSION',
'unknown'))
def build_model_args(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096)
model_args = {
"pretrained": eval_config["model_name"],
"tensor_parallel_size": tp_size,
"dtype": "auto",
"trust_remote_code": trust_remote_code,
"max_model_len": max_model_len,
}
for s in [
"max_images", "gpu_memory_utilization", "enable_expert_parallel",
"tensor_parallel_size"
]:
val = eval_config.get(s, None)
if val is not None:
model_args[s] = val
print("Model Parameters:")
print(model_args)
return model_args
def generate_report(tp_size, eval_config, report_data, report_output,
env_config):
env = Environment(loader=FileSystemLoader(TEST_DIR))
template = env.get_template("report_template.md")
model_args = build_model_args(eval_config, tp_size)
report_content = template.render(
vllm_version=env_config.vllm_version,
vllm_commit=env_config.vllm_commit,
vllm_ascend_version=env_config.vllm_ascend_version,
vllm_ascend_commit=env_config.vllm_ascend_commit,
cann_version=env_config.cann_version,
torch_version=env_config.torch_version,
torch_npu_version=env_config.torch_npu_version,
model_name=eval_config["model_name"],
model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'",
model_type=eval_config.get("model", "vllm"),
datasets=",".join([task["name"] for task in eval_config["tasks"]]),
apply_chat_template=eval_config.get("apply_chat_template", True),
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
limit=eval_config.get("limit", None),
batch_size="auto",
num_fewshot=eval_config.get("num_fewshot", "N/A"),
rows=report_data["rows"])
os.makedirs(os.path.dirname(report_output), exist_ok=True)
with open(report_output, 'w', encoding='utf-8') as f:
f.write(report_content)
def test_lm_eval_correctness_param(config_filename, tp_size, report_output,
env_config):
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
model_args = build_model_args(eval_config, tp_size)
success = True
report_data: dict[str, list[dict]] = {"rows": []}
eval_params = {
"model": eval_config.get("model", "vllm"),
"model_args": model_args,
"tasks": [task["name"] for task in eval_config["tasks"]],
"apply_chat_template": eval_config.get("apply_chat_template", True),
"fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
"limit": eval_config.get("limit", None),
"batch_size": "auto",
}
for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
val = eval_config.get(s, None)
if val is not None:
eval_params[s] = val
print("Eval Parameters:")
print(eval_params)
results = lm_eval.simple_evaluate(**eval_params)
for task in eval_config["tasks"]:
task_name = task["name"]
task_result = results["results"][task_name]
for metric in task["metrics"]:
metric_name = metric["name"]
ground_truth = metric["value"]
measured_value = task_result[metric_name]
task_success = bool(
np.isclose(ground_truth, measured_value, rtol=RTOL))
success = success and task_success
print(f"{task_name} | {metric_name}: "
f"ground_truth={ground_truth} | measured={measured_value} | "
f"success={'' if task_success else ''}")
report_data["rows"].append({
"task":
task_name,
"metric":
metric_name,
"value":
f"{measured_value}" if success else f"{measured_value}",
"stderr":
task_result[
metric_name.replace(',', '_stderr,') if metric_name ==
"acc,none" else metric_name.replace(',', '_stderr,')]
})
generate_report(tp_size, eval_config, report_data, report_output,
env_config)
assert success