Enable pytest and yaml style accuracy test (#2073)
### What this PR does / why we need it?
This PR enabled pytest and yaml style accuracy test, users now can
enable accuracy test by running:
```bash
cd ~/vllm-ascend
pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
--config ./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml \
--report_output ./benchmarks/accuracy/Qwen3-8B-Base.md
pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
--config-list-file ./tests/e2e/singlecard/models/configs/accuracy.txt
```
Closes: https://github.com/vllm-project/vllm-ascend/issues/1970
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
- vLLM version: v0.10.0
- vLLM main:
2836dd73f1
---------
Signed-off-by: Icey <1790571317@qq.com>
This commit is contained in:
180
.github/workflows/accuracy_test.yaml
vendored
180
.github/workflows/accuracy_test.yaml
vendored
@@ -29,35 +29,15 @@ on:
|
|||||||
types: [ labeled ]
|
types: [ labeled ]
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
vllm-version:
|
vllm-ascend-version:
|
||||||
description: 'vllm version:'
|
description: 'vllm-ascend:'
|
||||||
required: true
|
required: true
|
||||||
type: choice
|
type: choice
|
||||||
# Please also update this when bump matched version
|
|
||||||
# Current supported vLLM versions
|
# Current supported vLLM versions
|
||||||
options:
|
options:
|
||||||
|
- latest
|
||||||
- main
|
- main
|
||||||
- v0.10.0
|
default: main
|
||||||
- v0.9.1
|
|
||||||
- v0.7.3
|
|
||||||
vllm-ascend-version:
|
|
||||||
description: 'vllm-ascend version:'
|
|
||||||
required: true
|
|
||||||
type: choice
|
|
||||||
options:
|
|
||||||
- main
|
|
||||||
- v0.9.1-dev
|
|
||||||
- v0.7.3-dev
|
|
||||||
models:
|
|
||||||
description: 'model:'
|
|
||||||
required: true
|
|
||||||
type: choice
|
|
||||||
options:
|
|
||||||
- all
|
|
||||||
- Qwen/Qwen2.5-VL-7B-Instruct
|
|
||||||
- Qwen/Qwen3-8B-Base
|
|
||||||
- Qwen/Qwen3-30B-A3B
|
|
||||||
default: 'all'
|
|
||||||
|
|
||||||
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
||||||
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
|
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
|
||||||
@@ -76,58 +56,27 @@ jobs:
|
|||||||
# test will be triggered when tag '*-accuracy-test' & 'ready-for-test' or workflow_dispatch job
|
# test will be triggered when tag '*-accuracy-test' & 'ready-for-test' or workflow_dispatch job
|
||||||
if: >-
|
if: >-
|
||||||
${{
|
${{
|
||||||
(contains(github.event.pull_request.labels.*.name, 'accuracy-test') ||
|
contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
|
||||||
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') ||
|
|
||||||
contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') ||
|
|
||||||
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test')) &&
|
|
||||||
contains(github.event.pull_request.labels.*.name, 'ready-for-test') ||
|
contains(github.event.pull_request.labels.*.name, 'ready-for-test') ||
|
||||||
github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
|
github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
|
||||||
}}
|
}}
|
||||||
runs-on: >-
|
runs-on: ${{ matrix.runner }}
|
||||||
${{
|
|
||||||
(matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-aarch64-a2-2') ||
|
|
||||||
'linux-aarch64-a2-1'
|
|
||||||
}}
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
# the accuracy test will run:
|
include:
|
||||||
# 1. workflow_dispatch with models input
|
- model_name: Qwen3-8B-Base
|
||||||
# - all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
|
runner: linux-aarch64-a2-1
|
||||||
# - specified but not all: Qwen/Qwen3-30B-A3B, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
|
- model_name: Qwen2.5-VL-7B-Instruct
|
||||||
# 2. PR labeled with "*-accuracy-test"
|
runner: linux-aarch64-a2-1
|
||||||
# - accuracy-test: Qwen/Qwen3-8B-Base, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-30B-A3B
|
- model_name: Qwen3-30B-A3B
|
||||||
# - dense-accuracy-test: Qwen/Qwen3-8B-Base
|
runner: linux-aarch64-a2-2
|
||||||
# - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct
|
|
||||||
# - moe-accuracy-test: Qwen/Qwen3-30B-A3B
|
|
||||||
model_name: ${{ fromJSON(
|
|
||||||
(github.event_name == 'schedule' &&
|
|
||||||
'["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
|
|
||||||
(github.event.inputs.models == 'all' &&
|
|
||||||
'["Qwen/Qwen3-30B-A3B","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
|
|
||||||
(github.event.inputs.models == 'Qwen/Qwen3-30B-A3B' &&
|
|
||||||
'["Qwen/Qwen3-30B-A3B"]') ||
|
|
||||||
(github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' &&
|
|
||||||
'["Qwen/Qwen2.5-VL-7B-Instruct"]') ||
|
|
||||||
(github.event.inputs.models == 'Qwen/Qwen3-8B-Base' &&
|
|
||||||
'["Qwen/Qwen3-8B-Base"]') ||
|
|
||||||
contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
|
|
||||||
'["Qwen/Qwen3-8B-Base","Qwen/Qwen2.5-VL-7B-Instruct", "Qwen/Qwen3-30B-A3B"]' ||
|
|
||||||
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') &&
|
|
||||||
'["Qwen/Qwen3-8B-Base"]' ||
|
|
||||||
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') &&
|
|
||||||
'["Qwen/Qwen2.5-VL-7B-Instruct"]' ||
|
|
||||||
contains(github.event.pull_request.labels.*.name, 'moe-accuracy-test') &&
|
|
||||||
'["Qwen/Qwen3-30B-A3B"]'
|
|
||||||
) }}
|
|
||||||
|
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
|
|
||||||
name: ${{ matrix.model_name }} accuracy
|
name: ${{ matrix.model_name }} accuracy
|
||||||
container:
|
container:
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
||||||
env:
|
env:
|
||||||
DATASET_SOURCE: ModelScope
|
|
||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
USE_MODELSCOPE_HUB: 1
|
|
||||||
# 1. If version specified (work_dispatch), do specified branch accuracy test
|
# 1. If version specified (work_dispatch), do specified branch accuracy test
|
||||||
# 2. If no version (labeled PR), do accuracy test by default ref:
|
# 2. If no version (labeled PR), do accuracy test by default ref:
|
||||||
# The branch, tag or SHA to checkout. When checking out the repository that
|
# The branch, tag or SHA to checkout. When checking out the repository that
|
||||||
@@ -139,10 +88,10 @@ jobs:
|
|||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Check npu and CANN info
|
- name: Set model name as output
|
||||||
|
id: set_output
|
||||||
run: |
|
run: |
|
||||||
npu-smi info
|
echo "model_name=${{ matrix.model_name }}" >> $GITHUB_OUTPUT
|
||||||
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
|
|
||||||
|
|
||||||
- name: Config mirrors
|
- name: Config mirrors
|
||||||
run: |
|
run: |
|
||||||
@@ -161,19 +110,19 @@ jobs:
|
|||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
repository: vllm-project/vllm
|
repository: vllm-project/vllm
|
||||||
|
ref: v0.10.0
|
||||||
path: ./vllm-empty
|
path: ./vllm-empty
|
||||||
# Please also update this when bump matched version
|
|
||||||
ref: ${{ github.event.inputs.vllm-version || 'v0.10.0' }}
|
|
||||||
|
|
||||||
- name: Install vllm-project/vllm from source
|
- name: Install vllm-project/vllm from source
|
||||||
working-directory: ./vllm-empty
|
working-directory: ./vllm-empty
|
||||||
run: VLLM_TARGET_DEVICE=empty pip install -e .
|
run: |
|
||||||
|
VLLM_TARGET_DEVICE=empty pip install -e .
|
||||||
|
|
||||||
- name: Resolve vllm-ascend version
|
- name: Resolve vllm-ascend version
|
||||||
run: |
|
run: |
|
||||||
VERSION_INPUT="${{ github.event.inputs.vllm-ascend-version }}"
|
VERSION_INPUT="${{ github.event.inputs.vllm-ascend-version }}"
|
||||||
|
|
||||||
if [[ "$VERSION_INPUT" == "main" ]]; then
|
if [[ "$VERSION_INPUT" == "latest" ]]; then
|
||||||
TAGS=$(git ls-remote --tags --sort=-v:refname https://github.com/vllm-project/vllm-ascend "v*" | cut -f2 | sed 's|refs/tags/||')
|
TAGS=$(git ls-remote --tags --sort=-v:refname https://github.com/vllm-project/vllm-ascend "v*" | cut -f2 | sed 's|refs/tags/||')
|
||||||
LATEST_TAG=$(echo "$TAGS" | head -n1)
|
LATEST_TAG=$(echo "$TAGS" | head -n1)
|
||||||
if [[ -z "$LATEST_TAG" ]]; then
|
if [[ -z "$LATEST_TAG" ]]; then
|
||||||
@@ -199,8 +148,8 @@ jobs:
|
|||||||
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
|
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||||
run: |
|
run: |
|
||||||
pip install -r requirements-dev.txt
|
pip install -r requirements-dev.txt
|
||||||
pip install -v -e .
|
pip install -v -e .
|
||||||
|
|
||||||
- name: Get vLLM commit hash and URL
|
- name: Get vLLM commit hash and URL
|
||||||
working-directory: ./vllm-empty
|
working-directory: ./vllm-empty
|
||||||
run: |
|
run: |
|
||||||
@@ -213,15 +162,6 @@ jobs:
|
|||||||
VLLM_ASCEND_COMMIT=$(git rev-parse --short=7 HEAD)
|
VLLM_ASCEND_COMMIT=$(git rev-parse --short=7 HEAD)
|
||||||
echo "VLLM_ASCEND_COMMIT=$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV
|
echo "VLLM_ASCEND_COMMIT=$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: Print resolved hashes
|
|
||||||
run: |
|
|
||||||
echo "vLLM : ${{ env.VLLM_COMMIT }}"
|
|
||||||
echo "vLLM-Ascend: ${{ env.VLLM_ASCEND_COMMIT }}"
|
|
||||||
|
|
||||||
- name: Install lm-eval, ray, and datasets
|
|
||||||
run: |
|
|
||||||
pip install lm-eval==0.4.8
|
|
||||||
|
|
||||||
- name: Collect version info
|
- name: Collect version info
|
||||||
run: |
|
run: |
|
||||||
for dir in /usr/local/Ascend/ascend-toolkit/*; do
|
for dir in /usr/local/Ascend/ascend-toolkit/*; do
|
||||||
@@ -242,37 +182,27 @@ jobs:
|
|||||||
pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}'
|
pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}'
|
||||||
pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//'
|
pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//'
|
||||||
} >> "$GITHUB_ENV"
|
} >> "$GITHUB_ENV"
|
||||||
|
|
||||||
- name: Print versions
|
|
||||||
run: |
|
|
||||||
echo "CANN: ${{ env.GHA_CANN_VERSION }}"
|
|
||||||
echo "Torch NPU: ${{ env.GHA_TORCH_NPU_VERSION }}"
|
|
||||||
echo "Torch: ${{ env.GHA_TORCH_VERSION }}"
|
|
||||||
echo "vLLM: ${{ env.GHA_VLLM_VERSION }}"
|
|
||||||
echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION }}"
|
|
||||||
|
|
||||||
- name: Run Accuracy Test
|
- name: Run accuracy test
|
||||||
id: report
|
id: report
|
||||||
working-directory: ./benchmarks
|
|
||||||
env:
|
env:
|
||||||
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||||
|
VLLM_USE_MODELSCOPE: True
|
||||||
|
VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }}
|
||||||
|
VLLM_COMMIT: ${{ env.VLLM_COMMIT }}
|
||||||
|
VLLM_ASCEND_VERSION: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}
|
||||||
|
VLLM_ASCEND_COMMIT: ${{ env.VLLM_ASCEND_COMMIT }}
|
||||||
|
CANN_VERSION: ${{ env.GHA_CANN_VERSION }}
|
||||||
|
TORCH_VERSION: ${{ env.GHA_TORCH_VERSION }}
|
||||||
|
TORCH_NPU_VERSION: ${{ env.GHA_TORCH_NPU_VERSION }}
|
||||||
run: |
|
run: |
|
||||||
model_base_name=$(basename ${{ matrix.model_name }})
|
model_base_name=$(basename ${{ matrix.model_name }})
|
||||||
markdown_name="${model_base_name}"
|
markdown_name="${model_base_name}"
|
||||||
echo "markdown_name=$markdown_name"
|
|
||||||
echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT
|
echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT
|
||||||
mkdir -p ./accuracy
|
mkdir -p ./benchmarks/accuracy
|
||||||
|
pytest -sv ./tests/e2e/singlecard/models/test_lm_eval_correctness.py \
|
||||||
python ./scripts/run_accuracy.py \
|
--config ./tests/e2e/singlecard/models/configs/${{ matrix.model_name }}.yaml \
|
||||||
--model "${{ matrix.model_name }}" \
|
--report_output ./benchmarks/accuracy/${model_base_name}.md
|
||||||
--output "./accuracy/${markdown_name}.md" \
|
|
||||||
--vllm_ascend_version "${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}" \
|
|
||||||
--cann_version "${{ env.GHA_CANN_VERSION }}" \
|
|
||||||
--torch_npu_version "${{ env.GHA_TORCH_NPU_VERSION }}" \
|
|
||||||
--torch_version "${{ env.GHA_TORCH_VERSION }}" \
|
|
||||||
--vllm_version "${{ env.GHA_VLLM_VERSION }}" \
|
|
||||||
--vllm_commit "${{ env.VLLM_COMMIT }}" \
|
|
||||||
--vllm_ascend_commit "${{ env.VLLM_ASCEND_COMMIT }}" \
|
|
||||||
|
|
||||||
- name: Generate step summary
|
- name: Generate step summary
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
@@ -284,19 +214,7 @@ jobs:
|
|||||||
SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}"
|
SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}"
|
||||||
echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV"
|
echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV"
|
||||||
|
|
||||||
- name: Check report first line for failure
|
|
||||||
id: check_report
|
|
||||||
run: |
|
|
||||||
REPORT_PATH="./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md"
|
|
||||||
echo "Scanning $REPORT_PATH for ❌ …"
|
|
||||||
if grep -q '❌' "$REPORT_PATH"; then
|
|
||||||
echo "contains_fail=true" >> $GITHUB_OUTPUT
|
|
||||||
else
|
|
||||||
echo "contains_fail=false" >> $GITHUB_OUTPUT
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Upload Report
|
- name: Upload Report
|
||||||
if: ${{ github.event_name == 'workflow_dispatch' && steps.check_report.outputs.contains_fail == 'false' }}
|
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: "report-${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}"
|
name: "report-${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}"
|
||||||
@@ -305,12 +223,16 @@ jobs:
|
|||||||
retention-days: 90
|
retention-days: 90
|
||||||
overwrite: true
|
overwrite: true
|
||||||
|
|
||||||
|
outputs:
|
||||||
|
model_name: ${{ steps.set_output.outputs.model_name }}
|
||||||
|
|
||||||
create_pr:
|
create_pr:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: accuracy_tests
|
needs: accuracy_tests
|
||||||
if: ${{ github.event_name == 'workflow_dispatch' }}
|
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}
|
||||||
env:
|
env:
|
||||||
UPSTREAM_REPO: vllm-project/vllm-ascend
|
UPSTREAM_REPO: vllm-project/vllm-ascend
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -318,7 +240,7 @@ jobs:
|
|||||||
repository: vllm-ascend-ci/vllm-ascend
|
repository: vllm-ascend-ci/vllm-ascend
|
||||||
token: ${{ secrets.PAT_TOKEN }}
|
token: ${{ secrets.PAT_TOKEN }}
|
||||||
ref: main
|
ref: main
|
||||||
|
|
||||||
- name: Add upstream remote
|
- name: Add upstream remote
|
||||||
run: |
|
run: |
|
||||||
git remote add upstream https://github.com/${{ env.UPSTREAM_REPO }}.git
|
git remote add upstream https://github.com/${{ env.UPSTREAM_REPO }}.git
|
||||||
@@ -350,7 +272,7 @@ jobs:
|
|||||||
find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete
|
find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete
|
||||||
find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \;
|
find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \;
|
||||||
find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete
|
find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete
|
||||||
|
|
||||||
- name: Update accuracy_report/index.md
|
- name: Update accuracy_report/index.md
|
||||||
run: |
|
run: |
|
||||||
REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report"
|
REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report"
|
||||||
@@ -390,16 +312,10 @@ jobs:
|
|||||||
head: `vllm-ascend-ci:${{ env.BRANCH_NAME }}`,
|
head: `vllm-ascend-ci:${{ env.BRANCH_NAME }}`,
|
||||||
base: '${{ github.event.inputs.vllm-ascend-version }}',
|
base: '${{ github.event.inputs.vllm-ascend-version }}',
|
||||||
title: `[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}`,
|
title: `[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}`,
|
||||||
body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for:
|
body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)
|
||||||
${{
|
|
||||||
github.event.inputs.models == 'all'
|
|
||||||
&& 'All models (Qwen/Qwen3-30B-A3B, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)'
|
|
||||||
|| github.event.inputs.models
|
|
||||||
}}
|
|
||||||
|
|
||||||
- [Workflow run][1]
|
- [Workflow run][1]
|
||||||
|
|
||||||
[1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`
|
[1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`
|
||||||
});
|
});
|
||||||
core.info(`Created PR #${pr.data.number}`);
|
core.info(`Created PR #${pr.data.number}`);
|
||||||
|
|
||||||
|
|||||||
@@ -1,313 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
||||||
# Copyright 2023 The vLLM team.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
# This file is a part of the vllm-ascend project.
|
|
||||||
#
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import gc
|
|
||||||
import json
|
|
||||||
import multiprocessing
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
from multiprocessing import Queue
|
|
||||||
|
|
||||||
import lm_eval
|
|
||||||
import torch
|
|
||||||
|
|
||||||
# URLs for version information in Markdown report
|
|
||||||
VLLM_URL = "https://github.com/vllm-project/vllm/commit/"
|
|
||||||
VLLM_ASCEND_URL = "https://github.com/vllm-project/vllm-ascend/commit/"
|
|
||||||
|
|
||||||
# Model and task configurations
|
|
||||||
UNIMODAL_MODEL_NAME = ["Qwen/Qwen3-8B-Base", "Qwen/Qwen3-30B-A3B"]
|
|
||||||
UNIMODAL_TASK = ["ceval-valid", "gsm8k"]
|
|
||||||
MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"]
|
|
||||||
MULTIMODAL_TASK = ["mmmu_val"]
|
|
||||||
|
|
||||||
# Batch size configurations per task
|
|
||||||
BATCH_SIZE = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1}
|
|
||||||
|
|
||||||
# Model type mapping (vllm for text, vllm-vlm for vision-language)
|
|
||||||
MODEL_TYPE = {
|
|
||||||
"Qwen/Qwen3-8B-Base": "vllm",
|
|
||||||
"Qwen/Qwen3-30B-A3B": "vllm",
|
|
||||||
"Qwen/Qwen2.5-VL-7B-Instruct": "vllm-vlm",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Command templates for running evaluations
|
|
||||||
MODEL_RUN_INFO = {
|
|
||||||
"Qwen/Qwen3-30B-A3B": (
|
|
||||||
"export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n"
|
|
||||||
"lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
|
|
||||||
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
|
|
||||||
),
|
|
||||||
"Qwen/Qwen3-8B-Base": (
|
|
||||||
"export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6'\n"
|
|
||||||
"lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
|
|
||||||
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
|
|
||||||
),
|
|
||||||
"Qwen/Qwen2.5-VL-7B-Instruct": (
|
|
||||||
"export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2'\n"
|
|
||||||
"lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
|
|
||||||
"--apply_chat_template --fewshot_as_multiturn --batch_size 1"
|
|
||||||
),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Evaluation metric filters per task
|
|
||||||
FILTER = {
|
|
||||||
"gsm8k": "exact_match,flexible-extract",
|
|
||||||
"ceval-valid": "acc,none",
|
|
||||||
"mmmu_val": "acc,none",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Expected accuracy values for models
|
|
||||||
EXPECTED_VALUE = {
|
|
||||||
"Qwen/Qwen3-30B-A3B": {"ceval-valid": 0.83, "gsm8k": 0.85},
|
|
||||||
"Qwen/Qwen3-8B-Base": {"ceval-valid": 0.82, "gsm8k": 0.83},
|
|
||||||
"Qwen/Qwen2.5-VL-7B-Instruct": {"mmmu_val": 0.51},
|
|
||||||
}
|
|
||||||
PARALLEL_MODE = {
|
|
||||||
"Qwen/Qwen3-8B-Base": "TP",
|
|
||||||
"Qwen/Qwen2.5-VL-7B-Instruct": "TP",
|
|
||||||
"Qwen/Qwen3-30B-A3B": "EP",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Execution backend configuration
|
|
||||||
EXECUTION_MODE = {
|
|
||||||
"Qwen/Qwen3-8B-Base": "ACLGraph",
|
|
||||||
"Qwen/Qwen2.5-VL-7B-Instruct": "ACLGraph",
|
|
||||||
"Qwen/Qwen3-30B-A3B": "ACLGraph",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Model arguments for evaluation
|
|
||||||
MODEL_ARGS = {
|
|
||||||
"Qwen/Qwen3-8B-Base": "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6",
|
|
||||||
"Qwen/Qwen2.5-VL-7B-Instruct": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2",
|
|
||||||
"Qwen/Qwen3-30B-A3B": "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Whether to apply chat template formatting
|
|
||||||
APPLY_CHAT_TEMPLATE = {
|
|
||||||
"Qwen/Qwen3-8B-Base": True,
|
|
||||||
"Qwen/Qwen2.5-VL-7B-Instruct": True,
|
|
||||||
"Qwen/Qwen3-30B-A3B": False,
|
|
||||||
}
|
|
||||||
# Few-shot examples handling as multi-turn dialogues.
|
|
||||||
FEWSHOT_AS_MULTITURN = {
|
|
||||||
"Qwen/Qwen3-8B-Base": True,
|
|
||||||
"Qwen/Qwen2.5-VL-7B-Instruct": True,
|
|
||||||
"Qwen/Qwen3-30B-A3B": False,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Relative tolerance for accuracy checks
|
|
||||||
RTOL = 0.03
|
|
||||||
ACCURACY_FLAG = {}
|
|
||||||
|
|
||||||
|
|
||||||
def run_accuracy_test(queue, model, dataset):
|
|
||||||
"""Run accuracy evaluation for a model on a dataset in separate process"""
|
|
||||||
try:
|
|
||||||
eval_params = {
|
|
||||||
"model": MODEL_TYPE[model],
|
|
||||||
"model_args": MODEL_ARGS[model],
|
|
||||||
"tasks": dataset,
|
|
||||||
"apply_chat_template": APPLY_CHAT_TEMPLATE[model],
|
|
||||||
"fewshot_as_multiturn": FEWSHOT_AS_MULTITURN[model],
|
|
||||||
"batch_size": BATCH_SIZE[dataset],
|
|
||||||
}
|
|
||||||
|
|
||||||
if MODEL_TYPE[model] == "vllm":
|
|
||||||
eval_params["num_fewshot"] = 5
|
|
||||||
|
|
||||||
results = lm_eval.simple_evaluate(**eval_params)
|
|
||||||
print(f"Success: {model} on {dataset} ")
|
|
||||||
measured_value = results["results"]
|
|
||||||
queue.put(measured_value)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error in run_accuracy_test: {e}")
|
|
||||||
queue.put(e)
|
|
||||||
sys.exit(1)
|
|
||||||
finally:
|
|
||||||
if "results" in locals():
|
|
||||||
del results
|
|
||||||
gc.collect()
|
|
||||||
torch.npu.empty_cache()
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_md(model_name, tasks_list, args, datasets):
|
|
||||||
"""Generate Markdown report with evaluation results"""
|
|
||||||
# Format the run command
|
|
||||||
run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name, datasets=datasets)
|
|
||||||
model = model_name.split("/")[1]
|
|
||||||
|
|
||||||
# Version information section
|
|
||||||
version_info = (
|
|
||||||
f"**vLLM Version**: vLLM: {args.vllm_version} "
|
|
||||||
f"([{args.vllm_commit}]({VLLM_URL + args.vllm_commit})), "
|
|
||||||
f"vLLM Ascend: {args.vllm_ascend_version} "
|
|
||||||
f"([{args.vllm_ascend_commit}]({VLLM_ASCEND_URL + args.vllm_ascend_commit})) "
|
|
||||||
)
|
|
||||||
|
|
||||||
# Report header with system info
|
|
||||||
preamble = f"""# {model}
|
|
||||||
{version_info}
|
|
||||||
**Software Environment**: CANN: {args.cann_version}, PyTorch: {args.torch_version}, torch-npu: {args.torch_npu_version}
|
|
||||||
**Hardware Environment**: Atlas A2 Series
|
|
||||||
**Datasets**: {datasets}
|
|
||||||
**Parallel Mode**: {PARALLEL_MODE[model_name]}
|
|
||||||
**Execution Mode**: {EXECUTION_MODE[model_name]}
|
|
||||||
**Command**:
|
|
||||||
```bash
|
|
||||||
{run_cmd}
|
|
||||||
```
|
|
||||||
"""
|
|
||||||
|
|
||||||
header = (
|
|
||||||
"| Task | Filter | n-shot | Metric | Value | Stderr |\n"
|
|
||||||
"|-----------------------|-------:|-------:|----------|--------:|-------:|"
|
|
||||||
)
|
|
||||||
rows = []
|
|
||||||
rows_sub = []
|
|
||||||
# Process results for each task
|
|
||||||
for task_dict in tasks_list:
|
|
||||||
for key, stats in task_dict.items():
|
|
||||||
alias = stats.get("alias", key)
|
|
||||||
task_name = alias.strip()
|
|
||||||
if "exact_match,flexible-extract" in stats:
|
|
||||||
metric_key = "exact_match,flexible-extract"
|
|
||||||
else:
|
|
||||||
metric_key = None
|
|
||||||
for k in stats:
|
|
||||||
if "," in k and not k.startswith("acc_stderr"):
|
|
||||||
metric_key = k
|
|
||||||
break
|
|
||||||
if metric_key is None:
|
|
||||||
continue
|
|
||||||
metric, flt = metric_key.split(",", 1)
|
|
||||||
|
|
||||||
value = stats[metric_key]
|
|
||||||
stderr = stats.get(f"{metric}_stderr,{flt}", 0)
|
|
||||||
if model_name in UNIMODAL_MODEL_NAME:
|
|
||||||
n_shot = "5"
|
|
||||||
else:
|
|
||||||
n_shot = "0"
|
|
||||||
flag = ACCURACY_FLAG.get(task_name, "")
|
|
||||||
row = (
|
|
||||||
f"| {task_name:<37} "
|
|
||||||
f"| {flt:<6} "
|
|
||||||
f"| {n_shot:6} "
|
|
||||||
f"| {metric:<6} "
|
|
||||||
f"| {flag}{value:>5.4f} "
|
|
||||||
f"| ± {stderr:>5.4f} |"
|
|
||||||
)
|
|
||||||
if not task_name.startswith("-"):
|
|
||||||
rows.append(row)
|
|
||||||
rows_sub.append(
|
|
||||||
"<details>"
|
|
||||||
+ "\n"
|
|
||||||
+ "<summary>"
|
|
||||||
+ task_name
|
|
||||||
+ " details"
|
|
||||||
+ "</summary>"
|
|
||||||
+ "\n" * 2
|
|
||||||
+ header
|
|
||||||
)
|
|
||||||
rows_sub.append(row)
|
|
||||||
rows_sub.append("</details>")
|
|
||||||
# Combine all Markdown sections
|
|
||||||
md = (
|
|
||||||
preamble
|
|
||||||
+ "\n"
|
|
||||||
+ header
|
|
||||||
+ "\n"
|
|
||||||
+ "\n".join(rows)
|
|
||||||
+ "\n"
|
|
||||||
+ "\n".join(rows_sub)
|
|
||||||
+ "\n"
|
|
||||||
)
|
|
||||||
print(md)
|
|
||||||
return md
|
|
||||||
|
|
||||||
|
|
||||||
def safe_md(args, accuracy, datasets):
|
|
||||||
"""
|
|
||||||
Safely generate and save Markdown report from accuracy results.
|
|
||||||
"""
|
|
||||||
data = json.loads(json.dumps(accuracy))
|
|
||||||
for model_key, tasks_list in data.items():
|
|
||||||
md_content = generate_md(model_key, tasks_list, args, datasets)
|
|
||||||
with open(args.output, "w", encoding="utf-8") as f:
|
|
||||||
f.write(md_content)
|
|
||||||
print(f"create Markdown file:{args.output}")
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
"""Main evaluation workflow"""
|
|
||||||
accuracy = {}
|
|
||||||
accuracy[args.model] = []
|
|
||||||
result_queue: Queue[float] = multiprocessing.Queue()
|
|
||||||
if args.model in UNIMODAL_MODEL_NAME:
|
|
||||||
datasets = UNIMODAL_TASK
|
|
||||||
else:
|
|
||||||
datasets = MULTIMODAL_TASK
|
|
||||||
datasets_str = ",".join(datasets)
|
|
||||||
# Evaluate model on each dataset
|
|
||||||
for dataset in datasets:
|
|
||||||
accuracy_expected = EXPECTED_VALUE[args.model][dataset]
|
|
||||||
p = multiprocessing.Process(
|
|
||||||
target=run_accuracy_test, args=(result_queue, args.model, dataset)
|
|
||||||
)
|
|
||||||
p.start()
|
|
||||||
p.join()
|
|
||||||
if p.is_alive():
|
|
||||||
p.terminate()
|
|
||||||
p.join()
|
|
||||||
gc.collect()
|
|
||||||
torch.npu.empty_cache()
|
|
||||||
time.sleep(10)
|
|
||||||
result = result_queue.get()
|
|
||||||
print(result)
|
|
||||||
if (
|
|
||||||
accuracy_expected - RTOL
|
|
||||||
< result[dataset][FILTER[dataset]]
|
|
||||||
< accuracy_expected + RTOL
|
|
||||||
):
|
|
||||||
ACCURACY_FLAG[dataset] = "✅"
|
|
||||||
else:
|
|
||||||
ACCURACY_FLAG[dataset] = "❌"
|
|
||||||
accuracy[args.model].append(result)
|
|
||||||
print(accuracy)
|
|
||||||
safe_md(args, accuracy, datasets_str)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
multiprocessing.set_start_method("spawn", force=True)
|
|
||||||
# Initialize argument parser
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Run model accuracy evaluation and generate report"
|
|
||||||
)
|
|
||||||
parser.add_argument("--output", type=str, required=True)
|
|
||||||
parser.add_argument("--model", type=str, required=True)
|
|
||||||
parser.add_argument("--vllm_ascend_version", type=str, required=False)
|
|
||||||
parser.add_argument("--torch_version", type=str, required=False)
|
|
||||||
parser.add_argument("--torch_npu_version", type=str, required=False)
|
|
||||||
parser.add_argument("--vllm_version", type=str, required=False)
|
|
||||||
parser.add_argument("--cann_version", type=str, required=False)
|
|
||||||
parser.add_argument("--vllm_commit", type=str, required=False)
|
|
||||||
parser.add_argument("--vllm_ascend_commit", type=str, required=False)
|
|
||||||
args = parser.parse_args()
|
|
||||||
main(args)
|
|
||||||
@@ -5,7 +5,7 @@ openai
|
|||||||
pytest >= 6.0
|
pytest >= 6.0
|
||||||
pytest-asyncio
|
pytest-asyncio
|
||||||
pytest-mock
|
pytest-mock
|
||||||
lm-eval
|
lm-eval==0.4.8
|
||||||
types-jsonschema
|
types-jsonschema
|
||||||
xgrammar
|
xgrammar
|
||||||
zmq
|
zmq
|
||||||
|
|||||||
@@ -0,0 +1,8 @@
|
|||||||
|
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||||
|
model: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "mmmu_val"
|
||||||
|
metrics:
|
||||||
|
- name: "acc,none"
|
||||||
|
value: 0.51
|
||||||
|
max_model_len: 8192
|
||||||
18
tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml
Normal file
18
tests/e2e/singlecard/models/configs/Qwen3-30B-A3B.yaml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
model_name: "Qwen/Qwen3-30B-A3B"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.89
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.85
|
||||||
|
- name: "ceval-valid"
|
||||||
|
metrics:
|
||||||
|
- name: "acc,none"
|
||||||
|
value: 0.84
|
||||||
|
num_fewshot: 5
|
||||||
|
gpu_memory_utilization: 0.6
|
||||||
|
enable_expert_parallel: True
|
||||||
|
tensor_parallel_size: 2
|
||||||
|
apply_chat_template: False
|
||||||
|
fewshot_as_multiturn: False
|
||||||
13
tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml
Normal file
13
tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
model_name: "Qwen/Qwen3-8B-Base"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.82
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.83
|
||||||
|
- name: "ceval-valid"
|
||||||
|
metrics:
|
||||||
|
- name: "acc,none"
|
||||||
|
value: 0.82
|
||||||
|
num_fewshot: 5
|
||||||
3
tests/e2e/singlecard/models/configs/accuracy.txt
Normal file
3
tests/e2e/singlecard/models/configs/accuracy.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
Qwen3-8B-Base.yaml
|
||||||
|
Qwen2.5-VL-7B-Instruct.yaml
|
||||||
|
Qwen3-30B-A3B.yaml
|
||||||
73
tests/e2e/singlecard/models/conftest.py
Normal file
73
tests/e2e/singlecard/models/conftest.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_addoption(parser):
|
||||||
|
parser.addoption(
|
||||||
|
"--config-list-file",
|
||||||
|
action="store",
|
||||||
|
default=None,
|
||||||
|
help="Path to the file listing model config YAMLs (one per line)",
|
||||||
|
)
|
||||||
|
parser.addoption(
|
||||||
|
"--tp-size",
|
||||||
|
action="store",
|
||||||
|
default="1",
|
||||||
|
help="Tensor parallel size to use for evaluation",
|
||||||
|
)
|
||||||
|
parser.addoption(
|
||||||
|
"--config",
|
||||||
|
action="store",
|
||||||
|
default="./tests/e2e/singlecard/models/configs/Qwen3-8B-Base.yaml",
|
||||||
|
help="Path to the model config YAML file",
|
||||||
|
)
|
||||||
|
parser.addoption(
|
||||||
|
"--report_output",
|
||||||
|
action="store",
|
||||||
|
default="./benchmarks/accuracy/Qwen3-8B-Base.md",
|
||||||
|
help="Path to the report output file",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def config_list_file(pytestconfig, config_dir):
|
||||||
|
rel_path = pytestconfig.getoption("--config-list-file")
|
||||||
|
return config_dir / rel_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def tp_size(pytestconfig):
|
||||||
|
return pytestconfig.getoption("--tp-size")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def config(pytestconfig):
|
||||||
|
return pytestconfig.getoption("--config")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def report_output(pytestconfig):
|
||||||
|
return pytestconfig.getoption("--report_output")
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_generate_tests(metafunc):
|
||||||
|
if "config_filename" in metafunc.fixturenames:
|
||||||
|
# If config specified, use the --config directly
|
||||||
|
single_config = metafunc.config.getoption("--config")
|
||||||
|
if single_config:
|
||||||
|
metafunc.parametrize("config_filename",
|
||||||
|
[Path(single_config).resolve()])
|
||||||
|
return
|
||||||
|
# Otherwise, check --config-list-file
|
||||||
|
rel_path = metafunc.config.getoption("--config-list-file")
|
||||||
|
config_list_file = Path(rel_path).resolve()
|
||||||
|
config_dir = config_list_file.parent
|
||||||
|
with open(config_list_file, encoding="utf-8") as f:
|
||||||
|
configs = [
|
||||||
|
config_dir / line.strip() for line in f
|
||||||
|
if line.strip() and not line.startswith("#")
|
||||||
|
]
|
||||||
|
metafunc.parametrize("config_filename", configs)
|
||||||
24
tests/e2e/singlecard/models/report_template.md
Normal file
24
tests/e2e/singlecard/models/report_template.md
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
# {{ model_name }}
|
||||||
|
|
||||||
|
**vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})),
|
||||||
|
**vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))
|
||||||
|
**Software Environment**: CANN: {{ cann_version }}, PyTorch: {{ torch_version }}, torch-npu: {{ torch_npu_version }}
|
||||||
|
**Hardware Environment**: Atlas A2 Series
|
||||||
|
**Datasets**: {{ datasets }}
|
||||||
|
**Parallel Mode**: TP
|
||||||
|
**Execution Mode**: ACLGraph
|
||||||
|
|
||||||
|
**Command**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export MODEL_ARGS={{ model_args }}
|
||||||
|
lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
|
||||||
|
--apply_chat_template {{ apply_chat_template }} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} \
|
||||||
|
--limit {{ limit }} --batch_size {{ batch_size}}
|
||||||
|
```
|
||||||
|
|
||||||
|
| Task | Metric | Value | Stderr |
|
||||||
|
|-----------------------|-------------|----------:|-------:|
|
||||||
|
{% for row in rows -%}
|
||||||
|
| {{ row.task.rjust(23) }} | {{ row.metric.rjust(15) }} |{{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} |
|
||||||
|
{% endfor %}
|
||||||
148
tests/e2e/singlecard/models/test_lm_eval_correctness.py
Normal file
148
tests/e2e/singlecard/models/test_lm_eval_correctness.py
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import lm_eval
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
from jinja2 import Environment, FileSystemLoader
|
||||||
|
|
||||||
|
RTOL = 0.03
|
||||||
|
TEST_DIR = os.path.dirname(__file__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EnvConfig:
|
||||||
|
vllm_version: str
|
||||||
|
vllm_commit: str
|
||||||
|
vllm_ascend_version: str
|
||||||
|
vllm_ascend_commit: str
|
||||||
|
cann_version: str
|
||||||
|
torch_version: str
|
||||||
|
torch_npu_version: str
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def env_config() -> EnvConfig:
|
||||||
|
return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'),
|
||||||
|
vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'),
|
||||||
|
vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION',
|
||||||
|
'unknown'),
|
||||||
|
vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT',
|
||||||
|
'unknown'),
|
||||||
|
cann_version=os.getenv('CANN_VERSION', 'unknown'),
|
||||||
|
torch_version=os.getenv('TORCH_VERSION', 'unknown'),
|
||||||
|
torch_npu_version=os.getenv('TORCH_NPU_VERSION',
|
||||||
|
'unknown'))
|
||||||
|
|
||||||
|
|
||||||
|
def build_model_args(eval_config, tp_size):
|
||||||
|
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||||
|
max_model_len = eval_config.get("max_model_len", 4096)
|
||||||
|
model_args = {
|
||||||
|
"pretrained": eval_config["model_name"],
|
||||||
|
"tensor_parallel_size": tp_size,
|
||||||
|
"dtype": "auto",
|
||||||
|
"trust_remote_code": trust_remote_code,
|
||||||
|
"max_model_len": max_model_len,
|
||||||
|
}
|
||||||
|
for s in [
|
||||||
|
"max_images", "gpu_memory_utilization", "enable_expert_parallel",
|
||||||
|
"tensor_parallel_size"
|
||||||
|
]:
|
||||||
|
val = eval_config.get(s, None)
|
||||||
|
if val is not None:
|
||||||
|
model_args[s] = val
|
||||||
|
|
||||||
|
print("Model Parameters:")
|
||||||
|
print(model_args)
|
||||||
|
|
||||||
|
return model_args
|
||||||
|
|
||||||
|
|
||||||
|
def generate_report(tp_size, eval_config, report_data, report_output,
|
||||||
|
env_config):
|
||||||
|
env = Environment(loader=FileSystemLoader(TEST_DIR))
|
||||||
|
template = env.get_template("report_template.md")
|
||||||
|
model_args = build_model_args(eval_config, tp_size)
|
||||||
|
|
||||||
|
report_content = template.render(
|
||||||
|
vllm_version=env_config.vllm_version,
|
||||||
|
vllm_commit=env_config.vllm_commit,
|
||||||
|
vllm_ascend_version=env_config.vllm_ascend_version,
|
||||||
|
vllm_ascend_commit=env_config.vllm_ascend_commit,
|
||||||
|
cann_version=env_config.cann_version,
|
||||||
|
torch_version=env_config.torch_version,
|
||||||
|
torch_npu_version=env_config.torch_npu_version,
|
||||||
|
model_name=eval_config["model_name"],
|
||||||
|
model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'",
|
||||||
|
model_type=eval_config.get("model", "vllm"),
|
||||||
|
datasets=",".join([task["name"] for task in eval_config["tasks"]]),
|
||||||
|
apply_chat_template=eval_config.get("apply_chat_template", True),
|
||||||
|
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
|
||||||
|
limit=eval_config.get("limit", None),
|
||||||
|
batch_size="auto",
|
||||||
|
num_fewshot=eval_config.get("num_fewshot", "N/A"),
|
||||||
|
rows=report_data["rows"])
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(report_output), exist_ok=True)
|
||||||
|
with open(report_output, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(report_content)
|
||||||
|
|
||||||
|
|
||||||
|
def test_lm_eval_correctness_param(config_filename, tp_size, report_output,
|
||||||
|
env_config):
|
||||||
|
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
||||||
|
model_args = build_model_args(eval_config, tp_size)
|
||||||
|
success = True
|
||||||
|
report_data: dict[str, list[dict]] = {"rows": []}
|
||||||
|
|
||||||
|
eval_params = {
|
||||||
|
"model": eval_config.get("model", "vllm"),
|
||||||
|
"model_args": model_args,
|
||||||
|
"tasks": [task["name"] for task in eval_config["tasks"]],
|
||||||
|
"apply_chat_template": eval_config.get("apply_chat_template", True),
|
||||||
|
"fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
|
||||||
|
"limit": eval_config.get("limit", None),
|
||||||
|
"batch_size": "auto",
|
||||||
|
}
|
||||||
|
for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
|
||||||
|
val = eval_config.get(s, None)
|
||||||
|
if val is not None:
|
||||||
|
eval_params[s] = val
|
||||||
|
|
||||||
|
print("Eval Parameters:")
|
||||||
|
print(eval_params)
|
||||||
|
|
||||||
|
results = lm_eval.simple_evaluate(**eval_params)
|
||||||
|
|
||||||
|
for task in eval_config["tasks"]:
|
||||||
|
task_name = task["name"]
|
||||||
|
task_result = results["results"][task_name]
|
||||||
|
for metric in task["metrics"]:
|
||||||
|
metric_name = metric["name"]
|
||||||
|
ground_truth = metric["value"]
|
||||||
|
measured_value = task_result[metric_name]
|
||||||
|
task_success = bool(
|
||||||
|
np.isclose(ground_truth, measured_value, rtol=RTOL))
|
||||||
|
success = success and task_success
|
||||||
|
|
||||||
|
print(f"{task_name} | {metric_name}: "
|
||||||
|
f"ground_truth={ground_truth} | measured={measured_value} | "
|
||||||
|
f"success={'✅' if task_success else '❌'}")
|
||||||
|
|
||||||
|
report_data["rows"].append({
|
||||||
|
"task":
|
||||||
|
task_name,
|
||||||
|
"metric":
|
||||||
|
metric_name,
|
||||||
|
"value":
|
||||||
|
f"✅{measured_value}" if success else f"❌{measured_value}",
|
||||||
|
"stderr":
|
||||||
|
task_result[
|
||||||
|
metric_name.replace(',', '_stderr,') if metric_name ==
|
||||||
|
"acc,none" else metric_name.replace(',', '_stderr,')]
|
||||||
|
})
|
||||||
|
generate_report(tp_size, eval_config, report_data, report_output,
|
||||||
|
env_config)
|
||||||
|
assert success
|
||||||
Reference in New Issue
Block a user