diff --git a/.github/workflows/accuracy_report.yaml b/.github/workflows/accuracy_report.yaml deleted file mode 100644 index fe1dbd1..0000000 --- a/.github/workflows/accuracy_report.yaml +++ /dev/null @@ -1,202 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -name: Accuracy Report -on: - workflow_dispatch: - inputs: - vllm-ascend-branch: - description: 'vllm-ascend branch:' - required: true - type: choice - options: - - main - - v0.7.3-dev - models: - description: 'models:' - required: true - type: choice - options: - - all - - Qwen/Qwen2.5-7B-Instruct - - Qwen/Qwen2.5-VL-7B-Instruct - - Qwen/Qwen3-8B-Base - default: 'all' - -jobs: - download_reports: - runs-on: ubuntu-latest - strategy: - matrix: - model: ${{ fromJSON( - (github.event.inputs.models == 'all' && - '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || - (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' && - '["Qwen/Qwen2.5-7B-Instruct"]') || - (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && - '["Qwen/Qwen2.5-VL-7B-Instruct"]') || - (github.event.inputs.models == 'Qwen/Qwen3-8B-Base' && - '["Qwen/Qwen3-8B-Base"]') - ) }} - - version: [0, 1] - exclude: - - model: 'Qwen/Qwen2.5-VL-7B-Instruct' - version: 1 - fail-fast: false - - name: Download ${{ matrix.model }} V${{ matrix.version }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - ref: ${{ github.event.inputs.vllm-ascend-branch }} - - - name: Get base model name - id: get_basename - run: | - model_base_name=$(basename "${{ matrix.model }}") - echo "model_base_name=$model_base_name" >> $GITHUB_OUTPUT - shell: bash - - - name: Query artifact run id - id: get_run_id - run: | - ARTIFACT_PATTERN="${{ github.event.inputs.vllm-ascend-branch }}-${{ steps.get_basename.outputs.model_base_name }}-V${{ matrix.version }}-report" - echo "Querying artifacts with pattern: $ARTIFACT_PATTERN" - - ARTIFACT_JSON=$(gh api --paginate /repos/${{ github.repository }}/actions/artifacts || echo "{}") - - RUN_ID=$(echo "$ARTIFACT_JSON" | \ - jq -s -r --arg pattern "$ARTIFACT_PATTERN" \ - '[.[].artifacts[]] | map(select(.name | test($pattern))) | sort_by(.created_at) | last | .workflow_run.id // empty') - - if [ -z "$RUN_ID" ]; then - echo "::warning::No artifact found matching pattern $ARTIFACT_PATTERN. Skipping download." - echo "runid=" >> $GITHUB_OUTPUT - else - echo "Found matching artifact with run ID: $RUN_ID" - echo "runid=$RUN_ID" >> $GITHUB_OUTPUT - fi - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Download Artifact - if: ${{ steps.get_run_id.outputs.runid != '' }} - uses: actions/download-artifact@v4 - with: - name: ${{ github.event.inputs.vllm-ascend-branch }}-${{ steps.get_basename.outputs.model_base_name }}-V${{ matrix.version }}-report - path: ./docs/source/developer_guide/evaluation/accuracy_report_bak - github-token: ${{ secrets.GITHUB_TOKEN }} - repository: ${{ github.repository }} - run-id: ${{ steps.get_run_id.outputs.runid }} - - - name: Upload reports artifact - if: ${{ steps.get_run_id.outputs.runid != '' }} - uses: actions/upload-artifact@v4 - with: - name: report-${{ steps.get_basename.outputs.model_base_name }}-v${{ matrix.version }} - path: ./docs/source/developer_guide/evaluation/accuracy_report_bak/*.md - retention-days: 90 - - create_pr: - runs-on: ubuntu-latest - needs: download_reports - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - ref: ${{ github.event.inputs.vllm-ascend-branch }} - - - name: Setup workspace - run: mkdir -p ./accuracy/accuracy_report - - - name: Download only current run reports - uses: actions/download-artifact@v4 - with: - path: ./docs/source/developer_guide/evaluation/accuracy_report - pattern: report-* - github-token: ${{ secrets.GITHUB_TOKEN }} - run-id: ${{ github.run_id }} - - - name: Delete old report - run: | - find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete - find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \; - find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete - - - name: Generate step summary - if: ${{ always() }} - run: | - for report in ./docs/source/developer_guide/evaluation/accuracy_report/*.md; do - filename=$(basename "$report") - # skip index.md - if [ "$filename" = "index.md" ]; then - continue - fi - - if [ -f "$report" ]; then - { - echo -e "\n\n---\n" - echo "## 📄 Report File: $(basename $report)" - cat "$report" - } >> "$GITHUB_STEP_SUMMARY" - fi - done - - - name: Update accuracy_report/index.md - run: | - REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report" - INDEX_MD="$REPORT_DIR/index.md" - - { - echo "# Accuracy Report" - echo "" - echo "::: {toctree}" - echo ":caption: Accuracy Report" - echo ":maxdepth: 1" - - for report in "$REPORT_DIR"/*.md; do - filename="$(basename "$report" .md)" - if [ "$filename" != "index" ]; then - echo "$filename" - fi - done - - echo ":::" - } > "$INDEX_MD" - - - name: Create Pull Request - uses: peter-evans/create-pull-request@v7 - with: - token: ${{ secrets.PR_TOKEN }} - base: ${{ github.event.inputs.vllm-ascend-branch }} - branch: auto-pr/accuracy-report - commit-message: "Update accuracy reports for ${{ github.event.inputs.vllm-ascend-branch }}" - add-paths: ./docs/source/developer_guide/evaluation/accuracy_report/*.md - title: "[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-branch }}" - body: | - The accuracy results running on NPU Altlas A2 have changed, updating reports for: - ${{ - github.event.inputs.models == 'all' - && 'All models (Qwen2.5-7B-Instruct, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)' - || github.event.inputs.models - }} - - - [Workflow run][1] - - [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} \ No newline at end of file diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index e14cd39..5e08f8d 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -45,8 +45,8 @@ on: type: choice options: - main - - v0.7.3-dev - v0.9.1-dev + - v0.7.3-dev models: description: 'model:' required: true @@ -183,7 +183,28 @@ jobs: PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | pip install -r requirements-dev.txt - pip install -e . + pip install -v -e . + + - name: Get vLLM commit hash and URL + working-directory: ./vllm-empty + run: | + VLLM_COMMIT=$(git rev-parse HEAD) + echo "VLLM_COMMIT=$VLLM_COMMIT" >> $GITHUB_ENV + echo "VLLM_COMMIT_URL=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV + + - name: Get vLLM-Ascend commit hash and URL + working-directory: ./vllm-ascend + run: | + VLLM_ASCEND_COMMIT=$(git rev-parse HEAD) + echo "VLLM_ASCEND_COMMIT=$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV + echo "VLLM_ASCEND_COMMIT_URL=https://github.com/vllm-project/vllm-ascend/commit/$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV + + - name: Print resolved hashes and URLs + run: | + echo "vLLM : ${{ env.VLLM_COMMIT }}" + echo "vLLM link : ${{ env.VLLM_COMMIT_URL }}" + echo "vLLM-Ascend: ${{ env.VLLM_ASCEND_COMMIT }}" + echo "Ascend link: ${{ env.VLLM_ASCEND_COMMIT_URL }}" - name: Install lm-eval, ray, and datasets run: | @@ -239,7 +260,12 @@ jobs: --cann_version "${{ env.GHA_CANN_VERSION }}" \ --torch_npu_version "${{ env.GHA_TORCH_NPU_VERSION }}" \ --torch_version "${{ env.GHA_TORCH_VERSION }}" \ - --vllm_version "${{ env.GHA_VLLM_VERSION }}" + --vllm_version "${{ env.GHA_VLLM_VERSION }}" \ + --vllm_commit "${{ env.VLLM_COMMIT }}" \ + --vllm_ascend_commit "${{ env.VLLM_ASCEND_COMMIT }}" \ + --vllm_commit_url "${{ env.VLLM_COMMIT_URL }}" \ + --vllm_ascend_commit_url "${{ env.VLLM_ASCEND_COMMIT_URL }}" \ + --vllm_use_v1 "$VLLM_USE_V1" - name: Generate step summary if: ${{ always() }} @@ -251,12 +277,122 @@ jobs: SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}" echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV" + - name: Check report first line for failure + id: check_report + run: | + REPORT_PATH="./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md" + echo "Scanning $REPORT_PATH for ❌ …" + if grep -q '❌' "$REPORT_PATH"; then + echo "contains_fail=true" >> $GITHUB_OUTPUT + else + echo "contains_fail=false" >> $GITHUB_OUTPUT + fi + - name: Upload Report for V${{ matrix.vllm_use_version }} - if: ${{ github.event_name == 'workflow_dispatch' }} + if: ${{ github.event_name == 'workflow_dispatch' && steps.check_report.outputs.contains_fail == 'false' }} uses: actions/upload-artifact@v4 with: - name: "${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}-report" + name: "report-${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}" path: ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md if-no-files-found: warn retention-days: 90 overwrite: true + + create_pr: + runs-on: ubuntu-latest + needs: accuracy_tests + if: ${{ github.event_name == 'workflow_dispatch' }} + env: + UPSTREAM_REPO: vllm-project/vllm-ascend + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + repository: vllm-ascend-ci/vllm-ascend + token: ${{ secrets.PAT_TOKEN }} + ref: main + + - name: Add upstream remote + run: | + git remote add upstream https://github.com/${{ env.UPSTREAM_REPO }}.git + git fetch upstream + git remote -v + + - name: Set Git user info dynamically + run: | + git config user.name "${{ github.actor }}" + git config user.email "${{ github.actor }}@users.noreply.github.com" + + - name: Create or switch to branch + run: | + TIMESTAMP=$(date +%Y%m%d%H%M%S) + BRANCH_NAME="auto-pr/accuracy-report-${TIMESTAMP}" + echo "BRANCH_NAME=${BRANCH_NAME}" >> $GITHUB_ENV + git checkout -B "${BRANCH_NAME}" upstream/${{ github.event.inputs.vllm-ascend-version }} + + - name: Download only current run reports + uses: actions/download-artifact@v4 + with: + path: ./docs/source/developer_guide/evaluation/accuracy_report + pattern: report-* + github-token: ${{ secrets.GITHUB_TOKEN }} + run-id: ${{ github.run_id }} + + - name: Delete old report + run: | + find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete + find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \; + find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete + + - name: Update accuracy_report/index.md + run: | + REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report" + INDEX_MD="$REPORT_DIR/index.md" + { + echo "# Accuracy Report" + echo "" + echo ":::{toctree}" + echo ":caption: Accuracy Report" + echo ":maxdepth: 1" + + for report in "$REPORT_DIR"/*.md; do + filename="$(basename "$report" .md)" + if [ "$filename" != "index" ]; then + echo "$filename" + fi + done + echo ":::" + } > "$INDEX_MD" + + - name: push accuracy report + env: + GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }} + run: | + git add ./docs/source/developer_guide/evaluation/accuracy_report/*.md + git commit -s -m "[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}" + git push -f origin "${{ env.BRANCH_NAME }}" + + - name: Create PR in upstream via API + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.PAT_TOKEN }} + script: | + const pr = await github.rest.pulls.create({ + owner: 'vllm-project', + repo: 'vllm-ascend', + head: `${{ github.actor }}:${{ env.BRANCH_NAME }}`, + base: '${{ github.event.inputs.vllm-ascend-version }}', + title: `[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-version }}`, + body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: + ${{ + github.event.inputs.models == 'all' + && 'All models (Qwen2.5-7B-Instruct, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)' + || github.event.inputs.models + }} + + - [Workflow run][1] + + [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}` + }); + core.info(`Created PR #${pr.data.number}`); + \ No newline at end of file diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py index 79c58bc..5bdc370 100644 --- a/benchmarks/scripts/run_accuracy.py +++ b/benchmarks/scripts/run_accuracy.py @@ -31,24 +31,44 @@ UNIMODAL_TASK = ["ceval-valid", "gsm8k"] MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"] MULTIMODAL_TASK = ["mmmu_val"] -batch_size_dict = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1} +BATCH_SIZE = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1} MODEL_RUN_INFO = { "Qwen/Qwen2.5-7B-Instruct": - ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" + ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen3-8B-Base": - ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" + ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen2.5-VL-7B-Instruct": - ("export MODEL_ARGS='pretrained={model}, max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2'\n" + ("export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2'\n" "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --batch_size 1"), } +FILTER = { + "gsm8k": "exact_match,flexible-extract", + "ceval-valid": "acc,none", + "mmmu_val": "acc,none" +} +EXPECTED_VALUE = { + "Qwen/Qwen2.5-7B-Instruct": { + "ceval-valid": 0.80, + "gsm8k": 0.72 + }, + "Qwen/Qwen3-8B-Base": { + "ceval-valid": 0.82, + "gsm8k": 0.83 + }, + "Qwen/Qwen2.5-VL-7B-Instruct": { + "mmmu_val": 0.51 + } +} +RTOL = 0.03 +ACCURACY_FLAG = {} def run_accuracy_unimodal(queue, model, dataset): @@ -60,7 +80,7 @@ def run_accuracy_unimodal(queue, model, dataset): tasks=dataset, apply_chat_template=True, fewshot_as_multiturn=True, - batch_size=batch_size_dict[dataset], + batch_size=BATCH_SIZE[dataset], num_fewshot=5, ) print(f"Success: {model} on {dataset}") @@ -84,7 +104,7 @@ def run_accuracy_multimodal(queue, model, dataset): tasks=dataset, apply_chat_template=True, fewshot_as_multiturn=True, - batch_size=batch_size_dict[dataset], + batch_size=BATCH_SIZE[dataset], ) print(f"Success: {model} on {dataset}") measured_value = results["results"] @@ -102,25 +122,22 @@ def generate_md(model_name, tasks_list, args, datasets): run_cmd = MODEL_RUN_INFO[model_name].format(model=model_name, datasets=datasets) model = model_name.split("/")[1] - preamble = f"""# 🎯 {model} Accuracy Test -
- vLLM version: vLLM: {args.vllm_version}, vLLM Ascend: {args.vllm_ascend_version}
-
-
- Software Environment: CANN: {args.cann_version}, PyTorch: {args.torch_version}, torch-npu: {args.torch_npu_version}
-
-
- Hardware Environment: Atlas A2 Series
-
-
- Datasets: {datasets}
-
-
- Command: + version_info = ( + f"**vLLM Version**: vLLM: {args.vllm_version} " + f"([{args.vllm_commit}]({args.vllm_commit_url})), " + f"**vLLM Ascend**: {args.vllm_ascend_version} " + f"([{args.vllm_ascend_commit}]({args.vllm_ascend_commit_url}))") - ```bash - {run_cmd} - ``` + preamble = f"""# 🎯 {model} +{version_info} +**vLLM Engine**: V{args.vllm_use_v1} +**Software Environment**: CANN: {args.cann_version}, PyTorch: {args.torch_version}, torch-npu: {args.torch_npu_version} +**Hardware Environment**: Atlas A2 Series +**Datasets**: {datasets} +**Command**: +```bash +{run_cmd} +```
 
""" @@ -153,11 +170,12 @@ def generate_md(model_name, tasks_list, args, datasets): n_shot = "5" else: n_shot = "0" + flag = ACCURACY_FLAG.get(task_name, "") row = (f"| {task_name:<37} " f"| {flt:<6} " f"| {n_shot:6} " f"| {metric:<6} " - f"| ↑ {value:>5.4f} " + f"| {flag}{value:>5.4f} " f"| ± {stderr:>5.4f} |") if not task_name.startswith("-"): rows.append(row) @@ -187,6 +205,7 @@ def main(args): if args.model in UNIMODAL_MODEL_NAME: datasets = ",".join(UNIMODAL_TASK) for dataset in UNIMODAL_TASK: + accuracy_expected = EXPECTED_VALUE[args.model][dataset] p = multiprocessing.Process(target=run_accuracy_unimodal, args=(result_queue, args.model, dataset)) @@ -194,10 +213,16 @@ def main(args): p.join() result = result_queue.get() print(result) + if accuracy_expected - RTOL < result[dataset][ + FILTER[dataset]] < accuracy_expected + RTOL: + ACCURACY_FLAG[dataset] = "✅" + else: + ACCURACY_FLAG[dataset] = "❌" accuracy[args.model].append(result) if args.model in MULTIMODAL_NAME: datasets = ",".join(MULTIMODAL_TASK) for dataset in MULTIMODAL_TASK: + accuracy_expected = EXPECTED_VALUE[args.model][dataset] p = multiprocessing.Process(target=run_accuracy_multimodal, args=(result_queue, args.model, dataset)) @@ -205,12 +230,18 @@ def main(args): p.join() result = result_queue.get() print(result) + if accuracy_expected - RTOL < result[dataset][ + FILTER[dataset]] < accuracy_expected + RTOL: + ACCURACY_FLAG[dataset] = "✅" + else: + ACCURACY_FLAG[dataset] = "❌" accuracy[args.model].append(result) print(accuracy) safe_md(args, accuracy, datasets) if __name__ == "__main__": + multiprocessing.set_start_method('spawn', force=True) parser = argparse.ArgumentParser() parser.add_argument("--output", type=str, required=True) parser.add_argument("--model", type=str, required=True) @@ -219,8 +250,12 @@ if __name__ == "__main__": parser.add_argument("--torch_npu_version", type=str, required=False) parser.add_argument("--vllm_version", type=str, required=False) parser.add_argument("--cann_version", type=str, required=False) + parser.add_argument("--vllm_commit", type=lambda s: s[:7], required=False) + parser.add_argument("--vllm_commit_url", type=str, required=False) + parser.add_argument("--vllm_ascend_commit", + type=lambda s: s[:7], + required=False) + parser.add_argument("--vllm_ascend_commit_url", type=str, required=False) + parser.add_argument("--vllm_use_v1", type=str, required=False) args = parser.parse_args() - # TODO(yikun): - # 1. add a exit 1 if accuracy is not as expected - # 2. Add ✅, ❌ to markdown if accuracy is not as expected main(args)