[Test] Refactor accuracy test to nightly test (#3814)

### What this PR does / why we need it? Refactor accuracy test to nightly test - vLLM version: v0.11.0 - vLLM main: 83f478bb19 Signed-off-by: hfadzxy <starmoon_zhang@163.com>
2025-11-06 09:06:59 +08:00
parent b1488ecdb1
commit 737cad2b6b
4 changed files with 142 additions and 146 deletions
--- a/.github/workflows/_e2e_nightly_single_node_models.yaml
+++ b/.github/workflows/_e2e_nightly_single_node_models.yaml
@@ -1,4 +1,21 @@
-name: 'accuracy test'
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+name: 'e2e nightly models test'

 on:
  workflow_call:
@@ -16,7 +33,7 @@ on:
      image:
        required: true
        type: string
-      model_name:
+      model_list:
        required: true
        type: string
      upload:
@@ -24,38 +41,44 @@ on:
        type: boolean
        default: false

-jobs:
-  accuracy_tests:
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}

+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 1 card / 2 cards / 4 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ inputs.runner }}-${{inputs.model_list}}
+  cancel-in-progress: true
+
+jobs:
+  e2e-nightly:
+    name: ${{inputs.model_list}} accuracy test
    runs-on: ${{ inputs.runner }}
-    name: ${{ inputs.model_name }} accuracy
    container:
      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
      env:
        VLLM_USE_MODELSCOPE: True
-        # 1. If version specified (work_dispatch), do specified branch accuracy test
-        # 2. If no version (labeled PR), do accuracy test by default ref:
-        # The branch, tag or SHA to checkout. When checking out the repository that
-        # triggered a workflow, this defaults to the reference or SHA for that event.
-        # Otherwise, uses the default branch.
        GHA_VLLM_ASCEND_VERSION: ${{ inputs.vllm-ascend }}
-
    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set model name as output
-        id: set_output
+      - name: Check npu and CANN info
        run: |
-          echo "model_name=${{ inputs.model_name }}" >> $GITHUB_OUTPUT
+          npu-smi info
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info

      - name: Config mirrors
        run: |
-          sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
-          pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
-          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
+          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
          apt-get update -y
          apt install git -y
+          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4

      - name: Install system dependencies
        run: |
@@ -73,9 +96,16 @@ jobs:
        working-directory: ./vllm-empty
        run: |
          VLLM_TARGET_DEVICE=empty pip install -e .
-        
+
+      - name: Install vllm-project/vllm-ascend
+        env:
+          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
+        run: |
+          pip install -r requirements-dev.txt
+          pip install -v -e .
+
      - name: Install Ascend toolkit & triton_ascend (for Qwen3-Next-80B-A3B-Instruct)
-        if: ${{ inputs.model_name == 'Qwen3-Next-80B-A3B-Instruct' }}
+        if: ${{ inputs.runner == 'linux-aarch64-a2-4' && contains(inputs.model_list, 'Qwen3-Next-80B-A3B-Instruct') }}
        shell: bash -l {0}
        run: |
          wget -q https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/Ascend-BiSheng-toolkit_aarch64.run -O /tmp/Ascend-BiSheng-toolkit_aarch64.run
@@ -108,14 +138,6 @@ jobs:
          path: ./vllm-ascend
          ref: ${{ env.GHA_VLLM_ASCEND_VERSION }}

-      - name: Install vllm-project/vllm-ascend
-        working-directory: ./vllm-ascend
-        env:
-          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
-        run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
-
      - name: Get vLLM commit hash and URL
        working-directory: ./vllm-empty
        run: |
@@ -149,11 +171,12 @@ jobs:
            pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//'
          } >> "$GITHUB_ENV"

-      - name: Run accuracy test
+      - name: Run vllm-project/vllm-ascend accuracy test
        id: report
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
          VLLM_USE_MODELSCOPE: True
+          VLLM_CI_RUNNER: ${{ inputs.runner }}
          VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }}
          VLLM_COMMIT: ${{ env.VLLM_COMMIT }}
          VLLM_ASCEND_VERSION: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}
@@ -162,24 +185,44 @@ jobs:
          TORCH_VERSION: ${{ env.GHA_TORCH_VERSION }}
          TORCH_NPU_VERSION: ${{ env.GHA_TORCH_NPU_VERSION }}
        run: |
-          model_base_name=$(basename ${{ inputs.model_name }})
-          markdown_name="${model_base_name}"
-          echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT
          mkdir -p ./benchmarks/accuracy
-          pytest -sv ./tests/e2e/models/test_lm_eval_correctness.py \
-          --config ./tests/e2e/models/configs/${{ inputs.model_name }}.yaml
+          echo "Received model_list: ${{ inputs.model_list }}"
+          models=$(echo '${{ inputs.model_list }}' | jq -r '.[]')
+          any_failure=0
+          for model in $models; do
+            echo "Running test for model: $model"
+            pytest -sv ./tests/e2e/models/test_lm_eval_correctness.py \
+              --config "./tests/e2e/models/configs/${model}.yaml" || {
+              echo "Test failed for model: $model"
+              any_failure=1
+            }
+          done
+
+          if [ $any_failure -ne 0 ]; then
+            exit 1
+          fi

      - name: Generate step summary
        if: ${{ always() }}
        run: |
-          cat ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md >> $GITHUB_STEP_SUMMARY
+          models=$(echo '${{ inputs.model_list }}' | jq -r '.[]')
+          for model in $models; do
+            echo "Processing model: $model"
+            model_base_name=$(basename "$model")
+            cat ./benchmarks/accuracy/${model_base_name}.md >> $GITHUB_STEP_SUMMARY
+          done
+
+      - name: Set artifact timestamp
+        id: ts
+        run: |
+          echo "artifact_ts=$(date -u +%Y%m%dT%H%M%SZ)" >> $GITHUB_OUTPUT

      - name: Upload Report
        if: ${{ inputs.upload == true }}
        uses: actions/upload-artifact@v5
        with:
-          name: "report-${{ env.GHA_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}"
-          path: ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md
+          name: report-${{ env.GHA_VLLM_ASCEND_VERSION }}-${{ steps.ts.outputs.artifact_ts }}
+          path: ./benchmarks/accuracy/
          if-no-files-found: warn
          retention-days: 90
-          overwrite: true
+          overwrite: true
--- a/.github/workflows/accuracy_test.yaml
+++ b/.github/workflows/accuracy_test.yaml
@@ -1,85 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-#
-
-# This test will be triggered:
-# - PR labeled with: 'accuracy-test' & 'ready-for-test'
-name: ascend test / accuracy
-
-on:
-  pull_request:
-    branches:
-      - 'main'
-      - '*-dev'
-    types: [ labeled, synchronize ]
-
-# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
-# declared as "shell: bash -el {0}" on steps that need to be properly activated.
-# It's used to activate ascend-toolkit environment variables.
-defaults:
-  run:
-    shell: bash -el {0}
-
-# only cancel in-progress runs of the same workflow
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  run:
-    name: ""
-    strategy:
-      matrix:
-        # Only top series models should be listed in here
-        include:
-          - runner: a2-1
-            model_name: Qwen3-8B
-          - runner: a2-1
-            model_name: Qwen2.5-VL-7B-Instruct
-          # To do: This model has a bug that needs to be fixed and readded
-          # - runner: a2-1
-          #   model_name: Qwen2-Audio-7B-Instruct
-          - runner: a2-2
-            model_name: Qwen3-30B-A3B
-          - runner: a2-2
-            model_name: Qwen3-VL-30B-A3B-Instruct
-          - runner: a2-2
-            model_name: DeepSeek-V2-Lite
-          - runner: a2-4
-            model_name: Qwen3-Next-80B-A3B-Instruct
-          - runner: a2-1  
-            model_name: Qwen3-8B-W8A8
-          - runner: a2-1
-            model_name: Qwen3-VL-8B-Instruct
-          - runner: a2-1
-            model_name: Qwen2.5-Omni-7B
-          - runner: a2-1
-            model_name: Meta-Llama-3.1-8B-Instruct
-          - runner: a2-4
-            model_name: Qwen3-30B-A3B-W8A8
-      fail-fast: false
-    # test will be triggered when tag 'accuracy-test' & 'ready-for-test'
-    if:  >-
-      ${{
-      contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
-      contains(github.event.pull_request.labels.*.name, 'ready-for-test')
-      }}
-    uses: ./.github/workflows/_accuracy_test.yaml
-    with:
-      vllm: v0.11.0
-      runner:  linux-aarch64-${{ matrix.runner }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
-      model_name: ${{ matrix.model_name }}
--- a/.github/workflows/vllm_ascend_test_nightly_a2.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml
@@ -27,6 +27,7 @@ on:
  pull_request: 
    branches:
      - 'main'
+    types: [ labeled, synchronize ]

 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
 # declared as "shell: bash -el {0}" on steps that need to be properly activated.
@@ -88,3 +89,44 @@ jobs:
      config_file_path: ${{ matrix.test_config.config_file_path }}
    secrets:
      KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
+
+  single-node-accuracy-tests:
+    if: >-
+      ${{
+        github.event_name == 'schedule' ||
+        github.event_name == 'workflow_dispatch' ||
+        (
+          contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
+          contains(github.event.pull_request.labels.*.name, 'ready-for-test')
+        )
+      }}
+    strategy:
+      fail-fast: false
+      matrix:
+        test_config:
+          - os: linux-aarch64-a2-1
+            model_list:
+              - Qwen3-8B
+              - Qwen2.5-VL-7B-Instruct
+              # TODO: This model has a bug that needs to be fixed and readded
+              # - Qwen2-Audio-7B-Instruct
+              - Qwen3-8B-W8A8
+              - Qwen3-VL-8B-Instruct
+              - Qwen2.5-Omni-7B
+              - Meta-Llama-3.1-8B-Instruct
+          - os: linux-aarch64-a2-2
+            model_list:
+              - Qwen3-30B-A3B
+              - Qwen3-VL-30B-A3B-Instruct
+              - DeepSeek-V2-Lite
+              - Qwen3-30B-A3B-W8A8
+          - os: linux-aarch64-a2-4
+            model_list:
+              - Qwen3-Next-80B-A3B-Instruct
+    uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
+    with:
+      vllm: v0.11.0
+      runner: ${{ matrix.test_config.os }}
+      model_list: ${{ toJson(matrix.test_config.model_list) }}
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      upload: false
--- a/.github/workflows/vllm_ascend_test_report.yaml
+++ b/.github/workflows/vllm_ascend_test_report.yaml
@@ -20,18 +20,15 @@
 # 2. pull_request change the related files
 # 3. workflow_dispatch with models input

-name: ascend test / models
+name: ascend test / accuracy report

 on:
-  schedule:
-    # Runs every 6 hours
-    - cron:  '0 */6 * * *'
  pull_request:
    branches:
      - 'main'
      - '*-dev'
    paths:
-      - '.github/workflows/vllm_ascend_test_models.yaml'
+      - '.github/workflows/vllm_ascend_test_report.yaml'
      - 'tests/e2e/models/test_lm_eval_correctness.py'
  workflow_dispatch:
    inputs:
@@ -60,27 +57,26 @@ concurrency:
 jobs:
  run:
    strategy:
+      fail-fast: false
      matrix:
        include:
-          - model_name: Qwen3-8B
-            runner: a2-1
-          - model_name: Qwen2.5-VL-7B-Instruct
-            runner: a2-1
-          - model_name: Qwen2-Audio-7B-Instruct
-            runner: a2-1
-          - model_name: Qwen3-30B-A3B
-            runner: a2-2
-          - model_name: Qwen3-VL-30B-A3B-Instruct
-            runner: a2-2
-          - model_name: DeepSeek-V2-Lite
-            runner: a2-2
-      fail-fast: false
-    uses: ./.github/workflows/_accuracy_test.yaml
+          - runner: linux-aarch64-a2-1
+            model_list:
+              - Qwen3-8B
+              - Qwen2.5-VL-7B-Instruct
+              # TODO: This model has a bug that needs to be fixed and readded
+              # - Qwen2-Audio-7B-Instruct
+          - runner: linux-aarch64-a2-2
+            model_list:
+              - Qwen3-30B-A3B
+              - Qwen3-VL-30B-A3B-Instruct
+              - DeepSeek-V2-Lite
+    uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
    with:
      vllm: v0.11.0
-      runner:  linux-aarch64-${{ matrix.runner }}
+      runner: ${{ matrix.runner }}
      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
-      model_name: ${{ matrix.model_name }}
+      model_list: ${{ toJson(matrix.model_list) }}
      upload: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}

  create_pr: