[CI] CI refactor (#4928)

1. rename workflow to better name 2. fix lint error 3. remove accuracy report doc and test - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-14 11:09:56 +08:00
parent ba28d54f35
commit 8090914d69
20 changed files with 4 additions and 268 deletions
--- a/.github/workflows/label_merge_conflict.yml
+++ b/.github/workflows/label_merge_conflict.yml
--- a/.github/workflows/bot_pr_create.yaml
+++ b/.github/workflows/bot_pr_create.yaml
--- a/.github/workflows/vllm_ascend_doctest.yaml
+++ b/.github/workflows/vllm_ascend_doctest.yaml
--- a/.github/workflows/vllm_ascend_test_310p.yaml
+++ b/.github/workflows/vllm_ascend_test_310p.yaml
--- a/.github/workflows/vllm_ascend_test_nightly_a2.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml
--- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml
--- a/.github/workflows/pr_tag_image_build_and_push.yaml
+++ b/.github/workflows/pr_tag_image_build_and_push.yaml
--- a/.github/workflows/pr_tag_release_code_and_wheel.yml
+++ b/.github/workflows/pr_tag_release_code_and_wheel.yml
--- a/.github/workflows/vllm_ascend_test_pr_full.yaml
+++ b/.github/workflows/vllm_ascend_test_pr_full.yaml
--- a/.github/workflows/vllm_ascend_test_pr_light.yaml
+++ b/.github/workflows/vllm_ascend_test_pr_light.yaml
--- a/.github/workflows/schedule_test_benchmarks.yaml
+++ b/.github/workflows/schedule_test_benchmarks.yaml
--- a/.github/workflows/vllm_ascend_test_full_vllm_main.yaml
+++ b/.github/workflows/vllm_ascend_test_full_vllm_main.yaml
@@ -17,9 +17,9 @@
 name: 'ascend test / vllm main'
 on:
-  # Run full e2e tests per 2h
+  # Run full e2e tests per 4h
  schedule:
-    - cron: '0 */2 * * *'
+    - cron: '0 */4 * * *'
  workflow_dispatch:
 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
--- a/.github/workflows/vllm_ascend_test_report.yaml
+++ b/.github/workflows/vllm_ascend_test_report.yaml
@@ -1,172 +0,0 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 # This test will be triggered:
 # 1. schedule
 # 2. pull_request change the related files
 # 3. workflow_dispatch with models input
 name: ascend test / accuracy report
 on:
  pull_request:
    branches:
      - 'main'
      - '*-dev'
    paths:
      - '.github/workflows/vllm_ascend_test_report.yaml'
      - 'tests/e2e/models/test_lm_eval_correctness.py'
  workflow_dispatch:
    inputs:
      vllm-ascend-version:
        description: 'vllm-ascend:'
        required: true
        type: choice
        # Current supported vLLM versions
        options:
          - latest
          - main
        default: main
 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
 # declared as "shell: bash -el {0}" on steps that need to be properly activated.
 # It's used to activate ascend-toolkit environment variables.
 defaults:
  run:
    shell: bash -el {0}
 # only cancel in-progress runs of the same workflow
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run:
    strategy:
      fail-fast: false
      matrix:
        include:
          - runner: linux-aarch64-a2-1
            model_list:
              - Qwen3-8B
              - Qwen2.5-VL-7B-Instruct
              - Qwen2-Audio-7B-Instruct
          - runner: linux-aarch64-a2-2
            model_list:
              - Qwen3-30B-A3B
              - Qwen3-VL-30B-A3B-Instruct
              - DeepSeek-V2-Lite
    uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
    with:
      vllm: v0.12.0
      runner: ${{ matrix.runner }}
      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
      model_list: ${{ toJson(matrix.model_list) }}
      upload: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}
  create_pr:
    runs-on: ubuntu-latest
    needs: run
    if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}
    env:
      UPSTREAM_REPO: vllm-project/vllm-ascend
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6.0.1
        with:
          repository: vllm-ascend-ci/vllm-ascend
          token: ${{ secrets.PAT_TOKEN }}
          ref: main
      - name: Add upstream remote
        run: |
          git remote add upstream https://github.com/${{ env.UPSTREAM_REPO }}.git
          git fetch upstream
          git remote -v
      - name: Set Git user info dynamically
        run: |
          git config user.name "${{ github.actor }}"
          git config user.email "${{ github.actor }}@users.noreply.github.com"
      - name: Create or switch to branch
        run: |
          TIMESTAMP=$(date +%Y%m%d%H%M%S)
          BRANCH_NAME="auto-pr/accuracy-report-${TIMESTAMP}"
          echo "BRANCH_NAME=${BRANCH_NAME}" >> $GITHUB_ENV
          git checkout -B "${BRANCH_NAME}" upstream/main
      - name: Download only current run reports
        uses: actions/download-artifact@v6
        with:
          path: ./docs/source/developer_guide/evaluation/accuracy_report
          pattern: report-*
          github-token: ${{ secrets.GITHUB_TOKEN }}
          run-id: ${{ github.run_id }}
      - name: Delete old report
        run: |
          find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete
          find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \;
          find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete
      - name: Update accuracy_report/index.md
        run: |
          REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report"
          INDEX_MD="$REPORT_DIR/index.md"
          {
            echo "# Accuracy Report"
            echo ""
            echo ":::{toctree}"
            echo ":caption: Accuracy Report"
            echo ":maxdepth: 1"
            for report in "$REPORT_DIR"/*.md; do
              filename="$(basename "$report" .md)"
              if [ "$filename" != "index" ]; then
                echo "$filename"
              fi
            done
            echo ":::"
          } > "$INDEX_MD"
      - name: push accuracy report
        env:
          GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
        run: |
          git add ./docs/source/developer_guide/evaluation/accuracy_report/*.md
          git commit -s -m "[Doc] Update accuracy reports for ${{ env.BRANCH_NAME }}"
          git push -f origin "${{ env.BRANCH_NAME }}"
      - name: Create PR in upstream via API
        uses: actions/github-script@v8
        with:
          github-token: ${{ secrets.PAT_TOKEN }}
          script: |
            const pr = await github.rest.pulls.create({
              owner: 'vllm-project',
              repo: 'vllm-ascend',
              head: `vllm-ascend-ci:${{ env.BRANCH_NAME }}`,
              base: 'main',
              title: `[Doc] Update accuracy reports for ${{ env.BRANCH_NAME }}`,
              body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models
              - [Workflow run][1]
              [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`
            });
            core.info(`Created PR #${pr.data.number}`);
--- a/docs/source/developer_guide/evaluation/accuracy_report/DeepSeek-V2-Lite.md
+++ b/docs/source/developer_guide/evaluation/accuracy_report/DeepSeek-V2-Lite.md
@@ -1,20 +0,0 @@
 # deepseek-ai/DeepSeek-V2-Lite
 - **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a))  
 - **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724  
 - **Hardware Environment**: Atlas A2 Series  
 - **Parallel mode**: TP2
 - **Execution mode**: ACLGraph
 **Command**:  
 ```bash
 export MODEL_ARGS='pretrained=deepseek-ai/DeepSeek-V2-Lite,tensor_parallel_size=2,dtype=auto,trust_remote_code=True,max_model_len=4096,enforce_eager=True'
 lm_eval --model vllm --model_args $MODEL_ARGS --tasks gsm8k \
    --batch_size auto
 ```
 | Task                  | Metric      | Value     | Stderr |
 |-----------------------|-------------|----------:|-------:|
 | gsm8k | exact_match,strict-match | ✅0.3813 | ± 0.0134 |
 | gsm8k | exact_match,flexible-extract | ✅0.3836 | ± 0.0134 |
--- a/docs/source/developer_guide/evaluation/accuracy_report/Qwen2.5-VL-7B-Instruct.md
+++ b/docs/source/developer_guide/evaluation/accuracy_report/Qwen2.5-VL-7B-Instruct.md
@@ -1,19 +0,0 @@
 # Qwen/Qwen2.5-VL-7B-Instruct
 - **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a))  
 - **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724  
 - **Hardware Environment**: Atlas A2 Series  
 - **Parallel mode**: TP1
 - **Execution mode**: ACLGraph
 **Command**:  
 ```bash
 export MODEL_ARGS='pretrained=Qwen/Qwen2.5-VL-7B-Instruct,tensor_parallel_size=1,dtype=auto,trust_remote_code=False,max_model_len=8192'
 lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks mmmu_val \
 --apply_chat_template True   --fewshot_as_multiturn True    --batch_size auto
 ```
 | Task                  | Metric      | Value     | Stderr |
 |-----------------------|-------------|----------:|-------:|
 | mmmu_val | acc,none | ✅0.52 | ± 0.0162 |
--- a/docs/source/developer_guide/evaluation/accuracy_report/Qwen3-30B-A3B.md
+++ b/docs/source/developer_guide/evaluation/accuracy_report/Qwen3-30B-A3B.md
@@ -1,21 +0,0 @@
 # Qwen/Qwen3-30B-A3B
 - **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a))  
 - **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724  
 - **Hardware Environment**: Atlas A2 Series  
 - **Parallel mode**: TP2 + EP
 - **Execution mode**: ACLGraph
 **Command**:  
 ```bash
 export MODEL_ARGS='pretrained=Qwen/Qwen3-30B-A3B,tensor_parallel_size=2,dtype=auto,trust_remote_code=False,max_model_len=4096,gpu_memory_utilization=0.6,enable_expert_parallel=True'
 lm_eval --model vllm --model_args $MODEL_ARGS --tasks gsm8k,ceval-valid \
   --num_fewshot 5   --batch_size auto
 ```
 | Task                  | Metric      | Value     | Stderr |
 |-----------------------|-------------|----------:|-------:|
 | gsm8k | exact_match,strict-match | ✅0.8923 | ± 0.0085 |
 | gsm8k | exact_match,flexible-extract | ✅0.8506 | ± 0.0098 |
 | ceval-valid | acc,none | ✅0.8358 | ± 0.0099 |
--- a/docs/source/developer_guide/evaluation/accuracy_report/Qwen3-8B-Base.md
+++ b/docs/source/developer_guide/evaluation/accuracy_report/Qwen3-8B-Base.md
@@ -1,21 +0,0 @@
 # Qwen/Qwen3-8B-Base
 - **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a))  
 - **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724  
 - **Hardware Environment**: Atlas A2 Series  
 - **Parallel mode**: TP1
 - **Execution mode**: ACLGraph
 **Command**:  
 ```bash
 export MODEL_ARGS='pretrained=Qwen/Qwen3-8B-Base,tensor_parallel_size=1,dtype=auto,trust_remote_code=False,max_model_len=4096'
 lm_eval --model vllm --model_args $MODEL_ARGS --tasks gsm8k,ceval-valid \
 --apply_chat_template True   --fewshot_as_multiturn True   --num_fewshot 5   --batch_size auto
 ```
 | Task                  | Metric      | Value     | Stderr |
 |-----------------------|-------------|----------:|-------:|
 | gsm8k | exact_match,strict-match | ✅0.8271 | ± 0.0104 |
 | gsm8k | exact_match,flexible-extract | ✅0.8294 | ± 0.0104 |
 | ceval-valid | acc,none | ✅0.815 | ± 0.0103 |
--- a/docs/source/developer_guide/evaluation/accuracy_report/index.md
+++ b/docs/source/developer_guide/evaluation/accuracy_report/index.md
@@ -1,10 +0,0 @@
 # Accuracy Report
 :::{toctree}
 :caption: Accuracy Report
 :maxdepth: 1
 DeepSeek-V2-Lite
 Qwen2.5-VL-7B-Instruct
 Qwen3-30B-A3B
 Qwen3-8B-Base
 :::
--- a/docs/source/developer_guide/evaluation/index.md
+++ b/docs/source/developer_guide/evaluation/index.md
@@ -7,5 +7,4 @@ using_evalscope
 using_lm_eval
 using_ais_bench
 using_opencompass
 accuracy_report/index
 :::
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -200,7 +200,6 @@ class AscendW8A8DynamicFusedMoEMethod:
        assert router_logits.shape[
            1] == global_num_experts - global_redundant_expert_num, "Number of global experts mismatch (excluding redundancy)"
        topk_weights, topk_ids = None, None
        if self.multistream_overlap_gate:
            fc3_context = get_flash_common3_context()
            assert fc3_context is not None
@@ -219,7 +218,8 @@ class AscendW8A8DynamicFusedMoEMethod:
                scoring_func=scoring_func,
                e_score_correction_bias=e_score_correction_bias,
                global_num_experts=global_num_experts)
-
+        assert topk_ids is not None
        assert topk_weights is not None
        # this is a naive implementation for experts load balance so as
        # to avoid accumulating too much tokens on a single rank.
        # currently it is only activated when doing profile runs.