diff --git a/.github/workflows/label_merge_conflict.yml b/.github/workflows/bot_merge_conflict.yml similarity index 100% rename from .github/workflows/label_merge_conflict.yml rename to .github/workflows/bot_merge_conflict.yml diff --git a/.github/workflows/pr_create.yaml b/.github/workflows/bot_pr_create.yaml similarity index 100% rename from .github/workflows/pr_create.yaml rename to .github/workflows/bot_pr_create.yaml diff --git a/.github/workflows/vllm_ascend_doctest.yaml b/.github/workflows/labled_doctest.yaml similarity index 100% rename from .github/workflows/vllm_ascend_doctest.yaml rename to .github/workflows/labled_doctest.yaml diff --git a/.github/workflows/vllm_ascend_test_310p.yaml b/.github/workflows/labled_test_310.yaml similarity index 100% rename from .github/workflows/vllm_ascend_test_310p.yaml rename to .github/workflows/labled_test_310.yaml diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/nightly_test_a2.yaml similarity index 100% rename from .github/workflows/vllm_ascend_test_nightly_a2.yaml rename to .github/workflows/nightly_test_a2.yaml diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/nightly_test_a3.yaml similarity index 100% rename from .github/workflows/vllm_ascend_test_nightly_a3.yaml rename to .github/workflows/nightly_test_a3.yaml diff --git a/.github/workflows/image_build_and_push.yaml b/.github/workflows/pr_tag_image_build_and_push.yaml similarity index 100% rename from .github/workflows/image_build_and_push.yaml rename to .github/workflows/pr_tag_image_build_and_push.yaml diff --git a/.github/workflows/release_code_and_wheel.yml b/.github/workflows/pr_tag_release_code_and_wheel.yml similarity index 100% rename from .github/workflows/release_code_and_wheel.yml rename to .github/workflows/pr_tag_release_code_and_wheel.yml diff --git a/.github/workflows/vllm_ascend_test_pr_full.yaml b/.github/workflows/pr_test_full.yaml similarity index 100% rename from .github/workflows/vllm_ascend_test_pr_full.yaml rename to .github/workflows/pr_test_full.yaml diff --git a/.github/workflows/vllm_ascend_test_pr_light.yaml b/.github/workflows/pr_test_light.yaml similarity index 100% rename from .github/workflows/vllm_ascend_test_pr_light.yaml rename to .github/workflows/pr_test_light.yaml diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/schedule_test_benchmarks.yaml similarity index 100% rename from .github/workflows/nightly_benchmarks.yaml rename to .github/workflows/schedule_test_benchmarks.yaml diff --git a/.github/workflows/vllm_ascend_test_full_vllm_main.yaml b/.github/workflows/schedule_test_vllm_main.yaml similarity index 95% rename from .github/workflows/vllm_ascend_test_full_vllm_main.yaml rename to .github/workflows/schedule_test_vllm_main.yaml index b7253a4a..b5a39c87 100644 --- a/.github/workflows/vllm_ascend_test_full_vllm_main.yaml +++ b/.github/workflows/schedule_test_vllm_main.yaml @@ -17,9 +17,9 @@ name: 'ascend test / vllm main' on: - # Run full e2e tests per 2h + # Run full e2e tests per 4h schedule: - - cron: '0 */2 * * *' + - cron: '0 */4 * * *' workflow_dispatch: # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly diff --git a/.github/workflows/vllm_ascend_test_report.yaml b/.github/workflows/vllm_ascend_test_report.yaml deleted file mode 100644 index 0c548929..00000000 --- a/.github/workflows/vllm_ascend_test_report.yaml +++ /dev/null @@ -1,172 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -# This test will be triggered: -# 1. schedule -# 2. pull_request change the related files -# 3. workflow_dispatch with models input - -name: ascend test / accuracy report - -on: - pull_request: - branches: - - 'main' - - '*-dev' - paths: - - '.github/workflows/vllm_ascend_test_report.yaml' - - 'tests/e2e/models/test_lm_eval_correctness.py' - workflow_dispatch: - inputs: - vllm-ascend-version: - description: 'vllm-ascend:' - required: true - type: choice - # Current supported vLLM versions - options: - - latest - - main - default: main - -# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly -# declared as "shell: bash -el {0}" on steps that need to be properly activated. -# It's used to activate ascend-toolkit environment variables. -defaults: - run: - shell: bash -el {0} - -# only cancel in-progress runs of the same workflow -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - run: - strategy: - fail-fast: false - matrix: - include: - - runner: linux-aarch64-a2-1 - model_list: - - Qwen3-8B - - Qwen2.5-VL-7B-Instruct - - Qwen2-Audio-7B-Instruct - - runner: linux-aarch64-a2-2 - model_list: - - Qwen3-30B-A3B - - Qwen3-VL-30B-A3B-Instruct - - DeepSeek-V2-Lite - uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml - with: - vllm: v0.12.0 - runner: ${{ matrix.runner }} - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11 - model_list: ${{ toJson(matrix.model_list) }} - upload: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }} - - create_pr: - runs-on: ubuntu-latest - needs: run - if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }} - env: - UPSTREAM_REPO: vllm-project/vllm-ascend - steps: - - name: Checkout repository - uses: actions/checkout@v6.0.1 - with: - repository: vllm-ascend-ci/vllm-ascend - token: ${{ secrets.PAT_TOKEN }} - ref: main - - - name: Add upstream remote - run: | - git remote add upstream https://github.com/${{ env.UPSTREAM_REPO }}.git - git fetch upstream - git remote -v - - - name: Set Git user info dynamically - run: | - git config user.name "${{ github.actor }}" - git config user.email "${{ github.actor }}@users.noreply.github.com" - - - name: Create or switch to branch - run: | - TIMESTAMP=$(date +%Y%m%d%H%M%S) - BRANCH_NAME="auto-pr/accuracy-report-${TIMESTAMP}" - echo "BRANCH_NAME=${BRANCH_NAME}" >> $GITHUB_ENV - git checkout -B "${BRANCH_NAME}" upstream/main - - - name: Download only current run reports - uses: actions/download-artifact@v6 - with: - path: ./docs/source/developer_guide/evaluation/accuracy_report - pattern: report-* - github-token: ${{ secrets.GITHUB_TOKEN }} - run-id: ${{ github.run_id }} - - - name: Delete old report - run: | - find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete - find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \; - find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete - - - name: Update accuracy_report/index.md - run: | - REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report" - INDEX_MD="$REPORT_DIR/index.md" - { - echo "# Accuracy Report" - echo "" - echo ":::{toctree}" - echo ":caption: Accuracy Report" - echo ":maxdepth: 1" - - for report in "$REPORT_DIR"/*.md; do - filename="$(basename "$report" .md)" - if [ "$filename" != "index" ]; then - echo "$filename" - fi - done - echo ":::" - } > "$INDEX_MD" - - - name: push accuracy report - env: - GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }} - run: | - git add ./docs/source/developer_guide/evaluation/accuracy_report/*.md - git commit -s -m "[Doc] Update accuracy reports for ${{ env.BRANCH_NAME }}" - git push -f origin "${{ env.BRANCH_NAME }}" - - - name: Create PR in upstream via API - uses: actions/github-script@v8 - with: - github-token: ${{ secrets.PAT_TOKEN }} - script: | - const pr = await github.rest.pulls.create({ - owner: 'vllm-project', - repo: 'vllm-ascend', - head: `vllm-ascend-ci:${{ env.BRANCH_NAME }}`, - base: 'main', - title: `[Doc] Update accuracy reports for ${{ env.BRANCH_NAME }}`, - body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models - - - [Workflow run][1] - - [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}` - }); - core.info(`Created PR #${pr.data.number}`); diff --git a/docs/source/developer_guide/evaluation/accuracy_report/DeepSeek-V2-Lite.md b/docs/source/developer_guide/evaluation/accuracy_report/DeepSeek-V2-Lite.md deleted file mode 100644 index 68d43695..00000000 --- a/docs/source/developer_guide/evaluation/accuracy_report/DeepSeek-V2-Lite.md +++ /dev/null @@ -1,20 +0,0 @@ -# deepseek-ai/DeepSeek-V2-Lite - -- **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a)) -- **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724 -- **Hardware Environment**: Atlas A2 Series -- **Parallel mode**: TP2 -- **Execution mode**: ACLGraph - -**Command**: - -```bash -export MODEL_ARGS='pretrained=deepseek-ai/DeepSeek-V2-Lite,tensor_parallel_size=2,dtype=auto,trust_remote_code=True,max_model_len=4096,enforce_eager=True' -lm_eval --model vllm --model_args $MODEL_ARGS --tasks gsm8k \ - --batch_size auto -``` - -| Task | Metric | Value | Stderr | -|-----------------------|-------------|----------:|-------:| -| gsm8k | exact_match,strict-match | ✅0.3813 | ± 0.0134 | -| gsm8k | exact_match,flexible-extract | ✅0.3836 | ± 0.0134 | diff --git a/docs/source/developer_guide/evaluation/accuracy_report/Qwen2.5-VL-7B-Instruct.md b/docs/source/developer_guide/evaluation/accuracy_report/Qwen2.5-VL-7B-Instruct.md deleted file mode 100644 index 6ceff536..00000000 --- a/docs/source/developer_guide/evaluation/accuracy_report/Qwen2.5-VL-7B-Instruct.md +++ /dev/null @@ -1,19 +0,0 @@ -# Qwen/Qwen2.5-VL-7B-Instruct - -- **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a)) -- **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724 -- **Hardware Environment**: Atlas A2 Series -- **Parallel mode**: TP1 -- **Execution mode**: ACLGraph - -**Command**: - -```bash -export MODEL_ARGS='pretrained=Qwen/Qwen2.5-VL-7B-Instruct,tensor_parallel_size=1,dtype=auto,trust_remote_code=False,max_model_len=8192' -lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks mmmu_val \ - --apply_chat_template True --fewshot_as_multiturn True --batch_size auto -``` - -| Task | Metric | Value | Stderr | -|-----------------------|-------------|----------:|-------:| -| mmmu_val | acc,none | ✅0.52 | ± 0.0162 | diff --git a/docs/source/developer_guide/evaluation/accuracy_report/Qwen3-30B-A3B.md b/docs/source/developer_guide/evaluation/accuracy_report/Qwen3-30B-A3B.md deleted file mode 100644 index d170936e..00000000 --- a/docs/source/developer_guide/evaluation/accuracy_report/Qwen3-30B-A3B.md +++ /dev/null @@ -1,21 +0,0 @@ -# Qwen/Qwen3-30B-A3B - -- **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a)) -- **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724 -- **Hardware Environment**: Atlas A2 Series -- **Parallel mode**: TP2 + EP -- **Execution mode**: ACLGraph - -**Command**: - -```bash -export MODEL_ARGS='pretrained=Qwen/Qwen3-30B-A3B,tensor_parallel_size=2,dtype=auto,trust_remote_code=False,max_model_len=4096,gpu_memory_utilization=0.6,enable_expert_parallel=True' -lm_eval --model vllm --model_args $MODEL_ARGS --tasks gsm8k,ceval-valid \ - --num_fewshot 5 --batch_size auto -``` - -| Task | Metric | Value | Stderr | -|-----------------------|-------------|----------:|-------:| -| gsm8k | exact_match,strict-match | ✅0.8923 | ± 0.0085 | -| gsm8k | exact_match,flexible-extract | ✅0.8506 | ± 0.0098 | -| ceval-valid | acc,none | ✅0.8358 | ± 0.0099 | diff --git a/docs/source/developer_guide/evaluation/accuracy_report/Qwen3-8B-Base.md b/docs/source/developer_guide/evaluation/accuracy_report/Qwen3-8B-Base.md deleted file mode 100644 index 0649ee60..00000000 --- a/docs/source/developer_guide/evaluation/accuracy_report/Qwen3-8B-Base.md +++ /dev/null @@ -1,21 +0,0 @@ -# Qwen/Qwen3-8B-Base - -- **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a)) -- **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724 -- **Hardware Environment**: Atlas A2 Series -- **Parallel mode**: TP1 -- **Execution mode**: ACLGraph - -**Command**: - -```bash -export MODEL_ARGS='pretrained=Qwen/Qwen3-8B-Base,tensor_parallel_size=1,dtype=auto,trust_remote_code=False,max_model_len=4096' -lm_eval --model vllm --model_args $MODEL_ARGS --tasks gsm8k,ceval-valid \ - --apply_chat_template True --fewshot_as_multiturn True --num_fewshot 5 --batch_size auto -``` - -| Task | Metric | Value | Stderr | -|-----------------------|-------------|----------:|-------:| -| gsm8k | exact_match,strict-match | ✅0.8271 | ± 0.0104 | -| gsm8k | exact_match,flexible-extract | ✅0.8294 | ± 0.0104 | -| ceval-valid | acc,none | ✅0.815 | ± 0.0103 | diff --git a/docs/source/developer_guide/evaluation/accuracy_report/index.md b/docs/source/developer_guide/evaluation/accuracy_report/index.md deleted file mode 100644 index 59f7f23e..00000000 --- a/docs/source/developer_guide/evaluation/accuracy_report/index.md +++ /dev/null @@ -1,10 +0,0 @@ -# Accuracy Report - -:::{toctree} -:caption: Accuracy Report -:maxdepth: 1 -DeepSeek-V2-Lite -Qwen2.5-VL-7B-Instruct -Qwen3-30B-A3B -Qwen3-8B-Base -::: diff --git a/docs/source/developer_guide/evaluation/index.md b/docs/source/developer_guide/evaluation/index.md index 8bc6894b..02bd9cd3 100644 --- a/docs/source/developer_guide/evaluation/index.md +++ b/docs/source/developer_guide/evaluation/index.md @@ -7,5 +7,4 @@ using_evalscope using_lm_eval using_ais_bench using_opencompass -accuracy_report/index ::: diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 72a3570e..cce8750b 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -200,7 +200,6 @@ class AscendW8A8DynamicFusedMoEMethod: assert router_logits.shape[ 1] == global_num_experts - global_redundant_expert_num, "Number of global experts mismatch (excluding redundancy)" - topk_weights, topk_ids = None, None if self.multistream_overlap_gate: fc3_context = get_flash_common3_context() assert fc3_context is not None @@ -219,7 +218,8 @@ class AscendW8A8DynamicFusedMoEMethod: scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, global_num_experts=global_num_experts) - + assert topk_ids is not None + assert topk_weights is not None # this is a naive implementation for experts load balance so as # to avoid accumulating too much tokens on a single rank. # currently it is only activated when doing profile runs.