From 1e4017e3fa8dc72abe20cf4014174f54bc9a2d07 Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Thu, 5 Mar 2026 16:46:37 +0800 Subject: [PATCH] [CI] support nightly ci for per pr by labels (#6483) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? This PR refactors the nightly CI workflows (A2 and A3) to support running tests against a specific PR's code, in addition to the existing scheduled/dispatch runs using pre-built images. #### Motivation: Previously, nightly tests could only be triggered by schedule or workflow_dispatch, always using the pre-built nightly image. This change allows developers to trigger nightly tests against their own PR's source code, enabling early validation without waiting for a nightly build. #### Changes Trigger logic (parse-trigger job) A new parse-trigger job is introduced in both schedule_nightly_test_a2.yaml and schedule_nightly_test_a3.yaml to centralize trigger evaluation: `schedule / workflow_dispatch`: runs all tests with the pre-built image (existing behavior preserved) `pull_request (labeled + synchronize)`: runs only when:The PR has the nightly-test label, and /nightly [test-names] comment exists (latest one wins) 1. /nightly or /nightly all — runs all tests 2. /nightly test1 test2 — runs only named tests (comma-wrapped for exact matching) #### How to trigger 1. Add the nightly-test label to your PR 2. Comment /nightly (all tests) or /nightly test1 test2 (specific tests) 4. Re-triggering: add another /nightly comment and push a new commit (synchronize event) ### Does this PR introduce _any_ user-facing change? None ### How was this patch tested? - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd --------- Signed-off-by: hfadzxy --- .../workflows/_e2e_nightly_multi_node.yaml | 34 +++- .../workflows/_e2e_nightly_single_node.yaml | 65 +++++++ .../_e2e_nightly_single_node_models.yaml | 4 + .../workflows/schedule_nightly_test_a2.yaml | 165 ++++++++++++++++-- .../workflows/schedule_nightly_test_a3.yaml | 157 +++++++++++++++-- .../multi_node/scripts/lws-a2.yaml.jinja2 | 16 ++ .../multi_node/scripts/lws.yaml.jinja2 | 16 ++ tests/e2e/nightly/multi_node/scripts/run.sh | 65 ++++++- 8 files changed, 490 insertions(+), 32 deletions(-) diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index e080141c..1777af17 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -45,6 +45,12 @@ on: default: main type: string description: used for pr level tests + is_pr_test: + required: true + type: boolean + is_run: + required: true + type: boolean secrets: KUBECONFIG_B64: required: true @@ -68,6 +74,7 @@ jobs: name: ${{ inputs.config_file_path }} # This is the runner with no NPU for k8s controller runs-on: ${{ inputs.runner }} + if: ${{ inputs.is_run }} container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-cpu env: @@ -78,8 +85,19 @@ jobs: - name: Decode kubeconfig from secrets run: | # Decode and save kubeconfig - echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > "$KUBECONFIG" - + if [ "${{ inputs.is_pr_test }}" = "true" ]; then + echo "PR test mode" + if [ "${{ inputs.soc_version }}" = "a3" ]; then + echo "Using A3 cached kubeconfig" + cp /root/.cache/.kube/kubeconfig.yaml "$KUBECONFIG" + else + echo "Using A2 cached kubeconfig" + cp /root/.cache/.kube/hk_001_kb.yaml "$KUBECONFIG" + fi + else + echo "Decoding kubeconfig from secrets" + echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > "$KUBECONFIG" + fi - name: Checkout code uses: actions/checkout@v6 @@ -133,9 +151,13 @@ jobs: image="${{ inputs.image }}" config_file_path="${{ inputs.config_file_path }}" fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}" - echo "FAIL_TAG=${fail_tag}" >> "$GITHUB_ENV" + is_pr_test="${{ inputs.is_pr_test }}" + vllm_version="${{ inputs.vllm_version }}" + vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}" + vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}" + echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV - required_params=("size" "replicas" "image" "config_file_path") + required_params=("size" "replicas" "image" "config_file_path" "is_pr_test" "vllm_version" "vllm_ascend_ref" "vllm_ascend_remote_url") for param in "${required_params[@]}"; do if [ -z "${!param}" ]; then echo "Error: Parameter '$param' is required but empty" @@ -158,6 +180,10 @@ jobs: -D config_file_path="$config_file_path" \ -D npu_per_node="$npu_per_node" \ -D fail_tag="$fail_tag" \ + -D is_pr_test="$is_pr_test" \ + -D vllm_version="$vllm_version" \ + -D vllm_ascend_ref="$vllm_ascend_ref" \ + -D vllm_ascend_remote_url="$vllm_ascend_remote_url" \ --outfile lws.yaml kubectl apply -f ./lws.yaml diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml index bae2043c..4e99e8e2 100644 --- a/.github/workflows/_e2e_nightly_single_node.yaml +++ b/.github/workflows/_e2e_nightly_single_node.yaml @@ -36,6 +36,16 @@ on: name: required: false type: string + vllm_version: + required: false + type: string + default: "v0.16.0" + is_pr_test: + required: true + type: boolean + is_run: + required: true + type: boolean # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -54,6 +64,7 @@ jobs: e2e-nightly: name: ${{ inputs.name || inputs.config_file_path || inputs.tests }} runs-on: ${{ inputs.runner }} + if: ${{ inputs.is_run }} timeout-minutes: 600 container: image: ${{ inputs.image }} @@ -65,6 +76,60 @@ jobs: run: | npu-smi info cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + + - name: uninstall vlm vllm-ascend and remove code (if pr test) + if: ${{ inputs.is_pr_test }} + run: | + pip uninstall -y vllm vllm-ascend || true + rm -rf /vllm-workspace/vllm /vllm-workspace/vllm-ascend + + - name: Checkout vllm-project/vllm repo + if: ${{ inputs.is_pr_test }} + uses: actions/checkout@v6 + with: + repository: vllm-project/vllm + ref: ${{ inputs.vllm_version }} + path: ./temp-vllm + fetch-depth: 1 + + - name: Checkout vllm-project/vllm-ascend repo + if: ${{ inputs.is_pr_test }} + uses: actions/checkout@v6 + with: + path: ./temp-vllm-ascend + fetch-depth: 1 + + - name: Move code to /vllm-workspace + if: ${{ inputs.is_pr_test }} + run: | + mv ./temp-vllm /vllm-workspace/vllm + mv ./temp-vllm-ascend /vllm-workspace/vllm-ascend + ls -R /vllm-workspace + + - name: Install vllm-project/vllm from source + if: ${{ inputs.is_pr_test }} + working-directory: /vllm-workspace/vllm + run: | + VLLM_TARGET_DEVICE=empty pip install -e . + + - name: Install vllm-project/vllm-ascend + if: ${{ inputs.is_pr_test }} + working-directory: /vllm-workspace/vllm-ascend + env: + PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi + run: | + pip install -r requirements-dev.txt + pip install -v -e . + + - name: Install aisbench + if: ${{ inputs.is_pr_test }} + shell: bash -l {0} + run: | + git clone -b v3.0-20250930-master --depth 1 https://gitee.com/aisbench/benchmark.git /vllm-workspace/vllm-ascend/benchmark + cd /vllm-workspace/vllm-ascend/benchmark + pip install pytest asyncio pytest-asyncio + pip install -e . -r requirements/api.txt -r requirements/extra.txt + python3 -m pip cache purge - name: Show vLLM and vLLM-Ascend version working-directory: /vllm-workspace diff --git a/.github/workflows/_e2e_nightly_single_node_models.yaml b/.github/workflows/_e2e_nightly_single_node_models.yaml index c3b4199f..f7672f62 100644 --- a/.github/workflows/_e2e_nightly_single_node_models.yaml +++ b/.github/workflows/_e2e_nightly_single_node_models.yaml @@ -40,6 +40,9 @@ on: required: false type: boolean default: false + is_run: + required: true + type: boolean # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -58,6 +61,7 @@ jobs: e2e-nightly: name: ${{inputs.model_list}} accuracy test runs-on: ${{ inputs.runner }} + if: ${{ inputs.is_run }} container: image: "${{ inputs.image }}" env: diff --git a/.github/workflows/schedule_nightly_test_a2.yaml b/.github/workflows/schedule_nightly_test_a2.yaml index 7eed1414..347d4a36 100644 --- a/.github/workflows/schedule_nightly_test_a2.yaml +++ b/.github/workflows/schedule_nightly_test_a2.yaml @@ -24,10 +24,15 @@ on: # Run test at 24:00 Beijing time (UTC+8) - cron: "0 16 * * *" workflow_dispatch: - pull_request: + pull_request: branches: - 'main' - types: [ labeled ] + types: [labeled, synchronize] + +permissions: + contents: read + pull-requests: read + issues: read # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -42,9 +47,84 @@ concurrency: cancel-in-progress: true jobs: + parse-trigger: + name: Parse trigger and determine test scope + runs-on: linux-aarch64-a2b3-0 + outputs: + should_run: ${{ steps.parse.outputs.should_run }} + test_filter: ${{ steps.parse.outputs.test_filter }} + is_pr_event: ${{ steps.parse.outputs.is_pr_event }} + steps: + - name: Parse trigger + id: parse + uses: actions/github-script@v7 + with: + script: | + const eventName = context.eventName; + + function parseNightlyComment(body) { + if (!body) return null; + const match = body.trim().match(/^\/nightly(?:\s+(.+))?$/m); + if (!match) return null; + const args = (match[1] || '').trim(); + if (!args || args === 'all') return 'all'; + // Wrap with commas for exact-name matching: ",name1,name2," + return ',' + args.split(/\s+/).join(',') + ','; + } + + // schedule / workflow_dispatch: run all tests with pre-built image + if (eventName === 'schedule' || eventName === 'workflow_dispatch') { + core.setOutput('should_run', 'true'); + core.setOutput('test_filter', 'all'); + core.setOutput('is_pr_event', 'false'); + return; + } + + // pull_request (labeled / synchronize) + if (eventName === 'pull_request') { + const labels = context.payload.pull_request.labels.map(l => l.name); + if (!labels.includes('nightly-test')) { + core.setOutput('should_run', 'false'); + core.setOutput('test_filter', ''); + core.setOutput('is_pr_event', 'true'); + return; + } + // Search comments for latest /nightly command + const prNumber = context.payload.pull_request.number; + const comments = await github.paginate(github.rest.issues.listComments, { + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + per_page: 100, + }); + let testFilter = null; + for (let i = comments.length - 1; i >= 0; i--) { + const result = parseNightlyComment(comments[i].body); + if (result !== null) { testFilter = result; break; } + } + // No /nightly comment found: do not run any tests + if (testFilter === null) { + core.info('nightly-test label present but no /nightly comment found; skipping.'); + core.setOutput('should_run', 'false'); + core.setOutput('test_filter', ''); + core.setOutput('is_pr_event', 'true'); + return; + } + core.setOutput('should_run', 'true'); + core.setOutput('test_filter', testFilter); + core.setOutput('is_pr_event', 'true'); + return; + } + + // Fallback + core.setOutput('should_run', 'false'); + core.setOutput('test_filter', ''); + core.setOutput('is_pr_event', 'false'); + single-node-tests: name: single-node - if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + needs: [parse-trigger] + if: always() && needs.parse-trigger.outputs.should_run == 'true' strategy: fail-fast: false matrix: @@ -61,10 +141,25 @@ jobs: tests: ${{ matrix.test_config.tests }} name: ${{ matrix.test_config.name }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' + is_pr_test: >- + ${{ + needs.parse-trigger.outputs.is_pr_event == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} + is_run: >- + ${{ + needs.parse-trigger.outputs.should_run == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} single-node-yaml-tests: name: single-node - if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + needs: [parse-trigger] + if: always() && needs.parse-trigger.outputs.should_run == 'true' strategy: fail-fast: false matrix: @@ -84,11 +179,26 @@ jobs: image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' config_file_path: ${{ matrix.test_config.config_file_path }} name: ${{ matrix.test_config.name }} + is_pr_test: >- + ${{ + needs.parse-trigger.outputs.is_pr_event == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} + is_run: >- + ${{ + needs.parse-trigger.outputs.should_run == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} + multi-node-tests: name: multi-node - if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - needs: [single-node-tests, single-node-yaml-tests] + needs: [parse-trigger, single-node-tests, single-node-yaml-tests] + if: always() && needs.parse-trigger.outputs.should_run == 'true' strategy: fail-fast: false max-parallel: 1 @@ -108,40 +218,55 @@ jobs: replicas: 1 size: ${{ matrix.test_config.size }} config_file_path: ${{ matrix.test_config.config_file_path }} + vllm_ascend_ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.ref_name }} + is_pr_test: >- + ${{ + needs.parse-trigger.outputs.is_pr_event == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} + is_run: >- + ${{ + needs.parse-trigger.outputs.should_run == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} secrets: KUBECONFIG_B64: ${{ secrets.KUBECONFIG_HK_001_INTERNAL_B64 }} single-node-accuracy-tests: - if: >- - ${{ - github.event_name == 'schedule' || - github.event_name == 'workflow_dispatch' || - contains(github.event.pull_request.labels.*.name, 'accuracy-test') - }} + needs: [parse-trigger] + if: always() && needs.parse-trigger.outputs.should_run == 'true' strategy: fail-fast: false matrix: test_config: - - os: linux-aarch64-a2b3-1 + - name: accuracy-group-1 + os: linux-aarch64-a2b3-1 model_list: - Qwen3-8B - Qwen2-Audio-7B-Instruct - Qwen3-8B-W8A8 - Qwen3-VL-8B-Instruct - Qwen2.5-Omni-7B - - os: linux-aarch64-a2b3-1 + - name: accuracy-group-2 + os: linux-aarch64-a2b3-1 model_list: - ERNIE-4.5-21B-A3B-PT - InternVL3_5-8B-hf - Molmo-7B-D-0924 - Llama-3.2-3B-Instruct - llava-onevision-qwen2-0.5b-ov-hf - - os: linux-aarch64-a2b3-2 + - name: accuracy-group-3 + os: linux-aarch64-a2b3-2 model_list: - Qwen3-30B-A3B - Qwen3-VL-30B-A3B-Instruct - Qwen3-30B-A3B-W8A8 - - os: linux-aarch64-a2b3-4 + - name: accuracy-group-4 + os: linux-aarch64-a2b3-4 model_list: - Qwen3-Next-80B-A3B-Instruct - Qwen3-Omni-30B-A3B-Instruct @@ -151,10 +276,18 @@ jobs: runner: ${{ matrix.test_config.os }} model_list: ${{ toJson(matrix.test_config.model_list) }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11' + is_run: >- + ${{ + needs.parse-trigger.outputs.should_run == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} upload: false doc-test: name: doc-test + needs: [parse-trigger] if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') strategy: # Each version should be tested diff --git a/.github/workflows/schedule_nightly_test_a3.yaml b/.github/workflows/schedule_nightly_test_a3.yaml index 4c58aa39..66528caa 100644 --- a/.github/workflows/schedule_nightly_test_a3.yaml +++ b/.github/workflows/schedule_nightly_test_a3.yaml @@ -25,10 +25,15 @@ on: # Run test at 24:00 Beijing time (UTC+8) - cron: "0 16 * * *" workflow_dispatch: - pull_request: + pull_request: branches: - 'main' - types: [ labeled ] + types: [ labeled, synchronize ] + +permissions: + contents: read + pull-requests: read + issues: read # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -42,9 +47,84 @@ concurrency: cancel-in-progress: true jobs: + parse-trigger: + name: Parse trigger and determine test scope + runs-on: linux-aarch64-a2b3-0 + outputs: + should_run: ${{ steps.parse.outputs.should_run }} + test_filter: ${{ steps.parse.outputs.test_filter }} + is_pr_event: ${{ steps.parse.outputs.is_pr_event }} + steps: + - name: Parse trigger + id: parse + uses: actions/github-script@v7 + with: + script: | + const eventName = context.eventName; + + function parseNightlyComment(body) { + if (!body) return null; + const match = body.trim().match(/^\/nightly(?:\s+(.+))?$/m); + if (!match) return null; + const args = (match[1] || '').trim(); + if (!args || args === 'all') return 'all'; + // Wrap with commas for exact-name matching: ",name1,name2," + return ',' + args.split(/\s+/).join(',') + ','; + } + + // schedule / workflow_dispatch: run all tests with pre-built image + if (eventName === 'schedule' || eventName === 'workflow_dispatch') { + core.setOutput('should_run', 'true'); + core.setOutput('test_filter', 'all'); + core.setOutput('is_pr_event', 'false'); + return; + } + + // pull_request (labeled / synchronize) + if (eventName === 'pull_request') { + const labels = context.payload.pull_request.labels.map(l => l.name); + if (!labels.includes('nightly-test')) { + core.setOutput('should_run', 'false'); + core.setOutput('test_filter', ''); + core.setOutput('is_pr_event', 'true'); + return; + } + // Search comments for latest /nightly command + const prNumber = context.payload.pull_request.number; + const comments = await github.paginate(github.rest.issues.listComments, { + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + per_page: 100, + }); + let testFilter = null; + for (let i = comments.length - 1; i >= 0; i--) { + const result = parseNightlyComment(comments[i].body); + if (result !== null) { testFilter = result; break; } + } + // No /nightly comment found: do not run any tests + if (testFilter === null) { + core.info('nightly-test label present but no /nightly comment found; skipping.'); + core.setOutput('should_run', 'false'); + core.setOutput('test_filter', ''); + core.setOutput('is_pr_event', 'true'); + return; + } + core.setOutput('should_run', 'true'); + core.setOutput('test_filter', testFilter); + core.setOutput('is_pr_event', 'true'); + return; + } + + // Fallback + core.setOutput('should_run', 'false'); + core.setOutput('test_filter', ''); + core.setOutput('is_pr_event', 'false'); + multi-node-tests: name: multi-node - if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + needs: [parse-trigger] + if: always() && needs.parse-trigger.outputs.should_run == 'true' strategy: fail-fast: false max-parallel: 1 @@ -106,13 +186,28 @@ jobs: replicas: 1 size: ${{ matrix.test_config.size }} config_file_path: ${{ matrix.test_config.config_file_path }} + vllm_ascend_ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.ref_name }} + is_pr_test: >- + ${{ + needs.parse-trigger.outputs.is_pr_event == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} + is_run: >- + ${{ + needs.parse-trigger.outputs.should_run == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} secrets: KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }} - + single-node-tests: name: single-node - if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - needs: [multi-node-tests] + needs: [parse-trigger, multi-node-tests] + if: always() && needs.parse-trigger.outputs.should_run == 'true' strategy: fail-fast: false matrix: @@ -126,11 +221,25 @@ jobs: image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' tests: ${{ matrix.test_config.tests }} name: ${{ matrix.test_config.name }} + is_pr_test: >- + ${{ + needs.parse-trigger.outputs.is_pr_event == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} + is_run: >- + ${{ + needs.parse-trigger.outputs.should_run == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} single-node-yaml-tests: name: single-node - if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - needs: [multi-node-tests] + if: always() && needs.parse-trigger.outputs.should_run == 'true' + needs: [parse-trigger, multi-node-tests] strategy: fail-fast: false matrix: @@ -190,11 +299,25 @@ jobs: image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' config_file_path: ${{ matrix.test_config.config_file_path }} name: ${{ matrix.test_config.name }} + is_pr_test: >- + ${{ + needs.parse-trigger.outputs.is_pr_event == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} + is_run: >- + ${{ + needs.parse-trigger.outputs.should_run == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} custom-ops-tests: name: test ops - if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - needs: multi-node-tests + needs: [parse-trigger, multi-node-tests] + if: always() && needs.parse-trigger.outputs.should_run == 'true' strategy: fail-fast: false matrix: @@ -208,3 +331,17 @@ jobs: image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' tests: ${{ matrix.test_config.tests }} name: ${{ matrix.test_config.name }} + is_pr_test: >- + ${{ + needs.parse-trigger.outputs.is_pr_event == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} + is_run: >- + ${{ + needs.parse-trigger.outputs.should_run == 'true' && ( + needs.parse-trigger.outputs.test_filter == 'all' || + contains(needs.parse-trigger.outputs.test_filter, format(',{0},', matrix.test_config.name)) + ) + }} diff --git a/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 index 80f07643..b6048604 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 @@ -24,6 +24,14 @@ spec: value: "/vllm-workspace" - name: FAIL_TAG value: {{ fail_tag | default("FAIL_TAG") }} + - name: IS_PR_TEST + value: "{{ is_pr_test | default("false") }}" + - name: VLLM_VERSION + value: {{ vllm_version | default("latest") }} + - name: VLLM_ASCEND_REF + value: {{ vllm_ascend_ref | default("main") }} + - name: VLLM_ASCEND_REMOTE_URL + value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} command: - sh - -c @@ -76,6 +84,14 @@ spec: value: "/vllm-workspace" - name: FAIL_TAG value: {{ fail_tag | default("FAIL_TAG") }} + - name: IS_PR_TEST + value: "{{ is_pr_test | default("false") }}" + - name: VLLM_VERSION + value: {{ vllm_version | default("latest") }} + - name: VLLM_ASCEND_REF + value: {{ vllm_ascend_ref | default("main") }} + - name: VLLM_ASCEND_REMOTE_URL + value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} command: - sh - -c diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 index dffa0ea2..7e2de7b6 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 @@ -24,6 +24,14 @@ spec: value: "/vllm-workspace" - name: FAIL_TAG value: {{ fail_tag | default("FAIL_TAG") }} + - name: IS_PR_TEST + value: "{{ is_pr_test | default("false") }}" + - name: VLLM_VERSION + value: {{ vllm_version | default("latest") }} + - name: VLLM_ASCEND_REF + value: {{ vllm_ascend_ref | default("main") }} + - name: VLLM_ASCEND_REMOTE_URL + value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} command: - sh - -c @@ -76,6 +84,14 @@ spec: value: "/vllm-workspace" - name: FAIL_TAG value: {{ fail_tag | default("FAIL_TAG") }} + - name: IS_PR_TEST + value: "{{ is_pr_test | default("false") }}" + - name: VLLM_VERSION + value: {{ vllm_version | default("latest") }} + - name: VLLM_ASCEND_REF + value: {{ vllm_ascend_ref | default("main") }} + - name: VLLM_ASCEND_REMOTE_URL + value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} command: - sh - -c diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh index 2456d9a5..efb4f6d0 100644 --- a/tests/e2e/nightly/multi_node/scripts/run.sh +++ b/tests/e2e/nightly/multi_node/scripts/run.sh @@ -104,8 +104,6 @@ check_npu_info() { check_and_config() { echo "====> Configure mirrors and git proxy" - # Fix me(Potabk): Currently, there have some issues with accessing GitHub via https://gh-proxy.test.osinfra.cn in certain regions. - # We should switch to a more stable proxy for now until the network proxy is stable enough. git config --global url."https://ghfast.top/https://github.com/".insteadOf "https://github.com/" pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi @@ -136,6 +134,64 @@ install_extra_components() { echo "====> Extra components installation completed" } +checkout_src() { + echo "====> Checkout source code" + mkdir -p "$WORKSPACE" + cd "$WORKSPACE" + pip uninstall -y vllm vllm-ascend || true + rm -rf "$WORKSPACE/vllm" "$WORKSPACE/vllm-ascend" + + if [ ! -d "$WORKSPACE/vllm-ascend" ]; then + echo "Cloning vllm-ascend from $VLLM_ASCEND_REMOTE_URL" + git clone --depth 1 "$VLLM_ASCEND_REMOTE_URL" "$WORKSPACE/vllm-ascend" + cd "$WORKSPACE/vllm-ascend" + PR_REF=$(git ls-remote origin 'refs/pull/*/head' | grep "^${VLLM_ASCEND_REF}" | awk '{print $2}' | head -1) + if [ -n "$PR_REF" ]; then + git fetch --depth 1 origin "$PR_REF" + git checkout FETCH_HEAD + else + git fetch origin '+refs/pull/*/head:refs/remotes/pull/*' 2>/dev/null || true + git checkout "$VLLM_ASCEND_REF" + fi + fi + + if [ ! -d "$WORKSPACE/vllm" ]; then + echo "Cloning vllm version/ref: $VLLM_VERSION" + git clone --depth 1 --branch "$VLLM_VERSION" https://github.com/vllm-project/vllm.git "$WORKSPACE/vllm" + fi +} + +install_vllm() { + echo "====> Install vllm and vllm-ascend" + VLLM_TARGET_DEVICE=empty pip install -e "$WORKSPACE/vllm" + pip install -r "$WORKSPACE/vllm-ascend/requirements-dev.txt" + pip install -e "$WORKSPACE/vllm-ascend" +} + +install_aisbench() { + echo "====> Install AISBench benchmark" + + export AIS_BENCH_URL="https://gitee.com/aisbench/benchmark.git" + : "${AIS_BENCH_TAG:=v3.0-20250930-master}" + + BENCH_DIR="$WORKSPACE/vllm-ascend/benchmark" + + if [ -d "$BENCH_DIR" ]; then + echo "Removing existing benchmark directory..." + rm -rf "$BENCH_DIR" + fi + + git clone -b "${AIS_BENCH_TAG}" --depth 1 \ + "${AIS_BENCH_URL}" "${BENCH_DIR}" + + cd "$BENCH_DIR" + pip install -e . \ + -r requirements/api.txt \ + -r requirements/extra.txt + + python3 -m pip cache purge || echo "WARNING: pip cache purge failed, but proceeding..." + +} show_triton_ascend_info() { echo "====> Check triton ascend info" @@ -170,6 +226,11 @@ If this is insufficient to pinpoint the error, please download and review the lo main() { check_npu_info check_and_config + if [[ "$IS_PR_TEST" == "true" ]]; then + checkout_src + install_vllm + install_aisbench + fi show_vllm_info show_triton_ascend_info if [[ "$CONFIG_YAML_PATH" == *"DeepSeek-V3_2-Exp-bf16.yaml" ]]; then