[CI][Doc] Optimize multi-node CI (#3565)

### What this PR does / why we need it? This pull request mainly do the following things: 1. Add a doc for multi-node CI, The main content is the mechanism principle and how to contribute 2. Simplify the config yaml for more developer-friendly 3. Optimized the mooncake installation script to prevent accidental failures during installation 4. Fix the workflow to ensure the kubernetes can be apply correctly 5. Add Qwen3-235B-W8A8 disaggregated_prefill test 6. Add GLM-4.5 multi dp test 7. Add 2p1d 4nodes disaggregated_prefill test 8. Refactor nightly tests ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: 17c540a993 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-25 09:23:47 +08:00
parent 292cf339c3
commit 7f73c28a24
21 changed files with 1165 additions and 378 deletions
--- a/.github/workflows/_e2e_nightly_multi_node.yaml
+++ b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -0,0 +1,190 @@
+name: 'e2e nightly test multi_node'
+
+on:
+  workflow_call:
+    inputs:
+      soc_version:
+        required: true
+        type: string
+        description: use a2 or a3
+      image:
+        required: false
+        type: string
+        description: base image for pods
+        default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
+      config_file_path:
+        required: true
+        type: string
+        description: the model config for multi_node test
+      replicas:
+        required: false
+        default: "1"
+        type: string
+        description: replicas of the k8s cluster
+      size:
+        required: false
+        default: "2"
+        type: string
+        description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
+      vllm_version:
+        required: false
+        default: "v0.11.0"
+        type: string
+        description: vllm version to use
+      vllm_ascend_remote_url:
+        required: false
+        default: https://github.com/vllm-project/vllm-ascend.git
+        type: string
+        description: used for pr level tests
+      vllm_ascend_ref:
+        required: false
+        default: main
+        type: string
+        description: used for pr level tests
+
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 8 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    # This is a runner with no NPU for k8s controller
+    runs-on: linux-aarch64-a3-0
+    container:
+      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+      env:
+        KUBECONFIG: /tmp/kubeconfig
+        KUBECTL: /root/.cache/.kube/kubectl
+        NAMESPACE: vllm-project
+        LEADER_POD: vllm-0
+        RESULT_FILE: /root/.cache/tests/ret/test_result.txt
+    steps:
+        - name: Install system denpendencies
+          run: |
+           # configure apt and pip source
+           sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+           pip install jinja2-cli
+
+           apt-get update -y && apt-get install -y git curl
+
+        - name: Install kubectl
+          run: |
+            # Install kubectl
+            install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
+            
+            # Verify kubectl installation
+            kubectl version --client=true
+
+        # TODO: Add A2 tests
+        - name: Setup kubeconfig for A3
+          if: inputs.soc_version == 'a3'
+          run: |
+            # Decode and save kubeconfig
+            echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
+
+        - name: Checkout code
+          uses: actions/checkout@v4
+
+        - name: Prepare scripts
+          run: |
+            # prepare for lws entrypoint scripts
+            install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
+
+        - name: Clear result ret
+          run: |
+            rm -f $RESULT_FILE
+
+        - name: Launch cluster
+          run: |
+            set -e
+
+            size="${{ inputs.size }}"
+            replicas="${{ inputs.replicas }}"
+            image="${{ inputs.image }}"
+            config_file_path="${{ inputs.config_file_path }}"
+            vllm_version="${{ inputs.vllm_version }}"
+            vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
+            vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
+            result_file_path="$RESULT_FILE"
+
+            required_params=("size" "replicas" "image" "config_file_path")
+            for param in "${required_params[@]}"; do
+              if [ -z "${!param}" ]; then
+                echo "Error: Parameter '$param' is required but empty"
+                exit 1
+              fi
+            done
+
+            jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
+              -D size="$size" \
+              -D replicas="$replicas" \
+              -D image="$image" \
+              -D config_file_path="$config_file_path" \
+              -D vllm_version="$vllm_version" \
+              -D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
+              -D vllm_ascend_ref="$vllm_ascend_ref" \
+              -D result_file_path="$result_file_path" \
+              --outfile lws.yaml
+
+            kubectl apply -f ./lws.yaml
+
+        - name: Waiting for pod ready
+          run: |
+            echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."
+
+            while true; do
+              # get pod status
+              READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')
+
+              if [[ "$READY_STATUS" == "true" ]]; then
+                echo "Pod [$LEADER_POD] is Ready!"
+                break
+              else
+                echo "Pod [$LEADER_POD] not ready, waiting..."
+                sleep 3
+              fi
+            done
+
+        - name: Stream logs
+          run: |
+            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
+
+        - name: Determine is success
+          run: |
+            TIMEOUT=600
+            ELAPSED=0
+            while [ ! -f "$RESULT_FILE" ]; do
+              sleep 5
+              ELAPSED=$((ELAPSED + 5))
+              if [ $ELAPSED -ge $TIMEOUT ]; then
+                echo "Timeout waiting for test result file"
+                exit 1
+              fi
+            done
+
+            RET=$(cat "$RESULT_FILE")
+            echo "Test result: $RET"
+
+            if [ "$RET" -ne 0 ]; then
+              echo "Test failed"
+              exit 1
+            else
+              echo "Test succeeded"
+            fi
+
+        - name: Post process
+          if: always()
+          run: |
+            kubectl get pods -n $NAMESPACE
+            kubectl delete -f ./lws.yaml
--- a/.github/workflows/_e2e_nightly_single_node.yaml
+++ b/.github/workflows/_e2e_nightly_single_node.yaml
--- a/.github/workflows/multi_node_test.yaml
+++ b/.github/workflows/multi_node_test.yaml
@@ -1,125 +0,0 @@
-name: 'e2e test / multi-dp'
-
-on:
-    schedule:
-      - cron: "0 */4 * * *"
-    workflow_dispatch:
-
-# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
-# declared as "shell: bash -el {0}" on steps that need to be properly activated.
-# It's used to activate ascend-toolkit environment variables.
-defaults:
-  run:
-    shell: bash -el {0}
-
-# only cancel in-progress runs of the same workflow
-# and ignore the lint / 8 cards test type
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  e2e:
-    # This is a runner with no NPU for k8s controller
-    runs-on: linux-aarch64-a3-0
-    container:
-      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
-      env:
-        KUBECONFIG: /tmp/kubeconfig
-        KUBECTL: /root/.cache/.kube/kubectl
-        NAMESPACE: vllm-project
-        LEADER_POD: vllm-0
-    steps:
-        - name: Install system denpendencies
-          run: |
-           # configure apt and pip source
-           sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
-           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-           pip install jinja2-cli -y
-
-           apt-get update -y && apt-get install -y git curl
-
-           TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64`
-           git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN"
-
-        - name: Install kubectl
-          run: |
-            install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
-
-            # get kubeconfig from secret
-            echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
-
-        - name: Checkout code
-          uses: actions/checkout@v4
-
-        - name: Prepare scripts
-          run: |
-            # prepare for lws entrypoint scripts
-            install -D tests/e2e/multi_node/scripts/run.sh /root/.cache/tests/run.sh
-
-        - name: Launch cluster
-          run: |
-            jinja2 tests/e2e/multi_node/scripts/lws.yaml.jinja2 \
-              -D size=2 \
-              -D replicas=1 \
-              -D image="m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11" \
-              --outfile lws.yaml
-
-            kubectl apply -f ./lws.yaml
-          
-        - name: Waiting for pod ready
-          run: |
-            echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."
-
-            while true; do
-              # get pod status
-              READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')
-
-              if [[ "$READY_STATUS" == "true" ]]; then
-                echo "✅ Pod [$LEADER_POD] is Ready!"
-                break
-              else
-                echo "Pod [$LEADER_POD] not ready, waiting..."
-                sleep 3
-              fi
-            done
-
-        - name: Stream logs and monitor pod health
-          run: |
-            set -euo pipefail
-
-            echo "🚀 Start streaming logs for Pod [$LEADER_POD] ..."
-            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" &
-            LOG_PID=$!
-
-            echo "Start monitoring Pod [$LEADER_POD] status ..."
-            while true; do
-              STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}')
-              if [[ "$STATUS" != "Running" && "$STATUS" != "Succeeded" ]]; then
-                echo "❌ Pod [$LEADER_POD] exited abnormally with status: $STATUS"
-                kubectl describe pod "$LEADER_POD" -n "$NAMESPACE" || true
-                kubectl logs "$LEADER_POD" -n "$NAMESPACE" --previous --all-containers || true
-                kill $LOG_PID || true
-                exit 1
-              fi
-              sleep 5
-            done &
-
-            MONITOR_PID=$!
-            wait $LOG_PID || true
-            kill $MONITOR_PID || true
-
-        - name: Generate summary
-          if: always()
-          run: |
-            if [ -f "/root/.cache/test_summary.md" ]; then
-              cat /root/.cache/test_summary.md >> "$GITHUB_STEP_SUMMARY"
-            else
-              echo "No summary file found." >> "$GITHUB_STEP_SUMMARY"
-            fi
-
-        - name: Post process
-          if: always()
-          run: |
-            kubectl get pods -n $NAMESPACE
-            kubectl delete -f ./lws.yaml
--- a/.github/workflows/vllm_ascend_test_nightly.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly.yaml
@@ -1,133 +0,0 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-#
-
-name: 'ascend test / nightly'
-
-on:
-  schedule:
-      # Run test at 24:00 Beijing time (UTC+8)
-      - cron: "0 16 * * *"
-  workflow_dispatch:
-  pull_request: 
-    branches:
-      - 'main'
-      - '*-dev'
-    types: [labeled,opened,synchronize]
-
-# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
-# declared as "shell: bash -el {0}" on steps that need to be properly activated.
-# It's used to activate ascend-toolkit environment variables.
-defaults:
-  run:
-    shell: bash -el {0}
-
-# only cancel in-progress runs of the same workflow
-# and ignore the lint / 1 card / 4 cards test type
-concurrency:
-  group: ascend-nightly-${{ github.ref }}
-  #cancel-in-progress: true
-
-jobs:
-  qwen3-32b:
-    if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
-    strategy:
-      matrix:
-        # should add A3 chip runner when available
-        os: [linux-aarch64-a2-4]
-    # Note (yikun): If CI resource are limited we can split job into two chain jobs
-    # only trigger e2e test after lint passed and the change is e2e related with pull request.
-    uses: ./.github/workflows/_e2e_nightly.yaml
-    with:
-      vllm: v0.11.0
-      runner: ${{ matrix.os }}
-      tests: tests/e2e/nightly/models/test_qwen3_32b.py
-  qwen3-32b-in8-a3:
-    if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
-    strategy:
-      matrix:
-        os: [ linux-aarch64-a3-4 ]
-    uses: ./.github/workflows/_e2e_nightly.yaml
-    with:
-      vllm: v0.11.0
-      runner: ${{ matrix.os }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
-      tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
-  qwen3-32b-in8-a2:
-    if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
-    strategy:
-      matrix:
-        os: [ linux-aarch64-a2-4 ]
-    uses: ./.github/workflows/_e2e_nightly.yaml
-    with:
-      vllm: v0.11.0
-      runner: ${{ matrix.os }}
-      tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
-  qwen3-235b-a22b-w8a8-eplb:
-    if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
-    strategy:
-      matrix:
-        os: [ linux-aarch64-a3-16 ]
-    uses: ./.github/workflows/_e2e_nightly.yaml
-    with:
-      vllm: v0.11.0
-      runner: ${{ matrix.os }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
-      tests: tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
-  deepseek-r1-w8a8-eplb:
-    if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
-    strategy:
-      matrix:
-        os: [ linux-aarch64-a3-16 ]
-    uses: ./.github/workflows/_e2e_nightly.yaml
-    with:
-      vllm: v0.11.0
-      runner: ${{ matrix.os }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
-      tests: tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
-  qwen3-32b-int8-a3-feature-stack3:
-    if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
-    strategy:
-      matrix:
-        os: [ linux-aarch64-a3-4 ]
-    uses: ./.github/workflows/_e2e_nightly.yaml
-    with:
-      vllm: v0.11.0
-      runner: ${{ matrix.os }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
-      tests: tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
-  qwen2-5-vl-7b:
-    if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
-    strategy:
-      matrix:
-        os: [ linux-aarch64-a3-4 ]
-    uses: ./.github/workflows/_e2e_nightly.yaml
-    with:
-      vllm: v0.11.0
-      runner: ${{ matrix.os }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
-      tests: tests/e2e/nightly/models/test_qwen2_5_vl_7b.py
-  deepseek-r1-0528-w8a8:
-    if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
-    strategy:
-      matrix:
-        os: [ linux-aarch64-a3-16 ]
-    uses: ./.github/workflows/_e2e_nightly.yaml
-    with:
-      vllm: v0.11.0
-      runner: ${{ matrix.os }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
-      tests: tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
--- a/.github/workflows/vllm_ascend_test_nightly_a2.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml
@@ -0,0 +1,60 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+# This workflow related to the resources atlas 800 A2
+# We will not limit the concurrency of jobs on A2
+name: 'ascend test / nightly-a2'
+
+on:
+  schedule:
+      # Run test at 24:00 Beijing time (UTC+8)
+      - cron: "0 16 * * *"
+  workflow_dispatch:
+  pull_request: 
+    branches:
+      - 'main'
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+concurrency:
+  group: ascend-nightly-${{ github.ref }}-a2
+  cancel-in-progress: true
+
+jobs:
+  single-node-tests:
+    if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
+    strategy:
+      fail-fast: false
+      matrix:
+        test_config:
+          - name: qwen3-32b
+            os: linux-aarch64-a2-4
+            tests: tests/e2e/nightly/models/test_qwen3_32b.py
+          - name: qwen3-32b-in8-a2
+            os: linux-aarch64-a2-4
+            tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
+    uses: ./.github/workflows/_e2e_nightly_single_node.yaml
+    with:
+      vllm: v0.11.0
+      runner: ${{ matrix.test_config.os }}
+      tests: ${{ matrix.test_config.tests }}
--- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml
@@ -0,0 +1,98 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+# This workflow related to the resources atlas 800 A3
+# **Please note**: current A3 resource pool's maximum allowed concurrency is 5*16 NPUs
+# We will limit the concurrency of jobs on A3 to avoid the risk of insufficient resources
+name: 'ascend test / nightly-a3'
+
+on:
+  schedule:
+      # Run test at 24:00 Beijing time (UTC+8)
+      - cron: "0 16 * * *"
+  workflow_dispatch:
+  pull_request: 
+    branches:
+      - 'main'
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+concurrency:
+  group: ascend-nightly-${{ github.ref }}-a3
+  cancel-in-progress: true
+
+jobs:
+  single-node-tests:
+    if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
+    strategy:
+      fail-fast: false
+      matrix:
+        test_config:
+          - name: qwen3-32b-in8-a3
+            os: linux-aarch64-a3-4
+            tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
+          - name: qwen3-32b-int8-a3-feature-stack3
+            os: linux-aarch64-a3-4
+            tests: tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
+          - name: qwen3-235b-a22b-w8a8-eplb
+            os: linux-aarch64-a3-16
+            tests: tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
+          - name: deepseek-r1-w8a8-eplb
+            os: linux-aarch64-a3-16
+            tests: tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
+          - name: qwen2-5-vl-7b
+            os: linux-aarch64-a3-4
+            tests: tests/e2e/nightly/models/test_qwen2_5_vl_7b.py
+          - name: deepseek-r1-0528-w8a8
+            os: linux-aarch64-a3-16
+            tests: tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
+    uses: ./.github/workflows/_e2e_nightly_single_node.yaml
+    with:
+      vllm: v0.11.0
+      runner: ${{ matrix.test_config.os }}
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+      tests: ${{ matrix.test_config.tests }}
+
+  multi-node-tests:
+    needs: single-node-tests
+    if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        test_config:
+          - name: multi-node-deepseek-pd
+            config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml
+            size: 2
+          - name: multi-node-qwen3-dp
+            config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
+            size: 2
+          - name: multi-node-dpsk-4node-pd
+            config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
+            size: 4
+    uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
+    with:
+      soc_version: a3
+      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+      replicas: 1
+      size: ${{ matrix.test_config.size }}
+      config_file_path: ${{ matrix.test_config.config_file_path }}