From 67d40f23fd8a0024c5be66899c5390ecdd929534 Mon Sep 17 00:00:00 2001
From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com>
Date: Tue, 10 Mar 2026 16:25:51 +0800
Subject: [PATCH] [CI]Upgrade niglty multi-node-tests max-parallel to 2 (#7035)

### What this PR does / why we need it?

1. Increase nightly multi-node test max-parallel from 1 to 2, and fix
resource conflicts that arise when tests run concurrently.
2. Fix parse-trigger job: Add an if condition so it only runs on
schedule, workflow_dispatch, or PRs labeled nightly-test
3. Adjust nightly schedule: Shift trigger time from 24:00 to 23:45
(UTC+8)

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.16.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d

---------

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
---
 .../workflows/_e2e_nightly_multi_node.yaml    | 62 +++++++++++++++----
 .../workflows/schedule_nightly_test_a2.yaml   | 10 ++-
 .../workflows/schedule_nightly_test_a3.yaml   | 10 ++-
 .../multi_node/scripts/lws-a2.yaml.jinja2     |  6 +-
 .../multi_node/scripts/lws.yaml.jinja2        |  6 +-
 5 files changed, 71 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml
index 1777af17..cf9f38a7 100644
--- a/.github/workflows/_e2e_nightly_multi_node.yaml
+++ b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -66,7 +66,7 @@ defaults:
 # only cancel in-progress runs of the same workflow
 # and ignore the lint / 8 cards test type
 concurrency:
-  group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}
+  group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}-${{ inputs.config_file_path }}
   cancel-in-progress: true
 
 jobs:
@@ -80,7 +80,6 @@ jobs:
       env:
         KUBECONFIG: /tmp/kubeconfig
         NAMESPACE: vllm-project
-        LEADER_POD: vllm-0
     steps:
         - name: Decode kubeconfig from secrets
           run: |
@@ -101,6 +100,17 @@ jobs:
         - name: Checkout code
           uses: actions/checkout@v6
 
+        - name: Set job variables
+          run: |
+            # Derive a unique, valid k8s resource name from config_file_path.
+            # Strip .yaml extension, lowercase, replace dots/underscores with hyphens, cap at 50 chars.
+            config_file="${{ inputs.config_file_path }}"
+            lws_suffix=$(echo "$config_file" | sed 's/\.yaml$//' | tr '[:upper:]' '[:lower:]' | tr '._' '-' | cut -c1-50)
+            LWS_NAME="vllm-${lws_suffix}"
+            echo "LWS_NAME=${LWS_NAME}" >> $GITHUB_ENV
+            echo "LEADER_POD=${LWS_NAME}-0" >> $GITHUB_ENV
+            echo "Computed LWS_NAME=${LWS_NAME}"
+
         - name: Prepare scripts
           run: |
             # prepare for lws entrypoint scripts
@@ -110,14 +120,14 @@ jobs:
           run: |
             set -euo pipefail
 
-            CRD_NAME="${CRD_NAME:-vllm}"
             TIMEOUT=${TIMEOUT:-120}
             SLEEP_INTERVAL=2
 
-            echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..."
-            kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found
+            echo "Deleting leaderworkerset [$LWS_NAME] in namespace [$NAMESPACE]..."
+            kubectl delete leaderworkerset "$LWS_NAME" -n "$NAMESPACE" --ignore-not-found
+            kubectl delete service "${LWS_NAME}-leader" -n "$NAMESPACE" --ignore-not-found
 
-            echo "Waiting for all pods starting with 'vllm' to be deleted..."
+            echo "Waiting for pods of leaderworkerset [$LWS_NAME] to be deleted..."
             START_TIME=$(date +%s)
 
             while true; do
@@ -126,14 +136,14 @@ jobs:
 
               if [[ $ELAPSED -ge $TIMEOUT ]]; then
                 echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
-                kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true
+                kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
                 exit 1
               fi
 
-              PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true)
+              PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)
 
               if [[ -z "$PODS_EXIST" ]]; then
-                echo "All vllm pods deleted."
+                echo "All pods for [$LWS_NAME] deleted."
                 break
               else
                 echo "Waiting for pods to be deleted: $PODS_EXIST"
@@ -174,6 +184,7 @@ jobs:
             fi
 
             jinja2 $TEMPLATE_FILE \
+              -D lws_name="$LWS_NAME" \
               -D size="$size" \
               -D replicas="$replicas" \
               -D image="$image" \
@@ -190,7 +201,7 @@ jobs:
 
         - name: Waiting for pod ready
           run: |
-            POD_PREFIX="${POD_PREFIX:-vllm-0}"
+            POD_PREFIX="${LWS_NAME}-0"
             SIZE="${{ inputs.size }}"
             TIMEOUT=1200  # default timeout 20 minutes
 
@@ -260,7 +271,7 @@ jobs:
             trap cleanup EXIT
 
             for i in $(seq 1 $((size - 1))); do
-              POD="vllm-0-${i}"
+              POD="${LWS_NAME}-0-${i}"
 
               echo "==== Collecting logs from worker pod: $POD ===="
               kubectl logs -f "$POD" -n "$NAMESPACE" \
@@ -290,5 +301,34 @@ jobs:
         - name: Post process
           if: always()
           run: |
+            echo "Current pod status:"
             kubectl get pods -n "$NAMESPACE" --ignore-not-found=true
+
+            echo "Deleting resources for [$LWS_NAME]..."
             kubectl delete -f ./lws.yaml --ignore-not-found=true || true
+
+            echo "Waiting for pods of [$LWS_NAME] to fully terminate..."
+            TIMEOUT=300
+            SLEEP_INTERVAL=5
+            START_TIME=$(date +%s)
+
+            while true; do
+              NOW=$(date +%s)
+              ELAPSED=$((NOW - START_TIME))
+
+              if [[ $ELAPSED -ge $TIMEOUT ]]; then
+                echo "Timeout reached ($TIMEOUT seconds) waiting for termination, continuing anyway."
+                kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
+                break
+              fi
+
+              PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)
+
+              if [[ -z "$PODS_EXIST" ]]; then
+                echo "All pods for [$LWS_NAME] have terminated."
+                break
+              else
+                echo "Waiting for pods to terminate: $PODS_EXIST"
+                sleep $SLEEP_INTERVAL
+              fi
+            done
diff --git a/.github/workflows/schedule_nightly_test_a2.yaml b/.github/workflows/schedule_nightly_test_a2.yaml
index 347d4a36..011920cf 100644
--- a/.github/workflows/schedule_nightly_test_a2.yaml
+++ b/.github/workflows/schedule_nightly_test_a2.yaml
@@ -21,8 +21,8 @@ name: Nightly-A2
 
 on:
   schedule:
-      # Run test at 24:00 Beijing time (UTC+8)
-      - cron: "0 16 * * *"
+      # Run test at 23:45 Beijing time (UTC+8)
+      - cron: "45 15 * * *"
   workflow_dispatch:
   pull_request:
     branches:
@@ -50,6 +50,10 @@ jobs:
   parse-trigger:
     name: Parse trigger and determine test scope
     runs-on: linux-aarch64-a2b3-0
+    if: >-
+      github.event_name == 'schedule' ||
+      github.event_name == 'workflow_dispatch' ||
+      contains(github.event.pull_request.labels.*.name, 'nightly-test')
     outputs:
       should_run: ${{ steps.parse.outputs.should_run }}
       test_filter: ${{ steps.parse.outputs.test_filter }}
@@ -201,7 +205,7 @@ jobs:
     if: always() && needs.parse-trigger.outputs.should_run == 'true'
     strategy:
       fail-fast: false
-      max-parallel: 1
+      max-parallel: 2
       matrix:
         test_config:
           - name: multi-node-deepseek-dp
diff --git a/.github/workflows/schedule_nightly_test_a3.yaml b/.github/workflows/schedule_nightly_test_a3.yaml
index 66528caa..88c9b5eb 100644
--- a/.github/workflows/schedule_nightly_test_a3.yaml
+++ b/.github/workflows/schedule_nightly_test_a3.yaml
@@ -22,8 +22,8 @@ name: Nightly-A3
 
 on:
   schedule:
-      # Run test at 24:00 Beijing time (UTC+8)
-      - cron: "0 16 * * *"
+      # Run test at 23:45 Beijing time (UTC+8)
+      - cron: "45 15 * * *"
   workflow_dispatch:
   pull_request:
     branches:
@@ -50,6 +50,10 @@ jobs:
   parse-trigger:
     name: Parse trigger and determine test scope
     runs-on: linux-aarch64-a2b3-0
+    if: >-
+      github.event_name == 'schedule' ||
+      github.event_name == 'workflow_dispatch' ||
+      contains(github.event.pull_request.labels.*.name, 'nightly-test')
     outputs:
       should_run: ${{ steps.parse.outputs.should_run }}
       test_filter: ${{ steps.parse.outputs.test_filter }}
@@ -127,7 +131,7 @@ jobs:
     if: always() && needs.parse-trigger.outputs.should_run == 'true'
     strategy:
       fail-fast: false
-      max-parallel: 1
+      max-parallel: 2
       matrix:
         test_config:
           - name: multi-node-deepseek-pd
diff --git a/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2
index b6048604..c1a2f75e 100644
--- a/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2
+++ b/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2
@@ -1,7 +1,7 @@
 apiVersion: leaderworkerset.x-k8s.io/v1
 kind: LeaderWorkerSet
 metadata:
-  name: vllm
+  name: {{ lws_name | default("vllm") }}
   namespace: vllm-project
 spec:
   replicas: {{ replicas | default(1) }}
@@ -128,7 +128,7 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
-  name: vllm-leader
+  name: {{ lws_name | default("vllm") }}-leader
   namespace: vllm-project
 spec:
   ports:
@@ -137,6 +137,6 @@ spec:
       protocol: TCP
       targetPort: 8080
   selector:
-    leaderworkerset.sigs.k8s.io/name: vllm
+    leaderworkerset.sigs.k8s.io/name: {{ lws_name | default("vllm") }}
     role: leader
   type: ClusterIP
diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
index 7e2de7b6..5b0aa94c 100644
--- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
+++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
@@ -1,7 +1,7 @@
 apiVersion: leaderworkerset.x-k8s.io/v1
 kind: LeaderWorkerSet
 metadata:
-  name: vllm
+  name: {{ lws_name | default("vllm") }}
   namespace: vllm-project
 spec:
   replicas: {{ replicas | default(1) }}
@@ -128,7 +128,7 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
-  name: vllm-leader
+  name: {{ lws_name | default("vllm") }}-leader
   namespace: vllm-project
 spec:
   ports:
@@ -137,6 +137,6 @@ spec:
       protocol: TCP
       targetPort: 8080
   selector:
-    leaderworkerset.sigs.k8s.io/name: vllm
+    leaderworkerset.sigs.k8s.io/name: {{ lws_name | default("vllm") }}
     role: leader
   type: ClusterIP
\ No newline at end of file