From 67d40f23fd8a0024c5be66899c5390ecdd929534 Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Tue, 10 Mar 2026 16:25:51 +0800 Subject: [PATCH] [CI]Upgrade niglty multi-node-tests max-parallel to 2 (#7035) ### What this PR does / why we need it? 1. Increase nightly multi-node test max-parallel from 1 to 2, and fix resource conflicts that arise when tests run concurrently. 2. Fix parse-trigger job: Add an if condition so it only runs on schedule, workflow_dispatch, or PRs labeled nightly-test 3. Adjust nightly schedule: Shift trigger time from 24:00 to 23:45 (UTC+8) ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d --------- Signed-off-by: hfadzxy --- .../workflows/_e2e_nightly_multi_node.yaml | 62 +++++++++++++++---- .../workflows/schedule_nightly_test_a2.yaml | 10 ++- .../workflows/schedule_nightly_test_a3.yaml | 10 ++- .../multi_node/scripts/lws-a2.yaml.jinja2 | 6 +- .../multi_node/scripts/lws.yaml.jinja2 | 6 +- 5 files changed, 71 insertions(+), 23 deletions(-) diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 1777af17..cf9f38a7 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -66,7 +66,7 @@ defaults: # only cancel in-progress runs of the same workflow # and ignore the lint / 8 cards test type concurrency: - group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }} + group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}-${{ inputs.config_file_path }} cancel-in-progress: true jobs: @@ -80,7 +80,6 @@ jobs: env: KUBECONFIG: /tmp/kubeconfig NAMESPACE: vllm-project - LEADER_POD: vllm-0 steps: - name: Decode kubeconfig from secrets run: | @@ -101,6 +100,17 @@ jobs: - name: Checkout code uses: actions/checkout@v6 + - name: Set job variables + run: | + # Derive a unique, valid k8s resource name from config_file_path. + # Strip .yaml extension, lowercase, replace dots/underscores with hyphens, cap at 50 chars. + config_file="${{ inputs.config_file_path }}" + lws_suffix=$(echo "$config_file" | sed 's/\.yaml$//' | tr '[:upper:]' '[:lower:]' | tr '._' '-' | cut -c1-50) + LWS_NAME="vllm-${lws_suffix}" + echo "LWS_NAME=${LWS_NAME}" >> $GITHUB_ENV + echo "LEADER_POD=${LWS_NAME}-0" >> $GITHUB_ENV + echo "Computed LWS_NAME=${LWS_NAME}" + - name: Prepare scripts run: | # prepare for lws entrypoint scripts @@ -110,14 +120,14 @@ jobs: run: | set -euo pipefail - CRD_NAME="${CRD_NAME:-vllm}" TIMEOUT=${TIMEOUT:-120} SLEEP_INTERVAL=2 - echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..." - kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found + echo "Deleting leaderworkerset [$LWS_NAME] in namespace [$NAMESPACE]..." + kubectl delete leaderworkerset "$LWS_NAME" -n "$NAMESPACE" --ignore-not-found + kubectl delete service "${LWS_NAME}-leader" -n "$NAMESPACE" --ignore-not-found - echo "Waiting for all pods starting with 'vllm' to be deleted..." + echo "Waiting for pods of leaderworkerset [$LWS_NAME] to be deleted..." START_TIME=$(date +%s) while true; do @@ -126,14 +136,14 @@ jobs: if [[ $ELAPSED -ge $TIMEOUT ]]; then echo "Timeout reached ($TIMEOUT seconds), some pods still exist:" - kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true + kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true exit 1 fi - PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true) + PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true) if [[ -z "$PODS_EXIST" ]]; then - echo "All vllm pods deleted." + echo "All pods for [$LWS_NAME] deleted." break else echo "Waiting for pods to be deleted: $PODS_EXIST" @@ -174,6 +184,7 @@ jobs: fi jinja2 $TEMPLATE_FILE \ + -D lws_name="$LWS_NAME" \ -D size="$size" \ -D replicas="$replicas" \ -D image="$image" \ @@ -190,7 +201,7 @@ jobs: - name: Waiting for pod ready run: | - POD_PREFIX="${POD_PREFIX:-vllm-0}" + POD_PREFIX="${LWS_NAME}-0" SIZE="${{ inputs.size }}" TIMEOUT=1200 # default timeout 20 minutes @@ -260,7 +271,7 @@ jobs: trap cleanup EXIT for i in $(seq 1 $((size - 1))); do - POD="vllm-0-${i}" + POD="${LWS_NAME}-0-${i}" echo "==== Collecting logs from worker pod: $POD ====" kubectl logs -f "$POD" -n "$NAMESPACE" \ @@ -290,5 +301,34 @@ jobs: - name: Post process if: always() run: | + echo "Current pod status:" kubectl get pods -n "$NAMESPACE" --ignore-not-found=true + + echo "Deleting resources for [$LWS_NAME]..." kubectl delete -f ./lws.yaml --ignore-not-found=true || true + + echo "Waiting for pods of [$LWS_NAME] to fully terminate..." + TIMEOUT=300 + SLEEP_INTERVAL=5 + START_TIME=$(date +%s) + + while true; do + NOW=$(date +%s) + ELAPSED=$((NOW - START_TIME)) + + if [[ $ELAPSED -ge $TIMEOUT ]]; then + echo "Timeout reached ($TIMEOUT seconds) waiting for termination, continuing anyway." + kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true + break + fi + + PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true) + + if [[ -z "$PODS_EXIST" ]]; then + echo "All pods for [$LWS_NAME] have terminated." + break + else + echo "Waiting for pods to terminate: $PODS_EXIST" + sleep $SLEEP_INTERVAL + fi + done diff --git a/.github/workflows/schedule_nightly_test_a2.yaml b/.github/workflows/schedule_nightly_test_a2.yaml index 347d4a36..011920cf 100644 --- a/.github/workflows/schedule_nightly_test_a2.yaml +++ b/.github/workflows/schedule_nightly_test_a2.yaml @@ -21,8 +21,8 @@ name: Nightly-A2 on: schedule: - # Run test at 24:00 Beijing time (UTC+8) - - cron: "0 16 * * *" + # Run test at 23:45 Beijing time (UTC+8) + - cron: "45 15 * * *" workflow_dispatch: pull_request: branches: @@ -50,6 +50,10 @@ jobs: parse-trigger: name: Parse trigger and determine test scope runs-on: linux-aarch64-a2b3-0 + if: >- + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + contains(github.event.pull_request.labels.*.name, 'nightly-test') outputs: should_run: ${{ steps.parse.outputs.should_run }} test_filter: ${{ steps.parse.outputs.test_filter }} @@ -201,7 +205,7 @@ jobs: if: always() && needs.parse-trigger.outputs.should_run == 'true' strategy: fail-fast: false - max-parallel: 1 + max-parallel: 2 matrix: test_config: - name: multi-node-deepseek-dp diff --git a/.github/workflows/schedule_nightly_test_a3.yaml b/.github/workflows/schedule_nightly_test_a3.yaml index 66528caa..88c9b5eb 100644 --- a/.github/workflows/schedule_nightly_test_a3.yaml +++ b/.github/workflows/schedule_nightly_test_a3.yaml @@ -22,8 +22,8 @@ name: Nightly-A3 on: schedule: - # Run test at 24:00 Beijing time (UTC+8) - - cron: "0 16 * * *" + # Run test at 23:45 Beijing time (UTC+8) + - cron: "45 15 * * *" workflow_dispatch: pull_request: branches: @@ -50,6 +50,10 @@ jobs: parse-trigger: name: Parse trigger and determine test scope runs-on: linux-aarch64-a2b3-0 + if: >- + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + contains(github.event.pull_request.labels.*.name, 'nightly-test') outputs: should_run: ${{ steps.parse.outputs.should_run }} test_filter: ${{ steps.parse.outputs.test_filter }} @@ -127,7 +131,7 @@ jobs: if: always() && needs.parse-trigger.outputs.should_run == 'true' strategy: fail-fast: false - max-parallel: 1 + max-parallel: 2 matrix: test_config: - name: multi-node-deepseek-pd diff --git a/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 index b6048604..c1a2f75e 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2 @@ -1,7 +1,7 @@ apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet metadata: - name: vllm + name: {{ lws_name | default("vllm") }} namespace: vllm-project spec: replicas: {{ replicas | default(1) }} @@ -128,7 +128,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: vllm-leader + name: {{ lws_name | default("vllm") }}-leader namespace: vllm-project spec: ports: @@ -137,6 +137,6 @@ spec: protocol: TCP targetPort: 8080 selector: - leaderworkerset.sigs.k8s.io/name: vllm + leaderworkerset.sigs.k8s.io/name: {{ lws_name | default("vllm") }} role: leader type: ClusterIP diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 index 7e2de7b6..5b0aa94c 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 @@ -1,7 +1,7 @@ apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet metadata: - name: vllm + name: {{ lws_name | default("vllm") }} namespace: vllm-project spec: replicas: {{ replicas | default(1) }} @@ -128,7 +128,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: vllm-leader + name: {{ lws_name | default("vllm") }}-leader namespace: vllm-project spec: ports: @@ -137,6 +137,6 @@ spec: protocol: TCP targetPort: 8080 selector: - leaderworkerset.sigs.k8s.io/name: vllm + leaderworkerset.sigs.k8s.io/name: {{ lws_name | default("vllm") }} role: leader type: ClusterIP \ No newline at end of file