From b34f195cc8760ead9a704662cd3f6211054bb553 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sun, 23 Nov 2025 23:05:33 +0800 Subject: [PATCH] [CI] Fix nightly CI for A2 series (#3825) ### What this PR does / why we need it? For multi-node CI system, we need to ensure that cluster resources meet the expected specifications before conducting multi-node interoperability tests. Otherwise, unexpected errors may occur (for example, we might mistakenly assume all nodes are ready and perform a global cluster IP acquisition, which would cause an exception to be thrown in Python because some nodes might not actually be ready at that point). Therefore, we need to wait at the workflow level until all resources meet the expected specifications. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379 --------- Signed-off-by: wangli --- .../workflows/_e2e_nightly_multi_node.yaml | 96 ++++++++++++++++--- 1 file changed, 83 insertions(+), 13 deletions(-) diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 3fa400e8..ae3bb6be 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -60,7 +60,7 @@ defaults: # only cancel in-progress runs of the same workflow # and ignore the lint / 8 cards test type concurrency: - group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.config_file_path }} + group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }} cancel-in-progress: true jobs: @@ -115,8 +115,39 @@ jobs: - name: Clear resources run: | - # pre clear the crd resources created by lws - kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found + set -euo pipefail + + CRD_NAME="${CRD_NAME:-vllm}" + TIMEOUT=${TIMEOUT:-120} + SLEEP_INTERVAL=2 + + echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..." + kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found + + echo "Waiting for all pods starting with 'vllm' to be deleted..." + START_TIME=$(date +%s) + + while true; do + NOW=$(date +%s) + ELAPSED=$((NOW - START_TIME)) + + if [[ $ELAPSED -ge $TIMEOUT ]]; then + echo "Timeout reached ($TIMEOUT seconds), some pods still exist:" + kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true + exit 1 + fi + + PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true) + + if [[ -z "$PODS_EXIST" ]]; then + echo "All vllm pods deleted." + break + else + echo "Waiting for pods to be deleted: $PODS_EXIST" + sleep $SLEEP_INTERVAL + fi + done + - name: Launch cluster id: launcher run: | @@ -164,19 +195,58 @@ jobs: - name: Waiting for pod ready run: | - echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..." + POD_PREFIX="${POD_PREFIX:-vllm-0}" + SIZE="${{ inputs.size }}" + TIMEOUT=1200 # default timeout 20 minutes + + echo "Waiting for Pods in namespace [$NAMESPACE] to become Running and Ready (timeout ${TIMEOUT}s)..." + + START_TIME=$(date +%s) while true; do - # get pod status - READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}') - - if [[ "$READY_STATUS" == "true" ]]; then - echo "Pod [$LEADER_POD] is Ready!" - break - else - echo "Pod [$LEADER_POD] not ready, waiting..." - sleep 3 + NOW=$(date +%s) + ELAPSED=$((NOW - START_TIME)) + if [[ $ELAPSED -ge $TIMEOUT ]]; then + echo "Timeout reached after ${ELAPSED}s" + echo "Dumping pod status for debugging:" + kubectl get pods -n "$NAMESPACE" + kubectl describe pod "$LEADER_POD" -n "$NAMESPACE" + exit 1 fi + + # 1) check follower pods + ALL_FOLLOWERS_READY=true + for ((i=1; i<${SIZE}; i++)); do + POD="${POD_PREFIX}-${i}" + PHASE=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound") + READY=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null) + + echo "Follower [$POD] phase=$PHASE ready=$READY" + + if [[ "$PHASE" != "Running" || "$READY" != "true" ]]; then + echo "Follower [$POD] not Ready yet..." + ALL_FOLLOWERS_READY=false + break + fi + done + + # 2) check leader pod + LEADER_PHASE=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound") + LEADER_READY=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null) + + echo "Leader [$LEADER_POD] phase=$LEADER_PHASE ready=$LEADER_READY" + + if [[ "$LEADER_PHASE" != "Running" || "$LEADER_READY" != "true" ]]; then + echo "Leader not Ready yet..." + ALL_FOLLOWERS_READY=false + fi + + if [[ "$ALL_FOLLOWERS_READY" == "true" ]]; then + echo "All follower pods and leader pod are Running and Ready — continuing." + break + fi + + sleep 2 done - name: Stream logs