diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 3fa400e8..ae3bb6be 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -60,7 +60,7 @@ defaults: # only cancel in-progress runs of the same workflow # and ignore the lint / 8 cards test type concurrency: - group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.config_file_path }} + group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }} cancel-in-progress: true jobs: @@ -115,8 +115,39 @@ jobs: - name: Clear resources run: | - # pre clear the crd resources created by lws - kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found + set -euo pipefail + + CRD_NAME="${CRD_NAME:-vllm}" + TIMEOUT=${TIMEOUT:-120} + SLEEP_INTERVAL=2 + + echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..." + kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found + + echo "Waiting for all pods starting with 'vllm' to be deleted..." + START_TIME=$(date +%s) + + while true; do + NOW=$(date +%s) + ELAPSED=$((NOW - START_TIME)) + + if [[ $ELAPSED -ge $TIMEOUT ]]; then + echo "Timeout reached ($TIMEOUT seconds), some pods still exist:" + kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true + exit 1 + fi + + PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true) + + if [[ -z "$PODS_EXIST" ]]; then + echo "All vllm pods deleted." + break + else + echo "Waiting for pods to be deleted: $PODS_EXIST" + sleep $SLEEP_INTERVAL + fi + done + - name: Launch cluster id: launcher run: | @@ -164,19 +195,58 @@ jobs: - name: Waiting for pod ready run: | - echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..." + POD_PREFIX="${POD_PREFIX:-vllm-0}" + SIZE="${{ inputs.size }}" + TIMEOUT=1200 # default timeout 20 minutes + + echo "Waiting for Pods in namespace [$NAMESPACE] to become Running and Ready (timeout ${TIMEOUT}s)..." + + START_TIME=$(date +%s) while true; do - # get pod status - READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}') - - if [[ "$READY_STATUS" == "true" ]]; then - echo "Pod [$LEADER_POD] is Ready!" - break - else - echo "Pod [$LEADER_POD] not ready, waiting..." - sleep 3 + NOW=$(date +%s) + ELAPSED=$((NOW - START_TIME)) + if [[ $ELAPSED -ge $TIMEOUT ]]; then + echo "Timeout reached after ${ELAPSED}s" + echo "Dumping pod status for debugging:" + kubectl get pods -n "$NAMESPACE" + kubectl describe pod "$LEADER_POD" -n "$NAMESPACE" + exit 1 fi + + # 1) check follower pods + ALL_FOLLOWERS_READY=true + for ((i=1; i<${SIZE}; i++)); do + POD="${POD_PREFIX}-${i}" + PHASE=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound") + READY=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null) + + echo "Follower [$POD] phase=$PHASE ready=$READY" + + if [[ "$PHASE" != "Running" || "$READY" != "true" ]]; then + echo "Follower [$POD] not Ready yet..." + ALL_FOLLOWERS_READY=false + break + fi + done + + # 2) check leader pod + LEADER_PHASE=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound") + LEADER_READY=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null) + + echo "Leader [$LEADER_POD] phase=$LEADER_PHASE ready=$LEADER_READY" + + if [[ "$LEADER_PHASE" != "Running" || "$LEADER_READY" != "true" ]]; then + echo "Leader not Ready yet..." + ALL_FOLLOWERS_READY=false + fi + + if [[ "$ALL_FOLLOWERS_READY" == "true" ]]; then + echo "All follower pods and leader pod are Running and Ready — continuing." + break + fi + + sleep 2 done - name: Stream logs