xc-llm-ascend/.github/workflows/_e2e_nightly_multi_node.yaml

name: 'e2e nightly test multi_node'

on:
  workflow_call:
    inputs:
      soc_version:
        required: true
        type: string
        description: use a2 or a3
      runner:
        required: false
        type: string
        default: linux-aarch64-a3-0
      image:
        required: false
        type: string
        description: base image for pods
        default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11"
      config_file_path:
        required: true
        type: string
        description: the model config for multi_node test
      replicas:
        required: false
        default: "1"
        type: string
        description: replicas of the k8s cluster
      size:
        required: false
        default: "2"
        type: string
        description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
      vllm_version:
        required: false
        default: "v0.18.0"
        type: string
        description: vllm version to use
      vllm_ascend_remote_url:
        required: false
        default: https://github.com/vllm-project/vllm-ascend.git
        type: string
        description: used for pr level tests
      vllm_ascend_ref:
        required: false
        default: main
        type: string
        description: used for pr level tests
      should_run:
        required: true
        type: boolean
    secrets:
      KUBECONFIG_B64:
        required: true


# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
  run:
    shell: bash -el {0}

# only cancel in-progress runs of the same workflow
# and ignore the lint / 8 cards test type
concurrency:
  group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}-${{ inputs.config_file_path }}
  cancel-in-progress: true

jobs:
  e2e:
    name: ${{ inputs.config_file_path }}
    # This is the runner with no NPU for k8s controller
    runs-on: ${{ inputs.runner }}
    if: ${{ inputs.should_run }}
    container:
      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-cpu
      env:
        KUBECONFIG: /tmp/kubeconfig
        NAMESPACE: vllm-project
    steps:
        - name: Decode kubeconfig from secrets
          run: |
            # Decode and save kubeconfig
            if [ "${{ github.event_name }}" = "pull_request" ]; then
              echo "PR test mode"
              if [ "${{ inputs.soc_version }}" = "a3" ]; then
                echo "Using A3 cached kubeconfig"
                cp /root/.cache/.kube/kubeconfig.yaml "$KUBECONFIG"
              else
                echo "Using A2 cached kubeconfig"
                cp /root/.cache/.kube/hk_001_kb.yaml "$KUBECONFIG"
              fi
            else
              echo "Decoding kubeconfig from secrets"
              echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > "$KUBECONFIG"
            fi
        - name: Checkout code
          uses: actions/checkout@v6

        - name: Set job variables
          run: |
            # Derive a unique, valid k8s resource name from config_file_path.
            # Strip .yaml extension, lowercase, replace dots/underscores with hyphens, cap at 50 chars.
            config_file="${{ inputs.config_file_path }}"
            lws_suffix=$(echo "$config_file" | sed 's/\.yaml$//' | tr '[:upper:]' '[:lower:]' | tr '._' '-' | cut -c1-50)
            LWS_NAME="vllm-${lws_suffix}"
            echo "LWS_NAME=${LWS_NAME}" >> $GITHUB_ENV
            echo "LEADER_POD=${LWS_NAME}-0" >> $GITHUB_ENV
            echo "Computed LWS_NAME=${LWS_NAME}"

        - name: Prepare scripts
          run: |
            # prepare for lws entrypoint scripts
            install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh

        - name: Clear resources
          run: |
            set -euo pipefail

            TIMEOUT=${TIMEOUT:-120}
            SLEEP_INTERVAL=2

            echo "Deleting leaderworkerset [$LWS_NAME] in namespace [$NAMESPACE]..."
            kubectl delete leaderworkerset "$LWS_NAME" -n "$NAMESPACE" --ignore-not-found
            kubectl delete service "${LWS_NAME}-leader" -n "$NAMESPACE" --ignore-not-found

            echo "Waiting for pods of leaderworkerset [$LWS_NAME] to be deleted..."
            START_TIME=$(date +%s)

            while true; do
              NOW=$(date +%s)
              ELAPSED=$((NOW - START_TIME))

              if [[ $ELAPSED -ge $TIMEOUT ]]; then
                echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
                kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
                exit 1
              fi

              PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)

              if [[ -z "$PODS_EXIST" ]]; then
                echo "All pods for [$LWS_NAME] deleted."
                break
              else
                echo "Waiting for pods to be deleted: $PODS_EXIST"
                sleep $SLEEP_INTERVAL
              fi
            done

        - name: Launch cluster
          id: launcher
          run: |
            set -e

            size="${{ inputs.size }}"
            replicas="${{ inputs.replicas }}"
            image="${{ inputs.image }}"
            config_file_path="${{ inputs.config_file_path }}"
            fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
            is_pr_test="${{ github.event_name == 'pull_request' }}"
            vllm_version="${{ inputs.vllm_version }}"
            vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
            vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
            echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV

            required_params=("size" "replicas" "image" "config_file_path" "is_pr_test" "vllm_version" "vllm_ascend_ref" "vllm_ascend_remote_url")
            for param in "${required_params[@]}"; do
              if [ -z "${!param}" ]; then
                echo "Error: Parameter '$param' is required but empty"
                exit 1
              fi
            done

            if [ "${{ inputs.soc_version }}" = "a3" ]; then
              npu_per_node=16
              TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2"
            else
              npu_per_node=8
              TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2"
            fi

            jinja2 $TEMPLATE_FILE \
              -D lws_name="$LWS_NAME" \
              -D size="$size" \
              -D replicas="$replicas" \
              -D image="$image" \
              -D config_file_path="$config_file_path" \
              -D npu_per_node="$npu_per_node" \
              -D fail_tag="$fail_tag" \
              -D is_pr_test="$is_pr_test" \
              -D vllm_version="$vllm_version" \
              -D vllm_ascend_ref="$vllm_ascend_ref" \
              -D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
              --outfile lws.yaml

            kubectl apply -f ./lws.yaml

        - name: Waiting for pod ready
          run: |
            POD_PREFIX="${LWS_NAME}-0"
            SIZE="${{ inputs.size }}"
            TIMEOUT=1200  # default timeout 20 minutes

            echo "Waiting for Pods in namespace [$NAMESPACE] to become Running and Ready (timeout ${TIMEOUT}s)..."

            START_TIME=$(date +%s)

            while true; do
              NOW=$(date +%s)
              ELAPSED=$((NOW - START_TIME))
              if [[ $ELAPSED -ge $TIMEOUT ]]; then
                echo "Timeout reached after ${ELAPSED}s"
                echo "Dumping pod status for debugging:"
                kubectl get pods -n "$NAMESPACE"
                kubectl describe pod "$LEADER_POD" -n "$NAMESPACE"
                exit 1
              fi

              # 1) check follower pods
              ALL_FOLLOWERS_READY=true
              for ((i=1; i<SIZE; i++)); do
                POD="${POD_PREFIX}-${i}"
                PHASE=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
                READY=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)

                echo "Follower [$POD] phase=$PHASE ready=$READY"

                if [[ "$PHASE" != "Running" || "$READY" != "true" ]]; then
                  echo "Follower [$POD] not Ready yet..."
                  ALL_FOLLOWERS_READY=false
                  break
                fi
              done

              # 2) check leader pod
              LEADER_PHASE=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
              LEADER_READY=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)

              echo "Leader [$LEADER_POD] phase=$LEADER_PHASE ready=$LEADER_READY"

              if [[ "$LEADER_PHASE" != "Running" || "$LEADER_READY" != "true" ]]; then
                echo "Leader not Ready yet..."
                ALL_FOLLOWERS_READY=false
              fi

              if [[ "$ALL_FOLLOWERS_READY" == "true" ]]; then
                echo "All follower pods and leader pod are Running and Ready — continuing."
                break
              fi

              sleep 2
            done

        - name: Stream logs
          run: |
            set -euo pipefail

            size="${{ inputs.size }}"
            pids=()

            cleanup() {
              echo "Cleaning up background log streams..."
              for pid in "${pids[@]}"; do
                kill "$pid" 2>/dev/null || true
              done
            }
            trap cleanup EXIT

            for i in $(seq 1 $((size - 1))); do
              POD="${LWS_NAME}-0-${i}"

              echo "==== Collecting logs from worker pod: $POD ===="
              kubectl logs -f "$POD" -n "$NAMESPACE" \
                > "/tmp/${POD}_logs.txt" 2>&1 &

              pids+=($!)
            done

            echo "==== Streaming logs from leader pod: $LEADER_POD ===="
            echo "Looking for logs containing: $FAIL_TAG"

            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while IFS= read -r line; do
              echo "$line"
              if echo "$line" | grep -q "$FAIL_TAG"; then
                exit 1
              fi
            done

        - name: Upload logs
          if: always()
          uses: actions/upload-artifact@v7
          with:
            name: ${{ inputs.config_file_path }}-pod-logs
            path: /tmp/vllm*_logs.txt
            retention-days: 7

        - name: Post process
          if: always()
          run: |
            echo "Current pod status:"
            kubectl get pods -n "$NAMESPACE" --ignore-not-found=true

            echo "Deleting resources for [$LWS_NAME]..."
            kubectl delete -f ./lws.yaml --ignore-not-found=true || true

            echo "Waiting for pods of [$LWS_NAME] to fully terminate..."
            TIMEOUT=300
            SLEEP_INTERVAL=5
            START_TIME=$(date +%s)

            while true; do
              NOW=$(date +%s)
              ELAPSED=$((NOW - START_TIME))

              if [[ $ELAPSED -ge $TIMEOUT ]]; then
                echo "Timeout reached ($TIMEOUT seconds) waiting for termination, continuing anyway."
                kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
                break
              fi

              PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)

              if [[ -z "$PODS_EXIST" ]]; then
                echo "All pods for [$LWS_NAME] have terminated."
                break
              else
                echo "Waiting for pods to terminate: $PODS_EXIST"
                sleep $SLEEP_INTERVAL
              fi
            done