xc-llm-ascend/.github/workflows/multi_node_test.yaml

name: 'e2e test / multi-dp'

on:
    schedule:
      - cron: "0 */4 * * *"
    workflow_dispatch:

# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
  run:
    shell: bash -el {0}

# only cancel in-progress runs of the same workflow
# and ignore the lint / 8 cards test type
concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  e2e:
    # This is a runner with no NPU for k8s controller
    runs-on: linux-aarch64-a3-0
    container:
      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
      env:
        KUBECONFIG: /tmp/kubeconfig
        KUBECTL: /root/.cache/.kube/kubectl
        NAMESPACE: vllm-project
        LEADER_POD: vllm-0
    steps:
        - name: Install system denpendencies
          run: |
           # configure apt and pip source
           sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

           apt-get update -y && apt-get install -y git curl

           TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64`
           git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN"

        - name: Install kubectl
          run: |
            install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl

            # get kubeconfig from secret
            echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG

        - name: Checkout code
          uses: actions/checkout@v4

        - name: Prepare scripts
          run: |
            # prepare for lws entrypoint scripts
            install -D tests/e2e/multi_node/scripts/run.sh /root/.cache/tests/run.sh

        - name: Launch cluster
          run: |
            kubectl apply -f tests/e2e/multi_node/scripts/lws.yaml

        - name: Waiting for pod ready
          run: |
            echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."

            while true; do
              # get pod status
              READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')

              if [[ "$READY_STATUS" == "true" ]]; then
                echo "✅ Pod [$LEADER_POD] is Ready!"
                break
              else
                echo "Pod [$LEADER_POD] not ready, waiting..."
                sleep 3
              fi
            done

        - name: Stream logs and monitor pod health
          run: |
            set -euo pipefail

            echo "🚀 Start streaming logs for Pod [$LEADER_POD] ..."
            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" &
            LOG_PID=$!

            echo "Start monitoring Pod [$LEADER_POD] status ..."
            while true; do
              STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}')
              if [[ "$STATUS" != "Running" && "$STATUS" != "Succeeded" ]]; then
                echo "❌ Pod [$LEADER_POD] exited abnormally with status: $STATUS"
                kubectl describe pod "$LEADER_POD" -n "$NAMESPACE" || true
                kubectl logs "$LEADER_POD" -n "$NAMESPACE" --previous --all-containers || true
                kill $LOG_PID || true
                exit 1
              fi
              sleep 5
            done &

            MONITOR_PID=$!
            wait $LOG_PID || true
            kill $MONITOR_PID || true

        - name: Generate summary
          if: always()
          run: |
            if [ -f "/root/.cache/test_summary.md" ]; then
              cat /root/.cache/test_summary.md >> "$GITHUB_STEP_SUMMARY"
            else
              echo "No summary file found." >> "$GITHUB_STEP_SUMMARY"
            fi

        - name: Post process
          if: always()
          run: |
            kubectl get pods -n $NAMESPACE
            kubectl delete -f tests/e2e/multi_node/scripts/lws.yaml