name: 'e2e nightly test multi_node' on: workflow_call: inputs: soc_version: required: true type: string description: use a2 or a3 runner: required: false type: string default: linux-aarch64-a3-0 image: required: false type: string description: base image for pods default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11" config_file_path: required: true type: string description: the model config for multi_node test replicas: required: false default: "1" type: string description: replicas of the k8s cluster size: required: false default: "2" type: string description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need vllm_version: required: false default: "v0.13.0" type: string description: vllm version to use vllm_ascend_remote_url: required: false default: https://github.com/vllm-project/vllm-ascend.git type: string description: used for pr level tests vllm_ascend_ref: required: false default: main type: string description: used for pr level tests secrets: KUBECONFIG_B64: required: true # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. # It's used to activate ascend-toolkit environment variables. defaults: run: shell: bash -el {0} # only cancel in-progress runs of the same workflow # and ignore the lint / 8 cards test type concurrency: group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }} cancel-in-progress: true jobs: e2e: name: ${{ inputs.config_file_path }} # This is the runner with no NPU for k8s controller runs-on: ${{ inputs.runner }} container: image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11 env: KUBECONFIG: /tmp/kubeconfig KUBECTL: /root/.cache/.kube/kubectl NAMESPACE: vllm-project LEADER_POD: vllm-0 RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }} steps: - name: Install system denpendencies run: | # configure apt and pip source sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple pip install jinja2-cli - name: Install kubectl run: | # Install kubectl arch=$(uname -m) if echo "$arch" | grep -qiE "arm|aarch64"; then echo "Detected ARM architecture: $arch" KUBECTL="$KUBECTL"_arm fi install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl # Verify kubectl installation kubectl version --client=true - name: Decode kubeconfig from secrets run: | # Decode and save kubeconfig echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG - name: Checkout code uses: actions/checkout@v6 - name: Prepare scripts run: | # prepare for lws entrypoint scripts install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh # clear log directory rm -fr $RESULT_FILE - name: Clear resources run: | set -euo pipefail CRD_NAME="${CRD_NAME:-vllm}" TIMEOUT=${TIMEOUT:-120} SLEEP_INTERVAL=2 echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..." kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found echo "Waiting for all pods starting with 'vllm' to be deleted..." START_TIME=$(date +%s) while true; do NOW=$(date +%s) ELAPSED=$((NOW - START_TIME)) if [[ $ELAPSED -ge $TIMEOUT ]]; then echo "Timeout reached ($TIMEOUT seconds), some pods still exist:" kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true exit 1 fi PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true) if [[ -z "$PODS_EXIST" ]]; then echo "All vllm pods deleted." break else echo "Waiting for pods to be deleted: $PODS_EXIST" sleep $SLEEP_INTERVAL fi done - name: Launch cluster id: launcher run: | set -e size="${{ inputs.size }}" replicas="${{ inputs.replicas }}" image="${{ inputs.image }}" config_file_path="${{ inputs.config_file_path }}" vllm_version="${{ inputs.vllm_version }}" vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}" vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}" result_file_path="$RESULT_FILE" fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}" echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV required_params=("size" "replicas" "image" "config_file_path") for param in "${required_params[@]}"; do if [ -z "${!param}" ]; then echo "Error: Parameter '$param' is required but empty" exit 1 fi done if [ "${{ inputs.soc_version }}" = "a3" ]; then npu_per_node=16 else npu_per_node=8 fi jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \ -D size="$size" \ -D replicas="$replicas" \ -D image="$image" \ -D config_file_path="$config_file_path" \ -D vllm_version="$vllm_version" \ -D vllm_ascend_remote_url="$vllm_ascend_remote_url" \ -D vllm_ascend_ref="$vllm_ascend_ref" \ -D result_file_path="$result_file_path" \ -D npu_per_node="$npu_per_node" \ -D fail_tag="$fail_tag" \ --outfile lws.yaml kubectl apply -f ./lws.yaml - name: Waiting for pod ready run: | POD_PREFIX="${POD_PREFIX:-vllm-0}" SIZE="${{ inputs.size }}" TIMEOUT=1200 # default timeout 20 minutes echo "Waiting for Pods in namespace [$NAMESPACE] to become Running and Ready (timeout ${TIMEOUT}s)..." START_TIME=$(date +%s) while true; do NOW=$(date +%s) ELAPSED=$((NOW - START_TIME)) if [[ $ELAPSED -ge $TIMEOUT ]]; then echo "Timeout reached after ${ELAPSED}s" echo "Dumping pod status for debugging:" kubectl get pods -n "$NAMESPACE" kubectl describe pod "$LEADER_POD" -n "$NAMESPACE" exit 1 fi # 1) check follower pods ALL_FOLLOWERS_READY=true for ((i=1; i/dev/null || echo "NotFound") READY=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null) echo "Follower [$POD] phase=$PHASE ready=$READY" if [[ "$PHASE" != "Running" || "$READY" != "true" ]]; then echo "Follower [$POD] not Ready yet..." ALL_FOLLOWERS_READY=false break fi done # 2) check leader pod LEADER_PHASE=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound") LEADER_READY=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null) echo "Leader [$LEADER_POD] phase=$LEADER_PHASE ready=$LEADER_READY" if [[ "$LEADER_PHASE" != "Running" || "$LEADER_READY" != "true" ]]; then echo "Leader not Ready yet..." ALL_FOLLOWERS_READY=false fi if [[ "$ALL_FOLLOWERS_READY" == "true" ]]; then echo "All follower pods and leader pod are Running and Ready — continuing." break fi sleep 2 done - name: Stream logs run: | set -euo pipefail echo "Looking for logs containing: $FAIL_TAG" kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while read -r line; do echo "$line" if echo "$line" | grep -q "$FAIL_TAG"; then exit 1 # workflow step failed fi done - name: Post process if: always() run: | kubectl get pods -n $NAMESPACE --ignore-not-found=true kubectl delete -f ./lws.yaml --ignore-not-found=true || true