Files
xc-llm-ascend/.github/workflows/_e2e_nightly_multi_node.yaml
Li Wang 8e3f8bab57 [Nightly] Nightly pre-build image (#7388)
### What this PR does / why we need it?
This pull request refactor nightly image build and simplify the logic of
multi workflows.
1. Nightly image build become the prerequisite when the test are
triggered by `schedule` or `workflow_dispatch`
2. Simplify the pull request select case logic
3. Next step: Implement replaceable nightly tests. Specifically, if
nightly tests are manually triggered, they can accept any optional
docker image to meet the needs of different commits(Which means the
image is customizable).
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.17.0
- vLLM main:
4034c3d32e

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
2026-03-25 09:24:01 +08:00

332 lines
12 KiB
YAML

name: 'e2e nightly test multi_node'
on:
workflow_call:
inputs:
soc_version:
required: true
type: string
description: use a2 or a3
runner:
required: false
type: string
default: linux-aarch64-a3-0
image:
required: false
type: string
description: base image for pods
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11"
config_file_path:
required: true
type: string
description: the model config for multi_node test
replicas:
required: false
default: "1"
type: string
description: replicas of the k8s cluster
size:
required: false
default: "2"
type: string
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
vllm_version:
required: false
default: "v0.18.0"
type: string
description: vllm version to use
vllm_ascend_remote_url:
required: false
default: https://github.com/vllm-project/vllm-ascend.git
type: string
description: used for pr level tests
vllm_ascend_ref:
required: false
default: main
type: string
description: used for pr level tests
should_run:
required: true
type: boolean
secrets:
KUBECONFIG_B64:
required: true
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
# and ignore the lint / 8 cards test type
concurrency:
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}-${{ inputs.config_file_path }}
cancel-in-progress: true
jobs:
e2e:
name: ${{ inputs.config_file_path }}
# This is the runner with no NPU for k8s controller
runs-on: ${{ inputs.runner }}
if: ${{ inputs.should_run }}
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-cpu
env:
KUBECONFIG: /tmp/kubeconfig
NAMESPACE: vllm-project
steps:
- name: Decode kubeconfig from secrets
run: |
# Decode and save kubeconfig
if [ "${{ github.event_name }}" = "pull_request" ]; then
echo "PR test mode"
if [ "${{ inputs.soc_version }}" = "a3" ]; then
echo "Using A3 cached kubeconfig"
cp /root/.cache/.kube/kubeconfig.yaml "$KUBECONFIG"
else
echo "Using A2 cached kubeconfig"
cp /root/.cache/.kube/hk_001_kb.yaml "$KUBECONFIG"
fi
else
echo "Decoding kubeconfig from secrets"
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > "$KUBECONFIG"
fi
- name: Checkout code
uses: actions/checkout@v6
- name: Set job variables
run: |
# Derive a unique, valid k8s resource name from config_file_path.
# Strip .yaml extension, lowercase, replace dots/underscores with hyphens, cap at 50 chars.
config_file="${{ inputs.config_file_path }}"
lws_suffix=$(echo "$config_file" | sed 's/\.yaml$//' | tr '[:upper:]' '[:lower:]' | tr '._' '-' | cut -c1-50)
LWS_NAME="vllm-${lws_suffix}"
echo "LWS_NAME=${LWS_NAME}" >> $GITHUB_ENV
echo "LEADER_POD=${LWS_NAME}-0" >> $GITHUB_ENV
echo "Computed LWS_NAME=${LWS_NAME}"
- name: Prepare scripts
run: |
# prepare for lws entrypoint scripts
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
- name: Clear resources
run: |
set -euo pipefail
TIMEOUT=${TIMEOUT:-120}
SLEEP_INTERVAL=2
echo "Deleting leaderworkerset [$LWS_NAME] in namespace [$NAMESPACE]..."
kubectl delete leaderworkerset "$LWS_NAME" -n "$NAMESPACE" --ignore-not-found
kubectl delete service "${LWS_NAME}-leader" -n "$NAMESPACE" --ignore-not-found
echo "Waiting for pods of leaderworkerset [$LWS_NAME] to be deleted..."
START_TIME=$(date +%s)
while true; do
NOW=$(date +%s)
ELAPSED=$((NOW - START_TIME))
if [[ $ELAPSED -ge $TIMEOUT ]]; then
echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
exit 1
fi
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)
if [[ -z "$PODS_EXIST" ]]; then
echo "All pods for [$LWS_NAME] deleted."
break
else
echo "Waiting for pods to be deleted: $PODS_EXIST"
sleep $SLEEP_INTERVAL
fi
done
- name: Launch cluster
id: launcher
run: |
set -e
size="${{ inputs.size }}"
replicas="${{ inputs.replicas }}"
image="${{ inputs.image }}"
config_file_path="${{ inputs.config_file_path }}"
fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
is_pr_test="${{ github.event_name == 'pull_request' }}"
vllm_version="${{ inputs.vllm_version }}"
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
required_params=("size" "replicas" "image" "config_file_path" "is_pr_test" "vllm_version" "vllm_ascend_ref" "vllm_ascend_remote_url")
for param in "${required_params[@]}"; do
if [ -z "${!param}" ]; then
echo "Error: Parameter '$param' is required but empty"
exit 1
fi
done
if [ "${{ inputs.soc_version }}" = "a3" ]; then
npu_per_node=16
TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2"
else
npu_per_node=8
TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2"
fi
jinja2 $TEMPLATE_FILE \
-D lws_name="$LWS_NAME" \
-D size="$size" \
-D replicas="$replicas" \
-D image="$image" \
-D config_file_path="$config_file_path" \
-D npu_per_node="$npu_per_node" \
-D fail_tag="$fail_tag" \
-D is_pr_test="$is_pr_test" \
-D vllm_version="$vllm_version" \
-D vllm_ascend_ref="$vllm_ascend_ref" \
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
--outfile lws.yaml
kubectl apply -f ./lws.yaml
- name: Waiting for pod ready
run: |
POD_PREFIX="${LWS_NAME}-0"
SIZE="${{ inputs.size }}"
TIMEOUT=1200 # default timeout 20 minutes
echo "Waiting for Pods in namespace [$NAMESPACE] to become Running and Ready (timeout ${TIMEOUT}s)..."
START_TIME=$(date +%s)
while true; do
NOW=$(date +%s)
ELAPSED=$((NOW - START_TIME))
if [[ $ELAPSED -ge $TIMEOUT ]]; then
echo "Timeout reached after ${ELAPSED}s"
echo "Dumping pod status for debugging:"
kubectl get pods -n "$NAMESPACE"
kubectl describe pod "$LEADER_POD" -n "$NAMESPACE"
exit 1
fi
# 1) check follower pods
ALL_FOLLOWERS_READY=true
for ((i=1; i<SIZE; i++)); do
POD="${POD_PREFIX}-${i}"
PHASE=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
READY=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
echo "Follower [$POD] phase=$PHASE ready=$READY"
if [[ "$PHASE" != "Running" || "$READY" != "true" ]]; then
echo "Follower [$POD] not Ready yet..."
ALL_FOLLOWERS_READY=false
break
fi
done
# 2) check leader pod
LEADER_PHASE=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
LEADER_READY=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
echo "Leader [$LEADER_POD] phase=$LEADER_PHASE ready=$LEADER_READY"
if [[ "$LEADER_PHASE" != "Running" || "$LEADER_READY" != "true" ]]; then
echo "Leader not Ready yet..."
ALL_FOLLOWERS_READY=false
fi
if [[ "$ALL_FOLLOWERS_READY" == "true" ]]; then
echo "All follower pods and leader pod are Running and Ready — continuing."
break
fi
sleep 2
done
- name: Stream logs
run: |
set -euo pipefail
size="${{ inputs.size }}"
pids=()
cleanup() {
echo "Cleaning up background log streams..."
for pid in "${pids[@]}"; do
kill "$pid" 2>/dev/null || true
done
}
trap cleanup EXIT
for i in $(seq 1 $((size - 1))); do
POD="${LWS_NAME}-0-${i}"
echo "==== Collecting logs from worker pod: $POD ===="
kubectl logs -f "$POD" -n "$NAMESPACE" \
> "/tmp/${POD}_logs.txt" 2>&1 &
pids+=($!)
done
echo "==== Streaming logs from leader pod: $LEADER_POD ===="
echo "Looking for logs containing: $FAIL_TAG"
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while IFS= read -r line; do
echo "$line"
if echo "$line" | grep -q "$FAIL_TAG"; then
exit 1
fi
done
- name: Upload logs
if: always()
uses: actions/upload-artifact@v7
with:
name: ${{ inputs.config_file_path }}-pod-logs
path: /tmp/vllm*_logs.txt
retention-days: 7
- name: Post process
if: always()
run: |
echo "Current pod status:"
kubectl get pods -n "$NAMESPACE" --ignore-not-found=true
echo "Deleting resources for [$LWS_NAME]..."
kubectl delete -f ./lws.yaml --ignore-not-found=true || true
echo "Waiting for pods of [$LWS_NAME] to fully terminate..."
TIMEOUT=300
SLEEP_INTERVAL=5
START_TIME=$(date +%s)
while true; do
NOW=$(date +%s)
ELAPSED=$((NOW - START_TIME))
if [[ $ELAPSED -ge $TIMEOUT ]]; then
echo "Timeout reached ($TIMEOUT seconds) waiting for termination, continuing anyway."
kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
break
fi
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)
if [[ -z "$PODS_EXIST" ]]; then
echo "All pods for [$LWS_NAME] have terminated."
break
else
echo "Waiting for pods to terminate: $PODS_EXIST"
sleep $SLEEP_INTERVAL
fi
done