### What this PR does / why we need it?
This pull request refactor nightly image build and simplify the logic of
multi workflows.
1. Nightly image build become the prerequisite when the test are
triggered by `schedule` or `workflow_dispatch`
2. Simplify the pull request select case logic
3. Next step: Implement replaceable nightly tests. Specifically, if
nightly tests are manually triggered, they can accept any optional
docker image to meet the needs of different commits(Which means the
image is customizable).
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.17.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
332 lines
12 KiB
YAML
332 lines
12 KiB
YAML
name: 'e2e nightly test multi_node'
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
soc_version:
|
|
required: true
|
|
type: string
|
|
description: use a2 or a3
|
|
runner:
|
|
required: false
|
|
type: string
|
|
default: linux-aarch64-a3-0
|
|
image:
|
|
required: false
|
|
type: string
|
|
description: base image for pods
|
|
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11"
|
|
config_file_path:
|
|
required: true
|
|
type: string
|
|
description: the model config for multi_node test
|
|
replicas:
|
|
required: false
|
|
default: "1"
|
|
type: string
|
|
description: replicas of the k8s cluster
|
|
size:
|
|
required: false
|
|
default: "2"
|
|
type: string
|
|
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
|
vllm_version:
|
|
required: false
|
|
default: "v0.18.0"
|
|
type: string
|
|
description: vllm version to use
|
|
vllm_ascend_remote_url:
|
|
required: false
|
|
default: https://github.com/vllm-project/vllm-ascend.git
|
|
type: string
|
|
description: used for pr level tests
|
|
vllm_ascend_ref:
|
|
required: false
|
|
default: main
|
|
type: string
|
|
description: used for pr level tests
|
|
should_run:
|
|
required: true
|
|
type: boolean
|
|
secrets:
|
|
KUBECONFIG_B64:
|
|
required: true
|
|
|
|
|
|
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
|
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
|
|
# It's used to activate ascend-toolkit environment variables.
|
|
defaults:
|
|
run:
|
|
shell: bash -el {0}
|
|
|
|
# only cancel in-progress runs of the same workflow
|
|
# and ignore the lint / 8 cards test type
|
|
concurrency:
|
|
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}-${{ inputs.config_file_path }}
|
|
cancel-in-progress: true
|
|
|
|
jobs:
|
|
e2e:
|
|
name: ${{ inputs.config_file_path }}
|
|
# This is the runner with no NPU for k8s controller
|
|
runs-on: ${{ inputs.runner }}
|
|
if: ${{ inputs.should_run }}
|
|
container:
|
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-cpu
|
|
env:
|
|
KUBECONFIG: /tmp/kubeconfig
|
|
NAMESPACE: vllm-project
|
|
steps:
|
|
- name: Decode kubeconfig from secrets
|
|
run: |
|
|
# Decode and save kubeconfig
|
|
if [ "${{ github.event_name }}" = "pull_request" ]; then
|
|
echo "PR test mode"
|
|
if [ "${{ inputs.soc_version }}" = "a3" ]; then
|
|
echo "Using A3 cached kubeconfig"
|
|
cp /root/.cache/.kube/kubeconfig.yaml "$KUBECONFIG"
|
|
else
|
|
echo "Using A2 cached kubeconfig"
|
|
cp /root/.cache/.kube/hk_001_kb.yaml "$KUBECONFIG"
|
|
fi
|
|
else
|
|
echo "Decoding kubeconfig from secrets"
|
|
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > "$KUBECONFIG"
|
|
fi
|
|
- name: Checkout code
|
|
uses: actions/checkout@v6
|
|
|
|
- name: Set job variables
|
|
run: |
|
|
# Derive a unique, valid k8s resource name from config_file_path.
|
|
# Strip .yaml extension, lowercase, replace dots/underscores with hyphens, cap at 50 chars.
|
|
config_file="${{ inputs.config_file_path }}"
|
|
lws_suffix=$(echo "$config_file" | sed 's/\.yaml$//' | tr '[:upper:]' '[:lower:]' | tr '._' '-' | cut -c1-50)
|
|
LWS_NAME="vllm-${lws_suffix}"
|
|
echo "LWS_NAME=${LWS_NAME}" >> $GITHUB_ENV
|
|
echo "LEADER_POD=${LWS_NAME}-0" >> $GITHUB_ENV
|
|
echo "Computed LWS_NAME=${LWS_NAME}"
|
|
|
|
- name: Prepare scripts
|
|
run: |
|
|
# prepare for lws entrypoint scripts
|
|
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
|
|
|
|
- name: Clear resources
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
TIMEOUT=${TIMEOUT:-120}
|
|
SLEEP_INTERVAL=2
|
|
|
|
echo "Deleting leaderworkerset [$LWS_NAME] in namespace [$NAMESPACE]..."
|
|
kubectl delete leaderworkerset "$LWS_NAME" -n "$NAMESPACE" --ignore-not-found
|
|
kubectl delete service "${LWS_NAME}-leader" -n "$NAMESPACE" --ignore-not-found
|
|
|
|
echo "Waiting for pods of leaderworkerset [$LWS_NAME] to be deleted..."
|
|
START_TIME=$(date +%s)
|
|
|
|
while true; do
|
|
NOW=$(date +%s)
|
|
ELAPSED=$((NOW - START_TIME))
|
|
|
|
if [[ $ELAPSED -ge $TIMEOUT ]]; then
|
|
echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
|
|
kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
|
|
exit 1
|
|
fi
|
|
|
|
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)
|
|
|
|
if [[ -z "$PODS_EXIST" ]]; then
|
|
echo "All pods for [$LWS_NAME] deleted."
|
|
break
|
|
else
|
|
echo "Waiting for pods to be deleted: $PODS_EXIST"
|
|
sleep $SLEEP_INTERVAL
|
|
fi
|
|
done
|
|
|
|
- name: Launch cluster
|
|
id: launcher
|
|
run: |
|
|
set -e
|
|
|
|
size="${{ inputs.size }}"
|
|
replicas="${{ inputs.replicas }}"
|
|
image="${{ inputs.image }}"
|
|
config_file_path="${{ inputs.config_file_path }}"
|
|
fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
|
|
is_pr_test="${{ github.event_name == 'pull_request' }}"
|
|
vllm_version="${{ inputs.vllm_version }}"
|
|
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
|
|
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
|
|
echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
|
|
|
|
required_params=("size" "replicas" "image" "config_file_path" "is_pr_test" "vllm_version" "vllm_ascend_ref" "vllm_ascend_remote_url")
|
|
for param in "${required_params[@]}"; do
|
|
if [ -z "${!param}" ]; then
|
|
echo "Error: Parameter '$param' is required but empty"
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
if [ "${{ inputs.soc_version }}" = "a3" ]; then
|
|
npu_per_node=16
|
|
TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2"
|
|
else
|
|
npu_per_node=8
|
|
TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2"
|
|
fi
|
|
|
|
jinja2 $TEMPLATE_FILE \
|
|
-D lws_name="$LWS_NAME" \
|
|
-D size="$size" \
|
|
-D replicas="$replicas" \
|
|
-D image="$image" \
|
|
-D config_file_path="$config_file_path" \
|
|
-D npu_per_node="$npu_per_node" \
|
|
-D fail_tag="$fail_tag" \
|
|
-D is_pr_test="$is_pr_test" \
|
|
-D vllm_version="$vllm_version" \
|
|
-D vllm_ascend_ref="$vllm_ascend_ref" \
|
|
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
|
|
--outfile lws.yaml
|
|
|
|
kubectl apply -f ./lws.yaml
|
|
|
|
- name: Waiting for pod ready
|
|
run: |
|
|
POD_PREFIX="${LWS_NAME}-0"
|
|
SIZE="${{ inputs.size }}"
|
|
TIMEOUT=1200 # default timeout 20 minutes
|
|
|
|
echo "Waiting for Pods in namespace [$NAMESPACE] to become Running and Ready (timeout ${TIMEOUT}s)..."
|
|
|
|
START_TIME=$(date +%s)
|
|
|
|
while true; do
|
|
NOW=$(date +%s)
|
|
ELAPSED=$((NOW - START_TIME))
|
|
if [[ $ELAPSED -ge $TIMEOUT ]]; then
|
|
echo "Timeout reached after ${ELAPSED}s"
|
|
echo "Dumping pod status for debugging:"
|
|
kubectl get pods -n "$NAMESPACE"
|
|
kubectl describe pod "$LEADER_POD" -n "$NAMESPACE"
|
|
exit 1
|
|
fi
|
|
|
|
# 1) check follower pods
|
|
ALL_FOLLOWERS_READY=true
|
|
for ((i=1; i<SIZE; i++)); do
|
|
POD="${POD_PREFIX}-${i}"
|
|
PHASE=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
|
|
READY=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
|
|
|
|
echo "Follower [$POD] phase=$PHASE ready=$READY"
|
|
|
|
if [[ "$PHASE" != "Running" || "$READY" != "true" ]]; then
|
|
echo "Follower [$POD] not Ready yet..."
|
|
ALL_FOLLOWERS_READY=false
|
|
break
|
|
fi
|
|
done
|
|
|
|
# 2) check leader pod
|
|
LEADER_PHASE=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
|
|
LEADER_READY=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
|
|
|
|
echo "Leader [$LEADER_POD] phase=$LEADER_PHASE ready=$LEADER_READY"
|
|
|
|
if [[ "$LEADER_PHASE" != "Running" || "$LEADER_READY" != "true" ]]; then
|
|
echo "Leader not Ready yet..."
|
|
ALL_FOLLOWERS_READY=false
|
|
fi
|
|
|
|
if [[ "$ALL_FOLLOWERS_READY" == "true" ]]; then
|
|
echo "All follower pods and leader pod are Running and Ready — continuing."
|
|
break
|
|
fi
|
|
|
|
sleep 2
|
|
done
|
|
|
|
- name: Stream logs
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
size="${{ inputs.size }}"
|
|
pids=()
|
|
|
|
cleanup() {
|
|
echo "Cleaning up background log streams..."
|
|
for pid in "${pids[@]}"; do
|
|
kill "$pid" 2>/dev/null || true
|
|
done
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
for i in $(seq 1 $((size - 1))); do
|
|
POD="${LWS_NAME}-0-${i}"
|
|
|
|
echo "==== Collecting logs from worker pod: $POD ===="
|
|
kubectl logs -f "$POD" -n "$NAMESPACE" \
|
|
> "/tmp/${POD}_logs.txt" 2>&1 &
|
|
|
|
pids+=($!)
|
|
done
|
|
|
|
echo "==== Streaming logs from leader pod: $LEADER_POD ===="
|
|
echo "Looking for logs containing: $FAIL_TAG"
|
|
|
|
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while IFS= read -r line; do
|
|
echo "$line"
|
|
if echo "$line" | grep -q "$FAIL_TAG"; then
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
- name: Upload logs
|
|
if: always()
|
|
uses: actions/upload-artifact@v7
|
|
with:
|
|
name: ${{ inputs.config_file_path }}-pod-logs
|
|
path: /tmp/vllm*_logs.txt
|
|
retention-days: 7
|
|
|
|
- name: Post process
|
|
if: always()
|
|
run: |
|
|
echo "Current pod status:"
|
|
kubectl get pods -n "$NAMESPACE" --ignore-not-found=true
|
|
|
|
echo "Deleting resources for [$LWS_NAME]..."
|
|
kubectl delete -f ./lws.yaml --ignore-not-found=true || true
|
|
|
|
echo "Waiting for pods of [$LWS_NAME] to fully terminate..."
|
|
TIMEOUT=300
|
|
SLEEP_INTERVAL=5
|
|
START_TIME=$(date +%s)
|
|
|
|
while true; do
|
|
NOW=$(date +%s)
|
|
ELAPSED=$((NOW - START_TIME))
|
|
|
|
if [[ $ELAPSED -ge $TIMEOUT ]]; then
|
|
echo "Timeout reached ($TIMEOUT seconds) waiting for termination, continuing anyway."
|
|
kubectl get pods -n "$NAMESPACE" | grep "^${LWS_NAME}-" || true
|
|
break
|
|
fi
|
|
|
|
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep "^${LWS_NAME}-" || true)
|
|
|
|
if [[ -z "$PODS_EXIST" ]]; then
|
|
echo "All pods for [$LWS_NAME] have terminated."
|
|
break
|
|
else
|
|
echo "Waiting for pods to terminate: $PODS_EXIST"
|
|
sleep $SLEEP_INTERVAL
|
|
fi
|
|
done
|