Files
xc-llm-ascend/.github/workflows/_e2e_nightly_multi_node.yaml
wangxiyuan a1f142b7ad Drop 0.11.0 support (#4377)
There is a lot hack code for v0.11.0, which makes the code hard to
upgrade to newer vLLM version. Since v0.11.0 will release soon. Let's
drop v0.11.0 support first. Then we'll upgrade to v0.11.2 soon.


- vLLM version: v0.11.0
- vLLM main:
2918c1b49c

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-11-24 17:08:20 +08:00

268 lines
9.3 KiB
YAML

name: 'e2e nightly test multi_node'
on:
workflow_call:
inputs:
soc_version:
required: true
type: string
description: use a2 or a3
runner:
required: false
type: string
default: linux-aarch64-a3-0
image:
required: false
type: string
description: base image for pods
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11"
config_file_path:
required: true
type: string
description: the model config for multi_node test
replicas:
required: false
default: "1"
type: string
description: replicas of the k8s cluster
size:
required: false
default: "2"
type: string
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
vllm_version:
required: false
default: "2918c1b49c88c29783c86f78d2c4221cb9622379"
type: string
description: vllm version to use
vllm_ascend_remote_url:
required: false
default: https://github.com/vllm-project/vllm-ascend.git
type: string
description: used for pr level tests
vllm_ascend_ref:
required: false
default: main
type: string
description: used for pr level tests
secrets:
KUBECONFIG_B64:
required: true
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
# and ignore the lint / 8 cards test type
concurrency:
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}
cancel-in-progress: true
jobs:
e2e:
name: ${{ inputs.config_file_path }}
# This is the runner with no NPU for k8s controller
runs-on: ${{ inputs.runner }}
container:
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
env:
KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl
NAMESPACE: vllm-project
LEADER_POD: vllm-0
RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }}
steps:
- name: Install system denpendencies
run: |
# configure apt and pip source
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
pip install jinja2-cli
#apt-get update -y && apt-get install -y git curl
- name: Install kubectl
run: |
# Install kubectl
arch=$(uname -m)
if echo "$arch" | grep -qiE "arm|aarch64"; then
echo "Detected ARM architecture: $arch"
KUBECTL="$KUBECTL"_arm
fi
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
# Verify kubectl installation
kubectl version --client=true
- name: Decode kubeconfig from secrets
run: |
# Decode and save kubeconfig
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
- name: Checkout code
uses: actions/checkout@v4
- name: Prepare scripts
run: |
# prepare for lws entrypoint scripts
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
- name: Clear resources
run: |
set -euo pipefail
CRD_NAME="${CRD_NAME:-vllm}"
TIMEOUT=${TIMEOUT:-120}
SLEEP_INTERVAL=2
echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..."
kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found
echo "Waiting for all pods starting with 'vllm' to be deleted..."
START_TIME=$(date +%s)
while true; do
NOW=$(date +%s)
ELAPSED=$((NOW - START_TIME))
if [[ $ELAPSED -ge $TIMEOUT ]]; then
echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true
exit 1
fi
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true)
if [[ -z "$PODS_EXIST" ]]; then
echo "All vllm pods deleted."
break
else
echo "Waiting for pods to be deleted: $PODS_EXIST"
sleep $SLEEP_INTERVAL
fi
done
- name: Launch cluster
id: launcher
run: |
set -e
size="${{ inputs.size }}"
replicas="${{ inputs.replicas }}"
image="${{ inputs.image }}"
config_file_path="${{ inputs.config_file_path }}"
vllm_version="${{ inputs.vllm_version }}"
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
result_file_path="$RESULT_FILE"
fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
required_params=("size" "replicas" "image" "config_file_path")
for param in "${required_params[@]}"; do
if [ -z "${!param}" ]; then
echo "Error: Parameter '$param' is required but empty"
exit 1
fi
done
if [ "${{ inputs.soc_version }}" = "a3" ]; then
npu_per_node=16
else
npu_per_node=8
fi
jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
-D size="$size" \
-D replicas="$replicas" \
-D image="$image" \
-D config_file_path="$config_file_path" \
-D vllm_version="$vllm_version" \
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
-D vllm_ascend_ref="$vllm_ascend_ref" \
-D result_file_path="$result_file_path" \
-D npu_per_node="$npu_per_node" \
-D fail_tag="$fail_tag" \
--outfile lws.yaml
kubectl apply -f ./lws.yaml
- name: Waiting for pod ready
run: |
POD_PREFIX="${POD_PREFIX:-vllm-0}"
SIZE="${{ inputs.size }}"
TIMEOUT=1200 # default timeout 20 minutes
echo "Waiting for Pods in namespace [$NAMESPACE] to become Running and Ready (timeout ${TIMEOUT}s)..."
START_TIME=$(date +%s)
while true; do
NOW=$(date +%s)
ELAPSED=$((NOW - START_TIME))
if [[ $ELAPSED -ge $TIMEOUT ]]; then
echo "Timeout reached after ${ELAPSED}s"
echo "Dumping pod status for debugging:"
kubectl get pods -n "$NAMESPACE"
kubectl describe pod "$LEADER_POD" -n "$NAMESPACE"
exit 1
fi
# 1) check follower pods
ALL_FOLLOWERS_READY=true
for ((i=1; i<SIZE; i++)); do
POD="${POD_PREFIX}-${i}"
PHASE=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
READY=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
echo "Follower [$POD] phase=$PHASE ready=$READY"
if [[ "$PHASE" != "Running" || "$READY" != "true" ]]; then
echo "Follower [$POD] not Ready yet..."
ALL_FOLLOWERS_READY=false
break
fi
done
# 2) check leader pod
LEADER_PHASE=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
LEADER_READY=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
echo "Leader [$LEADER_POD] phase=$LEADER_PHASE ready=$LEADER_READY"
if [[ "$LEADER_PHASE" != "Running" || "$LEADER_READY" != "true" ]]; then
echo "Leader not Ready yet..."
ALL_FOLLOWERS_READY=false
fi
if [[ "$ALL_FOLLOWERS_READY" == "true" ]]; then
echo "All follower pods and leader pod are Running and Ready — continuing."
break
fi
sleep 2
done
- name: Stream logs
run: |
set -euo pipefail
echo "Looking for logs containing: $FAIL_TAG"
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while read -r line; do
echo "$line"
if echo "$line" | grep -q "$FAIL_TAG"; then
exit 1 # workflow step failed
fi
done
- name: Post process
if: always()
run: |
kubectl get pods -n $NAMESPACE
kubectl delete -f ./lws.yaml