There is a lot hack code for v0.11.0, which makes the code hard to
upgrade to newer vLLM version. Since v0.11.0 will release soon. Let's
drop v0.11.0 support first. Then we'll upgrade to v0.11.2 soon.
- vLLM version: v0.11.0
- vLLM main:
2918c1b49c
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
268 lines
9.3 KiB
YAML
268 lines
9.3 KiB
YAML
name: 'e2e nightly test multi_node'
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
soc_version:
|
|
required: true
|
|
type: string
|
|
description: use a2 or a3
|
|
runner:
|
|
required: false
|
|
type: string
|
|
default: linux-aarch64-a3-0
|
|
image:
|
|
required: false
|
|
type: string
|
|
description: base image for pods
|
|
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11"
|
|
config_file_path:
|
|
required: true
|
|
type: string
|
|
description: the model config for multi_node test
|
|
replicas:
|
|
required: false
|
|
default: "1"
|
|
type: string
|
|
description: replicas of the k8s cluster
|
|
size:
|
|
required: false
|
|
default: "2"
|
|
type: string
|
|
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
|
vllm_version:
|
|
required: false
|
|
default: "2918c1b49c88c29783c86f78d2c4221cb9622379"
|
|
type: string
|
|
description: vllm version to use
|
|
vllm_ascend_remote_url:
|
|
required: false
|
|
default: https://github.com/vllm-project/vllm-ascend.git
|
|
type: string
|
|
description: used for pr level tests
|
|
vllm_ascend_ref:
|
|
required: false
|
|
default: main
|
|
type: string
|
|
description: used for pr level tests
|
|
secrets:
|
|
KUBECONFIG_B64:
|
|
required: true
|
|
|
|
|
|
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
|
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
|
|
# It's used to activate ascend-toolkit environment variables.
|
|
defaults:
|
|
run:
|
|
shell: bash -el {0}
|
|
|
|
# only cancel in-progress runs of the same workflow
|
|
# and ignore the lint / 8 cards test type
|
|
concurrency:
|
|
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}
|
|
cancel-in-progress: true
|
|
|
|
jobs:
|
|
e2e:
|
|
name: ${{ inputs.config_file_path }}
|
|
# This is the runner with no NPU for k8s controller
|
|
runs-on: ${{ inputs.runner }}
|
|
container:
|
|
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
|
|
env:
|
|
KUBECONFIG: /tmp/kubeconfig
|
|
KUBECTL: /root/.cache/.kube/kubectl
|
|
NAMESPACE: vllm-project
|
|
LEADER_POD: vllm-0
|
|
RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }}
|
|
steps:
|
|
- name: Install system denpendencies
|
|
run: |
|
|
# configure apt and pip source
|
|
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
|
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
|
pip install jinja2-cli
|
|
|
|
#apt-get update -y && apt-get install -y git curl
|
|
|
|
- name: Install kubectl
|
|
run: |
|
|
# Install kubectl
|
|
arch=$(uname -m)
|
|
|
|
if echo "$arch" | grep -qiE "arm|aarch64"; then
|
|
echo "Detected ARM architecture: $arch"
|
|
KUBECTL="$KUBECTL"_arm
|
|
fi
|
|
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
|
|
|
|
# Verify kubectl installation
|
|
kubectl version --client=true
|
|
|
|
- name: Decode kubeconfig from secrets
|
|
run: |
|
|
# Decode and save kubeconfig
|
|
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
|
|
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Prepare scripts
|
|
run: |
|
|
# prepare for lws entrypoint scripts
|
|
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
|
|
|
|
- name: Clear resources
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
CRD_NAME="${CRD_NAME:-vllm}"
|
|
TIMEOUT=${TIMEOUT:-120}
|
|
SLEEP_INTERVAL=2
|
|
|
|
echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..."
|
|
kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found
|
|
|
|
echo "Waiting for all pods starting with 'vllm' to be deleted..."
|
|
START_TIME=$(date +%s)
|
|
|
|
while true; do
|
|
NOW=$(date +%s)
|
|
ELAPSED=$((NOW - START_TIME))
|
|
|
|
if [[ $ELAPSED -ge $TIMEOUT ]]; then
|
|
echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
|
|
kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true
|
|
exit 1
|
|
fi
|
|
|
|
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true)
|
|
|
|
if [[ -z "$PODS_EXIST" ]]; then
|
|
echo "All vllm pods deleted."
|
|
break
|
|
else
|
|
echo "Waiting for pods to be deleted: $PODS_EXIST"
|
|
sleep $SLEEP_INTERVAL
|
|
fi
|
|
done
|
|
|
|
- name: Launch cluster
|
|
id: launcher
|
|
run: |
|
|
set -e
|
|
|
|
size="${{ inputs.size }}"
|
|
replicas="${{ inputs.replicas }}"
|
|
image="${{ inputs.image }}"
|
|
config_file_path="${{ inputs.config_file_path }}"
|
|
vllm_version="${{ inputs.vllm_version }}"
|
|
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
|
|
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
|
|
result_file_path="$RESULT_FILE"
|
|
fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
|
|
echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
|
|
|
|
required_params=("size" "replicas" "image" "config_file_path")
|
|
for param in "${required_params[@]}"; do
|
|
if [ -z "${!param}" ]; then
|
|
echo "Error: Parameter '$param' is required but empty"
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
if [ "${{ inputs.soc_version }}" = "a3" ]; then
|
|
npu_per_node=16
|
|
else
|
|
npu_per_node=8
|
|
fi
|
|
|
|
jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
|
|
-D size="$size" \
|
|
-D replicas="$replicas" \
|
|
-D image="$image" \
|
|
-D config_file_path="$config_file_path" \
|
|
-D vllm_version="$vllm_version" \
|
|
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
|
|
-D vllm_ascend_ref="$vllm_ascend_ref" \
|
|
-D result_file_path="$result_file_path" \
|
|
-D npu_per_node="$npu_per_node" \
|
|
-D fail_tag="$fail_tag" \
|
|
--outfile lws.yaml
|
|
|
|
kubectl apply -f ./lws.yaml
|
|
|
|
- name: Waiting for pod ready
|
|
run: |
|
|
POD_PREFIX="${POD_PREFIX:-vllm-0}"
|
|
SIZE="${{ inputs.size }}"
|
|
TIMEOUT=1200 # default timeout 20 minutes
|
|
|
|
echo "Waiting for Pods in namespace [$NAMESPACE] to become Running and Ready (timeout ${TIMEOUT}s)..."
|
|
|
|
START_TIME=$(date +%s)
|
|
|
|
while true; do
|
|
NOW=$(date +%s)
|
|
ELAPSED=$((NOW - START_TIME))
|
|
if [[ $ELAPSED -ge $TIMEOUT ]]; then
|
|
echo "Timeout reached after ${ELAPSED}s"
|
|
echo "Dumping pod status for debugging:"
|
|
kubectl get pods -n "$NAMESPACE"
|
|
kubectl describe pod "$LEADER_POD" -n "$NAMESPACE"
|
|
exit 1
|
|
fi
|
|
|
|
# 1) check follower pods
|
|
ALL_FOLLOWERS_READY=true
|
|
for ((i=1; i<SIZE; i++)); do
|
|
POD="${POD_PREFIX}-${i}"
|
|
PHASE=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
|
|
READY=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
|
|
|
|
echo "Follower [$POD] phase=$PHASE ready=$READY"
|
|
|
|
if [[ "$PHASE" != "Running" || "$READY" != "true" ]]; then
|
|
echo "Follower [$POD] not Ready yet..."
|
|
ALL_FOLLOWERS_READY=false
|
|
break
|
|
fi
|
|
done
|
|
|
|
# 2) check leader pod
|
|
LEADER_PHASE=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
|
|
LEADER_READY=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
|
|
|
|
echo "Leader [$LEADER_POD] phase=$LEADER_PHASE ready=$LEADER_READY"
|
|
|
|
if [[ "$LEADER_PHASE" != "Running" || "$LEADER_READY" != "true" ]]; then
|
|
echo "Leader not Ready yet..."
|
|
ALL_FOLLOWERS_READY=false
|
|
fi
|
|
|
|
if [[ "$ALL_FOLLOWERS_READY" == "true" ]]; then
|
|
echo "All follower pods and leader pod are Running and Ready — continuing."
|
|
break
|
|
fi
|
|
|
|
sleep 2
|
|
done
|
|
|
|
- name: Stream logs
|
|
run: |
|
|
set -euo pipefail
|
|
echo "Looking for logs containing: $FAIL_TAG"
|
|
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while read -r line; do
|
|
echo "$line"
|
|
if echo "$line" | grep -q "$FAIL_TAG"; then
|
|
exit 1 # workflow step failed
|
|
fi
|
|
done
|
|
|
|
- name: Post process
|
|
if: always()
|
|
run: |
|
|
kubectl get pods -n $NAMESPACE
|
|
kubectl delete -f ./lws.yaml
|