### What this PR does / why we need it?
This pacth mainly do the following things:
1. Bugfix for multi_node_tests log, log names must be unique when
uploading logs.
2. Optimize `get_cluster_ips` logic, increase the max retry times for
robustness
3. Abandoned the existing gh-proxy temporarily until it is stable
enough.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: release/v0.13.0
- vLLM main:
81786c8774
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
300 lines
10 KiB
YAML
300 lines
10 KiB
YAML
name: 'e2e nightly test multi_node'
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
soc_version:
|
|
required: true
|
|
type: string
|
|
description: use a2 or a3
|
|
runner:
|
|
required: false
|
|
type: string
|
|
default: linux-aarch64-a3-0
|
|
image:
|
|
required: false
|
|
type: string
|
|
description: base image for pods
|
|
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11"
|
|
config_file_path:
|
|
required: true
|
|
type: string
|
|
description: the model config for multi_node test
|
|
replicas:
|
|
required: false
|
|
default: "1"
|
|
type: string
|
|
description: replicas of the k8s cluster
|
|
size:
|
|
required: false
|
|
default: "2"
|
|
type: string
|
|
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
|
vllm_version:
|
|
required: false
|
|
default: "v0.13.0"
|
|
type: string
|
|
description: vllm version to use
|
|
vllm_ascend_remote_url:
|
|
required: false
|
|
default: https://github.com/vllm-project/vllm-ascend.git
|
|
type: string
|
|
description: used for pr level tests
|
|
vllm_ascend_ref:
|
|
required: false
|
|
default: main
|
|
type: string
|
|
description: used for pr level tests
|
|
secrets:
|
|
KUBECONFIG_B64:
|
|
required: true
|
|
|
|
|
|
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
|
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
|
|
# It's used to activate ascend-toolkit environment variables.
|
|
defaults:
|
|
run:
|
|
shell: bash -el {0}
|
|
|
|
# only cancel in-progress runs of the same workflow
|
|
# and ignore the lint / 8 cards test type
|
|
concurrency:
|
|
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}
|
|
cancel-in-progress: true
|
|
|
|
jobs:
|
|
e2e:
|
|
name: ${{ inputs.config_file_path }}
|
|
# This is the runner with no NPU for k8s controller
|
|
runs-on: ${{ inputs.runner }}
|
|
container:
|
|
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
|
|
env:
|
|
KUBECONFIG: /tmp/kubeconfig
|
|
KUBECTL: /root/.cache/.kube/kubectl
|
|
NAMESPACE: vllm-project
|
|
LEADER_POD: vllm-0
|
|
RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }}
|
|
steps:
|
|
- name: Install system denpendencies
|
|
run: |
|
|
# configure apt and pip source
|
|
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
|
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
|
pip install jinja2-cli
|
|
|
|
- name: Install kubectl
|
|
run: |
|
|
# Install kubectl
|
|
arch=$(uname -m)
|
|
|
|
if echo "$arch" | grep -qiE "arm|aarch64"; then
|
|
echo "Detected ARM architecture: $arch"
|
|
KUBECTL="$KUBECTL"_arm
|
|
fi
|
|
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
|
|
|
|
# Verify kubectl installation
|
|
kubectl version --client=true
|
|
|
|
- name: Decode kubeconfig from secrets
|
|
run: |
|
|
# Decode and save kubeconfig
|
|
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
|
|
|
|
- name: Checkout code
|
|
uses: actions/checkout@v6
|
|
|
|
- name: Prepare scripts
|
|
run: |
|
|
# prepare for lws entrypoint scripts
|
|
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
|
|
# clear log directory
|
|
rm -fr $RESULT_FILE
|
|
|
|
- name: Clear resources
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
CRD_NAME="${CRD_NAME:-vllm}"
|
|
TIMEOUT=${TIMEOUT:-120}
|
|
SLEEP_INTERVAL=2
|
|
|
|
echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..."
|
|
kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found
|
|
|
|
echo "Waiting for all pods starting with 'vllm' to be deleted..."
|
|
START_TIME=$(date +%s)
|
|
|
|
while true; do
|
|
NOW=$(date +%s)
|
|
ELAPSED=$((NOW - START_TIME))
|
|
|
|
if [[ $ELAPSED -ge $TIMEOUT ]]; then
|
|
echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
|
|
kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true
|
|
exit 1
|
|
fi
|
|
|
|
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true)
|
|
|
|
if [[ -z "$PODS_EXIST" ]]; then
|
|
echo "All vllm pods deleted."
|
|
break
|
|
else
|
|
echo "Waiting for pods to be deleted: $PODS_EXIST"
|
|
sleep $SLEEP_INTERVAL
|
|
fi
|
|
done
|
|
|
|
- name: Launch cluster
|
|
id: launcher
|
|
run: |
|
|
set -e
|
|
|
|
size="${{ inputs.size }}"
|
|
replicas="${{ inputs.replicas }}"
|
|
image="${{ inputs.image }}"
|
|
config_file_path="${{ inputs.config_file_path }}"
|
|
vllm_version="${{ inputs.vllm_version }}"
|
|
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
|
|
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
|
|
result_file_path="$RESULT_FILE"
|
|
fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
|
|
echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
|
|
|
|
required_params=("size" "replicas" "image" "config_file_path")
|
|
for param in "${required_params[@]}"; do
|
|
if [ -z "${!param}" ]; then
|
|
echo "Error: Parameter '$param' is required but empty"
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
if [ "${{ inputs.soc_version }}" = "a3" ]; then
|
|
npu_per_node=16
|
|
else
|
|
npu_per_node=8
|
|
fi
|
|
|
|
jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
|
|
-D size="$size" \
|
|
-D replicas="$replicas" \
|
|
-D image="$image" \
|
|
-D config_file_path="$config_file_path" \
|
|
-D vllm_version="$vllm_version" \
|
|
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
|
|
-D vllm_ascend_ref="$vllm_ascend_ref" \
|
|
-D result_file_path="$result_file_path" \
|
|
-D npu_per_node="$npu_per_node" \
|
|
-D fail_tag="$fail_tag" \
|
|
--outfile lws.yaml
|
|
|
|
kubectl apply -f ./lws.yaml
|
|
|
|
- name: Waiting for pod ready
|
|
run: |
|
|
POD_PREFIX="${POD_PREFIX:-vllm-0}"
|
|
SIZE="${{ inputs.size }}"
|
|
TIMEOUT=1200 # default timeout 20 minutes
|
|
|
|
echo "Waiting for Pods in namespace [$NAMESPACE] to become Running and Ready (timeout ${TIMEOUT}s)..."
|
|
|
|
START_TIME=$(date +%s)
|
|
|
|
while true; do
|
|
NOW=$(date +%s)
|
|
ELAPSED=$((NOW - START_TIME))
|
|
if [[ $ELAPSED -ge $TIMEOUT ]]; then
|
|
echo "Timeout reached after ${ELAPSED}s"
|
|
echo "Dumping pod status for debugging:"
|
|
kubectl get pods -n "$NAMESPACE"
|
|
kubectl describe pod "$LEADER_POD" -n "$NAMESPACE"
|
|
exit 1
|
|
fi
|
|
|
|
# 1) check follower pods
|
|
ALL_FOLLOWERS_READY=true
|
|
for ((i=1; i<SIZE; i++)); do
|
|
POD="${POD_PREFIX}-${i}"
|
|
PHASE=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
|
|
READY=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
|
|
|
|
echo "Follower [$POD] phase=$PHASE ready=$READY"
|
|
|
|
if [[ "$PHASE" != "Running" || "$READY" != "true" ]]; then
|
|
echo "Follower [$POD] not Ready yet..."
|
|
ALL_FOLLOWERS_READY=false
|
|
break
|
|
fi
|
|
done
|
|
|
|
# 2) check leader pod
|
|
LEADER_PHASE=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
|
|
LEADER_READY=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
|
|
|
|
echo "Leader [$LEADER_POD] phase=$LEADER_PHASE ready=$LEADER_READY"
|
|
|
|
if [[ "$LEADER_PHASE" != "Running" || "$LEADER_READY" != "true" ]]; then
|
|
echo "Leader not Ready yet..."
|
|
ALL_FOLLOWERS_READY=false
|
|
fi
|
|
|
|
if [[ "$ALL_FOLLOWERS_READY" == "true" ]]; then
|
|
echo "All follower pods and leader pod are Running and Ready — continuing."
|
|
break
|
|
fi
|
|
|
|
sleep 2
|
|
done
|
|
|
|
- name: Stream logs
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
size="${{ inputs.size }}"
|
|
pids=()
|
|
|
|
cleanup() {
|
|
echo "Cleaning up background log streams..."
|
|
for pid in "${pids[@]}"; do
|
|
kill "$pid" 2>/dev/null || true
|
|
done
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
for i in $(seq 1 $((size - 1))); do
|
|
POD="vllm-0-${i}"
|
|
|
|
echo "==== Collecting logs from worker pod: $POD ===="
|
|
kubectl logs -f "$POD" -n "$NAMESPACE" \
|
|
> "/tmp/${POD}_logs.txt" 2>&1 &
|
|
|
|
pids+=($!)
|
|
done
|
|
|
|
echo "==== Streaming logs from leader pod: $LEADER_POD ===="
|
|
echo "Looking for logs containing: $FAIL_TAG"
|
|
|
|
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while IFS= read -r line; do
|
|
echo "$line"
|
|
if echo "$line" | grep -q "$FAIL_TAG"; then
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
- name: Upload logs
|
|
if: always()
|
|
uses: actions/upload-artifact@v4
|
|
with:
|
|
name: ${{ inputs.config_file_path }}-pod-logs
|
|
path: /tmp/vllm*_logs.txt
|
|
retention-days: 7
|
|
|
|
- name: Post process
|
|
if: always()
|
|
run: |
|
|
kubectl get pods -n $NAMESPACE --ignore-not-found=true
|
|
kubectl delete -f ./lws.yaml --ignore-not-found=true || true
|