### What this PR does / why we need it?
Currently, our multi-node logs only show the master node's logs (via the
Kubernetes API), which is insufficient for effective problem
localization if other nodes experience issues. Therefore, this pull
request adds the ability to upload logs for other nodes.
Next plan: Output structured directory logs, including logs from each
node and the polog.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: release/v0.13.0
- vLLM main:
bc0a5a0c08
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
300 lines
10 KiB
YAML
300 lines
10 KiB
YAML
name: 'e2e nightly test multi_node'
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
soc_version:
|
|
required: true
|
|
type: string
|
|
description: use a2 or a3
|
|
runner:
|
|
required: false
|
|
type: string
|
|
default: linux-aarch64-a3-0
|
|
image:
|
|
required: false
|
|
type: string
|
|
description: base image for pods
|
|
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11"
|
|
config_file_path:
|
|
required: true
|
|
type: string
|
|
description: the model config for multi_node test
|
|
replicas:
|
|
required: false
|
|
default: "1"
|
|
type: string
|
|
description: replicas of the k8s cluster
|
|
size:
|
|
required: false
|
|
default: "2"
|
|
type: string
|
|
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
|
vllm_version:
|
|
required: false
|
|
default: "v0.13.0"
|
|
type: string
|
|
description: vllm version to use
|
|
vllm_ascend_remote_url:
|
|
required: false
|
|
default: https://github.com/vllm-project/vllm-ascend.git
|
|
type: string
|
|
description: used for pr level tests
|
|
vllm_ascend_ref:
|
|
required: false
|
|
default: main
|
|
type: string
|
|
description: used for pr level tests
|
|
secrets:
|
|
KUBECONFIG_B64:
|
|
required: true
|
|
|
|
|
|
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
|
|
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
|
|
# It's used to activate ascend-toolkit environment variables.
|
|
defaults:
|
|
run:
|
|
shell: bash -el {0}
|
|
|
|
# only cancel in-progress runs of the same workflow
|
|
# and ignore the lint / 8 cards test type
|
|
concurrency:
|
|
group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.soc_version }}
|
|
cancel-in-progress: true
|
|
|
|
jobs:
|
|
e2e:
|
|
name: ${{ inputs.config_file_path }}
|
|
# This is the runner with no NPU for k8s controller
|
|
runs-on: ${{ inputs.runner }}
|
|
container:
|
|
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
|
|
env:
|
|
KUBECONFIG: /tmp/kubeconfig
|
|
KUBECTL: /root/.cache/.kube/kubectl
|
|
NAMESPACE: vllm-project
|
|
LEADER_POD: vllm-0
|
|
RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }}
|
|
steps:
|
|
- name: Install system denpendencies
|
|
run: |
|
|
# configure apt and pip source
|
|
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
|
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
|
pip install jinja2-cli
|
|
|
|
- name: Install kubectl
|
|
run: |
|
|
# Install kubectl
|
|
arch=$(uname -m)
|
|
|
|
if echo "$arch" | grep -qiE "arm|aarch64"; then
|
|
echo "Detected ARM architecture: $arch"
|
|
KUBECTL="$KUBECTL"_arm
|
|
fi
|
|
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
|
|
|
|
# Verify kubectl installation
|
|
kubectl version --client=true
|
|
|
|
- name: Decode kubeconfig from secrets
|
|
run: |
|
|
# Decode and save kubeconfig
|
|
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
|
|
|
|
- name: Checkout code
|
|
uses: actions/checkout@v6
|
|
|
|
- name: Prepare scripts
|
|
run: |
|
|
# prepare for lws entrypoint scripts
|
|
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
|
|
# clear log directory
|
|
rm -fr $RESULT_FILE
|
|
|
|
- name: Clear resources
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
CRD_NAME="${CRD_NAME:-vllm}"
|
|
TIMEOUT=${TIMEOUT:-120}
|
|
SLEEP_INTERVAL=2
|
|
|
|
echo "Deleting leaderworkerset [$CRD_NAME] in namespace [$NAMESPACE]..."
|
|
kubectl delete leaderworkerset "$CRD_NAME" -n "$NAMESPACE" --ignore-not-found
|
|
|
|
echo "Waiting for all pods starting with 'vllm' to be deleted..."
|
|
START_TIME=$(date +%s)
|
|
|
|
while true; do
|
|
NOW=$(date +%s)
|
|
ELAPSED=$((NOW - START_TIME))
|
|
|
|
if [[ $ELAPSED -ge $TIMEOUT ]]; then
|
|
echo "Timeout reached ($TIMEOUT seconds), some pods still exist:"
|
|
kubectl get pods -n "$NAMESPACE" | grep '^vllm' || true
|
|
exit 1
|
|
fi
|
|
|
|
PODS_EXIST=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null | tr ' ' '\n' | grep '^vllm' || true)
|
|
|
|
if [[ -z "$PODS_EXIST" ]]; then
|
|
echo "All vllm pods deleted."
|
|
break
|
|
else
|
|
echo "Waiting for pods to be deleted: $PODS_EXIST"
|
|
sleep $SLEEP_INTERVAL
|
|
fi
|
|
done
|
|
|
|
- name: Launch cluster
|
|
id: launcher
|
|
run: |
|
|
set -e
|
|
|
|
size="${{ inputs.size }}"
|
|
replicas="${{ inputs.replicas }}"
|
|
image="${{ inputs.image }}"
|
|
config_file_path="${{ inputs.config_file_path }}"
|
|
vllm_version="${{ inputs.vllm_version }}"
|
|
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
|
|
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
|
|
result_file_path="$RESULT_FILE"
|
|
fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
|
|
echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
|
|
|
|
required_params=("size" "replicas" "image" "config_file_path")
|
|
for param in "${required_params[@]}"; do
|
|
if [ -z "${!param}" ]; then
|
|
echo "Error: Parameter '$param' is required but empty"
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
if [ "${{ inputs.soc_version }}" = "a3" ]; then
|
|
npu_per_node=16
|
|
else
|
|
npu_per_node=8
|
|
fi
|
|
|
|
jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
|
|
-D size="$size" \
|
|
-D replicas="$replicas" \
|
|
-D image="$image" \
|
|
-D config_file_path="$config_file_path" \
|
|
-D vllm_version="$vllm_version" \
|
|
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
|
|
-D vllm_ascend_ref="$vllm_ascend_ref" \
|
|
-D result_file_path="$result_file_path" \
|
|
-D npu_per_node="$npu_per_node" \
|
|
-D fail_tag="$fail_tag" \
|
|
--outfile lws.yaml
|
|
|
|
kubectl apply -f ./lws.yaml
|
|
|
|
- name: Waiting for pod ready
|
|
run: |
|
|
POD_PREFIX="${POD_PREFIX:-vllm-0}"
|
|
SIZE="${{ inputs.size }}"
|
|
TIMEOUT=1200 # default timeout 20 minutes
|
|
|
|
echo "Waiting for Pods in namespace [$NAMESPACE] to become Running and Ready (timeout ${TIMEOUT}s)..."
|
|
|
|
START_TIME=$(date +%s)
|
|
|
|
while true; do
|
|
NOW=$(date +%s)
|
|
ELAPSED=$((NOW - START_TIME))
|
|
if [[ $ELAPSED -ge $TIMEOUT ]]; then
|
|
echo "Timeout reached after ${ELAPSED}s"
|
|
echo "Dumping pod status for debugging:"
|
|
kubectl get pods -n "$NAMESPACE"
|
|
kubectl describe pod "$LEADER_POD" -n "$NAMESPACE"
|
|
exit 1
|
|
fi
|
|
|
|
# 1) check follower pods
|
|
ALL_FOLLOWERS_READY=true
|
|
for ((i=1; i<SIZE; i++)); do
|
|
POD="${POD_PREFIX}-${i}"
|
|
PHASE=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
|
|
READY=$(kubectl get pod "$POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
|
|
|
|
echo "Follower [$POD] phase=$PHASE ready=$READY"
|
|
|
|
if [[ "$PHASE" != "Running" || "$READY" != "true" ]]; then
|
|
echo "Follower [$POD] not Ready yet..."
|
|
ALL_FOLLOWERS_READY=false
|
|
break
|
|
fi
|
|
done
|
|
|
|
# 2) check leader pod
|
|
LEADER_PHASE=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
|
|
LEADER_READY=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}' 2>/dev/null)
|
|
|
|
echo "Leader [$LEADER_POD] phase=$LEADER_PHASE ready=$LEADER_READY"
|
|
|
|
if [[ "$LEADER_PHASE" != "Running" || "$LEADER_READY" != "true" ]]; then
|
|
echo "Leader not Ready yet..."
|
|
ALL_FOLLOWERS_READY=false
|
|
fi
|
|
|
|
if [[ "$ALL_FOLLOWERS_READY" == "true" ]]; then
|
|
echo "All follower pods and leader pod are Running and Ready — continuing."
|
|
break
|
|
fi
|
|
|
|
sleep 2
|
|
done
|
|
|
|
- name: Stream logs
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
size="${{ inputs.size }}"
|
|
pids=()
|
|
|
|
cleanup() {
|
|
echo "Cleaning up background log streams..."
|
|
for pid in "${pids[@]}"; do
|
|
kill "$pid" 2>/dev/null || true
|
|
done
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
for i in $(seq 1 $((size - 1))); do
|
|
POD="vllm-0-${i}"
|
|
|
|
echo "==== Collecting logs from worker pod: $POD ===="
|
|
kubectl logs -f "$POD" -n "$NAMESPACE" \
|
|
> "/tmp/${POD}_logs.txt" 2>&1 &
|
|
|
|
pids+=($!)
|
|
done
|
|
|
|
echo "==== Streaming logs from leader pod: $LEADER_POD ===="
|
|
echo "Looking for logs containing: $FAIL_TAG"
|
|
|
|
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while IFS= read -r line; do
|
|
echo "$line"
|
|
if echo "$line" | grep -q "$FAIL_TAG"; then
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
- name: Upload logs
|
|
if: always()
|
|
uses: actions/upload-artifact@v4
|
|
with:
|
|
name: pod-logs
|
|
path: /tmp/vllm*_logs.txt
|
|
retention-days: 7
|
|
|
|
- name: Post process
|
|
if: always()
|
|
run: |
|
|
kubectl get pods -n $NAMESPACE --ignore-not-found=true
|
|
kubectl delete -f ./lws.yaml --ignore-not-found=true || true
|