Files
xc-llm-ascend/.github/workflows/multi_node_test.yaml
Li Wang 4c4a8458a5 [CI] Refator multi-node CI (#3487)
### What this PR does / why we need it?
Refactor the multi-machine CI use case. The purpose of this PR is to
increase the ease of adding multi-machine CI use cases, allowing
developers to add multi-machine cluster model testing use cases
(including PD separation) by simply adding a new YAML configuration
file.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-17 09:04:31 +08:00

119 lines
4.1 KiB
YAML

name: 'e2e test / multi-dp'
on:
schedule:
- cron: "0 */4 * * *"
workflow_dispatch:
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
# and ignore the lint / 8 cards test type
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
e2e:
# This is a runner with no NPU for k8s controller
runs-on: linux-aarch64-a3-0
container:
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
env:
KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl
NAMESPACE: vllm-project
LEADER_POD: vllm-0
steps:
- name: Install system denpendencies
run: |
# configure apt and pip source
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
apt-get update -y && apt-get install -y git curl
TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64`
git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN"
- name: Install kubectl
run: |
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
# get kubeconfig from secret
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
- name: Checkout code
uses: actions/checkout@v4
- name: Prepare scripts
run: |
# prepare for lws entrypoint scripts
install -D tests/e2e/multi_node/scripts/run.sh /root/.cache/tests/run.sh
- name: Launch cluster
run: |
kubectl apply -f tests/e2e/multi_node/scripts/lws.yaml
- name: Waiting for pod ready
run: |
echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."
while true; do
# get pod status
READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')
if [[ "$READY_STATUS" == "true" ]]; then
echo "✅ Pod [$LEADER_POD] is Ready!"
break
else
echo "Pod [$LEADER_POD] not ready, waiting..."
sleep 3
fi
done
- name: Stream logs and monitor pod health
run: |
set -euo pipefail
echo "🚀 Start streaming logs for Pod [$LEADER_POD] ..."
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" &
LOG_PID=$!
echo "Start monitoring Pod [$LEADER_POD] status ..."
while true; do
STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}')
if [[ "$STATUS" != "Running" && "$STATUS" != "Succeeded" ]]; then
echo "❌ Pod [$LEADER_POD] exited abnormally with status: $STATUS"
kubectl describe pod "$LEADER_POD" -n "$NAMESPACE" || true
kubectl logs "$LEADER_POD" -n "$NAMESPACE" --previous --all-containers || true
kill $LOG_PID || true
exit 1
fi
sleep 5
done &
MONITOR_PID=$!
wait $LOG_PID || true
kill $MONITOR_PID || true
- name: Generate summary
if: always()
run: |
if [ -f "/root/.cache/test_summary.md" ]; then
cat /root/.cache/test_summary.md >> "$GITHUB_STEP_SUMMARY"
else
echo "No summary file found." >> "$GITHUB_STEP_SUMMARY"
fi
- name: Post process
if: always()
run: |
kubectl get pods -n $NAMESPACE
kubectl delete -f tests/e2e/multi_node/scripts/lws.yaml