[CI] Refator multi-node CI (#3487)

### What this PR does / why we need it?
Refactor the multi-machine CI use case. The purpose of this PR is to
increase the ease of adding multi-machine CI use cases, allowing
developers to add multi-machine cluster model testing use cases
(including PD separation) by simply adding a new YAML configuration
file.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-10-17 09:04:31 +08:00
committed by GitHub
parent ccb6fb9ec1
commit 4c4a8458a5
18 changed files with 632 additions and 437 deletions

View File

@@ -0,0 +1,132 @@
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: vllm
namespace: vllm-project
spec:
replicas: 1
leaderWorkerTemplate:
size: 2
restartPolicy: RecreateGroupOnPodRestart
leaderTemplate:
metadata:
labels:
role: leader
spec:
containers:
- name: vllm-leader
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
env:
- name: WORKSPACE
value: "/root/workspace"
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
- name: VLLM_VERSION
value: "v0.11.0"
- name: VLLM_ASCEND_VERSION
value: "main"
- name: MOONCAKE_VERSION
value: "06cc217504a6f1b0cdaa26b096b985651b262748"
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
tail -f /dev/null
resources:
limits:
huawei.com/ascend-1980: "16"
memory: 512Gi
ephemeral-storage: 100Gi
requests:
huawei.com/ascend-1980: "16"
ephemeral-storage: 100Gi
cpu: 125
ports:
- containerPort: 8080
# readinessProbe:
# tcpSocket:
# port: 8080
# initialDelaySeconds: 15
# periodSeconds: 10
volumeMounts:
- mountPath: /root/.cache
name: shared-volume
- mountPath: /usr/local/Ascend/driver/tools
name: driver-tools
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
- name: shared-volume
persistentVolumeClaim:
claimName: nv-action-vllm-benchmarks-v2
- name: driver-tools
hostPath:
path: /usr/local/Ascend/driver/tools
workerTemplate:
spec:
containers:
- name: vllm-worker
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
env:
- name: WORKSPACE
value: "/root/workspace"
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
- name: VLLM_VERSION
value: "v0.11.0"
- name: VLLM_ASCEND_VERSION
value: "main"
- name: MOONCAKE_VERSION
value: "06cc217504a6f1b0cdaa26b096b985651b262748"
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
tail -f /dev/null
resources:
limits:
huawei.com/ascend-1980: "16"
memory: 512Gi
ephemeral-storage: 100Gi
requests:
huawei.com/ascend-1980: "16"
ephemeral-storage: 100Gi
cpu: 125
volumeMounts:
- mountPath: /root/.cache
name: shared-volume
- mountPath: /usr/local/Ascend/driver/tools
name: driver-tools
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
- name: shared-volume
persistentVolumeClaim:
claimName: nv-action-vllm-benchmarks-v2
- name: driver-tools
hostPath:
path: /usr/local/Ascend/driver/tools
---
apiVersion: v1
kind: Service
metadata:
name: vllm-leader
namespace: vllm-project
spec:
ports:
- name: http
port: 8080
protocol: TCP
targetPort: 8080
selector:
leaderworkerset.sigs.k8s.io/name: vllm
role: leader
type: ClusterIP

View File

@@ -0,0 +1,145 @@
#!/bin/bash
set -euo pipefail
export SRC_DIR="$WORKSPACE/source_code"
check_npu_info() {
echo "====> Check NPU info"
npu-smi info
cat "/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/ascend_toolkit_install.info"
}
check_and_config() {
echo "====> Configure mirrors and git proxy"
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/"
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
}
checkout_src() {
echo "====> Checkout source code"
mkdir -p "$SRC_DIR"
# vllm-ascend
if [ ! -d "$SRC_DIR/vllm-ascend" ]; then
git clone --depth 1 -b $VLLM_ASCEND_VERSION https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend"
fi
# vllm
if [ ! -d "$SRC_DIR/vllm" ]; then
git clone -b $VLLM_VERSION https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm"
fi
#mooncake
if [ ! -d "$SRC_DIR/Mooncake" ]; then
git clone -b pooling_async_memecpy_v1 https://github.com/AscendTransport/Mooncake "$SRC_DIR/Mooncake"
fi
}
install_sys_dependencies() {
echo "====> Install system dependencies"
apt-get update -y
DEP_LIST=()
while IFS= read -r line; do
[[ -n "$line" && ! "$line" =~ ^# ]] && DEP_LIST+=("$line")
done < "$SRC_DIR/vllm-ascend/packages.txt"
apt-get install -y "${DEP_LIST[@]}" gcc g++ cmake libnuma-dev iproute2
}
install_vllm() {
echo "====> Install vllm and vllm-ascend"
VLLM_TARGET_DEVICE=empty pip install -e "$SRC_DIR/vllm"
pip install -e "$SRC_DIR/vllm-ascend"
pip install modelscope
# Install for pytest
pip install -r "$SRC_DIR/vllm-ascend/requirements-dev.txt"
}
install_mooncake() {
echo "====> Install mooncake"
apt-get update -y
apt-get install -y --no-install-recommends mpich libmpich-dev
cd $SRC_DIR/Mooncake
bash dependencies.sh --yes
apt purge mpich libmpich-dev -y
apt purge openmpi-bin -y
apt purge openmpi-bin libopenmpi-dev -y
apt install mpich libmpich-dev -y
export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:$CPATH
export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$CPATH
mkdir build
cd -
cd $SRC_DIR/Mooncake/build
cmake ..
make -j
make install
cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cp mooncake-transfer-engine/src/libtransfer_engine.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cd -
}
kill_npu_processes() {
pgrep python3 | xargs -r kill -9
pgrep VLLM | xargs -r kill -9
sleep 4
}
run_tests() {
echo "====> Run tests"
shopt -s nullglob
declare -A results
local total=0
local passed=0
local failed=0
local REPORT_FILE="/root/.cache/test_summary.md"
echo "#Nightly Multi-node Test Summary" > "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
echo "| Config File | Result |" >> "$REPORT_FILE"
echo "|--------------|---------|" >> "$REPORT_FILE"
for file in tests/e2e/nightly/multi_node/config/models/*.yaml; do
export CONFIG_YAML_PATH="$file"
echo "Running test with config: $CONFIG_YAML_PATH"
if pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py; then
results["$file"]="✅ PASS"
((passed++))
else
results["$file"]="❌ FAIL"
((failed++))
fi
((total++))
echo "| \`$file\` | ${results[$file]} |" >> "$REPORT_FILE"
echo "------------------------------------------"
kill_npu_processes
done
shopt -u nullglob
echo "" >> "$REPORT_FILE"
echo "## Summary" >> "$REPORT_FILE"
echo "- **Total:** $total" >> "$REPORT_FILE"
echo "- **Passed:** $passed" >> "$REPORT_FILE"
echo "- **Failed:** $failed" >> "$REPORT_FILE"
echo
echo "✅ Markdown report written to: $REPORT_FILE"
}
main() {
check_npu_info
check_and_config
checkout_src
install_sys_dependencies
install_vllm
install_mooncake
run_tests
}
main "$@"