[CI][Doc] Optimize multi-node CI (#3565)

### What this PR does / why we need it?
This pull request mainly do the following things:
1. Add a doc for multi-node CI, The main content is the mechanism
principle and how to contribute
2. Simplify the config yaml for more developer-friendly
3. Optimized the mooncake installation script to prevent accidental
failures during installation
4. Fix the workflow to ensure the kubernetes can be apply correctly
5. Add Qwen3-235B-W8A8 disaggregated_prefill test
6. Add GLM-4.5 multi dp test
7. Add 2p1d 4nodes disaggregated_prefill test
8. Refactor nightly tests
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.11.0rc3
- vLLM main:
17c540a993

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-10-25 09:23:47 +08:00
committed by GitHub
parent 292cf339c3
commit 7f73c28a24
21 changed files with 1165 additions and 378 deletions

View File

@@ -0,0 +1,113 @@
#!/bin/bash
set -e
set -o pipefail
GREEN="\033[0;32m"
BLUE="\033[0;34m"
YELLOW="\033[0;33m"
RED="\033[0;31m"
NC="\033[0m" # No Color
branch=${1:-pooling_async_memecpy_v1}
point=${2:-9d96b2e1dd76cc601d76b1b4c5f6e04605cd81d3}
repo_url="https://github.com/AscendTransport/Mooncake"
repo_name="Mooncake"
state_file=".build_state"
echo "[INFO] Branch: $branch"
echo "[INFO] Commit: $point"
echo "-------------------------------------------"
mark_done() { echo "$1" >> "$state_file"; }
is_done() { grep -Fxq "$1" "$state_file" 2>/dev/null; }
if ! is_done "clone"; then
echo "[STEP] Clone repository..."
if [ -d "$repo_name" ]; then
echo "[WARN] Directory $repo_name already exists, skipping clone."
else
git clone -b "$branch" "$repo_url" "$repo_name"
fi
cd "$repo_name"
git fetch --all
git checkout "$point" || { echo "[ERROR] Checkout failed."; exit 1; }
cd ..
mark_done "clone"
else
echo "[SKIP] Clone step already done."
fi
if ! is_done "deps"; then
cd "$repo_name"
echo "[STEP]Installing dependencies (ignore Go failure)..."
yes | bash dependencies.sh || echo "⚠️ dependencies.sh failed (Go install likely failed), continuing..."
cd ..
mark_done "deps"
else
echo "[SKIP] Dependencies already installed."
fi
if ! is_done "mpi"; then
echo "[STEP] Install MPI..."
apt purge -y mpich libmpich-dev openmpi-bin libopenmpi-dev || true
apt install -y mpich libmpich-dev
export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:${CPATH:-}
export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:${CPATH:-}
mark_done "mpi"
else
echo "[SKIP] MPI installation already done."
fi
if ! is_done "build"; then
echo "[STEP] Compile and install..."
cd "$repo_name"
if [ -d "build" ]; then
echo "[INFO] Removing existing build directory..."
rm -rf build
fi
mkdir build && cd build
cmake .. || { echo "[ERROR] cmake failed."; exit 1; }
make -j || { echo "[ERROR] make failed."; exit 1; }
make install || { echo "[ERROR] make install failed."; exit 1; }
mark_done "build"
else
echo "[SKIP] Build already done."
fi
if ! is_done "copy_lib"; then
echo "[STEP] Copy library files..."
cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so \
/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cp mooncake-transfer-engine/src/libtransfer_engine.so \
/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cd ..
mark_done "copy_lib"
else
echo "[SKIP] Library copy already done."
fi
if ! grep -q "export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH" ~/.bashrc; then
echo -e "${YELLOW}Adding LD_LIBRARY_PATH to your PATH in ~/.bashrc${NC}"
echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH' >> ~/.bashrc
echo -e "${YELLOW}Please run 'source ~/.bashrc' or start a new terminal${NC}"
fi
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
echo "=========================================="
echo -e "${GREEN}[SUCCESS] Mooncake build completed!"
echo "You can rerun this script anytime — it will resume from the last step."
echo "=========================================="
echo "Example startup command:"
echo "mooncake_master --eviction_high_watermark_ratio 0.8 --eviction_ratio 0.05 --port 50088"

View File

@@ -17,19 +17,24 @@ spec:
- name: vllm-leader
image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }}
env:
- name: CONFIG_YAML_PATH
value: {{ config_file_path | default("tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/root/workspace"
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
- name: VLLM_VERSION
value: "v0.11.0"
- name: VLLM_ASCEND_VERSION
value: "main"
value: {{ vllm_ascend_ref | default("main") }}
- name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
tail -f /dev/null
resources:
limits:
huawei.com/ascend-1980: "16"
@@ -70,19 +75,24 @@ spec:
- name: vllm-worker
image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }}
env:
- name: CONFIG_YAML_PATH
value: {{ config_file_path | default("tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/root/workspace"
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
- name: VLLM_VERSION
value: "v0.11.0"
- name: VLLM_ASCEND_VERSION
value: "main"
value: {{ vllm_ascend_ref | default("main") }}
- name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
tail -f /dev/null
resources:
limits:
huawei.com/ascend-1980: "16"

View File

@@ -1,7 +1,47 @@
#!/bin/bash
set -euo pipefail
export SRC_DIR="$WORKSPACE/source_code"
# Color definitions
GREEN="\033[0;32m"
BLUE="\033[0;34m"
YELLOW="\033[0;33m"
RED="\033[0;31m"
NC="\033[0m" # No Color
# Configuration
GOVER=1.23.8
LOG_DIR="/root/.cache/tests/logs"
OVERWRITE_LOGS=true
SRC_DIR="$WORKSPACE/source_code"
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
# Function to print section headers
print_section() {
echo -e "\n${BLUE}=== $1 ===${NC}"
}
# Function to print success messages
print_success() {
echo -e "${GREEN}$1${NC}"
}
# Function to print error messages and exit
print_error() {
echo -e "${RED}✗ ERROR: $1${NC}"
exit 1
}
# Function to check command success
check_success() {
if [ $? -ne 0 ]; then
print_error "$1"
fi
}
if [ $(id -u) -ne 0 ]; then
print_error "Require root permission, try sudo ./dependencies.sh"
fi
check_npu_info() {
echo "====> Check NPU info"
@@ -22,18 +62,13 @@ checkout_src() {
# vllm-ascend
if [ ! -d "$SRC_DIR/vllm-ascend" ]; then
git clone --depth 1 -b $VLLM_ASCEND_VERSION https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend"
git clone --depth 1 -b $VLLM_ASCEND_VERSION $VLLM_ASCEND_REMOTE_URL "$SRC_DIR/vllm-ascend"
fi
# vllm
if [ ! -d "$SRC_DIR/vllm" ]; then
git clone -b $VLLM_VERSION https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm"
fi
#mooncake
if [ ! -d "$SRC_DIR/Mooncake" ]; then
git clone -b pooling_async_memecpy_v1 https://github.com/AscendTransport/Mooncake "$SRC_DIR/Mooncake"
fi
}
install_sys_dependencies() {
@@ -57,28 +92,55 @@ install_vllm() {
pip install -r "$SRC_DIR/vllm-ascend/requirements-dev.txt"
}
install_mooncake() {
echo "====> Install mooncake"
apt-get update -y
apt-get install -y --no-install-recommends mpich libmpich-dev
cd $SRC_DIR/Mooncake
bash dependencies.sh --yes
apt purge mpich libmpich-dev -y
apt purge openmpi-bin -y
apt purge openmpi-bin libopenmpi-dev -y
apt install mpich libmpich-dev -y
export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:$CPATH
export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$CPATH
download_go() {
ARCH=$(uname -m)
GOVER=1.23.8
if [ "$ARCH" = "aarch64" ]; then
ARCH="arm64"
elif [ "$ARCH" = "x86_64" ]; then
ARCH="amd64"
else
echo "Unsupported architecture: $ARCH"
exit 1
fi
# Download Go
echo "Downloading Go $GOVER..."
wget -q --show-progress https://golang.google.cn/dl/go$GOVER.linux-$ARCH.tar.gz
check_success "Failed to download Go $GOVER"
mkdir build
cd -
cd $SRC_DIR/Mooncake/build
cmake ..
make -j
make install
cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cp mooncake-transfer-engine/src/libtransfer_engine.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cd -
# Install Go
echo "Installing Go $GOVER..."
tar -C /usr/local -xzf go$GOVER.linux-$ARCH.tar.gz
check_success "Failed to install Go $GOVER"
# Clean up downloaded file
rm -f go$GOVER.linux-$ARCH.tar.gz
check_success "Failed to clean up Go installation file"
print_success "Go $GOVER installed successfully"
}
install_go() {
# Check if Go is already installed
if command -v go &> /dev/null; then
GO_VERSION=$(go version | awk '{print $3}')
if [[ "$GO_VERSION" == "go$GOVER" ]]; then
echo -e "${YELLOW}Go $GOVER is already installed. Skipping...${NC}"
else
echo -e "${YELLOW}Found Go $GO_VERSION. Will install Go $GOVER...${NC}"
download_go
fi
else
download_go
fi
# Add Go to PATH if not already there
if ! grep -q "export PATH=\$PATH:/usr/local/go/bin" ~/.bashrc; then
echo -e "${YELLOW}Adding Go to your PATH in ~/.bashrc${NC}"
echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc
echo -e "${YELLOW}Please run 'source ~/.bashrc' or start a new terminal to use Go${NC}"
fi
export PATH=$PATH:/usr/local/go/bin
}
kill_npu_processes() {
@@ -89,47 +151,14 @@ kill_npu_processes() {
}
run_tests() {
echo "====> Run tests"
shopt -s nullglob
declare -A results
local total=0
local passed=0
local failed=0
local REPORT_FILE="/root/.cache/test_summary.md"
echo "#Nightly Multi-node Test Summary" > "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
echo "| Config File | Result |" >> "$REPORT_FILE"
echo "|--------------|---------|" >> "$REPORT_FILE"
for file in tests/e2e/nightly/multi_node/config/models/*.yaml; do
export CONFIG_YAML_PATH="$file"
echo "Running test with config: $CONFIG_YAML_PATH"
if pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py; then
results["$file"]="✅ PASS"
((passed++))
else
results["$file"]="❌ FAIL"
((failed++))
fi
((total++))
echo "| \`$file\` | ${results[$file]} |" >> "$REPORT_FILE"
echo "------------------------------------------"
kill_npu_processes
done
shopt -u nullglob
echo "" >> "$REPORT_FILE"
echo "## Summary" >> "$REPORT_FILE"
echo "- **Total:** $total" >> "$REPORT_FILE"
echo "- **Passed:** $passed" >> "$REPORT_FILE"
echo "- **Failed:** $failed" >> "$REPORT_FILE"
echo
echo "✅ Markdown report written to: $REPORT_FILE"
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
kill_npu_processes
ret=$?
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
mkdir -p "$(dirname "$RESULT_PATH")"
echo $ret > "$RESULT_PATH"
fi
return $ret
}
main() {
@@ -138,7 +167,12 @@ main() {
checkout_src
install_sys_dependencies
install_vllm
install_mooncake
# to speed up mooncake build process, install Go here
install_go
cd "$WORKSPACE/source_code"
. $SRC_DIR/vllm-ascend/tests/e2e/nightly/multi_node/scripts/build_mooncake.sh \
pooling_async_memecpy_v1 9d96b2e1dd76cc601d76b1b4c5f6e04605cd81d3
cd "$WORKSPACE/source_code/vllm-ascend"
run_tests
}