[CI][Doc] Optimize multi-node CI (#3565)
### What this PR does / why we need it?
This pull request mainly do the following things:
1. Add a doc for multi-node CI, The main content is the mechanism
principle and how to contribute
2. Simplify the config yaml for more developer-friendly
3. Optimized the mooncake installation script to prevent accidental
failures during installation
4. Fix the workflow to ensure the kubernetes can be apply correctly
5. Add Qwen3-235B-W8A8 disaggregated_prefill test
6. Add GLM-4.5 multi dp test
7. Add 2p1d 4nodes disaggregated_prefill test
8. Refactor nightly tests
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0rc3
- vLLM main:
17c540a993
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -1,7 +1,47 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
export SRC_DIR="$WORKSPACE/source_code"
|
||||
# Color definitions
|
||||
GREEN="\033[0;32m"
|
||||
BLUE="\033[0;34m"
|
||||
YELLOW="\033[0;33m"
|
||||
RED="\033[0;31m"
|
||||
NC="\033[0m" # No Color
|
||||
|
||||
# Configuration
|
||||
GOVER=1.23.8
|
||||
LOG_DIR="/root/.cache/tests/logs"
|
||||
OVERWRITE_LOGS=true
|
||||
SRC_DIR="$WORKSPACE/source_code"
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||
|
||||
# Function to print section headers
|
||||
print_section() {
|
||||
echo -e "\n${BLUE}=== $1 ===${NC}"
|
||||
}
|
||||
|
||||
# Function to print success messages
|
||||
print_success() {
|
||||
echo -e "${GREEN}✓ $1${NC}"
|
||||
}
|
||||
|
||||
# Function to print error messages and exit
|
||||
print_error() {
|
||||
echo -e "${RED}✗ ERROR: $1${NC}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Function to check command success
|
||||
check_success() {
|
||||
if [ $? -ne 0 ]; then
|
||||
print_error "$1"
|
||||
fi
|
||||
}
|
||||
|
||||
if [ $(id -u) -ne 0 ]; then
|
||||
print_error "Require root permission, try sudo ./dependencies.sh"
|
||||
fi
|
||||
|
||||
|
||||
check_npu_info() {
|
||||
echo "====> Check NPU info"
|
||||
@@ -22,18 +62,13 @@ checkout_src() {
|
||||
|
||||
# vllm-ascend
|
||||
if [ ! -d "$SRC_DIR/vllm-ascend" ]; then
|
||||
git clone --depth 1 -b $VLLM_ASCEND_VERSION https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend"
|
||||
git clone --depth 1 -b $VLLM_ASCEND_VERSION $VLLM_ASCEND_REMOTE_URL "$SRC_DIR/vllm-ascend"
|
||||
fi
|
||||
|
||||
# vllm
|
||||
if [ ! -d "$SRC_DIR/vllm" ]; then
|
||||
git clone -b $VLLM_VERSION https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm"
|
||||
fi
|
||||
|
||||
#mooncake
|
||||
if [ ! -d "$SRC_DIR/Mooncake" ]; then
|
||||
git clone -b pooling_async_memecpy_v1 https://github.com/AscendTransport/Mooncake "$SRC_DIR/Mooncake"
|
||||
fi
|
||||
}
|
||||
|
||||
install_sys_dependencies() {
|
||||
@@ -57,28 +92,55 @@ install_vllm() {
|
||||
pip install -r "$SRC_DIR/vllm-ascend/requirements-dev.txt"
|
||||
}
|
||||
|
||||
install_mooncake() {
|
||||
echo "====> Install mooncake"
|
||||
apt-get update -y
|
||||
apt-get install -y --no-install-recommends mpich libmpich-dev
|
||||
cd $SRC_DIR/Mooncake
|
||||
bash dependencies.sh --yes
|
||||
apt purge mpich libmpich-dev -y
|
||||
apt purge openmpi-bin -y
|
||||
apt purge openmpi-bin libopenmpi-dev -y
|
||||
apt install mpich libmpich-dev -y
|
||||
export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:$CPATH
|
||||
export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$CPATH
|
||||
download_go() {
|
||||
ARCH=$(uname -m)
|
||||
GOVER=1.23.8
|
||||
if [ "$ARCH" = "aarch64" ]; then
|
||||
ARCH="arm64"
|
||||
elif [ "$ARCH" = "x86_64" ]; then
|
||||
ARCH="amd64"
|
||||
else
|
||||
echo "Unsupported architecture: $ARCH"
|
||||
exit 1
|
||||
fi
|
||||
# Download Go
|
||||
echo "Downloading Go $GOVER..."
|
||||
wget -q --show-progress https://golang.google.cn/dl/go$GOVER.linux-$ARCH.tar.gz
|
||||
check_success "Failed to download Go $GOVER"
|
||||
|
||||
mkdir build
|
||||
cd -
|
||||
cd $SRC_DIR/Mooncake/build
|
||||
cmake ..
|
||||
make -j
|
||||
make install
|
||||
cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
|
||||
cp mooncake-transfer-engine/src/libtransfer_engine.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
|
||||
cd -
|
||||
# Install Go
|
||||
echo "Installing Go $GOVER..."
|
||||
tar -C /usr/local -xzf go$GOVER.linux-$ARCH.tar.gz
|
||||
check_success "Failed to install Go $GOVER"
|
||||
|
||||
# Clean up downloaded file
|
||||
rm -f go$GOVER.linux-$ARCH.tar.gz
|
||||
check_success "Failed to clean up Go installation file"
|
||||
|
||||
print_success "Go $GOVER installed successfully"
|
||||
}
|
||||
|
||||
install_go() {
|
||||
# Check if Go is already installed
|
||||
if command -v go &> /dev/null; then
|
||||
GO_VERSION=$(go version | awk '{print $3}')
|
||||
if [[ "$GO_VERSION" == "go$GOVER" ]]; then
|
||||
echo -e "${YELLOW}Go $GOVER is already installed. Skipping...${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}Found Go $GO_VERSION. Will install Go $GOVER...${NC}"
|
||||
download_go
|
||||
fi
|
||||
else
|
||||
download_go
|
||||
fi
|
||||
|
||||
# Add Go to PATH if not already there
|
||||
if ! grep -q "export PATH=\$PATH:/usr/local/go/bin" ~/.bashrc; then
|
||||
echo -e "${YELLOW}Adding Go to your PATH in ~/.bashrc${NC}"
|
||||
echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc
|
||||
echo -e "${YELLOW}Please run 'source ~/.bashrc' or start a new terminal to use Go${NC}"
|
||||
fi
|
||||
export PATH=$PATH:/usr/local/go/bin
|
||||
}
|
||||
|
||||
kill_npu_processes() {
|
||||
@@ -89,47 +151,14 @@ kill_npu_processes() {
|
||||
}
|
||||
|
||||
run_tests() {
|
||||
echo "====> Run tests"
|
||||
|
||||
shopt -s nullglob
|
||||
declare -A results
|
||||
local total=0
|
||||
local passed=0
|
||||
local failed=0
|
||||
|
||||
local REPORT_FILE="/root/.cache/test_summary.md"
|
||||
echo "#Nightly Multi-node Test Summary" > "$REPORT_FILE"
|
||||
echo "" >> "$REPORT_FILE"
|
||||
echo "| Config File | Result |" >> "$REPORT_FILE"
|
||||
echo "|--------------|---------|" >> "$REPORT_FILE"
|
||||
|
||||
for file in tests/e2e/nightly/multi_node/config/models/*.yaml; do
|
||||
export CONFIG_YAML_PATH="$file"
|
||||
echo "Running test with config: $CONFIG_YAML_PATH"
|
||||
|
||||
if pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py; then
|
||||
results["$file"]="✅ PASS"
|
||||
((passed++))
|
||||
else
|
||||
results["$file"]="❌ FAIL"
|
||||
((failed++))
|
||||
fi
|
||||
((total++))
|
||||
|
||||
echo "| \`$file\` | ${results[$file]} |" >> "$REPORT_FILE"
|
||||
echo "------------------------------------------"
|
||||
kill_npu_processes
|
||||
done
|
||||
shopt -u nullglob
|
||||
|
||||
echo "" >> "$REPORT_FILE"
|
||||
echo "## Summary" >> "$REPORT_FILE"
|
||||
echo "- **Total:** $total" >> "$REPORT_FILE"
|
||||
echo "- **Passed:** $passed ✅" >> "$REPORT_FILE"
|
||||
echo "- **Failed:** $failed ❌" >> "$REPORT_FILE"
|
||||
|
||||
echo
|
||||
echo "✅ Markdown report written to: $REPORT_FILE"
|
||||
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
|
||||
kill_npu_processes
|
||||
ret=$?
|
||||
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
|
||||
mkdir -p "$(dirname "$RESULT_PATH")"
|
||||
echo $ret > "$RESULT_PATH"
|
||||
fi
|
||||
return $ret
|
||||
}
|
||||
|
||||
main() {
|
||||
@@ -138,7 +167,12 @@ main() {
|
||||
checkout_src
|
||||
install_sys_dependencies
|
||||
install_vllm
|
||||
install_mooncake
|
||||
# to speed up mooncake build process, install Go here
|
||||
install_go
|
||||
cd "$WORKSPACE/source_code"
|
||||
. $SRC_DIR/vllm-ascend/tests/e2e/nightly/multi_node/scripts/build_mooncake.sh \
|
||||
pooling_async_memecpy_v1 9d96b2e1dd76cc601d76b1b4c5f6e04605cd81d3
|
||||
cd "$WORKSPACE/source_code/vllm-ascend"
|
||||
run_tests
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user