Files
xc-llm-ascend/tests/e2e/nightly/multi_node/scripts/build_mooncake.sh
Li Wang 7f73c28a24 [CI][Doc] Optimize multi-node CI (#3565)
### What this PR does / why we need it?
This pull request mainly do the following things:
1. Add a doc for multi-node CI, The main content is the mechanism
principle and how to contribute
2. Simplify the config yaml for more developer-friendly
3. Optimized the mooncake installation script to prevent accidental
failures during installation
4. Fix the workflow to ensure the kubernetes can be apply correctly
5. Add Qwen3-235B-W8A8 disaggregated_prefill test
6. Add GLM-4.5 multi dp test
7. Add 2p1d 4nodes disaggregated_prefill test
8. Refactor nightly tests
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.11.0rc3
- vLLM main:
17c540a993

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-25 09:23:47 +08:00

114 lines
3.4 KiB
Bash

#!/bin/bash
set -e
set -o pipefail
GREEN="\033[0;32m"
BLUE="\033[0;34m"
YELLOW="\033[0;33m"
RED="\033[0;31m"
NC="\033[0m" # No Color
branch=${1:-pooling_async_memecpy_v1}
point=${2:-9d96b2e1dd76cc601d76b1b4c5f6e04605cd81d3}
repo_url="https://github.com/AscendTransport/Mooncake"
repo_name="Mooncake"
state_file=".build_state"
echo "[INFO] Branch: $branch"
echo "[INFO] Commit: $point"
echo "-------------------------------------------"
mark_done() { echo "$1" >> "$state_file"; }
is_done() { grep -Fxq "$1" "$state_file" 2>/dev/null; }
if ! is_done "clone"; then
echo "[STEP] Clone repository..."
if [ -d "$repo_name" ]; then
echo "[WARN] Directory $repo_name already exists, skipping clone."
else
git clone -b "$branch" "$repo_url" "$repo_name"
fi
cd "$repo_name"
git fetch --all
git checkout "$point" || { echo "[ERROR] Checkout failed."; exit 1; }
cd ..
mark_done "clone"
else
echo "[SKIP] Clone step already done."
fi
if ! is_done "deps"; then
cd "$repo_name"
echo "[STEP]Installing dependencies (ignore Go failure)..."
yes | bash dependencies.sh || echo "⚠️ dependencies.sh failed (Go install likely failed), continuing..."
cd ..
mark_done "deps"
else
echo "[SKIP] Dependencies already installed."
fi
if ! is_done "mpi"; then
echo "[STEP] Install MPI..."
apt purge -y mpich libmpich-dev openmpi-bin libopenmpi-dev || true
apt install -y mpich libmpich-dev
export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:${CPATH:-}
export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:${CPATH:-}
mark_done "mpi"
else
echo "[SKIP] MPI installation already done."
fi
if ! is_done "build"; then
echo "[STEP] Compile and install..."
cd "$repo_name"
if [ -d "build" ]; then
echo "[INFO] Removing existing build directory..."
rm -rf build
fi
mkdir build && cd build
cmake .. || { echo "[ERROR] cmake failed."; exit 1; }
make -j || { echo "[ERROR] make failed."; exit 1; }
make install || { echo "[ERROR] make install failed."; exit 1; }
mark_done "build"
else
echo "[SKIP] Build already done."
fi
if ! is_done "copy_lib"; then
echo "[STEP] Copy library files..."
cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so \
/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cp mooncake-transfer-engine/src/libtransfer_engine.so \
/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cd ..
mark_done "copy_lib"
else
echo "[SKIP] Library copy already done."
fi
if ! grep -q "export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH" ~/.bashrc; then
echo -e "${YELLOW}Adding LD_LIBRARY_PATH to your PATH in ~/.bashrc${NC}"
echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH' >> ~/.bashrc
echo -e "${YELLOW}Please run 'source ~/.bashrc' or start a new terminal${NC}"
fi
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
echo "=========================================="
echo -e "${GREEN}[SUCCESS] Mooncake build completed!"
echo "You can rerun this script anytime — it will resume from the last step."
echo "=========================================="
echo "Example startup command:"
echo "mooncake_master --eviction_high_watermark_ratio 0.8 --eviction_ratio 0.05 --port 50088"