[CI] Refator multi-node CI (#3487)
### What this PR does / why we need it? Refactor the multi-machine CI use case. The purpose of this PR is to increase the ease of adding multi-machine CI use cases, allowing developers to add multi-machine cluster model testing use cases (including PD separation) by simply adding a new YAML configuration file. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
145
tests/e2e/nightly/multi_node/scripts/run.sh
Normal file
145
tests/e2e/nightly/multi_node/scripts/run.sh
Normal file
@@ -0,0 +1,145 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
export SRC_DIR="$WORKSPACE/source_code"
|
||||
|
||||
check_npu_info() {
|
||||
echo "====> Check NPU info"
|
||||
npu-smi info
|
||||
cat "/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/ascend_toolkit_install.info"
|
||||
}
|
||||
|
||||
check_and_config() {
|
||||
echo "====> Configure mirrors and git proxy"
|
||||
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/"
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
}
|
||||
|
||||
checkout_src() {
|
||||
echo "====> Checkout source code"
|
||||
mkdir -p "$SRC_DIR"
|
||||
|
||||
# vllm-ascend
|
||||
if [ ! -d "$SRC_DIR/vllm-ascend" ]; then
|
||||
git clone --depth 1 -b $VLLM_ASCEND_VERSION https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend"
|
||||
fi
|
||||
|
||||
# vllm
|
||||
if [ ! -d "$SRC_DIR/vllm" ]; then
|
||||
git clone -b $VLLM_VERSION https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm"
|
||||
fi
|
||||
|
||||
#mooncake
|
||||
if [ ! -d "$SRC_DIR/Mooncake" ]; then
|
||||
git clone -b pooling_async_memecpy_v1 https://github.com/AscendTransport/Mooncake "$SRC_DIR/Mooncake"
|
||||
fi
|
||||
}
|
||||
|
||||
install_sys_dependencies() {
|
||||
echo "====> Install system dependencies"
|
||||
apt-get update -y
|
||||
|
||||
DEP_LIST=()
|
||||
while IFS= read -r line; do
|
||||
[[ -n "$line" && ! "$line" =~ ^# ]] && DEP_LIST+=("$line")
|
||||
done < "$SRC_DIR/vllm-ascend/packages.txt"
|
||||
|
||||
apt-get install -y "${DEP_LIST[@]}" gcc g++ cmake libnuma-dev iproute2
|
||||
}
|
||||
|
||||
install_vllm() {
|
||||
echo "====> Install vllm and vllm-ascend"
|
||||
VLLM_TARGET_DEVICE=empty pip install -e "$SRC_DIR/vllm"
|
||||
pip install -e "$SRC_DIR/vllm-ascend"
|
||||
pip install modelscope
|
||||
# Install for pytest
|
||||
pip install -r "$SRC_DIR/vllm-ascend/requirements-dev.txt"
|
||||
}
|
||||
|
||||
install_mooncake() {
|
||||
echo "====> Install mooncake"
|
||||
apt-get update -y
|
||||
apt-get install -y --no-install-recommends mpich libmpich-dev
|
||||
cd $SRC_DIR/Mooncake
|
||||
bash dependencies.sh --yes
|
||||
apt purge mpich libmpich-dev -y
|
||||
apt purge openmpi-bin -y
|
||||
apt purge openmpi-bin libopenmpi-dev -y
|
||||
apt install mpich libmpich-dev -y
|
||||
export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:$CPATH
|
||||
export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$CPATH
|
||||
|
||||
mkdir build
|
||||
cd -
|
||||
cd $SRC_DIR/Mooncake/build
|
||||
cmake ..
|
||||
make -j
|
||||
make install
|
||||
cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
|
||||
cp mooncake-transfer-engine/src/libtransfer_engine.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
|
||||
cd -
|
||||
}
|
||||
|
||||
kill_npu_processes() {
|
||||
pgrep python3 | xargs -r kill -9
|
||||
pgrep VLLM | xargs -r kill -9
|
||||
|
||||
sleep 4
|
||||
}
|
||||
|
||||
run_tests() {
|
||||
echo "====> Run tests"
|
||||
|
||||
shopt -s nullglob
|
||||
declare -A results
|
||||
local total=0
|
||||
local passed=0
|
||||
local failed=0
|
||||
|
||||
local REPORT_FILE="/root/.cache/test_summary.md"
|
||||
echo "#Nightly Multi-node Test Summary" > "$REPORT_FILE"
|
||||
echo "" >> "$REPORT_FILE"
|
||||
echo "| Config File | Result |" >> "$REPORT_FILE"
|
||||
echo "|--------------|---------|" >> "$REPORT_FILE"
|
||||
|
||||
for file in tests/e2e/nightly/multi_node/config/models/*.yaml; do
|
||||
export CONFIG_YAML_PATH="$file"
|
||||
echo "Running test with config: $CONFIG_YAML_PATH"
|
||||
|
||||
if pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py; then
|
||||
results["$file"]="✅ PASS"
|
||||
((passed++))
|
||||
else
|
||||
results["$file"]="❌ FAIL"
|
||||
((failed++))
|
||||
fi
|
||||
((total++))
|
||||
|
||||
echo "| \`$file\` | ${results[$file]} |" >> "$REPORT_FILE"
|
||||
echo "------------------------------------------"
|
||||
kill_npu_processes
|
||||
done
|
||||
shopt -u nullglob
|
||||
|
||||
echo "" >> "$REPORT_FILE"
|
||||
echo "## Summary" >> "$REPORT_FILE"
|
||||
echo "- **Total:** $total" >> "$REPORT_FILE"
|
||||
echo "- **Passed:** $passed ✅" >> "$REPORT_FILE"
|
||||
echo "- **Failed:** $failed ❌" >> "$REPORT_FILE"
|
||||
|
||||
echo
|
||||
echo "✅ Markdown report written to: $REPORT_FILE"
|
||||
}
|
||||
|
||||
main() {
|
||||
check_npu_info
|
||||
check_and_config
|
||||
checkout_src
|
||||
install_sys_dependencies
|
||||
install_vllm
|
||||
install_mooncake
|
||||
run_tests
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user