[CI] Refator multi-node CI (#3487)

### What this PR does / why we need it? Refactor the multi-machine CI use case. The purpose of this PR is to increase the ease of adding multi-machine CI use cases, allowing developers to add multi-machine cluster model testing use cases (including PD separation) by simply adding a new YAML configuration file. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-17 09:04:31 +08:00
parent ccb6fb9ec1
commit 4c4a8458a5
18 changed files with 632 additions and 437 deletions
--- a/tests/e2e/nightly/multi_node/scripts/run.sh
+++ b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+set -euo pipefail
+
+export SRC_DIR="$WORKSPACE/source_code"
+
+check_npu_info() {
+    echo "====> Check NPU info"
+    npu-smi info
+    cat "/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/ascend_toolkit_install.info"
+}
+
+check_and_config() {
+    echo "====> Configure mirrors and git proxy"
+    git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/"
+    pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
+}
+
+checkout_src() {
+    echo "====> Checkout source code"
+    mkdir -p "$SRC_DIR"
+
+    # vllm-ascend
+    if [ ! -d "$SRC_DIR/vllm-ascend" ]; then
+        git clone --depth 1 -b $VLLM_ASCEND_VERSION https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend"
+    fi
+
+    # vllm
+    if [ ! -d "$SRC_DIR/vllm" ]; then
+        git clone -b $VLLM_VERSION https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm"
+    fi
+
+    #mooncake
+    if [ ! -d "$SRC_DIR/Mooncake" ]; then
+        git clone -b pooling_async_memecpy_v1 https://github.com/AscendTransport/Mooncake "$SRC_DIR/Mooncake"
+    fi
+}
+
+install_sys_dependencies() {
+    echo "====> Install system dependencies"
+    apt-get update -y
+
+    DEP_LIST=()
+    while IFS= read -r line; do
+        [[ -n "$line" && ! "$line" =~ ^# ]] && DEP_LIST+=("$line")
+    done < "$SRC_DIR/vllm-ascend/packages.txt"
+
+    apt-get install -y "${DEP_LIST[@]}" gcc g++ cmake libnuma-dev iproute2
+}
+
+install_vllm() {
+    echo "====> Install vllm and vllm-ascend"
+    VLLM_TARGET_DEVICE=empty pip install -e "$SRC_DIR/vllm"
+    pip install -e "$SRC_DIR/vllm-ascend"
+    pip install modelscope
+    # Install for pytest
+    pip install -r "$SRC_DIR/vllm-ascend/requirements-dev.txt"
+}
+
+install_mooncake() {
+    echo "====> Install mooncake"
+    apt-get update -y
+    apt-get install -y --no-install-recommends mpich libmpich-dev
+    cd $SRC_DIR/Mooncake
+    bash dependencies.sh --yes
+    apt purge mpich libmpich-dev -y
+    apt purge openmpi-bin -y
+    apt purge openmpi-bin libopenmpi-dev -y
+    apt install mpich libmpich-dev -y
+    export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:$CPATH
+    export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$CPATH
+
+    mkdir build
+    cd -
+    cd $SRC_DIR/Mooncake/build
+    cmake ..
+    make -j
+    make install
+    cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
+    cp mooncake-transfer-engine/src/libtransfer_engine.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
+    cd -
+}
+
+kill_npu_processes() {
+  pgrep python3 | xargs -r kill -9
+  pgrep VLLM | xargs -r kill -9
+
+  sleep 4
+}
+
+run_tests() {
+    echo "====> Run tests"
+
+    shopt -s nullglob
+    declare -A results
+    local total=0
+    local passed=0
+    local failed=0
+
+    local REPORT_FILE="/root/.cache/test_summary.md"
+    echo "#Nightly Multi-node Test Summary" > "$REPORT_FILE"
+    echo "" >> "$REPORT_FILE"
+    echo "| Config File | Result |" >> "$REPORT_FILE"
+    echo "|--------------|---------|" >> "$REPORT_FILE"
+
+    for file in tests/e2e/nightly/multi_node/config/models/*.yaml; do
+        export CONFIG_YAML_PATH="$file"
+        echo "Running test with config: $CONFIG_YAML_PATH"
+
+        if pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py; then
+            results["$file"]="✅ PASS"
+            ((passed++))
+        else
+            results["$file"]="❌ FAIL"
+            ((failed++))
+        fi
+        ((total++))
+
+        echo "| \`$file\` | ${results[$file]} |" >> "$REPORT_FILE"
+        echo "------------------------------------------"
+        kill_npu_processes
+    done
+    shopt -u nullglob
+
+    echo "" >> "$REPORT_FILE"
+    echo "## Summary" >> "$REPORT_FILE"
+    echo "- **Total:** $total" >> "$REPORT_FILE"
+    echo "- **Passed:** $passed ✅" >> "$REPORT_FILE"
+    echo "- **Failed:** $failed ❌" >> "$REPORT_FILE"
+
+    echo
+    echo "✅ Markdown report written to: $REPORT_FILE"
+}
+
+main() {
+    check_npu_info
+    check_and_config
+    checkout_src
+    install_sys_dependencies
+    install_vllm
+    install_mooncake
+    run_tests
+}
+
+main "$@"