[CI] Refator multi-node CI (#3487)

### What this PR does / why we need it? Refactor the multi-machine CI use case. The purpose of this PR is to increase the ease of adding multi-machine CI use cases, allowing developers to add multi-machine cluster model testing use cases (including PD separation) by simply adding a new YAML configuration file. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-17 09:04:31 +08:00
parent ccb6fb9ec1
commit 4c4a8458a5
18 changed files with 632 additions and 437 deletions
--- a/tests/e2e/nightly/multi_node/scripts/lws.yaml
+++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml
@@ -0,0 +1,132 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: vllm
+  namespace: vllm-project
+spec:
+  replicas: 1
+  leaderWorkerTemplate:
+    size: 2
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+          - name: vllm-leader
+            image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+            env:
+              - name: WORKSPACE
+                value: "/root/workspace"
+              # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
+              - name: VLLM_VERSION
+                value: "v0.11.0"
+              - name: VLLM_ASCEND_VERSION
+                value: "main"
+              - name: MOONCAKE_VERSION
+                value: "06cc217504a6f1b0cdaa26b096b985651b262748"
+            command:
+              - sh
+              - -c
+              - |
+                bash /root/.cache/tests/run.sh
+                tail -f /dev/null
+            resources:
+              limits:
+                huawei.com/ascend-1980: "16"
+                memory: 512Gi
+                ephemeral-storage: 100Gi
+              requests:
+                huawei.com/ascend-1980: "16"
+                ephemeral-storage: 100Gi
+                cpu: 125
+            ports:
+              - containerPort: 8080
+            # readinessProbe:
+            #   tcpSocket:
+            #     port: 8080
+            #   initialDelaySeconds: 15
+            #   periodSeconds: 10
+            volumeMounts:
+              - mountPath: /root/.cache
+                name: shared-volume
+              - mountPath: /usr/local/Ascend/driver/tools
+                name: driver-tools
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+        - name: shared-volume
+          persistentVolumeClaim:
+            claimName: nv-action-vllm-benchmarks-v2
+        - name: driver-tools
+          hostPath:
+            path: /usr/local/Ascend/driver/tools
+    workerTemplate:
+      spec:
+        containers:
+          - name: vllm-worker
+            image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+            env:
+              - name: WORKSPACE
+                value: "/root/workspace"
+              # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
+              - name: VLLM_VERSION
+                value: "v0.11.0"
+              - name: VLLM_ASCEND_VERSION
+                value: "main"
+              - name: MOONCAKE_VERSION
+                value: "06cc217504a6f1b0cdaa26b096b985651b262748"
+            command:
+              - sh
+              - -c
+              - |
+                bash /root/.cache/tests/run.sh
+                tail -f /dev/null
+            resources:
+              limits:
+                huawei.com/ascend-1980: "16"
+                memory: 512Gi
+                ephemeral-storage: 100Gi
+              requests:
+                huawei.com/ascend-1980: "16"
+                ephemeral-storage: 100Gi
+                cpu: 125
+            volumeMounts:
+              - mountPath: /root/.cache
+                name: shared-volume
+              - mountPath: /usr/local/Ascend/driver/tools
+                name: driver-tools
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+        - name: shared-volume
+          persistentVolumeClaim:
+            claimName: nv-action-vllm-benchmarks-v2
+        - name: driver-tools
+          hostPath:
+            path: /usr/local/Ascend/driver/tools
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-leader
+  namespace: vllm-project
+spec:
+  ports:
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    leaderworkerset.sigs.k8s.io/name: vllm
+    role: leader
+  type: ClusterIP
--- a/tests/e2e/nightly/multi_node/scripts/run.sh
+++ b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+set -euo pipefail
+
+export SRC_DIR="$WORKSPACE/source_code"
+
+check_npu_info() {
+    echo "====> Check NPU info"
+    npu-smi info
+    cat "/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/ascend_toolkit_install.info"
+}
+
+check_and_config() {
+    echo "====> Configure mirrors and git proxy"
+    git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/"
+    pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
+}
+
+checkout_src() {
+    echo "====> Checkout source code"
+    mkdir -p "$SRC_DIR"
+
+    # vllm-ascend
+    if [ ! -d "$SRC_DIR/vllm-ascend" ]; then
+        git clone --depth 1 -b $VLLM_ASCEND_VERSION https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend"
+    fi
+
+    # vllm
+    if [ ! -d "$SRC_DIR/vllm" ]; then
+        git clone -b $VLLM_VERSION https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm"
+    fi
+
+    #mooncake
+    if [ ! -d "$SRC_DIR/Mooncake" ]; then
+        git clone -b pooling_async_memecpy_v1 https://github.com/AscendTransport/Mooncake "$SRC_DIR/Mooncake"
+    fi
+}
+
+install_sys_dependencies() {
+    echo "====> Install system dependencies"
+    apt-get update -y
+
+    DEP_LIST=()
+    while IFS= read -r line; do
+        [[ -n "$line" && ! "$line" =~ ^# ]] && DEP_LIST+=("$line")
+    done < "$SRC_DIR/vllm-ascend/packages.txt"
+
+    apt-get install -y "${DEP_LIST[@]}" gcc g++ cmake libnuma-dev iproute2
+}
+
+install_vllm() {
+    echo "====> Install vllm and vllm-ascend"
+    VLLM_TARGET_DEVICE=empty pip install -e "$SRC_DIR/vllm"
+    pip install -e "$SRC_DIR/vllm-ascend"
+    pip install modelscope
+    # Install for pytest
+    pip install -r "$SRC_DIR/vllm-ascend/requirements-dev.txt"
+}
+
+install_mooncake() {
+    echo "====> Install mooncake"
+    apt-get update -y
+    apt-get install -y --no-install-recommends mpich libmpich-dev
+    cd $SRC_DIR/Mooncake
+    bash dependencies.sh --yes
+    apt purge mpich libmpich-dev -y
+    apt purge openmpi-bin -y
+    apt purge openmpi-bin libopenmpi-dev -y
+    apt install mpich libmpich-dev -y
+    export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:$CPATH
+    export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$CPATH
+
+    mkdir build
+    cd -
+    cd $SRC_DIR/Mooncake/build
+    cmake ..
+    make -j
+    make install
+    cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
+    cp mooncake-transfer-engine/src/libtransfer_engine.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
+    cd -
+}
+
+kill_npu_processes() {
+  pgrep python3 | xargs -r kill -9
+  pgrep VLLM | xargs -r kill -9
+
+  sleep 4
+}
+
+run_tests() {
+    echo "====> Run tests"
+
+    shopt -s nullglob
+    declare -A results
+    local total=0
+    local passed=0
+    local failed=0
+
+    local REPORT_FILE="/root/.cache/test_summary.md"
+    echo "#Nightly Multi-node Test Summary" > "$REPORT_FILE"
+    echo "" >> "$REPORT_FILE"
+    echo "| Config File | Result |" >> "$REPORT_FILE"
+    echo "|--------------|---------|" >> "$REPORT_FILE"
+
+    for file in tests/e2e/nightly/multi_node/config/models/*.yaml; do
+        export CONFIG_YAML_PATH="$file"
+        echo "Running test with config: $CONFIG_YAML_PATH"
+
+        if pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py; then
+            results["$file"]="✅ PASS"
+            ((passed++))
+        else
+            results["$file"]="❌ FAIL"
+            ((failed++))
+        fi
+        ((total++))
+
+        echo "| \`$file\` | ${results[$file]} |" >> "$REPORT_FILE"
+        echo "------------------------------------------"
+        kill_npu_processes
+    done
+    shopt -u nullglob
+
+    echo "" >> "$REPORT_FILE"
+    echo "## Summary" >> "$REPORT_FILE"
+    echo "- **Total:** $total" >> "$REPORT_FILE"
+    echo "- **Passed:** $passed ✅" >> "$REPORT_FILE"
+    echo "- **Failed:** $failed ❌" >> "$REPORT_FILE"
+
+    echo
+    echo "✅ Markdown report written to: $REPORT_FILE"
+}
+
+main() {
+    check_npu_info
+    check_and_config
+    checkout_src
+    install_sys_dependencies
+    install_vllm
+    install_mooncake
+    run_tests
+}
+
+main "$@"