[1/N][CI] Add multi node test (#3359)

### What this PR does / why we need it? This pr purpose to add multi-node test, on the first step, add `deepseek-v3` dp+tp+ep test ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-11 14:50:46 +08:00
parent 82b6c846ca
commit 9eb103607f
11 changed files with 897 additions and 1 deletions
--- a/tests/e2e/multi_node/scripts/lws.yaml
+++ b/tests/e2e/multi_node/scripts/lws.yaml
@@ -0,0 +1,142 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: vllm
+  namespace: vllm-project
+spec:
+  replicas: 1
+  leaderWorkerTemplate:
+    size: 2
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+          - name: vllm-leader
+            image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+            env:
+              - name: VLLM_USE_MODELSCOPE
+                value: "true"
+              - name: WORKSPACE
+                value: "/root/workspace"
+              - name: WORLD_SIZE
+                value: "2"
+              - name: NPU_PER_NODE
+                value: "16"
+              # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
+              - name: VLLM_VERSION
+                value: "v0.11.0"
+              - name: VLLM_ASCEND_VERSION
+                value: "main"
+              - name: MOONCAKE_VERSION
+                value: "06cc217504a6f1b0cdaa26b096b985651b262748"
+            command:
+              - sh
+              - -c
+              - |
+                bash /root/.cache/tests/run.sh
+            resources:
+              limits:
+                huawei.com/ascend-1980: "16"
+                memory: 512Gi
+                ephemeral-storage: 100Gi
+              requests:
+                huawei.com/ascend-1980: "16"
+                ephemeral-storage: 100Gi
+                cpu: 125
+            ports:
+              - containerPort: 8080
+            # readinessProbe:
+            #   tcpSocket:
+            #     port: 8080
+            #   initialDelaySeconds: 15
+            #   periodSeconds: 10
+            volumeMounts:
+              - mountPath: /root/.cache
+                name: shared-volume
+              - mountPath: /usr/local/Ascend/driver/tools
+                name: driver-tools
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+        - name: shared-volume
+          persistentVolumeClaim:
+            claimName: nv-action-vllm-benchmarks-v2
+        - name: driver-tools
+          hostPath:
+            path: /usr/local/Ascend/driver/tools
+    workerTemplate:
+      spec:
+        containers:
+          - name: vllm-worker
+            image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+            env:
+              - name: VLLM_USE_MODELSCOPE
+                value: "true"
+              - name: WORKSPACE
+                value: "/root/workspace"
+              - name: WORLD_SIZE
+                value: "2"
+              - name: NPU_PER_NODE
+                value: "16"
+              # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
+              - name: VLLM_VERSION
+                value: "v0.11.0"
+              - name: VLLM_ASCEND_VERSION
+                value: "main"
+              - name: MOONCAKE_VERSION
+                value: "06cc217504a6f1b0cdaa26b096b985651b262748"
+            command:
+              - sh
+              - -c
+              - |
+                bash /root/.cache/tests/run.sh
+            resources:
+              limits:
+                huawei.com/ascend-1980: "16"
+                memory: 512Gi
+                ephemeral-storage: 100Gi
+              requests:
+                huawei.com/ascend-1980: "16"
+                ephemeral-storage: 100Gi
+                cpu: 125
+            volumeMounts:
+              - mountPath: /root/.cache
+                name: shared-volume
+              - mountPath: /usr/local/Ascend/driver/tools
+                name: driver-tools
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+        - name: shared-volume
+          persistentVolumeClaim:
+            claimName: nv-action-vllm-benchmarks-v2
+        - name: driver-tools
+          hostPath:
+            path: /usr/local/Ascend/driver/tools
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-leader
+  namespace: vllm-project
+spec:
+  ports:
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    leaderworkerset.sigs.k8s.io/name: vllm
+    role: leader
+  type: ClusterIP
--- a/tests/e2e/multi_node/scripts/run.sh
+++ b/tests/e2e/multi_node/scripts/run.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+set -euo pipefail
+
+export SRC_DIR="$WORKSPACE/source_code"
+
+check_npu_info() {
+    echo "====> Check NPU info"
+    npu-smi info
+    cat "/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/ascend_toolkit_install.info"
+}
+
+check_and_config() {
+    echo "====> Configure mirrors and git proxy"
+    git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/"
+    pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
+}
+
+checkout_src() {
+    echo "====> Checkout source code"
+    mkdir -p "$SRC_DIR"
+
+    # vllm-ascend
+    if [ ! -d "$SRC_DIR/vllm-ascend" ]; then
+        git clone --depth 1 -b $VLLM_ASCEND_VERSION https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend"
+    fi
+
+    # vllm
+    if [ ! -d "$SRC_DIR/vllm" ]; then
+        git clone -b $VLLM_VERSION https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm"
+    fi
+
+    #mooncake
+    if [ ! -d "$SRC_DIR/Mooncake" ]; then
+        git clone https://github.com/kvcache-ai/Mooncake.git "$SRC_DIR/Mooncake"
+        cd "$SRC_DIR/Mooncake"
+        git checkout 06cc217504a6f1b0cdaa26b096b985651b262748
+        cd -
+    fi
+}
+
+install_sys_dependencies() {
+    echo "====> Install system dependencies"
+    apt-get update -y
+
+    DEP_LIST=()
+    while IFS= read -r line; do
+        [[ -n "$line" && ! "$line" =~ ^# ]] && DEP_LIST+=("$line")
+    done < "$SRC_DIR/vllm-ascend/packages.txt"
+
+    apt-get install -y "${DEP_LIST[@]}" gcc g++ cmake libnuma-dev iproute2
+}
+
+install_vllm() {
+    echo "====> Install vllm and vllm-ascend"
+    VLLM_TARGET_DEVICE=empty pip install -e "$SRC_DIR/vllm"
+    pip install -e "$SRC_DIR/vllm-ascend"
+    pip install modelscope
+    # Install for pytest
+    pip install -r "$SRC_DIR/vllm-ascend/requirements-dev.txt"
+}
+
+install_mooncake() {
+    echo "====> Install mooncake"
+    apt-get update
+    apt install -y --allow-change-held-packages python3 python-is-python3
+    apt-get install -y --no-install-recommends mpich libmpich-dev
+    cd $SRC_DIR/Mooncake
+    sed -i '/option(USE_ASCEND_DIRECT)/s/OFF/ON/' mooncake-common/common.cmake
+    bash dependencies.sh --yes
+    mkdir build
+    cd -
+    cd $SRC_DIR/Mooncake/build
+    cmake ..
+    make -j
+    make install
+    cd -
+}
+
+run_tests() {
+    echo "====> Run tests"
+    cd "$SRC_DIR/vllm-ascend"
+    pytest -sv tests/e2e/multi_node/test_multi_dp.py
+}
+
+main() {
+    check_npu_info
+    check_and_config
+    checkout_src
+    install_sys_dependencies
+    install_vllm
+    #install_mooncake
+    run_tests
+}
+
+main "$@"