[1/N][CI] Add multi node test (#3359)
### What this PR does / why we need it? This pr purpose to add multi-node test, on the first step, add `deepseek-v3` dp+tp+ep test ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
142
tests/e2e/multi_node/scripts/lws.yaml
Normal file
142
tests/e2e/multi_node/scripts/lws.yaml
Normal file
@@ -0,0 +1,142 @@
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: vllm
|
||||
namespace: vllm-project
|
||||
spec:
|
||||
replicas: 1
|
||||
leaderWorkerTemplate:
|
||||
size: 2
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-leader
|
||||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
||||
env:
|
||||
- name: VLLM_USE_MODELSCOPE
|
||||
value: "true"
|
||||
- name: WORKSPACE
|
||||
value: "/root/workspace"
|
||||
- name: WORLD_SIZE
|
||||
value: "2"
|
||||
- name: NPU_PER_NODE
|
||||
value: "16"
|
||||
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
|
||||
- name: VLLM_VERSION
|
||||
value: "v0.11.0"
|
||||
- name: VLLM_ASCEND_VERSION
|
||||
value: "main"
|
||||
- name: MOONCAKE_VERSION
|
||||
value: "06cc217504a6f1b0cdaa26b096b985651b262748"
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
bash /root/.cache/tests/run.sh
|
||||
resources:
|
||||
limits:
|
||||
huawei.com/ascend-1980: "16"
|
||||
memory: 512Gi
|
||||
ephemeral-storage: 100Gi
|
||||
requests:
|
||||
huawei.com/ascend-1980: "16"
|
||||
ephemeral-storage: 100Gi
|
||||
cpu: 125
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
# readinessProbe:
|
||||
# tcpSocket:
|
||||
# port: 8080
|
||||
# initialDelaySeconds: 15
|
||||
# periodSeconds: 10
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: shared-volume
|
||||
- mountPath: /usr/local/Ascend/driver/tools
|
||||
name: driver-tools
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 15Gi
|
||||
- name: shared-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nv-action-vllm-benchmarks-v2
|
||||
- name: driver-tools
|
||||
hostPath:
|
||||
path: /usr/local/Ascend/driver/tools
|
||||
workerTemplate:
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-worker
|
||||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
||||
env:
|
||||
- name: VLLM_USE_MODELSCOPE
|
||||
value: "true"
|
||||
- name: WORKSPACE
|
||||
value: "/root/workspace"
|
||||
- name: WORLD_SIZE
|
||||
value: "2"
|
||||
- name: NPU_PER_NODE
|
||||
value: "16"
|
||||
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
|
||||
- name: VLLM_VERSION
|
||||
value: "v0.11.0"
|
||||
- name: VLLM_ASCEND_VERSION
|
||||
value: "main"
|
||||
- name: MOONCAKE_VERSION
|
||||
value: "06cc217504a6f1b0cdaa26b096b985651b262748"
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
bash /root/.cache/tests/run.sh
|
||||
resources:
|
||||
limits:
|
||||
huawei.com/ascend-1980: "16"
|
||||
memory: 512Gi
|
||||
ephemeral-storage: 100Gi
|
||||
requests:
|
||||
huawei.com/ascend-1980: "16"
|
||||
ephemeral-storage: 100Gi
|
||||
cpu: 125
|
||||
volumeMounts:
|
||||
- mountPath: /root/.cache
|
||||
name: shared-volume
|
||||
- mountPath: /usr/local/Ascend/driver/tools
|
||||
name: driver-tools
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 15Gi
|
||||
- name: shared-volume
|
||||
persistentVolumeClaim:
|
||||
claimName: nv-action-vllm-benchmarks-v2
|
||||
- name: driver-tools
|
||||
hostPath:
|
||||
path: /usr/local/Ascend/driver/tools
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm-leader
|
||||
namespace: vllm-project
|
||||
spec:
|
||||
ports:
|
||||
- name: http
|
||||
port: 8080
|
||||
protocol: TCP
|
||||
targetPort: 8080
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: vllm
|
||||
role: leader
|
||||
type: ClusterIP
|
||||
96
tests/e2e/multi_node/scripts/run.sh
Normal file
96
tests/e2e/multi_node/scripts/run.sh
Normal file
@@ -0,0 +1,96 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
export SRC_DIR="$WORKSPACE/source_code"
|
||||
|
||||
check_npu_info() {
|
||||
echo "====> Check NPU info"
|
||||
npu-smi info
|
||||
cat "/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/ascend_toolkit_install.info"
|
||||
}
|
||||
|
||||
check_and_config() {
|
||||
echo "====> Configure mirrors and git proxy"
|
||||
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/"
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
}
|
||||
|
||||
checkout_src() {
|
||||
echo "====> Checkout source code"
|
||||
mkdir -p "$SRC_DIR"
|
||||
|
||||
# vllm-ascend
|
||||
if [ ! -d "$SRC_DIR/vllm-ascend" ]; then
|
||||
git clone --depth 1 -b $VLLM_ASCEND_VERSION https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend"
|
||||
fi
|
||||
|
||||
# vllm
|
||||
if [ ! -d "$SRC_DIR/vllm" ]; then
|
||||
git clone -b $VLLM_VERSION https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm"
|
||||
fi
|
||||
|
||||
#mooncake
|
||||
if [ ! -d "$SRC_DIR/Mooncake" ]; then
|
||||
git clone https://github.com/kvcache-ai/Mooncake.git "$SRC_DIR/Mooncake"
|
||||
cd "$SRC_DIR/Mooncake"
|
||||
git checkout 06cc217504a6f1b0cdaa26b096b985651b262748
|
||||
cd -
|
||||
fi
|
||||
}
|
||||
|
||||
install_sys_dependencies() {
|
||||
echo "====> Install system dependencies"
|
||||
apt-get update -y
|
||||
|
||||
DEP_LIST=()
|
||||
while IFS= read -r line; do
|
||||
[[ -n "$line" && ! "$line" =~ ^# ]] && DEP_LIST+=("$line")
|
||||
done < "$SRC_DIR/vllm-ascend/packages.txt"
|
||||
|
||||
apt-get install -y "${DEP_LIST[@]}" gcc g++ cmake libnuma-dev iproute2
|
||||
}
|
||||
|
||||
install_vllm() {
|
||||
echo "====> Install vllm and vllm-ascend"
|
||||
VLLM_TARGET_DEVICE=empty pip install -e "$SRC_DIR/vllm"
|
||||
pip install -e "$SRC_DIR/vllm-ascend"
|
||||
pip install modelscope
|
||||
# Install for pytest
|
||||
pip install -r "$SRC_DIR/vllm-ascend/requirements-dev.txt"
|
||||
}
|
||||
|
||||
install_mooncake() {
|
||||
echo "====> Install mooncake"
|
||||
apt-get update
|
||||
apt install -y --allow-change-held-packages python3 python-is-python3
|
||||
apt-get install -y --no-install-recommends mpich libmpich-dev
|
||||
cd $SRC_DIR/Mooncake
|
||||
sed -i '/option(USE_ASCEND_DIRECT)/s/OFF/ON/' mooncake-common/common.cmake
|
||||
bash dependencies.sh --yes
|
||||
mkdir build
|
||||
cd -
|
||||
cd $SRC_DIR/Mooncake/build
|
||||
cmake ..
|
||||
make -j
|
||||
make install
|
||||
cd -
|
||||
}
|
||||
|
||||
run_tests() {
|
||||
echo "====> Run tests"
|
||||
cd "$SRC_DIR/vllm-ascend"
|
||||
pytest -sv tests/e2e/multi_node/test_multi_dp.py
|
||||
}
|
||||
|
||||
main() {
|
||||
check_npu_info
|
||||
check_and_config
|
||||
checkout_src
|
||||
install_sys_dependencies
|
||||
install_vllm
|
||||
#install_mooncake
|
||||
run_tests
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user