[CI][Doc] Optimize multi-node CI (#3565)

### What this PR does / why we need it?
This pull request mainly do the following things:
1. Add a doc for multi-node CI, The main content is the mechanism
principle and how to contribute
2. Simplify the config yaml for more developer-friendly
3. Optimized the mooncake installation script to prevent accidental
failures during installation
4. Fix the workflow to ensure the kubernetes can be apply correctly
5. Add Qwen3-235B-W8A8 disaggregated_prefill test
6. Add GLM-4.5 multi dp test
7. Add 2p1d 4nodes disaggregated_prefill test
8. Refactor nightly tests
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.11.0rc3
- vLLM main:
17c540a993

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-10-25 09:23:47 +08:00
committed by GitHub
parent 292cf339c3
commit 7f73c28a24
21 changed files with 1165 additions and 378 deletions

View File

@@ -0,0 +1,190 @@
name: 'e2e nightly test multi_node'
on:
workflow_call:
inputs:
soc_version:
required: true
type: string
description: use a2 or a3
image:
required: false
type: string
description: base image for pods
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
config_file_path:
required: true
type: string
description: the model config for multi_node test
replicas:
required: false
default: "1"
type: string
description: replicas of the k8s cluster
size:
required: false
default: "2"
type: string
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
vllm_version:
required: false
default: "v0.11.0"
type: string
description: vllm version to use
vllm_ascend_remote_url:
required: false
default: https://github.com/vllm-project/vllm-ascend.git
type: string
description: used for pr level tests
vllm_ascend_ref:
required: false
default: main
type: string
description: used for pr level tests
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
# and ignore the lint / 8 cards test type
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
e2e:
# This is a runner with no NPU for k8s controller
runs-on: linux-aarch64-a3-0
container:
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
env:
KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl
NAMESPACE: vllm-project
LEADER_POD: vllm-0
RESULT_FILE: /root/.cache/tests/ret/test_result.txt
steps:
- name: Install system denpendencies
run: |
# configure apt and pip source
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
pip install jinja2-cli
apt-get update -y && apt-get install -y git curl
- name: Install kubectl
run: |
# Install kubectl
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
# Verify kubectl installation
kubectl version --client=true
# TODO: Add A2 tests
- name: Setup kubeconfig for A3
if: inputs.soc_version == 'a3'
run: |
# Decode and save kubeconfig
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
- name: Checkout code
uses: actions/checkout@v4
- name: Prepare scripts
run: |
# prepare for lws entrypoint scripts
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
- name: Clear result ret
run: |
rm -f $RESULT_FILE
- name: Launch cluster
run: |
set -e
size="${{ inputs.size }}"
replicas="${{ inputs.replicas }}"
image="${{ inputs.image }}"
config_file_path="${{ inputs.config_file_path }}"
vllm_version="${{ inputs.vllm_version }}"
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
result_file_path="$RESULT_FILE"
required_params=("size" "replicas" "image" "config_file_path")
for param in "${required_params[@]}"; do
if [ -z "${!param}" ]; then
echo "Error: Parameter '$param' is required but empty"
exit 1
fi
done
jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
-D size="$size" \
-D replicas="$replicas" \
-D image="$image" \
-D config_file_path="$config_file_path" \
-D vllm_version="$vllm_version" \
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
-D vllm_ascend_ref="$vllm_ascend_ref" \
-D result_file_path="$result_file_path" \
--outfile lws.yaml
kubectl apply -f ./lws.yaml
- name: Waiting for pod ready
run: |
echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."
while true; do
# get pod status
READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')
if [[ "$READY_STATUS" == "true" ]]; then
echo "Pod [$LEADER_POD] is Ready!"
break
else
echo "Pod [$LEADER_POD] not ready, waiting..."
sleep 3
fi
done
- name: Stream logs
run: |
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
- name: Determine is success
run: |
TIMEOUT=600
ELAPSED=0
while [ ! -f "$RESULT_FILE" ]; do
sleep 5
ELAPSED=$((ELAPSED + 5))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo "Timeout waiting for test result file"
exit 1
fi
done
RET=$(cat "$RESULT_FILE")
echo "Test result: $RET"
if [ "$RET" -ne 0 ]; then
echo "Test failed"
exit 1
else
echo "Test succeeded"
fi
- name: Post process
if: always()
run: |
kubectl get pods -n $NAMESPACE
kubectl delete -f ./lws.yaml

View File

@@ -1,125 +0,0 @@
name: 'e2e test / multi-dp'
on:
schedule:
- cron: "0 */4 * * *"
workflow_dispatch:
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
# and ignore the lint / 8 cards test type
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
e2e:
# This is a runner with no NPU for k8s controller
runs-on: linux-aarch64-a3-0
container:
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
env:
KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl
NAMESPACE: vllm-project
LEADER_POD: vllm-0
steps:
- name: Install system denpendencies
run: |
# configure apt and pip source
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
pip install jinja2-cli -y
apt-get update -y && apt-get install -y git curl
TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64`
git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN"
- name: Install kubectl
run: |
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
# get kubeconfig from secret
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
- name: Checkout code
uses: actions/checkout@v4
- name: Prepare scripts
run: |
# prepare for lws entrypoint scripts
install -D tests/e2e/multi_node/scripts/run.sh /root/.cache/tests/run.sh
- name: Launch cluster
run: |
jinja2 tests/e2e/multi_node/scripts/lws.yaml.jinja2 \
-D size=2 \
-D replicas=1 \
-D image="m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11" \
--outfile lws.yaml
kubectl apply -f ./lws.yaml
- name: Waiting for pod ready
run: |
echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."
while true; do
# get pod status
READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')
if [[ "$READY_STATUS" == "true" ]]; then
echo "✅ Pod [$LEADER_POD] is Ready!"
break
else
echo "Pod [$LEADER_POD] not ready, waiting..."
sleep 3
fi
done
- name: Stream logs and monitor pod health
run: |
set -euo pipefail
echo "🚀 Start streaming logs for Pod [$LEADER_POD] ..."
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" &
LOG_PID=$!
echo "Start monitoring Pod [$LEADER_POD] status ..."
while true; do
STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}')
if [[ "$STATUS" != "Running" && "$STATUS" != "Succeeded" ]]; then
echo "❌ Pod [$LEADER_POD] exited abnormally with status: $STATUS"
kubectl describe pod "$LEADER_POD" -n "$NAMESPACE" || true
kubectl logs "$LEADER_POD" -n "$NAMESPACE" --previous --all-containers || true
kill $LOG_PID || true
exit 1
fi
sleep 5
done &
MONITOR_PID=$!
wait $LOG_PID || true
kill $MONITOR_PID || true
- name: Generate summary
if: always()
run: |
if [ -f "/root/.cache/test_summary.md" ]; then
cat /root/.cache/test_summary.md >> "$GITHUB_STEP_SUMMARY"
else
echo "No summary file found." >> "$GITHUB_STEP_SUMMARY"
fi
- name: Post process
if: always()
run: |
kubectl get pods -n $NAMESPACE
kubectl delete -f ./lws.yaml

View File

@@ -1,133 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
name: 'ascend test / nightly'
on:
schedule:
# Run test at 24:00 Beijing time (UTC+8)
- cron: "0 16 * * *"
workflow_dispatch:
pull_request:
branches:
- 'main'
- '*-dev'
types: [labeled,opened,synchronize]
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
# and ignore the lint / 1 card / 4 cards test type
concurrency:
group: ascend-nightly-${{ github.ref }}
#cancel-in-progress: true
jobs:
qwen3-32b:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
# should add A3 chip runner when available
os: [linux-aarch64-a2-4]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
# only trigger e2e test after lint passed and the change is e2e related with pull request.
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
tests: tests/e2e/nightly/models/test_qwen3_32b.py
qwen3-32b-in8-a3:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a3-4 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
qwen3-32b-in8-a2:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a2-4 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
qwen3-235b-a22b-w8a8-eplb:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a3-16 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
deepseek-r1-w8a8-eplb:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a3-16 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
qwen3-32b-int8-a3-feature-stack3:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a3-4 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
qwen2-5-vl-7b:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a3-4 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/models/test_qwen2_5_vl_7b.py
deepseek-r1-0528-w8a8:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a3-16 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py

View File

@@ -0,0 +1,60 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
# This workflow related to the resources atlas 800 A2
# We will not limit the concurrency of jobs on A2
name: 'ascend test / nightly-a2'
on:
schedule:
# Run test at 24:00 Beijing time (UTC+8)
- cron: "0 16 * * *"
workflow_dispatch:
pull_request:
branches:
- 'main'
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
concurrency:
group: ascend-nightly-${{ github.ref }}-a2
cancel-in-progress: true
jobs:
single-node-tests:
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
strategy:
fail-fast: false
matrix:
test_config:
- name: qwen3-32b
os: linux-aarch64-a2-4
tests: tests/e2e/nightly/models/test_qwen3_32b.py
- name: qwen3-32b-in8-a2
os: linux-aarch64-a2-4
tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.test_config.os }}
tests: ${{ matrix.test_config.tests }}

View File

@@ -0,0 +1,98 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
# This workflow related to the resources atlas 800 A3
# **Please note**: current A3 resource pool's maximum allowed concurrency is 5*16 NPUs
# We will limit the concurrency of jobs on A3 to avoid the risk of insufficient resources
name: 'ascend test / nightly-a3'
on:
schedule:
# Run test at 24:00 Beijing time (UTC+8)
- cron: "0 16 * * *"
workflow_dispatch:
pull_request:
branches:
- 'main'
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
concurrency:
group: ascend-nightly-${{ github.ref }}-a3
cancel-in-progress: true
jobs:
single-node-tests:
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
strategy:
fail-fast: false
matrix:
test_config:
- name: qwen3-32b-in8-a3
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
- name: qwen3-32b-int8-a3-feature-stack3
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
- name: qwen3-235b-a22b-w8a8-eplb
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
- name: deepseek-r1-w8a8-eplb
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
- name: qwen2-5-vl-7b
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/models/test_qwen2_5_vl_7b.py
- name: deepseek-r1-0528-w8a8
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.test_config.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: ${{ matrix.test_config.tests }}
multi-node-tests:
needs: single-node-tests
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-pd
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml
size: 2
- name: multi-node-qwen3-dp
config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
size: 2
- name: multi-node-dpsk-4node-pd
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
size: 4
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a3
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

View File

@@ -108,4 +108,5 @@ If you find any problem when contributing, you can feel free to submit a PR to i
:caption: Index
:maxdepth: 1
testing
multi_node_test
:::

View File

@@ -0,0 +1,99 @@
# Multi Node Test
Multi-Node CI is designed to test distributed scenarios of very large models, eg: disaggregated_prefill multi DP across multi nodes and so on.
## How is works
The following picture shows the basic deployment view of the multi-node CI mechanism, It shows how the github action interact with [lws](https://lws.sigs.k8s.io/docs/overview/) (a kind of kubernetes crd resource)
![alt text](../../assets/deployment.png)
From the workflow perspective, we can see how the final test script is executed, The key point is that these two [lws.yaml and run.sh](https://github.com/vllm-project/vllm-ascend/tree/main/tests/e2e/nightly/multi_node/scripts), The former defines how our k8s cluster is pulled up, and the latter defines the entry script when the pod is started, Each node executes different logic according to the [LWS_WORKER_INDEX](https://lws.sigs.k8s.io/docs/reference/labels-annotations-and-environment-variables/) environment variable, so that multiple nodes can form a distributed cluster to perform tasks.
![alt text](../../assets/workflow.png)
## How to contribute
1. Upload custom weights
If you need customized weights, for example, you quantized a w8a8 weight for DeepSeek-V3 and you want your weight to run on CI, Uploading weights to ModelScope's [vllm-ascend](https://www.modelscope.cn/organization/vllm-ascend) organization is welcome, If you do not have permission to upload, please contact @Potabk
2. Add config yaml
As the entrypoint script [run.sh](https://github.com/vllm-project/vllm-ascend/blob/0bf3f21a987aede366ec4629ad0ffec8e32fe90d/tests/e2e/nightly/multi_node/scripts/run.sh#L106) shows, A k8s pod startup means traversing all *.yaml files in the [directory](https://github.com/vllm-project/vllm-ascend/tree/main/tests/e2e/nightly/multi_node/config/models), reading and executing according to different configurations, so what we need to do is just add "yamls" like [DeepSeek-V3.yaml](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml).
Suppose you have **2 nodes** running a 1P1D setup (1 Prefillers + 1 Decoder):
you may add a config file looks like:
```yaml
test_name: "test DeepSeek-V3 disaggregated_prefill"
# the model being tested
model: "vllm-ascend/DeepSeek-V3-W8A8"
# how large the cluster is
num_nodes: 2
npu_per_node: 16
# All env vars you need should add it here
env_common:
VLLM_USE_MODELSCOPE: true
OMP_PROC_BIND: false
OMP_NUM_THREADS: 100
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
disaggregated_prefill:
enabled: true
# node index(a list) which meet all the conditions:
# - prefiller
# - no headless(have api server)
prefiller_host_index: [0]
# node index(a list) which meet all the conditions:
# - decoder
# - no headless(have api server)
decoder_host_index: [1]
# Add each node's vllm serve cli command just like you runs locally
deployment:
-
server_cmd: >
vllm serve ...
-
server_cmd: >
vllm serve ...
benchmarks:
perf:
# fill with performance test kwargs
acc:
# fill with accuracy test kwargs
```
3. Add the case to nightly workflow
currently, the multi-node test workflow defined in the [vllm_ascend_test_nightly_a2/a3.yaml](https://github.com/vllm-project/vllm-ascend/blob/main/.github/workflows/vllm_ascend_test_nightly_a3.yaml)
```yaml
multi-node-tests:
needs: single-node-tests
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-pd
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml
size: 2
- name: multi-node-qwen3-dp
config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
size: 2
- name: multi-node-dpsk-4node-pd
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
size: 4
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a3
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}
```
The matrix above defines all the parameters required to add a multi-machine use case, The parameters worth paying attention to (I mean if you are adding a new use case) are size and the path to the yaml configuration file. The former defines the number of nodes required for your use case, and the latter defines the path to the configuration file you have completed in step 2.

View File

@@ -49,6 +49,7 @@ from vllm.utils import get_open_port
from tests.e2e.model_utils import (TokensTextLogprobs,
TokensTextLogprobsPromptLogprobs)
from tests.e2e.nightly.multi_node.config.multi_node_config import NodeInfo
from vllm_ascend.ascend_config import clear_ascend_config
# TODO: remove this part after the patch merged into vllm, if
# we not explicitly patch here, some of them might be effectiveless
@@ -115,6 +116,9 @@ class RemoteOpenAIServer:
env_dict: Optional[dict[str, str]] = None,
seed: Optional[int] = None,
auto_port: bool = True,
nodes_info: Optional[list[NodeInfo]] = None,
disaggregated_prefill: Optional[dict] = None,
proxy_port: Optional[int] = None,
max_wait_seconds: Optional[float] = None,
override_hf_configs: Optional[dict[str, Any]] = None) -> None:
if isinstance(vllm_serve_args, str):
@@ -144,13 +148,23 @@ class RemoteOpenAIServer:
"--hf-overrides",
json.dumps(override_hf_configs)
]
self.host = str(server_host)
self.port = int(server_port)
# for multi-nodes test
self.nodes_info = nodes_info
self.disaggregated_prefill = disaggregated_prefill
self.cur_index = os.getenv("LWS_WORKER_INDEX", 0)
self.proxy_port = proxy_port
self._start_server(model, vllm_serve_args, env_dict)
max_wait_seconds = max_wait_seconds or 7200
self._wait_for_server(url=self.url_for("health"),
timeout=max_wait_seconds)
if self.disaggregated_prefill:
assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided"
self._wait_for_server_pd(proxy_port=proxy_port)
else:
self._wait_for_server(url=self.url_for("health"),
timeout=max_wait_seconds)
def __enter__(self):
return self
@@ -187,6 +201,21 @@ class RemoteOpenAIServer:
if isinstance(client, httpx.Client):
client.close()
def _wait_for_server_pd(self, proxy_port: int):
# Wait for all api_server nodes ready
assert self.nodes_info is not None, "cluster info must be provided"
for node_info in self.nodes_info:
if node_info.headless:
continue
url_health = f"http://{node_info.ip}:{node_info.server_port}/health"
self._wait_for_server(url=url_health, timeout=7200)
# Wait for proxy ready
master_node = self.nodes_info[0]
url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck"
self._wait_for_server(url=url_proxy, timeout=7200)
def _wait_for_server(self, *, url: str, timeout: float):
# run health check
start = time.time()

View File

@@ -0,0 +1,163 @@
test_name: "test DeepSeek-R1-W8A8 disaggregated_prefill"
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
num_nodes: 4
npu_per_node: 16
env_common:
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 10
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
HCCL_DETERMINISTIC: True
TASK_QUEUE_ENABLE: 1
HCCL_OP_RETRY_ENABLE: "L0:0, L1:0, L2:0"
disaggregated_prefill:
enabled: true
prefiller_host_index: [0, 1]
decoder_host_index: [2]
ranktable_gen_path: "examples/disaggregated_prefill_v1/gen_ranktable.py"
ranktable_path: "/tmp/ranktable.json"
deployment:
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-size-local 2
--tensor-parallel-size 8
--enforce-eager
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 4
--max-model-len 36864
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.9
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--kv-transfer-config
'{"kv_connector": "LLMDataDistCMgrConnector",
"kv_buffer_device": "npu",
"kv_role": "kv_producer",
"kv_parallel_size": 1,
"kv_port": "20001",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-size-local 2
--tensor-parallel-size 8
--enforce-eager
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 4
--max-model-len 36864
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.9
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--kv-transfer-config
'{"kv_connector": "LLMDataDistCMgrConnector",
"kv_buffer_device": "npu",
"kv_role": "kv_producer",
"kv_parallel_size": 1,
"kv_port": "20001",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 32
--data-parallel-size-local 16
--data-parallel-start-rank 0
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 1
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 28
--max-model-len 36864
--max-num-batched-tokens 256
--trust-remote-code
--gpu-memory-utilization 0.9
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--kv-transfer-config
'{"kv_connector": "LLMDataDistCMgrConnector",
"kv_buffer_device": "npu",
"kv_role": "kv_consumer",
"kv_parallel_size": 1,
"kv_port": "20001",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--headless
--data-parallel-size 32
--data-parallel-size-local 16
--data-parallel-start-rank 16
--data-parallel-address $MASTER_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 1
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 28
--max-model-len 36864
--max-num-batched-tokens 256
--trust-remote-code
--gpu-memory-utilization 0.9
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}'
--kv-transfer-config
'{"kv_connector": "LLMDataDistCMgrConnector",
"kv_buffer_device": "npu",
"kv_role": "kv_consumer",
"kv_parallel_size": 1,
"kv_port": "20001",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}'
--additional-config
'{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}'
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 1
max_out_len: 2
batch_size: 1
baseline: 5
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/AIME2024
request_conf: vllm_api_general_chat
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
max_out_len: 10
batch_size: 32
baseline: 1
threshold: 1

View File

@@ -26,10 +26,6 @@ disaggregated_prefill:
deployment:
-
local_index: 0
master_index: 0
headless: false
env_extend:
server_cmd: >
vllm serve "vllm-ascend/DeepSeek-V3-W8A8"
--host 0.0.0.0
@@ -66,10 +62,6 @@ deployment:
}'
-
local_index: 1
master_index: 0
headless: true
env_extend:
server_cmd: >
vllm serve "vllm-ascend/DeepSeek-V3-W8A8"
--host 0.0.0.0

View File

@@ -0,0 +1,68 @@
test_name: "test GLM-4.5 multi-dp"
model: "ZhipuAI/GLM-4.5"
num_nodes: 2
npu_per_node: 16
env_common:
VLLM_USE_MODELSCOPE: true
OMP_PROC_BIND: false
OMP_NUM_THREADS: 100
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
deployment:
-
server_cmd: >
vllm serve "ZhipuAI/GLM-4.5"
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 4
--data-parallel-size-local 2
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 8
--seed 1024
--enable-expert-parallel
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 8192
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
-
server_cmd: >
vllm serve "ZhipuAI/GLM-4.5"
--headless
--data-parallel-size 4
--data-parallel-size-local 2
--data-parallel-start-rank 2
--data-parallel-address $MASTER_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 8
--seed 1024
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 8192
--enable-expert-parallel
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 1
max_out_len: 2
batch_size: 1
baseline: 5
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/AIME2024
request_conf: vllm_api_general_chat
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
max_out_len: 10
batch_size: 32
baseline: 1
threshold: 1

View File

@@ -11,10 +11,6 @@ env_common:
deployment:
-
local_index: 0
master_index: 0
headless: false
env_extend:
server_cmd: >
vllm serve "Qwen/Qwen3-235B-A22B"
--host 0.0.0.0
@@ -33,10 +29,6 @@ deployment:
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
-
local_index: 1
master_index: 0
headless: true
env_extend:
server_cmd: >
vllm serve "Qwen/Qwen3-235B-A22B"
--headless

View File

@@ -0,0 +1,105 @@
test_name: "test Qwen3-235B-A22B-W8A8 disaggregated_prefill"
model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
num_nodes: 2
npu_per_node: 16
env_common:
VLLM_USE_MODELSCOPE: true
OMP_PROC_BIND: false
OMP_NUM_THREADS: 100
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
disaggregated_prefill:
enabled: true
prefiller_host_index: [0]
decoder_host_index: [1]
deployment:
-
server_cmd: >
vllm serve "vllm-ascend/Qwen3-235B-A22B-W8A8"
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-size-local 2
--tensor-parallel-size 8
--seed 1024
--enable-expert-parallel
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 8192
--quantization ascend
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
--kv-transfer-config
'{"kv_connector": "MooncakeConnector",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 8
},
"decode": {
"dp_size": 2,
"tp_size": 8
}
}
}'
-
server_cmd: >
vllm serve "vllm-ascend/Qwen3-235B-A22B-W8A8"
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-size-local 2
--tensor-parallel-size 8
--seed 1024
--quantization ascend
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 8192
--enable-expert-parallel
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
--kv-transfer-config
'{"kv_connector": "MooncakeConnector",
"kv_role": "kv_consumer",
"kv_port": "30200",
"engine_id": "1",
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 8
},
"decode": {
"dp_size": 2,
"tp_size": 8
}
}
}'
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs400
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 1
max_out_len: 2
batch_size: 1
baseline: 5
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/AIME2024
request_conf: vllm_api_general_chat
dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt
max_out_len: 10
batch_size: 32
baseline: 1
threshold: 1

View File

@@ -1,6 +1,7 @@
import logging
import os
import subprocess
from dataclasses import dataclass
from typing import Optional
import regex as re
@@ -15,6 +16,16 @@ from tests.e2e.nightly.multi_node.config.utils import (get_avaliable_port,
setup_logger()
logger = logging.getLogger(__name__)
DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py"
DISAGGEGATED_PREFILL_PORT = 5333
@dataclass
class NodeInfo:
index: int
ip: str
server_cmd: str
headless: bool
server_port: int
class MultiNodeConfig:
@@ -22,38 +33,50 @@ class MultiNodeConfig:
def __init__(self,
model: str,
test_name: str,
num_nodes: int = 2,
npu_per_node: int = 16,
server_port: int = 8080,
headless: bool = False,
disaggregated_prefill: Optional[dict] = None,
envs: Optional[dict] = None,
server_cmd: str = "",
nodes_info: Optional[list[NodeInfo]] = None,
perf_cmd: Optional[str] = None,
acc_cmd: Optional[str] = None):
self.test_name = test_name
self.model = model
self.num_nodes = num_nodes
self.nodes_info = nodes_info or []
self.num_nodes = len(self.nodes_info)
self.npu_per_node = npu_per_node
self.envs = envs if envs is not None else {}
self.server_port = server_port
if disaggregated_prefill:
self.proxy_port = get_avaliable_port()
self.headless = headless
self.server_cmd = server_cmd
self.envs = envs if envs is not None else {}
self.proxy_port = get_avaliable_port()
self.perf_cmd = perf_cmd
self.acc_cmd = acc_cmd
assert perf_cmd is not None, "perf_cmd must be provided"
assert acc_cmd is not None, "acc_cmd must be provided"
assert server_cmd is not None, "server_cmd must be provided"
self.cur_index = os.getenv("LWS_WORKER_INDEX", 0)
self.cur_index = int(os.getenv("LWS_WORKER_INDEX", 0))
self.cur_ip = get_cur_ip()
self.nic_name = get_net_interface(self.cur_ip)
self.cluster_ips = get_cluster_ips(num_nodes)
self.cluster_ips = get_cluster_ips(self.num_nodes)
self.cur_node_info: NodeInfo = self.nodes_info[self.cur_index]
self.disaggregated_prefill = disaggregated_prefill
self._init_disaggregated_prefill()
self._init_dist_env()
self.server_cmd = self._expand_env_vars(self.server_cmd, self.envs)
self.server_cmd = self._expand_env_vars(self.cur_node_info.server_cmd,
self.envs)
def _init_disaggregated_prefill(self):
if self.disaggregated_prefill:
decode_host_index = self.disaggregated_prefill.get(
"decoder_host_index")
if not decode_host_index:
raise RuntimeError("got empty decode_host_index")
self.decode_start_index: int = decode_host_index[0]
self.num_prefillers = self.decode_start_index
self.num_decoders = self.num_nodes - self.num_prefillers
if self.disaggregated_prefill.get(
"ranktable_gen_path") is not None:
self._gen_ranktable()
def _init_dist_env(self):
self.envs["HCCL_IF_IP"] = self.cur_ip
@@ -62,7 +85,17 @@ class MultiNodeConfig:
self.envs["HCCL_SOCKET_IFNAME"] = self.nic_name
self.envs["LOCAL_IP"] = self.cur_ip
self.envs["NIC_NAME"] = self.nic_name
self.envs["MASTER_IP"] = self.cluster_ips[0]
if self.disaggregated_prefill:
self.envs[
"DISAGGREGATED_PREFILL_RANK_TABLE_PATH"] = self.disaggregated_prefill.get(
"ranktable_path")
if self.cur_index < self.decode_start_index:
self.envs["MASTER_IP"] = self.cluster_ips[0]
else:
self.envs["MASTER_IP"] = self.cluster_ips[
self.decode_start_index]
ascend_path = "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages"
self.envs[
"LD_LIBRARY_PATH"] = f"{ascend_path}:{self.envs.get('LD_LIBRARY_PATH', os.environ.get('LD_LIBRARY_PATH', ''))}"
@@ -172,15 +205,21 @@ class MultiNodeConfig:
deployments = config_data.get("deployment", [])
assert len(deployments) == num_nodes, \
f"Number of deployments ({len(deployments)}) must match num_nodes ({num_nodes})"
for deployment in deployments:
if deployment.get("local_index") == int(
os.getenv("LWS_WORKER_INDEX", 0)):
envs_extend = deployment.get("env_extend", {})
if envs_extend:
envs.update(envs_extend)
server_cmd = deployment.get("server_cmd")
headless = deployment.get("headless", False)
break
cluster_ips = get_cluster_ips(num_nodes)
nodes_info = []
for index, deployment in enumerate(deployments):
# after assert len(deployments) == num_nodes, we can assume that this will must have a match
server_cmd = deployment.get("server_cmd", "")
headless = "--headless" in server_cmd
nodes_info.append(
NodeInfo(ip=cluster_ips[index],
index=index,
headless=headless,
server_port=server_port,
server_cmd=server_cmd))
benchmarks = config_data.get("benchmarks", {})
assert benchmarks is not None, "benchmarks must be provided"
perf_cmd = benchmarks["perf"]
@@ -188,13 +227,11 @@ class MultiNodeConfig:
return cls(model=model,
test_name=test_name,
num_nodes=num_nodes,
npu_per_node=npu_per_node,
envs=envs,
server_port=server_port,
headless=headless,
disaggregated_prefill=disaggregated_prefill,
server_cmd=server_cmd,
nodes_info=nodes_info,
perf_cmd=perf_cmd,
acc_cmd=acc_cmd)
@@ -204,4 +241,52 @@ class MultiNodeConfig:
@property
def is_master(self):
return int(self.cur_index) == 0
return self.cur_index == 0
def _gen_ranktable(self):
cluster_ip = self.cluster_ips
assert len(cluster_ip) > 0
nnodes = self.num_nodes
node_rank = self.cur_index
master_addr = cluster_ip[0]
master_port = DISAGGEGATED_PREFILL_PORT
assert self.disaggregated_prefill is not None
ranktable_gen_path = self.disaggregated_prefill.get(
"ranktable_gen_path")
ranktable_path = self.disaggregated_prefill.get("ranktable_path")
assert ranktable_gen_path is not None and ranktable_path is not None
if os.path.exists(str(ranktable_path)):
return
local_host = self.cur_ip
cmd = [
"torchrun",
"--nproc_per_node",
"1",
"--nnodes",
str(nnodes),
"--node_rank",
str(node_rank),
"--master_addr",
master_addr,
"--master_port",
str(master_port),
ranktable_gen_path,
"--ranktable-path",
str(ranktable_path),
"--local-host",
local_host,
"--prefill-device-cnt",
str(self.npu_per_node * self.num_prefillers),
"--decode-device-cnt",
str(self.npu_per_node * self.num_decoders),
]
env = os.environ.copy()
assert self.nic_name is not None
env["GLOO_SOCKET_IFNAME"] = self.nic_name
subprocess.run(cmd, env=env, check=True)
assert os.path.exists(
str(ranktable_path)), "failed generate ranktable.json"

View File

@@ -0,0 +1,113 @@
#!/bin/bash
set -e
set -o pipefail
GREEN="\033[0;32m"
BLUE="\033[0;34m"
YELLOW="\033[0;33m"
RED="\033[0;31m"
NC="\033[0m" # No Color
branch=${1:-pooling_async_memecpy_v1}
point=${2:-9d96b2e1dd76cc601d76b1b4c5f6e04605cd81d3}
repo_url="https://github.com/AscendTransport/Mooncake"
repo_name="Mooncake"
state_file=".build_state"
echo "[INFO] Branch: $branch"
echo "[INFO] Commit: $point"
echo "-------------------------------------------"
mark_done() { echo "$1" >> "$state_file"; }
is_done() { grep -Fxq "$1" "$state_file" 2>/dev/null; }
if ! is_done "clone"; then
echo "[STEP] Clone repository..."
if [ -d "$repo_name" ]; then
echo "[WARN] Directory $repo_name already exists, skipping clone."
else
git clone -b "$branch" "$repo_url" "$repo_name"
fi
cd "$repo_name"
git fetch --all
git checkout "$point" || { echo "[ERROR] Checkout failed."; exit 1; }
cd ..
mark_done "clone"
else
echo "[SKIP] Clone step already done."
fi
if ! is_done "deps"; then
cd "$repo_name"
echo "[STEP]Installing dependencies (ignore Go failure)..."
yes | bash dependencies.sh || echo "⚠️ dependencies.sh failed (Go install likely failed), continuing..."
cd ..
mark_done "deps"
else
echo "[SKIP] Dependencies already installed."
fi
if ! is_done "mpi"; then
echo "[STEP] Install MPI..."
apt purge -y mpich libmpich-dev openmpi-bin libopenmpi-dev || true
apt install -y mpich libmpich-dev
export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:${CPATH:-}
export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:${CPATH:-}
mark_done "mpi"
else
echo "[SKIP] MPI installation already done."
fi
if ! is_done "build"; then
echo "[STEP] Compile and install..."
cd "$repo_name"
if [ -d "build" ]; then
echo "[INFO] Removing existing build directory..."
rm -rf build
fi
mkdir build && cd build
cmake .. || { echo "[ERROR] cmake failed."; exit 1; }
make -j || { echo "[ERROR] make failed."; exit 1; }
make install || { echo "[ERROR] make install failed."; exit 1; }
mark_done "build"
else
echo "[SKIP] Build already done."
fi
if ! is_done "copy_lib"; then
echo "[STEP] Copy library files..."
cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so \
/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cp mooncake-transfer-engine/src/libtransfer_engine.so \
/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cd ..
mark_done "copy_lib"
else
echo "[SKIP] Library copy already done."
fi
if ! grep -q "export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH" ~/.bashrc; then
echo -e "${YELLOW}Adding LD_LIBRARY_PATH to your PATH in ~/.bashrc${NC}"
echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH' >> ~/.bashrc
echo -e "${YELLOW}Please run 'source ~/.bashrc' or start a new terminal${NC}"
fi
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
echo "=========================================="
echo -e "${GREEN}[SUCCESS] Mooncake build completed!"
echo "You can rerun this script anytime — it will resume from the last step."
echo "=========================================="
echo "Example startup command:"
echo "mooncake_master --eviction_high_watermark_ratio 0.8 --eviction_ratio 0.05 --port 50088"

View File

@@ -17,19 +17,24 @@ spec:
- name: vllm-leader
image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }}
env:
- name: CONFIG_YAML_PATH
value: {{ config_file_path | default("tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/root/workspace"
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
- name: VLLM_VERSION
value: "v0.11.0"
- name: VLLM_ASCEND_VERSION
value: "main"
value: {{ vllm_ascend_ref | default("main") }}
- name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
tail -f /dev/null
resources:
limits:
huawei.com/ascend-1980: "16"
@@ -70,19 +75,24 @@ spec:
- name: vllm-worker
image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }}
env:
- name: CONFIG_YAML_PATH
value: {{ config_file_path | default("tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/root/workspace"
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
- name: VLLM_VERSION
value: "v0.11.0"
- name: VLLM_ASCEND_VERSION
value: "main"
value: {{ vllm_ascend_ref | default("main") }}
- name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
tail -f /dev/null
resources:
limits:
huawei.com/ascend-1980: "16"

View File

@@ -1,7 +1,47 @@
#!/bin/bash
set -euo pipefail
export SRC_DIR="$WORKSPACE/source_code"
# Color definitions
GREEN="\033[0;32m"
BLUE="\033[0;34m"
YELLOW="\033[0;33m"
RED="\033[0;31m"
NC="\033[0m" # No Color
# Configuration
GOVER=1.23.8
LOG_DIR="/root/.cache/tests/logs"
OVERWRITE_LOGS=true
SRC_DIR="$WORKSPACE/source_code"
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
# Function to print section headers
print_section() {
echo -e "\n${BLUE}=== $1 ===${NC}"
}
# Function to print success messages
print_success() {
echo -e "${GREEN}$1${NC}"
}
# Function to print error messages and exit
print_error() {
echo -e "${RED}✗ ERROR: $1${NC}"
exit 1
}
# Function to check command success
check_success() {
if [ $? -ne 0 ]; then
print_error "$1"
fi
}
if [ $(id -u) -ne 0 ]; then
print_error "Require root permission, try sudo ./dependencies.sh"
fi
check_npu_info() {
echo "====> Check NPU info"
@@ -22,18 +62,13 @@ checkout_src() {
# vllm-ascend
if [ ! -d "$SRC_DIR/vllm-ascend" ]; then
git clone --depth 1 -b $VLLM_ASCEND_VERSION https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend"
git clone --depth 1 -b $VLLM_ASCEND_VERSION $VLLM_ASCEND_REMOTE_URL "$SRC_DIR/vllm-ascend"
fi
# vllm
if [ ! -d "$SRC_DIR/vllm" ]; then
git clone -b $VLLM_VERSION https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm"
fi
#mooncake
if [ ! -d "$SRC_DIR/Mooncake" ]; then
git clone -b pooling_async_memecpy_v1 https://github.com/AscendTransport/Mooncake "$SRC_DIR/Mooncake"
fi
}
install_sys_dependencies() {
@@ -57,28 +92,55 @@ install_vllm() {
pip install -r "$SRC_DIR/vllm-ascend/requirements-dev.txt"
}
install_mooncake() {
echo "====> Install mooncake"
apt-get update -y
apt-get install -y --no-install-recommends mpich libmpich-dev
cd $SRC_DIR/Mooncake
bash dependencies.sh --yes
apt purge mpich libmpich-dev -y
apt purge openmpi-bin -y
apt purge openmpi-bin libopenmpi-dev -y
apt install mpich libmpich-dev -y
export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:$CPATH
export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$CPATH
download_go() {
ARCH=$(uname -m)
GOVER=1.23.8
if [ "$ARCH" = "aarch64" ]; then
ARCH="arm64"
elif [ "$ARCH" = "x86_64" ]; then
ARCH="amd64"
else
echo "Unsupported architecture: $ARCH"
exit 1
fi
# Download Go
echo "Downloading Go $GOVER..."
wget -q --show-progress https://golang.google.cn/dl/go$GOVER.linux-$ARCH.tar.gz
check_success "Failed to download Go $GOVER"
mkdir build
cd -
cd $SRC_DIR/Mooncake/build
cmake ..
make -j
make install
cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cp mooncake-transfer-engine/src/libtransfer_engine.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/
cd -
# Install Go
echo "Installing Go $GOVER..."
tar -C /usr/local -xzf go$GOVER.linux-$ARCH.tar.gz
check_success "Failed to install Go $GOVER"
# Clean up downloaded file
rm -f go$GOVER.linux-$ARCH.tar.gz
check_success "Failed to clean up Go installation file"
print_success "Go $GOVER installed successfully"
}
install_go() {
# Check if Go is already installed
if command -v go &> /dev/null; then
GO_VERSION=$(go version | awk '{print $3}')
if [[ "$GO_VERSION" == "go$GOVER" ]]; then
echo -e "${YELLOW}Go $GOVER is already installed. Skipping...${NC}"
else
echo -e "${YELLOW}Found Go $GO_VERSION. Will install Go $GOVER...${NC}"
download_go
fi
else
download_go
fi
# Add Go to PATH if not already there
if ! grep -q "export PATH=\$PATH:/usr/local/go/bin" ~/.bashrc; then
echo -e "${YELLOW}Adding Go to your PATH in ~/.bashrc${NC}"
echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc
echo -e "${YELLOW}Please run 'source ~/.bashrc' or start a new terminal to use Go${NC}"
fi
export PATH=$PATH:/usr/local/go/bin
}
kill_npu_processes() {
@@ -89,47 +151,14 @@ kill_npu_processes() {
}
run_tests() {
echo "====> Run tests"
shopt -s nullglob
declare -A results
local total=0
local passed=0
local failed=0
local REPORT_FILE="/root/.cache/test_summary.md"
echo "#Nightly Multi-node Test Summary" > "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
echo "| Config File | Result |" >> "$REPORT_FILE"
echo "|--------------|---------|" >> "$REPORT_FILE"
for file in tests/e2e/nightly/multi_node/config/models/*.yaml; do
export CONFIG_YAML_PATH="$file"
echo "Running test with config: $CONFIG_YAML_PATH"
if pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py; then
results["$file"]="✅ PASS"
((passed++))
else
results["$file"]="❌ FAIL"
((failed++))
fi
((total++))
echo "| \`$file\` | ${results[$file]} |" >> "$REPORT_FILE"
echo "------------------------------------------"
kill_npu_processes
done
shopt -u nullglob
echo "" >> "$REPORT_FILE"
echo "## Summary" >> "$REPORT_FILE"
echo "- **Total:** $total" >> "$REPORT_FILE"
echo "- **Passed:** $passed" >> "$REPORT_FILE"
echo "- **Failed:** $failed" >> "$REPORT_FILE"
echo
echo "✅ Markdown report written to: $REPORT_FILE"
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
kill_npu_processes
ret=$?
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
mkdir -p "$(dirname "$RESULT_PATH")"
echo $ret > "$RESULT_PATH"
fi
return $ret
}
main() {
@@ -138,7 +167,12 @@ main() {
checkout_src
install_sys_dependencies
install_vllm
install_mooncake
# to speed up mooncake build process, install Go here
install_go
cd "$WORKSPACE/source_code"
. $SRC_DIR/vllm-ascend/tests/e2e/nightly/multi_node/scripts/build_mooncake.sh \
pooling_async_memecpy_v1 9d96b2e1dd76cc601d76b1b4c5f6e04605cd81d3
cd "$WORKSPACE/source_code/vllm-ascend"
run_tests
}

View File

@@ -8,7 +8,10 @@ def test_multi_node() -> None:
env_dict = config.envs
# perf_cmd = config.perf_cmd
# acc_cmd = config.acc_cmd
server_port = config.server_port if not config.disaggregated_prefill else config.proxy_port
nodes_info = config.nodes_info
disaggregated_prefill = config.disaggregated_prefill
server_port = config.server_port
proxy_port = config.proxy_port
server_host = config.cluster_ips[0]
with config.launch_server_proxy(DISAGGREGATED_PREFILL_PROXY_SCRIPT):
with RemoteOpenAIServer(
@@ -18,6 +21,9 @@ def test_multi_node() -> None:
server_host=server_host,
env_dict=env_dict,
auto_port=False,
proxy_port=proxy_port,
disaggregated_prefill=disaggregated_prefill,
nodes_info=nodes_info,
max_wait_seconds=2000,
) as remote_server:
# base_url = remote_server.url_root