[CI][Doc] Optimize multi-node CI (#3565)

### What this PR does / why we need it?
This pull request mainly do the following things:
1. Add a doc for multi-node CI, The main content is the mechanism
principle and how to contribute
2. Simplify the config yaml for more developer-friendly
3. Optimized the mooncake installation script to prevent accidental
failures during installation
4. Fix the workflow to ensure the kubernetes can be apply correctly
5. Add Qwen3-235B-W8A8 disaggregated_prefill test
6. Add GLM-4.5 multi dp test
7. Add 2p1d 4nodes disaggregated_prefill test
8. Refactor nightly tests
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.11.0rc3
- vLLM main:
17c540a993

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-10-25 09:23:47 +08:00
committed by GitHub
parent 292cf339c3
commit 7f73c28a24
21 changed files with 1165 additions and 378 deletions

View File

@@ -0,0 +1,190 @@
name: 'e2e nightly test multi_node'
on:
workflow_call:
inputs:
soc_version:
required: true
type: string
description: use a2 or a3
image:
required: false
type: string
description: base image for pods
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
config_file_path:
required: true
type: string
description: the model config for multi_node test
replicas:
required: false
default: "1"
type: string
description: replicas of the k8s cluster
size:
required: false
default: "2"
type: string
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
vllm_version:
required: false
default: "v0.11.0"
type: string
description: vllm version to use
vllm_ascend_remote_url:
required: false
default: https://github.com/vllm-project/vllm-ascend.git
type: string
description: used for pr level tests
vllm_ascend_ref:
required: false
default: main
type: string
description: used for pr level tests
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
# and ignore the lint / 8 cards test type
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
e2e:
# This is a runner with no NPU for k8s controller
runs-on: linux-aarch64-a3-0
container:
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
env:
KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl
NAMESPACE: vllm-project
LEADER_POD: vllm-0
RESULT_FILE: /root/.cache/tests/ret/test_result.txt
steps:
- name: Install system denpendencies
run: |
# configure apt and pip source
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
pip install jinja2-cli
apt-get update -y && apt-get install -y git curl
- name: Install kubectl
run: |
# Install kubectl
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
# Verify kubectl installation
kubectl version --client=true
# TODO: Add A2 tests
- name: Setup kubeconfig for A3
if: inputs.soc_version == 'a3'
run: |
# Decode and save kubeconfig
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
- name: Checkout code
uses: actions/checkout@v4
- name: Prepare scripts
run: |
# prepare for lws entrypoint scripts
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
- name: Clear result ret
run: |
rm -f $RESULT_FILE
- name: Launch cluster
run: |
set -e
size="${{ inputs.size }}"
replicas="${{ inputs.replicas }}"
image="${{ inputs.image }}"
config_file_path="${{ inputs.config_file_path }}"
vllm_version="${{ inputs.vllm_version }}"
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
result_file_path="$RESULT_FILE"
required_params=("size" "replicas" "image" "config_file_path")
for param in "${required_params[@]}"; do
if [ -z "${!param}" ]; then
echo "Error: Parameter '$param' is required but empty"
exit 1
fi
done
jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
-D size="$size" \
-D replicas="$replicas" \
-D image="$image" \
-D config_file_path="$config_file_path" \
-D vllm_version="$vllm_version" \
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
-D vllm_ascend_ref="$vllm_ascend_ref" \
-D result_file_path="$result_file_path" \
--outfile lws.yaml
kubectl apply -f ./lws.yaml
- name: Waiting for pod ready
run: |
echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."
while true; do
# get pod status
READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')
if [[ "$READY_STATUS" == "true" ]]; then
echo "Pod [$LEADER_POD] is Ready!"
break
else
echo "Pod [$LEADER_POD] not ready, waiting..."
sleep 3
fi
done
- name: Stream logs
run: |
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
- name: Determine is success
run: |
TIMEOUT=600
ELAPSED=0
while [ ! -f "$RESULT_FILE" ]; do
sleep 5
ELAPSED=$((ELAPSED + 5))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo "Timeout waiting for test result file"
exit 1
fi
done
RET=$(cat "$RESULT_FILE")
echo "Test result: $RET"
if [ "$RET" -ne 0 ]; then
echo "Test failed"
exit 1
else
echo "Test succeeded"
fi
- name: Post process
if: always()
run: |
kubectl get pods -n $NAMESPACE
kubectl delete -f ./lws.yaml

View File

@@ -1,125 +0,0 @@
name: 'e2e test / multi-dp'
on:
schedule:
- cron: "0 */4 * * *"
workflow_dispatch:
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
# and ignore the lint / 8 cards test type
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
e2e:
# This is a runner with no NPU for k8s controller
runs-on: linux-aarch64-a3-0
container:
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
env:
KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl
NAMESPACE: vllm-project
LEADER_POD: vllm-0
steps:
- name: Install system denpendencies
run: |
# configure apt and pip source
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
pip install jinja2-cli -y
apt-get update -y && apt-get install -y git curl
TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64`
git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN"
- name: Install kubectl
run: |
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
# get kubeconfig from secret
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
- name: Checkout code
uses: actions/checkout@v4
- name: Prepare scripts
run: |
# prepare for lws entrypoint scripts
install -D tests/e2e/multi_node/scripts/run.sh /root/.cache/tests/run.sh
- name: Launch cluster
run: |
jinja2 tests/e2e/multi_node/scripts/lws.yaml.jinja2 \
-D size=2 \
-D replicas=1 \
-D image="m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11" \
--outfile lws.yaml
kubectl apply -f ./lws.yaml
- name: Waiting for pod ready
run: |
echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."
while true; do
# get pod status
READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')
if [[ "$READY_STATUS" == "true" ]]; then
echo "✅ Pod [$LEADER_POD] is Ready!"
break
else
echo "Pod [$LEADER_POD] not ready, waiting..."
sleep 3
fi
done
- name: Stream logs and monitor pod health
run: |
set -euo pipefail
echo "🚀 Start streaming logs for Pod [$LEADER_POD] ..."
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" &
LOG_PID=$!
echo "Start monitoring Pod [$LEADER_POD] status ..."
while true; do
STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}')
if [[ "$STATUS" != "Running" && "$STATUS" != "Succeeded" ]]; then
echo "❌ Pod [$LEADER_POD] exited abnormally with status: $STATUS"
kubectl describe pod "$LEADER_POD" -n "$NAMESPACE" || true
kubectl logs "$LEADER_POD" -n "$NAMESPACE" --previous --all-containers || true
kill $LOG_PID || true
exit 1
fi
sleep 5
done &
MONITOR_PID=$!
wait $LOG_PID || true
kill $MONITOR_PID || true
- name: Generate summary
if: always()
run: |
if [ -f "/root/.cache/test_summary.md" ]; then
cat /root/.cache/test_summary.md >> "$GITHUB_STEP_SUMMARY"
else
echo "No summary file found." >> "$GITHUB_STEP_SUMMARY"
fi
- name: Post process
if: always()
run: |
kubectl get pods -n $NAMESPACE
kubectl delete -f ./lws.yaml

View File

@@ -1,133 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
name: 'ascend test / nightly'
on:
schedule:
# Run test at 24:00 Beijing time (UTC+8)
- cron: "0 16 * * *"
workflow_dispatch:
pull_request:
branches:
- 'main'
- '*-dev'
types: [labeled,opened,synchronize]
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
# and ignore the lint / 1 card / 4 cards test type
concurrency:
group: ascend-nightly-${{ github.ref }}
#cancel-in-progress: true
jobs:
qwen3-32b:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
# should add A3 chip runner when available
os: [linux-aarch64-a2-4]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
# only trigger e2e test after lint passed and the change is e2e related with pull request.
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
tests: tests/e2e/nightly/models/test_qwen3_32b.py
qwen3-32b-in8-a3:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a3-4 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
qwen3-32b-in8-a2:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a2-4 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
qwen3-235b-a22b-w8a8-eplb:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a3-16 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
deepseek-r1-w8a8-eplb:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a3-16 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
qwen3-32b-int8-a3-feature-stack3:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a3-4 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
qwen2-5-vl-7b:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a3-4 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/models/test_qwen2_5_vl_7b.py
deepseek-r1-0528-w8a8:
if: contains(github.event.pull_request.labels.*.name, 'run-nightly')
strategy:
matrix:
os: [ linux-aarch64-a3-16 ]
uses: ./.github/workflows/_e2e_nightly.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py

View File

@@ -0,0 +1,60 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
# This workflow related to the resources atlas 800 A2
# We will not limit the concurrency of jobs on A2
name: 'ascend test / nightly-a2'
on:
schedule:
# Run test at 24:00 Beijing time (UTC+8)
- cron: "0 16 * * *"
workflow_dispatch:
pull_request:
branches:
- 'main'
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
concurrency:
group: ascend-nightly-${{ github.ref }}-a2
cancel-in-progress: true
jobs:
single-node-tests:
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
strategy:
fail-fast: false
matrix:
test_config:
- name: qwen3-32b
os: linux-aarch64-a2-4
tests: tests/e2e/nightly/models/test_qwen3_32b.py
- name: qwen3-32b-in8-a2
os: linux-aarch64-a2-4
tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.test_config.os }}
tests: ${{ matrix.test_config.tests }}

View File

@@ -0,0 +1,98 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
# This workflow related to the resources atlas 800 A3
# **Please note**: current A3 resource pool's maximum allowed concurrency is 5*16 NPUs
# We will limit the concurrency of jobs on A3 to avoid the risk of insufficient resources
name: 'ascend test / nightly-a3'
on:
schedule:
# Run test at 24:00 Beijing time (UTC+8)
- cron: "0 16 * * *"
workflow_dispatch:
pull_request:
branches:
- 'main'
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
concurrency:
group: ascend-nightly-${{ github.ref }}-a3
cancel-in-progress: true
jobs:
single-node-tests:
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
strategy:
fail-fast: false
matrix:
test_config:
- name: qwen3-32b-in8-a3
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py
- name: qwen3-32b-int8-a3-feature-stack3
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py
- name: qwen3-235b-a22b-w8a8-eplb
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py
- name: deepseek-r1-w8a8-eplb
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py
- name: qwen2-5-vl-7b
os: linux-aarch64-a3-4
tests: tests/e2e/nightly/models/test_qwen2_5_vl_7b.py
- name: deepseek-r1-0528-w8a8
os: linux-aarch64-a3-16
tests: tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with:
vllm: v0.11.0
runner: ${{ matrix.test_config.os }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
tests: ${{ matrix.test_config.tests }}
multi-node-tests:
needs: single-node-tests
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-pd
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml
size: 2
- name: multi-node-qwen3-dp
config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
size: 2
- name: multi-node-dpsk-4node-pd
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
size: 4
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a3
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}