[Test] Add deepseek v3.2 exp nightly test (#4191)

### What this PR does / why we need it?

- skip the nightly image build when the github event is pull_request
- set imagepullpolicy as alway for multi_node test
- move multi_node tests ahead to have some resource clean first
- do not relevant nightly image build with nightly tests for tolerance

- vLLM version: v0.11.0
- vLLM main:
2918c1b49c

---------

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Signed-off-by: wangli <wangli858794774@gmail.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
zhangxinyuehfad
2025-11-14 15:46:10 +08:00
committed by GitHub
parent 1d0f13c1a3
commit 67f2b3a031
7 changed files with 113 additions and 106 deletions

View File

@@ -106,6 +106,19 @@ jobs:
fi
cd ..
- name: Install custom-ops (for DeepSeek-V3.2-Exp)
if: ${{ inputs.name == 'deepseek3_2-exp-w8a8' }}
shell: bash -l {0}
run: |
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
. /usr/local/Ascend/ascend-toolkit/set_env.sh
- name: Run vllm-project/vllm-ascend test
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn

View File

@@ -2,22 +2,7 @@ name: 'image / nightly / Ubuntu / test'
on:
schedule:
- cron: '0 0,4,8,12,14 * * *'
workflow_call:
inputs:
target:
required: true
type: string
description: 'Target architecture, e.g., a2, a3'
outputs:
image-tag:
description: 'The built image tag'
value: ${{ jobs.build-and-sync.outputs.image-tag }}
secrets:
HW_USERNAME:
required: true
HW_TOKEN:
required: true
- cron: '0 0,4,8,12,15 * * *'
# This workflow builds and pushes Docker images for nightly-ci
# It will be built base on the quay.io/ascend/vllm-ascend:main
@@ -28,7 +13,7 @@ jobs:
strategy:
matrix:
target: ${{ fromJson(github.event_name == 'schedule' && '["a2","a3"]' || format('["{0}"]', inputs.target || 'a3')) }}
target: ['a2', 'a3']
outputs:
image-tag: ${{ steps.build-image.outputs.image-tag }}

View File

@@ -42,18 +42,35 @@ concurrency:
cancel-in-progress: true
jobs:
image_build:
name: nightly image build
uses: ./.github/workflows/_nightly_image_build.yaml
multi-node-tests:
name: multi-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-dp
config_file_path: DeepSeek-R1-W8A8-A2.yaml
size: 2
- name: multi-node-deepseek-dp-torchair
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
target: a2
soc_version: a2
runner: linux-aarch64-a2-0
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets:
HW_USERNAME: ${{ secrets.HW_USERNAME }}
HW_TOKEN: ${{ secrets.HW_TOKEN }}
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
single-node-tests:
name: single-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
needs: image_build
needs: multi-node-tests
strategy:
fail-fast: false
matrix:
@@ -72,33 +89,7 @@ jobs:
vllm: v0.11.0
runner: ${{ matrix.test_config.os }}
tests: ${{ matrix.test_config.tests }}
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }}
multi-node-tests:
name: multi-node
needs: [single-node-tests, image_build]
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-dp
config_file_path: DeepSeek-R1-W8A8-A2.yaml
size: 2
- name: multi-node-deepseek-dp-torchair
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a2
runner: linux-aarch64-a2-0
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }}
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
single-node-accuracy-tests:
if: >-

View File

@@ -41,18 +41,53 @@ concurrency:
cancel-in-progress: true
jobs:
image_build:
name: nightly image build
uses: ./.github/workflows/_nightly_image_build.yaml
multi-node-tests:
name: multi-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-pd
config_file_path: DeepSeek-V3.yaml
size: 2
- name: multi-node-qwen3-dp
config_file_path: Qwen3-235B-A3B.yaml
size: 2
- name: multi-node-dpsk-4node-pd
config_file_path: DeepSeek-R1-W8A8.yaml
size: 4
- name: multi-node-qwenw8a8-2node
config_file_path: Qwen3-235B-W8A8.yaml
size: 2
- name: multi-node-glm-2node
config_file_path: GLM-4_5.yaml
size: 2
- name: multi-node-dpsk3.2-exp-2node
config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
size: 2
- name: multi-node-deepseek-r1-w8a8-eplb
config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
size: 4
- name: multi-node-qwenw8a8-2node-eplb
config_file_path: Qwen3-235B-W8A8-EPLB.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
target: a3
soc_version: a3
runner: linux-aarch64-a3-0
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets:
HW_USERNAME: ${{ secrets.HW_USERNAME }}
HW_TOKEN: ${{ secrets.HW_TOKEN }}
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
single-node-tests:
name: single-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
needs: image_build
needs: multi-node-tests
strategy:
fail-fast: false
matrix:
@@ -103,51 +138,6 @@ jobs:
with:
vllm: v0.11.0
runner: ${{ matrix.test_config.os }}
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
tests: ${{ matrix.test_config.tests }}
name: ${{ matrix.test_config.name }}
multi-node-tests:
name: multi-node
needs: [single-node-tests, image_build]
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-pd
config_file_path: DeepSeek-V3.yaml
size: 2
- name: multi-node-qwen3-dp
config_file_path: Qwen3-235B-A3B.yaml
size: 2
- name: multi-node-dpsk-4node-pd
config_file_path: DeepSeek-R1-W8A8.yaml
size: 4
- name: multi-node-qwenw8a8-2node
config_file_path: Qwen3-235B-W8A8.yaml
size: 2
- name: multi-node-glm-2node
config_file_path: GLM-4_5.yaml
size: 2
- name: multi-node-dpsk3.2-exp-2node
config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
size: 2
- name: multi-node-deepseek-r1-w8a8-eplb
config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
size: 4
- name: multi-node-qwenw8a8-2node-eplb
config_file_path: Qwen3-235B-W8A8-EPLB.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a3
runner: linux-aarch64-a3-0
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }}
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}

View File

@@ -34,8 +34,6 @@ deployment:
-
server_cmd: >
vllm serve Yanguan/DeepSeek-V3.2-Exp-bf16 \
--host 0.0.0.0
--port $SERVER_PORT
--headless
--data-parallel-size 2
--data-parallel-size-local 1

View File

@@ -15,6 +15,7 @@ spec:
spec:
containers:
- name: vllm-leader
imagePullPolicy: Always
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }}
env:
- name: CONFIG_YAML_PATH
@@ -73,6 +74,7 @@ spec:
spec:
containers:
- name: vllm-worker
imagePullPolicy: Always
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }}
env:
- name: CONFIG_YAML_PATH

View File

@@ -92,6 +92,31 @@ check_and_config() {
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
}
install_extra_components() {
echo "====> Installing extra components for DeepSeek-v3.2-exp-bf16"
if ! wget -q https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run; then
echo "Failed to download CANN-custom_ops-sfa-linux.aarch64.run"
return 1
fi
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
if ! wget -q https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl; then
echo "Failed to download custom_ops wheel"
return 1
fi
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
source /usr/local/Ascend/ascend-toolkit/set_env.sh
rm -f CANN-custom_ops-sfa-linux.aarch64.run \
custom_ops-1.0-cp311-cp311-linux_aarch64.whl
echo "====> Extra components installation completed"
}
kill_npu_processes() {
pgrep python3 | xargs -r kill -9
pgrep VLLM | xargs -r kill -9
@@ -123,6 +148,9 @@ main() {
check_npu_info
check_and_config
show_vllm_info
if [[ "$CONFIG_YAML_PATH" == *"DeepSeek-V3_2-Exp-bf16.yaml" ]]; then
install_extra_components
fi
cd "$WORKSPACE/vllm-ascend"
run_tests_with_log
}