[Test] Add deepseek v3.2 exp nightly test (#4191)

### What this PR does / why we need it?

- skip the nightly image build when the github event is pull_request
- set imagepullpolicy as alway for multi_node test
- move multi_node tests ahead to have some resource clean first
- do not relevant nightly image build with nightly tests for tolerance

- vLLM version: v0.11.0
- vLLM main:
2918c1b49c

---------

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Signed-off-by: wangli <wangli858794774@gmail.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
zhangxinyuehfad
2025-11-14 15:46:10 +08:00
committed by GitHub
parent 1d0f13c1a3
commit 67f2b3a031
7 changed files with 113 additions and 106 deletions

View File

@@ -106,6 +106,19 @@ jobs:
fi fi
cd .. cd ..
- name: Install custom-ops (for DeepSeek-V3.2-Exp)
if: ${{ inputs.name == 'deepseek3_2-exp-w8a8' }}
shell: bash -l {0}
run: |
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
. /usr/local/Ascend/ascend-toolkit/set_env.sh
- name: Run vllm-project/vllm-ascend test - name: Run vllm-project/vllm-ascend test
env: env:
VLLM_WORKER_MULTIPROC_METHOD: spawn VLLM_WORKER_MULTIPROC_METHOD: spawn

View File

@@ -2,22 +2,7 @@ name: 'image / nightly / Ubuntu / test'
on: on:
schedule: schedule:
- cron: '0 0,4,8,12,14 * * *' - cron: '0 0,4,8,12,15 * * *'
workflow_call:
inputs:
target:
required: true
type: string
description: 'Target architecture, e.g., a2, a3'
outputs:
image-tag:
description: 'The built image tag'
value: ${{ jobs.build-and-sync.outputs.image-tag }}
secrets:
HW_USERNAME:
required: true
HW_TOKEN:
required: true
# This workflow builds and pushes Docker images for nightly-ci # This workflow builds and pushes Docker images for nightly-ci
# It will be built base on the quay.io/ascend/vllm-ascend:main # It will be built base on the quay.io/ascend/vllm-ascend:main
@@ -28,7 +13,7 @@ jobs:
strategy: strategy:
matrix: matrix:
target: ${{ fromJson(github.event_name == 'schedule' && '["a2","a3"]' || format('["{0}"]', inputs.target || 'a3')) }} target: ['a2', 'a3']
outputs: outputs:
image-tag: ${{ steps.build-image.outputs.image-tag }} image-tag: ${{ steps.build-image.outputs.image-tag }}

View File

@@ -42,18 +42,35 @@ concurrency:
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
image_build: multi-node-tests:
name: nightly image build name: multi-node
uses: ./.github/workflows/_nightly_image_build.yaml if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-dp
config_file_path: DeepSeek-R1-W8A8-A2.yaml
size: 2
- name: multi-node-deepseek-dp-torchair
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with: with:
target: a2 soc_version: a2
runner: linux-aarch64-a2-0
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets: secrets:
HW_USERNAME: ${{ secrets.HW_USERNAME }} KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
HW_TOKEN: ${{ secrets.HW_TOKEN }}
single-node-tests: single-node-tests:
name: single-node name: single-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
needs: image_build needs: multi-node-tests
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
@@ -72,33 +89,7 @@ jobs:
vllm: v0.11.0 vllm: v0.11.0
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
tests: ${{ matrix.test_config.tests }} tests: ${{ matrix.test_config.tests }}
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
multi-node-tests:
name: multi-node
needs: [single-node-tests, image_build]
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-dp
config_file_path: DeepSeek-R1-W8A8-A2.yaml
size: 2
- name: multi-node-deepseek-dp-torchair
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a2
runner: linux-aarch64-a2-0
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }}
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
single-node-accuracy-tests: single-node-accuracy-tests:
if: >- if: >-

View File

@@ -41,18 +41,53 @@ concurrency:
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
image_build: multi-node-tests:
name: nightly image build name: multi-node
uses: ./.github/workflows/_nightly_image_build.yaml if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-pd
config_file_path: DeepSeek-V3.yaml
size: 2
- name: multi-node-qwen3-dp
config_file_path: Qwen3-235B-A3B.yaml
size: 2
- name: multi-node-dpsk-4node-pd
config_file_path: DeepSeek-R1-W8A8.yaml
size: 4
- name: multi-node-qwenw8a8-2node
config_file_path: Qwen3-235B-W8A8.yaml
size: 2
- name: multi-node-glm-2node
config_file_path: GLM-4_5.yaml
size: 2
- name: multi-node-dpsk3.2-exp-2node
config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
size: 2
- name: multi-node-deepseek-r1-w8a8-eplb
config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
size: 4
- name: multi-node-qwenw8a8-2node-eplb
config_file_path: Qwen3-235B-W8A8-EPLB.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with: with:
target: a3 soc_version: a3
runner: linux-aarch64-a3-0
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets: secrets:
HW_USERNAME: ${{ secrets.HW_USERNAME }} KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
HW_TOKEN: ${{ secrets.HW_TOKEN }}
single-node-tests: single-node-tests:
name: single-node name: single-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
needs: image_build needs: multi-node-tests
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
@@ -103,51 +138,6 @@ jobs:
with: with:
vllm: v0.11.0 vllm: v0.11.0
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
tests: ${{ matrix.test_config.tests }} tests: ${{ matrix.test_config.tests }}
name: ${{ matrix.test_config.name }} name: ${{ matrix.test_config.name }}
multi-node-tests:
name: multi-node
needs: [single-node-tests, image_build]
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-pd
config_file_path: DeepSeek-V3.yaml
size: 2
- name: multi-node-qwen3-dp
config_file_path: Qwen3-235B-A3B.yaml
size: 2
- name: multi-node-dpsk-4node-pd
config_file_path: DeepSeek-R1-W8A8.yaml
size: 4
- name: multi-node-qwenw8a8-2node
config_file_path: Qwen3-235B-W8A8.yaml
size: 2
- name: multi-node-glm-2node
config_file_path: GLM-4_5.yaml
size: 2
- name: multi-node-dpsk3.2-exp-2node
config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
size: 2
- name: multi-node-deepseek-r1-w8a8-eplb
config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
size: 4
- name: multi-node-qwenw8a8-2node-eplb
config_file_path: Qwen3-235B-W8A8-EPLB.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a3
runner: linux-aarch64-a3-0
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }}
replicas: 1
size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}

View File

@@ -34,8 +34,6 @@ deployment:
- -
server_cmd: > server_cmd: >
vllm serve Yanguan/DeepSeek-V3.2-Exp-bf16 \ vllm serve Yanguan/DeepSeek-V3.2-Exp-bf16 \
--host 0.0.0.0
--port $SERVER_PORT
--headless --headless
--data-parallel-size 2 --data-parallel-size 2
--data-parallel-size-local 1 --data-parallel-size-local 1

View File

@@ -15,6 +15,7 @@ spec:
spec: spec:
containers: containers:
- name: vllm-leader - name: vllm-leader
imagePullPolicy: Always
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }} image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }}
env: env:
- name: CONFIG_YAML_PATH - name: CONFIG_YAML_PATH
@@ -73,6 +74,7 @@ spec:
spec: spec:
containers: containers:
- name: vllm-worker - name: vllm-worker
imagePullPolicy: Always
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }} image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }}
env: env:
- name: CONFIG_YAML_PATH - name: CONFIG_YAML_PATH

View File

@@ -92,6 +92,31 @@ check_and_config() {
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
} }
install_extra_components() {
echo "====> Installing extra components for DeepSeek-v3.2-exp-bf16"
if ! wget -q https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run; then
echo "Failed to download CANN-custom_ops-sfa-linux.aarch64.run"
return 1
fi
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
if ! wget -q https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl; then
echo "Failed to download custom_ops wheel"
return 1
fi
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
source /usr/local/Ascend/ascend-toolkit/set_env.sh
rm -f CANN-custom_ops-sfa-linux.aarch64.run \
custom_ops-1.0-cp311-cp311-linux_aarch64.whl
echo "====> Extra components installation completed"
}
kill_npu_processes() { kill_npu_processes() {
pgrep python3 | xargs -r kill -9 pgrep python3 | xargs -r kill -9
pgrep VLLM | xargs -r kill -9 pgrep VLLM | xargs -r kill -9
@@ -123,6 +148,9 @@ main() {
check_npu_info check_npu_info
check_and_config check_and_config
show_vllm_info show_vllm_info
if [[ "$CONFIG_YAML_PATH" == *"DeepSeek-V3_2-Exp-bf16.yaml" ]]; then
install_extra_components
fi
cd "$WORKSPACE/vllm-ascend" cd "$WORKSPACE/vllm-ascend"
run_tests_with_log run_tests_with_log
} }