[Test] Add deepseek v3.2 exp nightly test (#4191)
### What this PR does / why we need it?
- skip the nightly image build when the github event is pull_request
- set imagepullpolicy as alway for multi_node test
- move multi_node tests ahead to have some resource clean first
- do not relevant nightly image build with nightly tests for tolerance
- vLLM version: v0.11.0
- vLLM main:
2918c1b49c
---------
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Signed-off-by: wangli <wangli858794774@gmail.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
13
.github/workflows/_e2e_nightly_single_node.yaml
vendored
13
.github/workflows/_e2e_nightly_single_node.yaml
vendored
@@ -106,6 +106,19 @@ jobs:
|
||||
fi
|
||||
cd ..
|
||||
|
||||
- name: Install custom-ops (for DeepSeek-V3.2-Exp)
|
||||
if: ${{ inputs.name == 'deepseek3_2-exp-w8a8' }}
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run
|
||||
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
|
||||
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
|
||||
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
|
||||
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
||||
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
||||
. /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
- name: Run vllm-project/vllm-ascend test
|
||||
env:
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
|
||||
19
.github/workflows/_nightly_image_build.yaml
vendored
19
.github/workflows/_nightly_image_build.yaml
vendored
@@ -2,22 +2,7 @@ name: 'image / nightly / Ubuntu / test'
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 0,4,8,12,14 * * *'
|
||||
workflow_call:
|
||||
inputs:
|
||||
target:
|
||||
required: true
|
||||
type: string
|
||||
description: 'Target architecture, e.g., a2, a3'
|
||||
outputs:
|
||||
image-tag:
|
||||
description: 'The built image tag'
|
||||
value: ${{ jobs.build-and-sync.outputs.image-tag }}
|
||||
secrets:
|
||||
HW_USERNAME:
|
||||
required: true
|
||||
HW_TOKEN:
|
||||
required: true
|
||||
- cron: '0 0,4,8,12,15 * * *'
|
||||
|
||||
# This workflow builds and pushes Docker images for nightly-ci
|
||||
# It will be built base on the quay.io/ascend/vllm-ascend:main
|
||||
@@ -28,7 +13,7 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
target: ${{ fromJson(github.event_name == 'schedule' && '["a2","a3"]' || format('["{0}"]', inputs.target || 'a3')) }}
|
||||
target: ['a2', 'a3']
|
||||
|
||||
outputs:
|
||||
image-tag: ${{ steps.build-image.outputs.image-tag }}
|
||||
|
||||
@@ -42,18 +42,35 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
image_build:
|
||||
name: nightly image build
|
||||
uses: ./.github/workflows/_nightly_image_build.yaml
|
||||
multi-node-tests:
|
||||
name: multi-node
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
test_config:
|
||||
- name: multi-node-deepseek-dp
|
||||
config_file_path: DeepSeek-R1-W8A8-A2.yaml
|
||||
size: 2
|
||||
- name: multi-node-deepseek-dp-torchair
|
||||
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
|
||||
size: 2
|
||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||
with:
|
||||
target: a2
|
||||
soc_version: a2
|
||||
runner: linux-aarch64-a2-0
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
||||
replicas: 1
|
||||
size: ${{ matrix.test_config.size }}
|
||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||
secrets:
|
||||
HW_USERNAME: ${{ secrets.HW_USERNAME }}
|
||||
HW_TOKEN: ${{ secrets.HW_TOKEN }}
|
||||
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
|
||||
|
||||
single-node-tests:
|
||||
name: single-node
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
needs: image_build
|
||||
needs: multi-node-tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -72,33 +89,7 @@ jobs:
|
||||
vllm: v0.11.0
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
tests: ${{ matrix.test_config.tests }}
|
||||
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }}
|
||||
|
||||
multi-node-tests:
|
||||
name: multi-node
|
||||
needs: [single-node-tests, image_build]
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
test_config:
|
||||
- name: multi-node-deepseek-dp
|
||||
config_file_path: DeepSeek-R1-W8A8-A2.yaml
|
||||
size: 2
|
||||
- name: multi-node-deepseek-dp-torchair
|
||||
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
|
||||
size: 2
|
||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||
with:
|
||||
soc_version: a2
|
||||
runner: linux-aarch64-a2-0
|
||||
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }}
|
||||
replicas: 1
|
||||
size: ${{ matrix.test_config.size }}
|
||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||
secrets:
|
||||
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
||||
|
||||
single-node-accuracy-tests:
|
||||
if: >-
|
||||
|
||||
@@ -41,18 +41,53 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
image_build:
|
||||
name: nightly image build
|
||||
uses: ./.github/workflows/_nightly_image_build.yaml
|
||||
multi-node-tests:
|
||||
name: multi-node
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
test_config:
|
||||
- name: multi-node-deepseek-pd
|
||||
config_file_path: DeepSeek-V3.yaml
|
||||
size: 2
|
||||
- name: multi-node-qwen3-dp
|
||||
config_file_path: Qwen3-235B-A3B.yaml
|
||||
size: 2
|
||||
- name: multi-node-dpsk-4node-pd
|
||||
config_file_path: DeepSeek-R1-W8A8.yaml
|
||||
size: 4
|
||||
- name: multi-node-qwenw8a8-2node
|
||||
config_file_path: Qwen3-235B-W8A8.yaml
|
||||
size: 2
|
||||
- name: multi-node-glm-2node
|
||||
config_file_path: GLM-4_5.yaml
|
||||
size: 2
|
||||
- name: multi-node-dpsk3.2-exp-2node
|
||||
config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
|
||||
size: 2
|
||||
- name: multi-node-deepseek-r1-w8a8-eplb
|
||||
config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
|
||||
size: 4
|
||||
- name: multi-node-qwenw8a8-2node-eplb
|
||||
config_file_path: Qwen3-235B-W8A8-EPLB.yaml
|
||||
size: 2
|
||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||
with:
|
||||
target: a3
|
||||
soc_version: a3
|
||||
runner: linux-aarch64-a3-0
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
||||
replicas: 1
|
||||
size: ${{ matrix.test_config.size }}
|
||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||
secrets:
|
||||
HW_USERNAME: ${{ secrets.HW_USERNAME }}
|
||||
HW_TOKEN: ${{ secrets.HW_TOKEN }}
|
||||
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
|
||||
|
||||
single-node-tests:
|
||||
name: single-node
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
needs: image_build
|
||||
needs: multi-node-tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -103,51 +138,6 @@ jobs:
|
||||
with:
|
||||
vllm: v0.11.0
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }}
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
||||
tests: ${{ matrix.test_config.tests }}
|
||||
name: ${{ matrix.test_config.name }}
|
||||
|
||||
multi-node-tests:
|
||||
name: multi-node
|
||||
needs: [single-node-tests, image_build]
|
||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
test_config:
|
||||
- name: multi-node-deepseek-pd
|
||||
config_file_path: DeepSeek-V3.yaml
|
||||
size: 2
|
||||
- name: multi-node-qwen3-dp
|
||||
config_file_path: Qwen3-235B-A3B.yaml
|
||||
size: 2
|
||||
- name: multi-node-dpsk-4node-pd
|
||||
config_file_path: DeepSeek-R1-W8A8.yaml
|
||||
size: 4
|
||||
- name: multi-node-qwenw8a8-2node
|
||||
config_file_path: Qwen3-235B-W8A8.yaml
|
||||
size: 2
|
||||
- name: multi-node-glm-2node
|
||||
config_file_path: GLM-4_5.yaml
|
||||
size: 2
|
||||
- name: multi-node-dpsk3.2-exp-2node
|
||||
config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
|
||||
size: 2
|
||||
- name: multi-node-deepseek-r1-w8a8-eplb
|
||||
config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
|
||||
size: 4
|
||||
- name: multi-node-qwenw8a8-2node-eplb
|
||||
config_file_path: Qwen3-235B-W8A8-EPLB.yaml
|
||||
size: 2
|
||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||
with:
|
||||
soc_version: a3
|
||||
runner: linux-aarch64-a3-0
|
||||
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }}
|
||||
replicas: 1
|
||||
size: ${{ matrix.test_config.size }}
|
||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||
secrets:
|
||||
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
|
||||
|
||||
@@ -34,8 +34,6 @@ deployment:
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve Yanguan/DeepSeek-V3.2-Exp-bf16 \
|
||||
--host 0.0.0.0
|
||||
--port $SERVER_PORT
|
||||
--headless
|
||||
--data-parallel-size 2
|
||||
--data-parallel-size-local 1
|
||||
|
||||
@@ -15,6 +15,7 @@ spec:
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-leader
|
||||
imagePullPolicy: Always
|
||||
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }}
|
||||
env:
|
||||
- name: CONFIG_YAML_PATH
|
||||
@@ -73,6 +74,7 @@ spec:
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-worker
|
||||
imagePullPolicy: Always
|
||||
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }}
|
||||
env:
|
||||
- name: CONFIG_YAML_PATH
|
||||
|
||||
@@ -92,6 +92,31 @@ check_and_config() {
|
||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
}
|
||||
|
||||
install_extra_components() {
|
||||
echo "====> Installing extra components for DeepSeek-v3.2-exp-bf16"
|
||||
|
||||
if ! wget -q https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run; then
|
||||
echo "Failed to download CANN-custom_ops-sfa-linux.aarch64.run"
|
||||
return 1
|
||||
fi
|
||||
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
|
||||
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
|
||||
|
||||
if ! wget -q https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl; then
|
||||
echo "Failed to download custom_ops wheel"
|
||||
return 1
|
||||
fi
|
||||
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
||||
|
||||
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
rm -f CANN-custom_ops-sfa-linux.aarch64.run \
|
||||
custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
||||
echo "====> Extra components installation completed"
|
||||
}
|
||||
|
||||
kill_npu_processes() {
|
||||
pgrep python3 | xargs -r kill -9
|
||||
pgrep VLLM | xargs -r kill -9
|
||||
@@ -123,6 +148,9 @@ main() {
|
||||
check_npu_info
|
||||
check_and_config
|
||||
show_vllm_info
|
||||
if [[ "$CONFIG_YAML_PATH" == *"DeepSeek-V3_2-Exp-bf16.yaml" ]]; then
|
||||
install_extra_components
|
||||
fi
|
||||
cd "$WORKSPACE/vllm-ascend"
|
||||
run_tests_with_log
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user