[Test] Add deepseek v3.2 exp nightly test (#4191)
### What this PR does / why we need it?
- skip the nightly image build when the github event is pull_request
- set imagepullpolicy as alway for multi_node test
- move multi_node tests ahead to have some resource clean first
- do not relevant nightly image build with nightly tests for tolerance
- vLLM version: v0.11.0
- vLLM main:
2918c1b49c
---------
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Signed-off-by: wangli <wangli858794774@gmail.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
13
.github/workflows/_e2e_nightly_single_node.yaml
vendored
13
.github/workflows/_e2e_nightly_single_node.yaml
vendored
@@ -106,6 +106,19 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
|
- name: Install custom-ops (for DeepSeek-V3.2-Exp)
|
||||||
|
if: ${{ inputs.name == 'deepseek3_2-exp-w8a8' }}
|
||||||
|
shell: bash -l {0}
|
||||||
|
run: |
|
||||||
|
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run
|
||||||
|
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
|
||||||
|
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
|
||||||
|
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
|
||||||
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
|
||||||
|
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
||||||
|
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
||||||
|
. /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||||
|
|
||||||
- name: Run vllm-project/vllm-ascend test
|
- name: Run vllm-project/vllm-ascend test
|
||||||
env:
|
env:
|
||||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||||
|
|||||||
19
.github/workflows/_nightly_image_build.yaml
vendored
19
.github/workflows/_nightly_image_build.yaml
vendored
@@ -2,22 +2,7 @@ name: 'image / nightly / Ubuntu / test'
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '0 0,4,8,12,14 * * *'
|
- cron: '0 0,4,8,12,15 * * *'
|
||||||
workflow_call:
|
|
||||||
inputs:
|
|
||||||
target:
|
|
||||||
required: true
|
|
||||||
type: string
|
|
||||||
description: 'Target architecture, e.g., a2, a3'
|
|
||||||
outputs:
|
|
||||||
image-tag:
|
|
||||||
description: 'The built image tag'
|
|
||||||
value: ${{ jobs.build-and-sync.outputs.image-tag }}
|
|
||||||
secrets:
|
|
||||||
HW_USERNAME:
|
|
||||||
required: true
|
|
||||||
HW_TOKEN:
|
|
||||||
required: true
|
|
||||||
|
|
||||||
# This workflow builds and pushes Docker images for nightly-ci
|
# This workflow builds and pushes Docker images for nightly-ci
|
||||||
# It will be built base on the quay.io/ascend/vllm-ascend:main
|
# It will be built base on the quay.io/ascend/vllm-ascend:main
|
||||||
@@ -28,7 +13,7 @@ jobs:
|
|||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
target: ${{ fromJson(github.event_name == 'schedule' && '["a2","a3"]' || format('["{0}"]', inputs.target || 'a3')) }}
|
target: ['a2', 'a3']
|
||||||
|
|
||||||
outputs:
|
outputs:
|
||||||
image-tag: ${{ steps.build-image.outputs.image-tag }}
|
image-tag: ${{ steps.build-image.outputs.image-tag }}
|
||||||
|
|||||||
@@ -42,18 +42,35 @@ concurrency:
|
|||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
image_build:
|
multi-node-tests:
|
||||||
name: nightly image build
|
name: multi-node
|
||||||
uses: ./.github/workflows/_nightly_image_build.yaml
|
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
max-parallel: 1
|
||||||
|
matrix:
|
||||||
|
test_config:
|
||||||
|
- name: multi-node-deepseek-dp
|
||||||
|
config_file_path: DeepSeek-R1-W8A8-A2.yaml
|
||||||
|
size: 2
|
||||||
|
- name: multi-node-deepseek-dp-torchair
|
||||||
|
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
|
||||||
|
size: 2
|
||||||
|
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||||
with:
|
with:
|
||||||
target: a2
|
soc_version: a2
|
||||||
|
runner: linux-aarch64-a2-0
|
||||||
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
||||||
|
replicas: 1
|
||||||
|
size: ${{ matrix.test_config.size }}
|
||||||
|
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||||
secrets:
|
secrets:
|
||||||
HW_USERNAME: ${{ secrets.HW_USERNAME }}
|
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
|
||||||
HW_TOKEN: ${{ secrets.HW_TOKEN }}
|
|
||||||
single-node-tests:
|
single-node-tests:
|
||||||
name: single-node
|
name: single-node
|
||||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||||
needs: image_build
|
needs: multi-node-tests
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -72,33 +89,7 @@ jobs:
|
|||||||
vllm: v0.11.0
|
vllm: v0.11.0
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
tests: ${{ matrix.test_config.tests }}
|
tests: ${{ matrix.test_config.tests }}
|
||||||
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }}
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
||||||
|
|
||||||
multi-node-tests:
|
|
||||||
name: multi-node
|
|
||||||
needs: [single-node-tests, image_build]
|
|
||||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
max-parallel: 1
|
|
||||||
matrix:
|
|
||||||
test_config:
|
|
||||||
- name: multi-node-deepseek-dp
|
|
||||||
config_file_path: DeepSeek-R1-W8A8-A2.yaml
|
|
||||||
size: 2
|
|
||||||
- name: multi-node-deepseek-dp-torchair
|
|
||||||
config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml
|
|
||||||
size: 2
|
|
||||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
|
||||||
with:
|
|
||||||
soc_version: a2
|
|
||||||
runner: linux-aarch64-a2-0
|
|
||||||
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }}
|
|
||||||
replicas: 1
|
|
||||||
size: ${{ matrix.test_config.size }}
|
|
||||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
|
||||||
secrets:
|
|
||||||
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
|
|
||||||
|
|
||||||
single-node-accuracy-tests:
|
single-node-accuracy-tests:
|
||||||
if: >-
|
if: >-
|
||||||
|
|||||||
@@ -41,18 +41,53 @@ concurrency:
|
|||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
image_build:
|
multi-node-tests:
|
||||||
name: nightly image build
|
name: multi-node
|
||||||
uses: ./.github/workflows/_nightly_image_build.yaml
|
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
max-parallel: 1
|
||||||
|
matrix:
|
||||||
|
test_config:
|
||||||
|
- name: multi-node-deepseek-pd
|
||||||
|
config_file_path: DeepSeek-V3.yaml
|
||||||
|
size: 2
|
||||||
|
- name: multi-node-qwen3-dp
|
||||||
|
config_file_path: Qwen3-235B-A3B.yaml
|
||||||
|
size: 2
|
||||||
|
- name: multi-node-dpsk-4node-pd
|
||||||
|
config_file_path: DeepSeek-R1-W8A8.yaml
|
||||||
|
size: 4
|
||||||
|
- name: multi-node-qwenw8a8-2node
|
||||||
|
config_file_path: Qwen3-235B-W8A8.yaml
|
||||||
|
size: 2
|
||||||
|
- name: multi-node-glm-2node
|
||||||
|
config_file_path: GLM-4_5.yaml
|
||||||
|
size: 2
|
||||||
|
- name: multi-node-dpsk3.2-exp-2node
|
||||||
|
config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
|
||||||
|
size: 2
|
||||||
|
- name: multi-node-deepseek-r1-w8a8-eplb
|
||||||
|
config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
|
||||||
|
size: 4
|
||||||
|
- name: multi-node-qwenw8a8-2node-eplb
|
||||||
|
config_file_path: Qwen3-235B-W8A8-EPLB.yaml
|
||||||
|
size: 2
|
||||||
|
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||||
with:
|
with:
|
||||||
target: a3
|
soc_version: a3
|
||||||
|
runner: linux-aarch64-a3-0
|
||||||
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
||||||
|
replicas: 1
|
||||||
|
size: ${{ matrix.test_config.size }}
|
||||||
|
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||||
secrets:
|
secrets:
|
||||||
HW_USERNAME: ${{ secrets.HW_USERNAME }}
|
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
|
||||||
HW_TOKEN: ${{ secrets.HW_TOKEN }}
|
|
||||||
single-node-tests:
|
single-node-tests:
|
||||||
name: single-node
|
name: single-node
|
||||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||||
needs: image_build
|
needs: multi-node-tests
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -103,51 +138,6 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
vllm: v0.11.0
|
vllm: v0.11.0
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }}
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
||||||
tests: ${{ matrix.test_config.tests }}
|
tests: ${{ matrix.test_config.tests }}
|
||||||
name: ${{ matrix.test_config.name }}
|
name: ${{ matrix.test_config.name }}
|
||||||
|
|
||||||
multi-node-tests:
|
|
||||||
name: multi-node
|
|
||||||
needs: [single-node-tests, image_build]
|
|
||||||
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
max-parallel: 1
|
|
||||||
matrix:
|
|
||||||
test_config:
|
|
||||||
- name: multi-node-deepseek-pd
|
|
||||||
config_file_path: DeepSeek-V3.yaml
|
|
||||||
size: 2
|
|
||||||
- name: multi-node-qwen3-dp
|
|
||||||
config_file_path: Qwen3-235B-A3B.yaml
|
|
||||||
size: 2
|
|
||||||
- name: multi-node-dpsk-4node-pd
|
|
||||||
config_file_path: DeepSeek-R1-W8A8.yaml
|
|
||||||
size: 4
|
|
||||||
- name: multi-node-qwenw8a8-2node
|
|
||||||
config_file_path: Qwen3-235B-W8A8.yaml
|
|
||||||
size: 2
|
|
||||||
- name: multi-node-glm-2node
|
|
||||||
config_file_path: GLM-4_5.yaml
|
|
||||||
size: 2
|
|
||||||
- name: multi-node-dpsk3.2-exp-2node
|
|
||||||
config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
|
|
||||||
size: 2
|
|
||||||
- name: multi-node-deepseek-r1-w8a8-eplb
|
|
||||||
config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
|
|
||||||
size: 4
|
|
||||||
- name: multi-node-qwenw8a8-2node-eplb
|
|
||||||
config_file_path: Qwen3-235B-W8A8-EPLB.yaml
|
|
||||||
size: 2
|
|
||||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
|
||||||
with:
|
|
||||||
soc_version: a3
|
|
||||||
runner: linux-aarch64-a3-0
|
|
||||||
image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }}
|
|
||||||
replicas: 1
|
|
||||||
size: ${{ matrix.test_config.size }}
|
|
||||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
|
||||||
secrets:
|
|
||||||
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
|
|
||||||
|
|
||||||
@@ -34,8 +34,6 @@ deployment:
|
|||||||
-
|
-
|
||||||
server_cmd: >
|
server_cmd: >
|
||||||
vllm serve Yanguan/DeepSeek-V3.2-Exp-bf16 \
|
vllm serve Yanguan/DeepSeek-V3.2-Exp-bf16 \
|
||||||
--host 0.0.0.0
|
|
||||||
--port $SERVER_PORT
|
|
||||||
--headless
|
--headless
|
||||||
--data-parallel-size 2
|
--data-parallel-size 2
|
||||||
--data-parallel-size-local 1
|
--data-parallel-size-local 1
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ spec:
|
|||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: vllm-leader
|
- name: vllm-leader
|
||||||
|
imagePullPolicy: Always
|
||||||
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }}
|
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }}
|
||||||
env:
|
env:
|
||||||
- name: CONFIG_YAML_PATH
|
- name: CONFIG_YAML_PATH
|
||||||
@@ -73,6 +74,7 @@ spec:
|
|||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: vllm-worker
|
- name: vllm-worker
|
||||||
|
imagePullPolicy: Always
|
||||||
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }}
|
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }}
|
||||||
env:
|
env:
|
||||||
- name: CONFIG_YAML_PATH
|
- name: CONFIG_YAML_PATH
|
||||||
|
|||||||
@@ -92,6 +92,31 @@ check_and_config() {
|
|||||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
install_extra_components() {
|
||||||
|
echo "====> Installing extra components for DeepSeek-v3.2-exp-bf16"
|
||||||
|
|
||||||
|
if ! wget -q https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run; then
|
||||||
|
echo "Failed to download CANN-custom_ops-sfa-linux.aarch64.run"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
|
||||||
|
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
|
||||||
|
|
||||||
|
if ! wget -q https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl; then
|
||||||
|
echo "Failed to download custom_ops wheel"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
||||||
|
|
||||||
|
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
|
||||||
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
|
||||||
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||||
|
|
||||||
|
rm -f CANN-custom_ops-sfa-linux.aarch64.run \
|
||||||
|
custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
||||||
|
echo "====> Extra components installation completed"
|
||||||
|
}
|
||||||
|
|
||||||
kill_npu_processes() {
|
kill_npu_processes() {
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
pgrep VLLM | xargs -r kill -9
|
pgrep VLLM | xargs -r kill -9
|
||||||
@@ -123,6 +148,9 @@ main() {
|
|||||||
check_npu_info
|
check_npu_info
|
||||||
check_and_config
|
check_and_config
|
||||||
show_vllm_info
|
show_vllm_info
|
||||||
|
if [[ "$CONFIG_YAML_PATH" == *"DeepSeek-V3_2-Exp-bf16.yaml" ]]; then
|
||||||
|
install_extra_components
|
||||||
|
fi
|
||||||
cd "$WORKSPACE/vllm-ascend"
|
cd "$WORKSPACE/vllm-ascend"
|
||||||
run_tests_with_log
|
run_tests_with_log
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user