From 67f2b3a0313c09805fc3a02daf3c22b2e0ad324c Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Fri, 14 Nov 2025 15:46:10 +0800 Subject: [PATCH] [Test] Add deepseek v3.2 exp nightly test (#4191) ### What this PR does / why we need it? - skip the nightly image build when the github event is pull_request - set imagepullpolicy as alway for multi_node test - move multi_node tests ahead to have some resource clean first - do not relevant nightly image build with nightly tests for tolerance - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379 --------- Signed-off-by: hfadzxy Signed-off-by: wangli Co-authored-by: wangli --- .../workflows/_e2e_nightly_single_node.yaml | 13 +++ .github/workflows/_nightly_image_build.yaml | 19 +--- .../vllm_ascend_test_nightly_a2.yaml | 59 +++++------- .../vllm_ascend_test_nightly_a3.yaml | 96 +++++++++---------- .../config/models/DeepSeek-V3_2-Exp-bf16.yaml | 2 - .../multi_node/scripts/lws.yaml.jinja2 | 2 + tests/e2e/nightly/multi_node/scripts/run.sh | 28 ++++++ 7 files changed, 113 insertions(+), 106 deletions(-) diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml index b60e4613..8b4a4257 100644 --- a/.github/workflows/_e2e_nightly_single_node.yaml +++ b/.github/workflows/_e2e_nightly_single_node.yaml @@ -106,6 +106,19 @@ jobs: fi cd .. + - name: Install custom-ops (for DeepSeek-V3.2-Exp) + if: ${{ inputs.name == 'deepseek3_2-exp-w8a8' }} + shell: bash -l {0} + run: | + wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run + chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run + ./CANN-custom_ops-sfa-linux.aarch64.run --quiet + export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH} + export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH} + wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl + pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl + . /usr/local/Ascend/ascend-toolkit/set_env.sh + - name: Run vllm-project/vllm-ascend test env: VLLM_WORKER_MULTIPROC_METHOD: spawn diff --git a/.github/workflows/_nightly_image_build.yaml b/.github/workflows/_nightly_image_build.yaml index 25570cd0..1d342ead 100644 --- a/.github/workflows/_nightly_image_build.yaml +++ b/.github/workflows/_nightly_image_build.yaml @@ -2,22 +2,7 @@ name: 'image / nightly / Ubuntu / test' on: schedule: - - cron: '0 0,4,8,12,14 * * *' - workflow_call: - inputs: - target: - required: true - type: string - description: 'Target architecture, e.g., a2, a3' - outputs: - image-tag: - description: 'The built image tag' - value: ${{ jobs.build-and-sync.outputs.image-tag }} - secrets: - HW_USERNAME: - required: true - HW_TOKEN: - required: true + - cron: '0 0,4,8,12,15 * * *' # This workflow builds and pushes Docker images for nightly-ci # It will be built base on the quay.io/ascend/vllm-ascend:main @@ -28,7 +13,7 @@ jobs: strategy: matrix: - target: ${{ fromJson(github.event_name == 'schedule' && '["a2","a3"]' || format('["{0}"]', inputs.target || 'a3')) }} + target: ['a2', 'a3'] outputs: image-tag: ${{ steps.build-image.outputs.image-tag }} diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/vllm_ascend_test_nightly_a2.yaml index 8c1c0997..4baa3332 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a2.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml @@ -42,18 +42,35 @@ concurrency: cancel-in-progress: true jobs: - image_build: - name: nightly image build - uses: ./.github/workflows/_nightly_image_build.yaml + multi-node-tests: + name: multi-node + if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + strategy: + fail-fast: false + max-parallel: 1 + matrix: + test_config: + - name: multi-node-deepseek-dp + config_file_path: DeepSeek-R1-W8A8-A2.yaml + size: 2 + - name: multi-node-deepseek-dp-torchair + config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml + size: 2 + uses: ./.github/workflows/_e2e_nightly_multi_node.yaml with: - target: a2 + soc_version: a2 + runner: linux-aarch64-a2-0 + image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' + replicas: 1 + size: ${{ matrix.test_config.size }} + config_file_path: ${{ matrix.test_config.config_file_path }} secrets: - HW_USERNAME: ${{ secrets.HW_USERNAME }} - HW_TOKEN: ${{ secrets.HW_TOKEN }} + KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }} + single-node-tests: name: single-node if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - needs: image_build + needs: multi-node-tests strategy: fail-fast: false matrix: @@ -72,33 +89,7 @@ jobs: vllm: v0.11.0 runner: ${{ matrix.test_config.os }} tests: ${{ matrix.test_config.tests }} - image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }} - - multi-node-tests: - name: multi-node - needs: [single-node-tests, image_build] - if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - strategy: - fail-fast: false - max-parallel: 1 - matrix: - test_config: - - name: multi-node-deepseek-dp - config_file_path: DeepSeek-R1-W8A8-A2.yaml - size: 2 - - name: multi-node-deepseek-dp-torchair - config_file_path: DeepSeek-R1-W8A8-A2-torchair.yaml - size: 2 - uses: ./.github/workflows/_e2e_nightly_multi_node.yaml - with: - soc_version: a2 - runner: linux-aarch64-a2-0 - image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2')) }} - replicas: 1 - size: ${{ matrix.test_config.size }} - config_file_path: ${{ matrix.test_config.config_file_path }} - secrets: - KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }} + image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' single-node-accuracy-tests: if: >- diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/vllm_ascend_test_nightly_a3.yaml index 6ff88d15..f425ce3c 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml @@ -41,18 +41,53 @@ concurrency: cancel-in-progress: true jobs: - image_build: - name: nightly image build - uses: ./.github/workflows/_nightly_image_build.yaml + multi-node-tests: + name: multi-node + if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + strategy: + fail-fast: false + max-parallel: 1 + matrix: + test_config: + - name: multi-node-deepseek-pd + config_file_path: DeepSeek-V3.yaml + size: 2 + - name: multi-node-qwen3-dp + config_file_path: Qwen3-235B-A3B.yaml + size: 2 + - name: multi-node-dpsk-4node-pd + config_file_path: DeepSeek-R1-W8A8.yaml + size: 4 + - name: multi-node-qwenw8a8-2node + config_file_path: Qwen3-235B-W8A8.yaml + size: 2 + - name: multi-node-glm-2node + config_file_path: GLM-4_5.yaml + size: 2 + - name: multi-node-dpsk3.2-exp-2node + config_file_path: DeepSeek-V3_2-Exp-bf16.yaml + size: 2 + - name: multi-node-deepseek-r1-w8a8-eplb + config_file_path: DeepSeek-R1-W8A8-EPLB.yaml + size: 4 + - name: multi-node-qwenw8a8-2node-eplb + config_file_path: Qwen3-235B-W8A8-EPLB.yaml + size: 2 + uses: ./.github/workflows/_e2e_nightly_multi_node.yaml with: - target: a3 + soc_version: a3 + runner: linux-aarch64-a3-0 + image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' + replicas: 1 + size: ${{ matrix.test_config.size }} + config_file_path: ${{ matrix.test_config.config_file_path }} secrets: - HW_USERNAME: ${{ secrets.HW_USERNAME }} - HW_TOKEN: ${{ secrets.HW_TOKEN }} + KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }} + single-node-tests: name: single-node if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - needs: image_build + needs: multi-node-tests strategy: fail-fast: false matrix: @@ -103,51 +138,6 @@ jobs: with: vllm: v0.11.0 runner: ${{ matrix.test_config.os }} - image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }} + image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' tests: ${{ matrix.test_config.tests }} name: ${{ matrix.test_config.name }} - - multi-node-tests: - name: multi-node - needs: [single-node-tests, image_build] - if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - strategy: - fail-fast: false - max-parallel: 1 - matrix: - test_config: - - name: multi-node-deepseek-pd - config_file_path: DeepSeek-V3.yaml - size: 2 - - name: multi-node-qwen3-dp - config_file_path: Qwen3-235B-A3B.yaml - size: 2 - - name: multi-node-dpsk-4node-pd - config_file_path: DeepSeek-R1-W8A8.yaml - size: 4 - - name: multi-node-qwenw8a8-2node - config_file_path: Qwen3-235B-W8A8.yaml - size: 2 - - name: multi-node-glm-2node - config_file_path: GLM-4_5.yaml - size: 2 - - name: multi-node-dpsk3.2-exp-2node - config_file_path: DeepSeek-V3_2-Exp-bf16.yaml - size: 2 - - name: multi-node-deepseek-r1-w8a8-eplb - config_file_path: DeepSeek-R1-W8A8-EPLB.yaml - size: 4 - - name: multi-node-qwenw8a8-2node-eplb - config_file_path: Qwen3-235B-W8A8-EPLB.yaml - size: 2 - uses: ./.github/workflows/_e2e_nightly_multi_node.yaml - with: - soc_version: a3 - runner: linux-aarch64-a3-0 - image: ${{ fromJSON(format('"{0}"', needs.image_build.outputs.image-tag || 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3')) }} - replicas: 1 - size: ${{ matrix.test_config.size }} - config_file_path: ${{ matrix.test_config.config_file_path }} - secrets: - KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }} - \ No newline at end of file diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml index ff07210b..6dafd3cc 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3_2-Exp-bf16.yaml @@ -34,8 +34,6 @@ deployment: - server_cmd: > vllm serve Yanguan/DeepSeek-V3.2-Exp-bf16 \ - --host 0.0.0.0 - --port $SERVER_PORT --headless --data-parallel-size 2 --data-parallel-size-local 1 diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 index a18d91a4..fa993db7 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 @@ -15,6 +15,7 @@ spec: spec: containers: - name: vllm-leader + imagePullPolicy: Always image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }} env: - name: CONFIG_YAML_PATH @@ -73,6 +74,7 @@ spec: spec: containers: - name: vllm-worker + imagePullPolicy: Always image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3") }} env: - name: CONFIG_YAML_PATH diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh index 080e0ea8..48d1c39d 100644 --- a/tests/e2e/nightly/multi_node/scripts/run.sh +++ b/tests/e2e/nightly/multi_node/scripts/run.sh @@ -92,6 +92,31 @@ check_and_config() { export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi } +install_extra_components() { + echo "====> Installing extra components for DeepSeek-v3.2-exp-bf16" + + if ! wget -q https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run; then + echo "Failed to download CANN-custom_ops-sfa-linux.aarch64.run" + return 1 + fi + chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run + ./CANN-custom_ops-sfa-linux.aarch64.run --quiet + + if ! wget -q https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl; then + echo "Failed to download custom_ops wheel" + return 1 + fi + pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl + + export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH} + export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH} + source /usr/local/Ascend/ascend-toolkit/set_env.sh + + rm -f CANN-custom_ops-sfa-linux.aarch64.run \ + custom_ops-1.0-cp311-cp311-linux_aarch64.whl + echo "====> Extra components installation completed" +} + kill_npu_processes() { pgrep python3 | xargs -r kill -9 pgrep VLLM | xargs -r kill -9 @@ -123,6 +148,9 @@ main() { check_npu_info check_and_config show_vllm_info + if [[ "$CONFIG_YAML_PATH" == *"DeepSeek-V3_2-Exp-bf16.yaml" ]]; then + install_extra_components + fi cd "$WORKSPACE/vllm-ascend" run_tests_with_log }