diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml new file mode 100644 index 00000000..680d7985 --- /dev/null +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -0,0 +1,190 @@ +name: 'e2e nightly test multi_node' + +on: + workflow_call: + inputs: + soc_version: + required: true + type: string + description: use a2 or a3 + image: + required: false + type: string + description: base image for pods + default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11" + config_file_path: + required: true + type: string + description: the model config for multi_node test + replicas: + required: false + default: "1" + type: string + description: replicas of the k8s cluster + size: + required: false + default: "2" + type: string + description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need + vllm_version: + required: false + default: "v0.11.0" + type: string + description: vllm version to use + vllm_ascend_remote_url: + required: false + default: https://github.com/vllm-project/vllm-ascend.git + type: string + description: used for pr level tests + vllm_ascend_ref: + required: false + default: main + type: string + description: used for pr level tests + + +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. +defaults: + run: + shell: bash -el {0} + +# only cancel in-progress runs of the same workflow +# and ignore the lint / 8 cards test type +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + e2e: + # This is a runner with no NPU for k8s controller + runs-on: linux-aarch64-a3-0 + container: + image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + env: + KUBECONFIG: /tmp/kubeconfig + KUBECTL: /root/.cache/.kube/kubectl + NAMESPACE: vllm-project + LEADER_POD: vllm-0 + RESULT_FILE: /root/.cache/tests/ret/test_result.txt + steps: + - name: Install system denpendencies + run: | + # configure apt and pip source + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + pip install jinja2-cli + + apt-get update -y && apt-get install -y git curl + + - name: Install kubectl + run: | + # Install kubectl + install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl + + # Verify kubectl installation + kubectl version --client=true + + # TODO: Add A2 tests + - name: Setup kubeconfig for A3 + if: inputs.soc_version == 'a3' + run: | + # Decode and save kubeconfig + echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Prepare scripts + run: | + # prepare for lws entrypoint scripts + install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh + + - name: Clear result ret + run: | + rm -f $RESULT_FILE + + - name: Launch cluster + run: | + set -e + + size="${{ inputs.size }}" + replicas="${{ inputs.replicas }}" + image="${{ inputs.image }}" + config_file_path="${{ inputs.config_file_path }}" + vllm_version="${{ inputs.vllm_version }}" + vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}" + vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}" + result_file_path="$RESULT_FILE" + + required_params=("size" "replicas" "image" "config_file_path") + for param in "${required_params[@]}"; do + if [ -z "${!param}" ]; then + echo "Error: Parameter '$param' is required but empty" + exit 1 + fi + done + + jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \ + -D size="$size" \ + -D replicas="$replicas" \ + -D image="$image" \ + -D config_file_path="$config_file_path" \ + -D vllm_version="$vllm_version" \ + -D vllm_ascend_remote_url="$vllm_ascend_remote_url" \ + -D vllm_ascend_ref="$vllm_ascend_ref" \ + -D result_file_path="$result_file_path" \ + --outfile lws.yaml + + kubectl apply -f ./lws.yaml + + - name: Waiting for pod ready + run: | + echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..." + + while true; do + # get pod status + READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}') + + if [[ "$READY_STATUS" == "true" ]]; then + echo "Pod [$LEADER_POD] is Ready!" + break + else + echo "Pod [$LEADER_POD] not ready, waiting..." + sleep 3 + fi + done + + - name: Stream logs + run: | + kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" + + - name: Determine is success + run: | + TIMEOUT=600 + ELAPSED=0 + while [ ! -f "$RESULT_FILE" ]; do + sleep 5 + ELAPSED=$((ELAPSED + 5)) + if [ $ELAPSED -ge $TIMEOUT ]; then + echo "Timeout waiting for test result file" + exit 1 + fi + done + + RET=$(cat "$RESULT_FILE") + echo "Test result: $RET" + + if [ "$RET" -ne 0 ]; then + echo "Test failed" + exit 1 + else + echo "Test succeeded" + fi + + - name: Post process + if: always() + run: | + kubectl get pods -n $NAMESPACE + kubectl delete -f ./lws.yaml diff --git a/.github/workflows/_e2e_nightly.yaml b/.github/workflows/_e2e_nightly_single_node.yaml similarity index 100% rename from .github/workflows/_e2e_nightly.yaml rename to .github/workflows/_e2e_nightly_single_node.yaml diff --git a/.github/workflows/multi_node_test.yaml b/.github/workflows/multi_node_test.yaml deleted file mode 100644 index 9ea3776b..00000000 --- a/.github/workflows/multi_node_test.yaml +++ /dev/null @@ -1,125 +0,0 @@ -name: 'e2e test / multi-dp' - -on: - schedule: - - cron: "0 */4 * * *" - workflow_dispatch: - -# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly -# declared as "shell: bash -el {0}" on steps that need to be properly activated. -# It's used to activate ascend-toolkit environment variables. -defaults: - run: - shell: bash -el {0} - -# only cancel in-progress runs of the same workflow -# and ignore the lint / 8 cards test type -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - e2e: - # This is a runner with no NPU for k8s controller - runs-on: linux-aarch64-a3-0 - container: - image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 - env: - KUBECONFIG: /tmp/kubeconfig - KUBECTL: /root/.cache/.kube/kubectl - NAMESPACE: vllm-project - LEADER_POD: vllm-0 - steps: - - name: Install system denpendencies - run: | - # configure apt and pip source - sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list - pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple - pip install jinja2-cli -y - - apt-get update -y && apt-get install -y git curl - - TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64` - git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN" - - - name: Install kubectl - run: | - install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl - - # get kubeconfig from secret - echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG - - - name: Checkout code - uses: actions/checkout@v4 - - - name: Prepare scripts - run: | - # prepare for lws entrypoint scripts - install -D tests/e2e/multi_node/scripts/run.sh /root/.cache/tests/run.sh - - - name: Launch cluster - run: | - jinja2 tests/e2e/multi_node/scripts/lws.yaml.jinja2 \ - -D size=2 \ - -D replicas=1 \ - -D image="m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11" \ - --outfile lws.yaml - - kubectl apply -f ./lws.yaml - - - name: Waiting for pod ready - run: | - echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..." - - while true; do - # get pod status - READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}') - - if [[ "$READY_STATUS" == "true" ]]; then - echo "✅ Pod [$LEADER_POD] is Ready!" - break - else - echo "Pod [$LEADER_POD] not ready, waiting..." - sleep 3 - fi - done - - - name: Stream logs and monitor pod health - run: | - set -euo pipefail - - echo "🚀 Start streaming logs for Pod [$LEADER_POD] ..." - kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" & - LOG_PID=$! - - echo "Start monitoring Pod [$LEADER_POD] status ..." - while true; do - STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}') - if [[ "$STATUS" != "Running" && "$STATUS" != "Succeeded" ]]; then - echo "❌ Pod [$LEADER_POD] exited abnormally with status: $STATUS" - kubectl describe pod "$LEADER_POD" -n "$NAMESPACE" || true - kubectl logs "$LEADER_POD" -n "$NAMESPACE" --previous --all-containers || true - kill $LOG_PID || true - exit 1 - fi - sleep 5 - done & - - MONITOR_PID=$! - wait $LOG_PID || true - kill $MONITOR_PID || true - - - name: Generate summary - if: always() - run: | - if [ -f "/root/.cache/test_summary.md" ]; then - cat /root/.cache/test_summary.md >> "$GITHUB_STEP_SUMMARY" - else - echo "No summary file found." >> "$GITHUB_STEP_SUMMARY" - fi - - - name: Post process - if: always() - run: | - kubectl get pods -n $NAMESPACE - kubectl delete -f ./lws.yaml diff --git a/.github/workflows/vllm_ascend_test_nightly.yaml b/.github/workflows/vllm_ascend_test_nightly.yaml deleted file mode 100644 index ccc246ae..00000000 --- a/.github/workflows/vllm_ascend_test_nightly.yaml +++ /dev/null @@ -1,133 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -name: 'ascend test / nightly' - -on: - schedule: - # Run test at 24:00 Beijing time (UTC+8) - - cron: "0 16 * * *" - workflow_dispatch: - pull_request: - branches: - - 'main' - - '*-dev' - types: [labeled,opened,synchronize] - -# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly -# declared as "shell: bash -el {0}" on steps that need to be properly activated. -# It's used to activate ascend-toolkit environment variables. -defaults: - run: - shell: bash -el {0} - -# only cancel in-progress runs of the same workflow -# and ignore the lint / 1 card / 4 cards test type -concurrency: - group: ascend-nightly-${{ github.ref }} - #cancel-in-progress: true - -jobs: - qwen3-32b: - if: contains(github.event.pull_request.labels.*.name, 'run-nightly') - strategy: - matrix: - # should add A3 chip runner when available - os: [linux-aarch64-a2-4] - # Note (yikun): If CI resource are limited we can split job into two chain jobs - # only trigger e2e test after lint passed and the change is e2e related with pull request. - uses: ./.github/workflows/_e2e_nightly.yaml - with: - vllm: v0.11.0 - runner: ${{ matrix.os }} - tests: tests/e2e/nightly/models/test_qwen3_32b.py - qwen3-32b-in8-a3: - if: contains(github.event.pull_request.labels.*.name, 'run-nightly') - strategy: - matrix: - os: [ linux-aarch64-a3-4 ] - uses: ./.github/workflows/_e2e_nightly.yaml - with: - vllm: v0.11.0 - runner: ${{ matrix.os }} - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 - tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py - qwen3-32b-in8-a2: - if: contains(github.event.pull_request.labels.*.name, 'run-nightly') - strategy: - matrix: - os: [ linux-aarch64-a2-4 ] - uses: ./.github/workflows/_e2e_nightly.yaml - with: - vllm: v0.11.0 - runner: ${{ matrix.os }} - tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py - qwen3-235b-a22b-w8a8-eplb: - if: contains(github.event.pull_request.labels.*.name, 'run-nightly') - strategy: - matrix: - os: [ linux-aarch64-a3-16 ] - uses: ./.github/workflows/_e2e_nightly.yaml - with: - vllm: v0.11.0 - runner: ${{ matrix.os }} - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 - tests: tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py - deepseek-r1-w8a8-eplb: - if: contains(github.event.pull_request.labels.*.name, 'run-nightly') - strategy: - matrix: - os: [ linux-aarch64-a3-16 ] - uses: ./.github/workflows/_e2e_nightly.yaml - with: - vllm: v0.11.0 - runner: ${{ matrix.os }} - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 - tests: tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py - qwen3-32b-int8-a3-feature-stack3: - if: contains(github.event.pull_request.labels.*.name, 'run-nightly') - strategy: - matrix: - os: [ linux-aarch64-a3-4 ] - uses: ./.github/workflows/_e2e_nightly.yaml - with: - vllm: v0.11.0 - runner: ${{ matrix.os }} - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 - tests: tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py - qwen2-5-vl-7b: - if: contains(github.event.pull_request.labels.*.name, 'run-nightly') - strategy: - matrix: - os: [ linux-aarch64-a3-4 ] - uses: ./.github/workflows/_e2e_nightly.yaml - with: - vllm: v0.11.0 - runner: ${{ matrix.os }} - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 - tests: tests/e2e/nightly/models/test_qwen2_5_vl_7b.py - deepseek-r1-0528-w8a8: - if: contains(github.event.pull_request.labels.*.name, 'run-nightly') - strategy: - matrix: - os: [ linux-aarch64-a3-16 ] - uses: ./.github/workflows/_e2e_nightly.yaml - with: - vllm: v0.11.0 - runner: ${{ matrix.os }} - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 - tests: tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/vllm_ascend_test_nightly_a2.yaml new file mode 100644 index 00000000..809babed --- /dev/null +++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml @@ -0,0 +1,60 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +# This workflow related to the resources atlas 800 A2 +# We will not limit the concurrency of jobs on A2 +name: 'ascend test / nightly-a2' + +on: + schedule: + # Run test at 24:00 Beijing time (UTC+8) + - cron: "0 16 * * *" + workflow_dispatch: + pull_request: + branches: + - 'main' + +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. +defaults: + run: + shell: bash -el {0} + +# only cancel in-progress runs of the same workflow +concurrency: + group: ascend-nightly-${{ github.ref }}-a2 + cancel-in-progress: true + +jobs: + single-node-tests: + if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + strategy: + fail-fast: false + matrix: + test_config: + - name: qwen3-32b + os: linux-aarch64-a2-4 + tests: tests/e2e/nightly/models/test_qwen3_32b.py + - name: qwen3-32b-in8-a2 + os: linux-aarch64-a2-4 + tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py + uses: ./.github/workflows/_e2e_nightly_single_node.yaml + with: + vllm: v0.11.0 + runner: ${{ matrix.test_config.os }} + tests: ${{ matrix.test_config.tests }} diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/vllm_ascend_test_nightly_a3.yaml new file mode 100644 index 00000000..70d2b9c9 --- /dev/null +++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml @@ -0,0 +1,98 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +# This workflow related to the resources atlas 800 A3 +# **Please note**: current A3 resource pool's maximum allowed concurrency is 5*16 NPUs +# We will limit the concurrency of jobs on A3 to avoid the risk of insufficient resources +name: 'ascend test / nightly-a3' + +on: + schedule: + # Run test at 24:00 Beijing time (UTC+8) + - cron: "0 16 * * *" + workflow_dispatch: + pull_request: + branches: + - 'main' + +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. +defaults: + run: + shell: bash -el {0} + +concurrency: + group: ascend-nightly-${{ github.ref }}-a3 + cancel-in-progress: true + +jobs: + single-node-tests: + if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + strategy: + fail-fast: false + matrix: + test_config: + - name: qwen3-32b-in8-a3 + os: linux-aarch64-a3-4 + tests: tests/e2e/nightly/models/test_qwen3_32b_int8.py + - name: qwen3-32b-int8-a3-feature-stack3 + os: linux-aarch64-a3-4 + tests: tests/e2e/nightly/features/test_qwen3_32b_int8_a3_feature_stack3.py + - name: qwen3-235b-a22b-w8a8-eplb + os: linux-aarch64-a3-16 + tests: tests/e2e/nightly/models/test_qwen3_235b_a22b_w8a8_eplb.py + - name: deepseek-r1-w8a8-eplb + os: linux-aarch64-a3-16 + tests: tests/e2e/nightly/models/test_deepseek_r1_w8a8_eplb.py + - name: qwen2-5-vl-7b + os: linux-aarch64-a3-4 + tests: tests/e2e/nightly/models/test_qwen2_5_vl_7b.py + - name: deepseek-r1-0528-w8a8 + os: linux-aarch64-a3-16 + tests: tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py + uses: ./.github/workflows/_e2e_nightly_single_node.yaml + with: + vllm: v0.11.0 + runner: ${{ matrix.test_config.os }} + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + tests: ${{ matrix.test_config.tests }} + + multi-node-tests: + needs: single-node-tests + if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + strategy: + fail-fast: false + max-parallel: 1 + matrix: + test_config: + - name: multi-node-deepseek-pd + config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml + size: 2 + - name: multi-node-qwen3-dp + config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml + size: 2 + - name: multi-node-dpsk-4node-pd + config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml + size: 4 + uses: ./.github/workflows/_e2e_nightly_multi_node.yaml + with: + soc_version: a3 + image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + replicas: 1 + size: ${{ matrix.test_config.size }} + config_file_path: ${{ matrix.test_config.config_file_path }} diff --git a/docs/source/assets/deployment.png b/docs/source/assets/deployment.png new file mode 100644 index 00000000..c8b90b75 Binary files /dev/null and b/docs/source/assets/deployment.png differ diff --git a/docs/source/assets/workflow.png b/docs/source/assets/workflow.png new file mode 100644 index 00000000..ba41aa7c Binary files /dev/null and b/docs/source/assets/workflow.png differ diff --git a/docs/source/developer_guide/contribution/index.md b/docs/source/developer_guide/contribution/index.md index 82280ed1..fbb9e376 100644 --- a/docs/source/developer_guide/contribution/index.md +++ b/docs/source/developer_guide/contribution/index.md @@ -108,4 +108,5 @@ If you find any problem when contributing, you can feel free to submit a PR to i :caption: Index :maxdepth: 1 testing +multi_node_test ::: diff --git a/docs/source/developer_guide/contribution/multi_node_test.md b/docs/source/developer_guide/contribution/multi_node_test.md new file mode 100644 index 00000000..6e96084c --- /dev/null +++ b/docs/source/developer_guide/contribution/multi_node_test.md @@ -0,0 +1,99 @@ +# Multi Node Test + +Multi-Node CI is designed to test distributed scenarios of very large models, eg: disaggregated_prefill multi DP across multi nodes and so on. + +## How is works + +The following picture shows the basic deployment view of the multi-node CI mechanism, It shows how the github action interact with [lws](https://lws.sigs.k8s.io/docs/overview/) (a kind of kubernetes crd resource) + +![alt text](../../assets/deployment.png) + +From the workflow perspective, we can see how the final test script is executed, The key point is that these two [lws.yaml and run.sh](https://github.com/vllm-project/vllm-ascend/tree/main/tests/e2e/nightly/multi_node/scripts), The former defines how our k8s cluster is pulled up, and the latter defines the entry script when the pod is started, Each node executes different logic according to the [LWS_WORKER_INDEX](https://lws.sigs.k8s.io/docs/reference/labels-annotations-and-environment-variables/) environment variable, so that multiple nodes can form a distributed cluster to perform tasks. + +![alt text](../../assets/workflow.png) + +## How to contribute + +1. Upload custom weights + + If you need customized weights, for example, you quantized a w8a8 weight for DeepSeek-V3 and you want your weight to run on CI, Uploading weights to ModelScope's [vllm-ascend](https://www.modelscope.cn/organization/vllm-ascend) organization is welcome, If you do not have permission to upload, please contact @Potabk + +2. Add config yaml + + As the entrypoint script [run.sh](https://github.com/vllm-project/vllm-ascend/blob/0bf3f21a987aede366ec4629ad0ffec8e32fe90d/tests/e2e/nightly/multi_node/scripts/run.sh#L106) shows, A k8s pod startup means traversing all *.yaml files in the [directory](https://github.com/vllm-project/vllm-ascend/tree/main/tests/e2e/nightly/multi_node/config/models), reading and executing according to different configurations, so what we need to do is just add "yamls" like [DeepSeek-V3.yaml](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml). + + Suppose you have **2 nodes** running a 1P1D setup (1 Prefillers + 1 Decoder): + + you may add a config file looks like: + + ```yaml + test_name: "test DeepSeek-V3 disaggregated_prefill" + # the model being tested + model: "vllm-ascend/DeepSeek-V3-W8A8" + # how large the cluster is + num_nodes: 2 + npu_per_node: 16 + # All env vars you need should add it here + env_common: + VLLM_USE_MODELSCOPE: true + OMP_PROC_BIND: false + OMP_NUM_THREADS: 100 + HCCL_BUFFSIZE: 1024 + SERVER_PORT: 8080 + disaggregated_prefill: + enabled: true + # node index(a list) which meet all the conditions: + # - prefiller + # - no headless(have api server) + prefiller_host_index: [0] + # node index(a list) which meet all the conditions: + # - decoder + # - no headless(have api server) + decoder_host_index: [1] + + # Add each node's vllm serve cli command just like you runs locally + deployment: + - + server_cmd: > + vllm serve ... + - + server_cmd: > + vllm serve ... + benchmarks: + perf: + # fill with performance test kwargs + acc: + # fill with accuracy test kwargs + ``` + +3. Add the case to nightly workflow +currently, the multi-node test workflow defined in the [vllm_ascend_test_nightly_a2/a3.yaml](https://github.com/vllm-project/vllm-ascend/blob/main/.github/workflows/vllm_ascend_test_nightly_a3.yaml) + + ```yaml + multi-node-tests: + needs: single-node-tests + if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + strategy: + fail-fast: false + max-parallel: 1 + matrix: + test_config: + - name: multi-node-deepseek-pd + config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml + size: 2 + - name: multi-node-qwen3-dp + config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml + size: 2 + - name: multi-node-dpsk-4node-pd + config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml + size: 4 + uses: ./.github/workflows/_e2e_nightly_multi_node.yaml + with: + soc_version: a3 + image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + replicas: 1 + size: ${{ matrix.test_config.size }} + config_file_path: ${{ matrix.test_config.config_file_path }} + ``` + +The matrix above defines all the parameters required to add a multi-machine use case, The parameters worth paying attention to (I mean if you are adding a new use case) are size and the path to the yaml configuration file. The former defines the number of nodes required for your use case, and the latter defines the path to the configuration file you have completed in step 2. diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 452faa17..da715724 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -49,6 +49,7 @@ from vllm.utils import get_open_port from tests.e2e.model_utils import (TokensTextLogprobs, TokensTextLogprobsPromptLogprobs) +from tests.e2e.nightly.multi_node.config.multi_node_config import NodeInfo from vllm_ascend.ascend_config import clear_ascend_config # TODO: remove this part after the patch merged into vllm, if # we not explicitly patch here, some of them might be effectiveless @@ -115,6 +116,9 @@ class RemoteOpenAIServer: env_dict: Optional[dict[str, str]] = None, seed: Optional[int] = None, auto_port: bool = True, + nodes_info: Optional[list[NodeInfo]] = None, + disaggregated_prefill: Optional[dict] = None, + proxy_port: Optional[int] = None, max_wait_seconds: Optional[float] = None, override_hf_configs: Optional[dict[str, Any]] = None) -> None: if isinstance(vllm_serve_args, str): @@ -144,13 +148,23 @@ class RemoteOpenAIServer: "--hf-overrides", json.dumps(override_hf_configs) ] + self.host = str(server_host) self.port = int(server_port) + # for multi-nodes test + self.nodes_info = nodes_info + self.disaggregated_prefill = disaggregated_prefill + self.cur_index = os.getenv("LWS_WORKER_INDEX", 0) + self.proxy_port = proxy_port self._start_server(model, vllm_serve_args, env_dict) max_wait_seconds = max_wait_seconds or 7200 - self._wait_for_server(url=self.url_for("health"), - timeout=max_wait_seconds) + if self.disaggregated_prefill: + assert proxy_port is not None, "for disaggregated_prefill, proxy port must be provided" + self._wait_for_server_pd(proxy_port=proxy_port) + else: + self._wait_for_server(url=self.url_for("health"), + timeout=max_wait_seconds) def __enter__(self): return self @@ -187,6 +201,21 @@ class RemoteOpenAIServer: if isinstance(client, httpx.Client): client.close() + def _wait_for_server_pd(self, proxy_port: int): + # Wait for all api_server nodes ready + assert self.nodes_info is not None, "cluster info must be provided" + for node_info in self.nodes_info: + if node_info.headless: + continue + + url_health = f"http://{node_info.ip}:{node_info.server_port}/health" + self._wait_for_server(url=url_health, timeout=7200) + + # Wait for proxy ready + master_node = self.nodes_info[0] + url_proxy = f"http://{master_node.ip}:{proxy_port}/healthcheck" + self._wait_for_server(url=url_proxy, timeout=7200) + def _wait_for_server(self, *, url: str, timeout: float): # run health check start = time.time() diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml new file mode 100644 index 00000000..e3b1db18 --- /dev/null +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml @@ -0,0 +1,163 @@ +test_name: "test DeepSeek-R1-W8A8 disaggregated_prefill" +model: "vllm-ascend/DeepSeek-R1-0528-W8A8" +num_nodes: 4 +npu_per_node: 16 +env_common: + VLLM_USE_MODELSCOPE: true + HCCL_BUFFSIZE: 1024 + SERVER_PORT: 8080 + OMP_PROC_BIND: false + OMP_NUM_THREADS: 10 + PYTORCH_NPU_ALLOC_CONF: expandable_segments:True + HCCL_DETERMINISTIC: True + TASK_QUEUE_ENABLE: 1 + HCCL_OP_RETRY_ENABLE: "L0:0, L1:0, L2:0" + +disaggregated_prefill: + enabled: true + prefiller_host_index: [0, 1] + decoder_host_index: [2] + ranktable_gen_path: "examples/disaggregated_prefill_v1/gen_ranktable.py" + ranktable_path: "/tmp/ranktable.json" + +deployment: + - + server_cmd: > + vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 + --host 0.0.0.0 + --port $SERVER_PORT + --data-parallel-size 2 + --data-parallel-size-local 2 + --tensor-parallel-size 8 + --enforce-eager + --enable-expert-parallel + --seed 1024 + --quantization ascend + --max-num-seqs 4 + --max-model-len 36864 + --max-num-batched-tokens 16384 + --trust-remote-code + --gpu-memory-utilization 0.9 + --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --kv-transfer-config + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_producer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' + --additional-config + '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}' + + - + server_cmd: > + vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 + --host 0.0.0.0 + --port $SERVER_PORT + --data-parallel-size 2 + --data-parallel-size-local 2 + --tensor-parallel-size 8 + --enforce-eager + --enable-expert-parallel + --seed 1024 + --quantization ascend + --max-num-seqs 4 + --max-model-len 36864 + --max-num-batched-tokens 16384 + --trust-remote-code + --gpu-memory-utilization 0.9 + --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --kv-transfer-config + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_producer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' + --additional-config + '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":false,"enable_multistream_shared_expert":false},"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}' + - + server_cmd: > + vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 + --host 0.0.0.0 + --port $SERVER_PORT + --data-parallel-size 32 + --data-parallel-size-local 16 + --data-parallel-start-rank 0 + --data-parallel-address $LOCAL_IP + --data-parallel-rpc-port 13389 + --tensor-parallel-size 1 + --enable-expert-parallel + --seed 1024 + --quantization ascend + --max-num-seqs 28 + --max-model-len 36864 + --max-num-batched-tokens 256 + --trust-remote-code + --gpu-memory-utilization 0.9 + --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --kv-transfer-config + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_consumer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' + --additional-config + '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}' + - + server_cmd: > + vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 + --headless + --data-parallel-size 32 + --data-parallel-size-local 16 + --data-parallel-start-rank 16 + --data-parallel-address $MASTER_IP + --data-parallel-rpc-port 13389 + --tensor-parallel-size 1 + --enable-expert-parallel + --seed 1024 + --quantization ascend + --max-num-seqs 28 + --max-model-len 36864 + --max-num-batched-tokens 256 + --trust-remote-code + --gpu-memory-utilization 0.9 + --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' + --kv-transfer-config + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_consumer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' + --additional-config + '{"ascend_scheduler_config":{"enabled":false},"torchair_graph_config":{"enabled":true,"enable_multistream_mla":true,"graph_batch_sizes":[28],"use_cached_graph":true,"enable_super_kernel":false},"multistream_overlap_shared_expert":true}' +benchmarks: + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 1 + max_out_len: 2 + batch_size: 1 + baseline: 5 + threshold: 0.97 + acc: + case_type: accuracy + dataset_path: vllm-ascend/AIME2024 + request_conf: vllm_api_general_chat + dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt + max_out_len: 10 + batch_size: 32 + baseline: 1 + threshold: 1 diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml index 2d292232..94ecb61d 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml @@ -26,10 +26,6 @@ disaggregated_prefill: deployment: - - local_index: 0 - master_index: 0 - headless: false - env_extend: server_cmd: > vllm serve "vllm-ascend/DeepSeek-V3-W8A8" --host 0.0.0.0 @@ -66,10 +62,6 @@ deployment: }' - - local_index: 1 - master_index: 0 - headless: true - env_extend: server_cmd: > vllm serve "vllm-ascend/DeepSeek-V3-W8A8" --host 0.0.0.0 diff --git a/tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml b/tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml new file mode 100644 index 00000000..70c7cc7e --- /dev/null +++ b/tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml @@ -0,0 +1,68 @@ +test_name: "test GLM-4.5 multi-dp" +model: "ZhipuAI/GLM-4.5" +num_nodes: 2 +npu_per_node: 16 +env_common: + VLLM_USE_MODELSCOPE: true + OMP_PROC_BIND: false + OMP_NUM_THREADS: 100 + HCCL_BUFFSIZE: 1024 + SERVER_PORT: 8080 + +deployment: + - + server_cmd: > + vllm serve "ZhipuAI/GLM-4.5" + --host 0.0.0.0 + --port $SERVER_PORT + --data-parallel-size 4 + --data-parallel-size-local 2 + --data-parallel-address $LOCAL_IP + --data-parallel-rpc-port 13389 + --tensor-parallel-size 8 + --seed 1024 + --enable-expert-parallel + --max-num-seqs 16 + --max-model-len 8192 + --max-num-batched-tokens 8192 + --trust-remote-code + --no-enable-prefix-caching + --gpu-memory-utilization 0.9 + - + server_cmd: > + vllm serve "ZhipuAI/GLM-4.5" + --headless + --data-parallel-size 4 + --data-parallel-size-local 2 + --data-parallel-start-rank 2 + --data-parallel-address $MASTER_IP + --data-parallel-rpc-port 13389 + --tensor-parallel-size 8 + --seed 1024 + --max-num-seqs 16 + --max-model-len 8192 + --max-num-batched-tokens 8192 + --enable-expert-parallel + --trust-remote-code + --no-enable-prefix-caching + --gpu-memory-utilization 0.9 +benchmarks: + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 1 + max_out_len: 2 + batch_size: 1 + baseline: 5 + threshold: 0.97 + acc: + case_type: accuracy + dataset_path: vllm-ascend/AIME2024 + request_conf: vllm_api_general_chat + dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt + max_out_len: 10 + batch_size: 32 + baseline: 1 + threshold: 1 diff --git a/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml index d2b2095f..f0ac5e88 100644 --- a/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml +++ b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml @@ -11,10 +11,6 @@ env_common: deployment: - - local_index: 0 - master_index: 0 - headless: false - env_extend: server_cmd: > vllm serve "Qwen/Qwen3-235B-A22B" --host 0.0.0.0 @@ -33,10 +29,6 @@ deployment: --no-enable-prefix-caching --gpu-memory-utilization 0.9 - - local_index: 1 - master_index: 0 - headless: true - env_extend: server_cmd: > vllm serve "Qwen/Qwen3-235B-A22B" --headless diff --git a/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml new file mode 100644 index 00000000..ca7033a3 --- /dev/null +++ b/tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml @@ -0,0 +1,105 @@ +test_name: "test Qwen3-235B-A22B-W8A8 disaggregated_prefill" +model: "vllm-ascend/Qwen3-235B-A22B-W8A8" +num_nodes: 2 +npu_per_node: 16 +env_common: + VLLM_USE_MODELSCOPE: true + OMP_PROC_BIND: false + OMP_NUM_THREADS: 100 + HCCL_BUFFSIZE: 1024 + SERVER_PORT: 8080 +disaggregated_prefill: + enabled: true + prefiller_host_index: [0] + decoder_host_index: [1] + +deployment: + - + server_cmd: > + vllm serve "vllm-ascend/Qwen3-235B-A22B-W8A8" + --host 0.0.0.0 + --port $SERVER_PORT + --data-parallel-size 2 + --data-parallel-size-local 2 + --tensor-parallel-size 8 + --seed 1024 + --enable-expert-parallel + --max-num-seqs 16 + --max-model-len 8192 + --max-num-batched-tokens 8192 + --quantization ascend + --trust-remote-code + --no-enable-prefix-caching + --gpu-memory-utilization 0.9 + --kv-transfer-config + '{"kv_connector": "MooncakeConnector", + "kv_role": "kv_producer", + "kv_port": "30000", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector", + "kv_connector_extra_config": { + "prefill": { + "dp_size": 2, + "tp_size": 8 + }, + "decode": { + "dp_size": 2, + "tp_size": 8 + } + } + }' + + - + server_cmd: > + vllm serve "vllm-ascend/Qwen3-235B-A22B-W8A8" + --host 0.0.0.0 + --port $SERVER_PORT + --data-parallel-size 2 + --data-parallel-size-local 2 + --tensor-parallel-size 8 + --seed 1024 + --quantization ascend + --max-num-seqs 16 + --max-model-len 8192 + --max-num-batched-tokens 8192 + --enable-expert-parallel + --trust-remote-code + --no-enable-prefix-caching + --gpu-memory-utilization 0.9 + --kv-transfer-config + '{"kv_connector": "MooncakeConnector", + "kv_role": "kv_consumer", + "kv_port": "30200", + "engine_id": "1", + "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector", + "kv_connector_extra_config": { + "prefill": { + "dp_size": 2, + "tp_size": 8 + }, + "decode": { + "dp_size": 2, + "tp_size": 8 + } + } + }' +benchmarks: + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 1 + max_out_len: 2 + batch_size: 1 + baseline: 5 + threshold: 0.97 + acc: + case_type: accuracy + dataset_path: vllm-ascend/AIME2024 + request_conf: vllm_api_general_chat + dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt + max_out_len: 10 + batch_size: 32 + baseline: 1 + threshold: 1 diff --git a/tests/e2e/nightly/multi_node/config/multi_node_config.py b/tests/e2e/nightly/multi_node/config/multi_node_config.py index 2106b829..18ee6cc3 100644 --- a/tests/e2e/nightly/multi_node/config/multi_node_config.py +++ b/tests/e2e/nightly/multi_node/config/multi_node_config.py @@ -1,6 +1,7 @@ import logging import os import subprocess +from dataclasses import dataclass from typing import Optional import regex as re @@ -15,6 +16,16 @@ from tests.e2e.nightly.multi_node.config.utils import (get_avaliable_port, setup_logger() logger = logging.getLogger(__name__) DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py" +DISAGGEGATED_PREFILL_PORT = 5333 + + +@dataclass +class NodeInfo: + index: int + ip: str + server_cmd: str + headless: bool + server_port: int class MultiNodeConfig: @@ -22,38 +33,50 @@ class MultiNodeConfig: def __init__(self, model: str, test_name: str, - num_nodes: int = 2, npu_per_node: int = 16, server_port: int = 8080, - headless: bool = False, disaggregated_prefill: Optional[dict] = None, envs: Optional[dict] = None, - server_cmd: str = "", + nodes_info: Optional[list[NodeInfo]] = None, perf_cmd: Optional[str] = None, acc_cmd: Optional[str] = None): self.test_name = test_name self.model = model - self.num_nodes = num_nodes + self.nodes_info = nodes_info or [] + self.num_nodes = len(self.nodes_info) self.npu_per_node = npu_per_node - self.envs = envs if envs is not None else {} self.server_port = server_port - if disaggregated_prefill: - self.proxy_port = get_avaliable_port() - self.headless = headless - self.server_cmd = server_cmd + self.envs = envs if envs is not None else {} + self.proxy_port = get_avaliable_port() self.perf_cmd = perf_cmd self.acc_cmd = acc_cmd assert perf_cmd is not None, "perf_cmd must be provided" assert acc_cmd is not None, "acc_cmd must be provided" - assert server_cmd is not None, "server_cmd must be provided" - self.cur_index = os.getenv("LWS_WORKER_INDEX", 0) + self.cur_index = int(os.getenv("LWS_WORKER_INDEX", 0)) self.cur_ip = get_cur_ip() self.nic_name = get_net_interface(self.cur_ip) - self.cluster_ips = get_cluster_ips(num_nodes) + self.cluster_ips = get_cluster_ips(self.num_nodes) + self.cur_node_info: NodeInfo = self.nodes_info[self.cur_index] self.disaggregated_prefill = disaggregated_prefill + self._init_disaggregated_prefill() + self._init_dist_env() - self.server_cmd = self._expand_env_vars(self.server_cmd, self.envs) + self.server_cmd = self._expand_env_vars(self.cur_node_info.server_cmd, + self.envs) + + def _init_disaggregated_prefill(self): + if self.disaggregated_prefill: + decode_host_index = self.disaggregated_prefill.get( + "decoder_host_index") + if not decode_host_index: + raise RuntimeError("got empty decode_host_index") + self.decode_start_index: int = decode_host_index[0] + self.num_prefillers = self.decode_start_index + self.num_decoders = self.num_nodes - self.num_prefillers + if self.disaggregated_prefill.get( + "ranktable_gen_path") is not None: + self._gen_ranktable() def _init_dist_env(self): self.envs["HCCL_IF_IP"] = self.cur_ip @@ -62,7 +85,17 @@ class MultiNodeConfig: self.envs["HCCL_SOCKET_IFNAME"] = self.nic_name self.envs["LOCAL_IP"] = self.cur_ip self.envs["NIC_NAME"] = self.nic_name - self.envs["MASTER_IP"] = self.cluster_ips[0] + + if self.disaggregated_prefill: + self.envs[ + "DISAGGREGATED_PREFILL_RANK_TABLE_PATH"] = self.disaggregated_prefill.get( + "ranktable_path") + if self.cur_index < self.decode_start_index: + self.envs["MASTER_IP"] = self.cluster_ips[0] + else: + self.envs["MASTER_IP"] = self.cluster_ips[ + self.decode_start_index] + ascend_path = "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages" self.envs[ "LD_LIBRARY_PATH"] = f"{ascend_path}:{self.envs.get('LD_LIBRARY_PATH', os.environ.get('LD_LIBRARY_PATH', ''))}" @@ -172,15 +205,21 @@ class MultiNodeConfig: deployments = config_data.get("deployment", []) assert len(deployments) == num_nodes, \ f"Number of deployments ({len(deployments)}) must match num_nodes ({num_nodes})" - for deployment in deployments: - if deployment.get("local_index") == int( - os.getenv("LWS_WORKER_INDEX", 0)): - envs_extend = deployment.get("env_extend", {}) - if envs_extend: - envs.update(envs_extend) - server_cmd = deployment.get("server_cmd") - headless = deployment.get("headless", False) - break + + cluster_ips = get_cluster_ips(num_nodes) + nodes_info = [] + + for index, deployment in enumerate(deployments): + # after assert len(deployments) == num_nodes, we can assume that this will must have a match + server_cmd = deployment.get("server_cmd", "") + headless = "--headless" in server_cmd + nodes_info.append( + NodeInfo(ip=cluster_ips[index], + index=index, + headless=headless, + server_port=server_port, + server_cmd=server_cmd)) + benchmarks = config_data.get("benchmarks", {}) assert benchmarks is not None, "benchmarks must be provided" perf_cmd = benchmarks["perf"] @@ -188,13 +227,11 @@ class MultiNodeConfig: return cls(model=model, test_name=test_name, - num_nodes=num_nodes, npu_per_node=npu_per_node, envs=envs, server_port=server_port, - headless=headless, disaggregated_prefill=disaggregated_prefill, - server_cmd=server_cmd, + nodes_info=nodes_info, perf_cmd=perf_cmd, acc_cmd=acc_cmd) @@ -204,4 +241,52 @@ class MultiNodeConfig: @property def is_master(self): - return int(self.cur_index) == 0 + return self.cur_index == 0 + + def _gen_ranktable(self): + cluster_ip = self.cluster_ips + assert len(cluster_ip) > 0 + nnodes = self.num_nodes + node_rank = self.cur_index + master_addr = cluster_ip[0] + master_port = DISAGGEGATED_PREFILL_PORT + assert self.disaggregated_prefill is not None + ranktable_gen_path = self.disaggregated_prefill.get( + "ranktable_gen_path") + ranktable_path = self.disaggregated_prefill.get("ranktable_path") + assert ranktable_gen_path is not None and ranktable_path is not None + if os.path.exists(str(ranktable_path)): + return + + local_host = self.cur_ip + + cmd = [ + "torchrun", + "--nproc_per_node", + "1", + "--nnodes", + str(nnodes), + "--node_rank", + str(node_rank), + "--master_addr", + master_addr, + "--master_port", + str(master_port), + ranktable_gen_path, + "--ranktable-path", + str(ranktable_path), + "--local-host", + local_host, + "--prefill-device-cnt", + str(self.npu_per_node * self.num_prefillers), + "--decode-device-cnt", + str(self.npu_per_node * self.num_decoders), + ] + + env = os.environ.copy() + assert self.nic_name is not None + env["GLOO_SOCKET_IFNAME"] = self.nic_name + + subprocess.run(cmd, env=env, check=True) + assert os.path.exists( + str(ranktable_path)), "failed generate ranktable.json" diff --git a/tests/e2e/nightly/multi_node/scripts/build_mooncake.sh b/tests/e2e/nightly/multi_node/scripts/build_mooncake.sh new file mode 100644 index 00000000..5dc966a1 --- /dev/null +++ b/tests/e2e/nightly/multi_node/scripts/build_mooncake.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +set -e +set -o pipefail + +GREEN="\033[0;32m" +BLUE="\033[0;34m" +YELLOW="\033[0;33m" +RED="\033[0;31m" +NC="\033[0m" # No Color + +branch=${1:-pooling_async_memecpy_v1} +point=${2:-9d96b2e1dd76cc601d76b1b4c5f6e04605cd81d3} + +repo_url="https://github.com/AscendTransport/Mooncake" +repo_name="Mooncake" +state_file=".build_state" + +echo "[INFO] Branch: $branch" +echo "[INFO] Commit: $point" +echo "-------------------------------------------" + + +mark_done() { echo "$1" >> "$state_file"; } +is_done() { grep -Fxq "$1" "$state_file" 2>/dev/null; } + +if ! is_done "clone"; then + echo "[STEP] Clone repository..." + if [ -d "$repo_name" ]; then + echo "[WARN] Directory $repo_name already exists, skipping clone." + else + git clone -b "$branch" "$repo_url" "$repo_name" + fi + cd "$repo_name" + git fetch --all + git checkout "$point" || { echo "[ERROR] Checkout failed."; exit 1; } + cd .. + mark_done "clone" +else + echo "[SKIP] Clone step already done." +fi + + +if ! is_done "deps"; then + cd "$repo_name" + echo "[STEP]Installing dependencies (ignore Go failure)..." + yes | bash dependencies.sh || echo "⚠️ dependencies.sh failed (Go install likely failed), continuing..." + cd .. + mark_done "deps" +else + echo "[SKIP] Dependencies already installed." +fi + + +if ! is_done "mpi"; then + echo "[STEP] Install MPI..." + apt purge -y mpich libmpich-dev openmpi-bin libopenmpi-dev || true + apt install -y mpich libmpich-dev + export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:${CPATH:-} + export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:${CPATH:-} + mark_done "mpi" +else + echo "[SKIP] MPI installation already done." +fi + + +if ! is_done "build"; then + echo "[STEP] Compile and install..." + cd "$repo_name" + + if [ -d "build" ]; then + echo "[INFO] Removing existing build directory..." + rm -rf build + fi + + mkdir build && cd build + cmake .. || { echo "[ERROR] cmake failed."; exit 1; } + make -j || { echo "[ERROR] make failed."; exit 1; } + make install || { echo "[ERROR] make install failed."; exit 1; } + mark_done "build" +else + echo "[SKIP] Build already done." +fi + + +if ! is_done "copy_lib"; then + echo "[STEP] Copy library files..." + cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so \ + /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/ + cp mooncake-transfer-engine/src/libtransfer_engine.so \ + /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/ + cd .. + mark_done "copy_lib" +else + echo "[SKIP] Library copy already done." +fi + + +if ! grep -q "export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH" ~/.bashrc; then + echo -e "${YELLOW}Adding LD_LIBRARY_PATH to your PATH in ~/.bashrc${NC}" + echo 'export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH' >> ~/.bashrc + echo -e "${YELLOW}Please run 'source ~/.bashrc' or start a new terminal${NC}" +fi +export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH + + +echo "==========================================" +echo -e "${GREEN}[SUCCESS] Mooncake build completed!" +echo "You can rerun this script anytime — it will resume from the last step." +echo "==========================================" + +echo "Example startup command:" +echo "mooncake_master --eviction_high_watermark_ratio 0.8 --eviction_ratio 0.05 --port 50088" diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 index 58f1a810..dfaa1956 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 @@ -17,19 +17,24 @@ spec: - name: vllm-leader image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }} env: + - name: CONFIG_YAML_PATH + value: {{ config_file_path | default("tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") }} - name: WORKSPACE value: "/root/workspace" # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here. - name: VLLM_VERSION value: "v0.11.0" - name: VLLM_ASCEND_VERSION - value: "main" + value: {{ vllm_ascend_ref | default("main") }} + - name: VLLM_ASCEND_REMOTE_URL + value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} + - name: RESULT_FILE_PATH + value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }} command: - sh - -c - | bash /root/.cache/tests/run.sh - tail -f /dev/null resources: limits: huawei.com/ascend-1980: "16" @@ -70,19 +75,24 @@ spec: - name: vllm-worker image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }} env: + - name: CONFIG_YAML_PATH + value: {{ config_file_path | default("tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml") }} - name: WORKSPACE value: "/root/workspace" # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here. - name: VLLM_VERSION value: "v0.11.0" - name: VLLM_ASCEND_VERSION - value: "main" + value: {{ vllm_ascend_ref | default("main") }} + - name: VLLM_ASCEND_REMOTE_URL + value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }} + - name: RESULT_FILE_PATH + value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }} command: - sh - -c - | bash /root/.cache/tests/run.sh - tail -f /dev/null resources: limits: huawei.com/ascend-1980: "16" diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh index 2cf22783..544bb034 100644 --- a/tests/e2e/nightly/multi_node/scripts/run.sh +++ b/tests/e2e/nightly/multi_node/scripts/run.sh @@ -1,7 +1,47 @@ #!/bin/bash set -euo pipefail -export SRC_DIR="$WORKSPACE/source_code" +# Color definitions +GREEN="\033[0;32m" +BLUE="\033[0;34m" +YELLOW="\033[0;33m" +RED="\033[0;31m" +NC="\033[0m" # No Color + +# Configuration +GOVER=1.23.8 +LOG_DIR="/root/.cache/tests/logs" +OVERWRITE_LOGS=true +SRC_DIR="$WORKSPACE/source_code" +export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH + +# Function to print section headers +print_section() { + echo -e "\n${BLUE}=== $1 ===${NC}" +} + +# Function to print success messages +print_success() { + echo -e "${GREEN}✓ $1${NC}" +} + +# Function to print error messages and exit +print_error() { + echo -e "${RED}✗ ERROR: $1${NC}" + exit 1 +} + +# Function to check command success +check_success() { + if [ $? -ne 0 ]; then + print_error "$1" + fi +} + +if [ $(id -u) -ne 0 ]; then + print_error "Require root permission, try sudo ./dependencies.sh" +fi + check_npu_info() { echo "====> Check NPU info" @@ -22,18 +62,13 @@ checkout_src() { # vllm-ascend if [ ! -d "$SRC_DIR/vllm-ascend" ]; then - git clone --depth 1 -b $VLLM_ASCEND_VERSION https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend" + git clone --depth 1 -b $VLLM_ASCEND_VERSION $VLLM_ASCEND_REMOTE_URL "$SRC_DIR/vllm-ascend" fi # vllm if [ ! -d "$SRC_DIR/vllm" ]; then git clone -b $VLLM_VERSION https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm" fi - - #mooncake - if [ ! -d "$SRC_DIR/Mooncake" ]; then - git clone -b pooling_async_memecpy_v1 https://github.com/AscendTransport/Mooncake "$SRC_DIR/Mooncake" - fi } install_sys_dependencies() { @@ -57,28 +92,55 @@ install_vllm() { pip install -r "$SRC_DIR/vllm-ascend/requirements-dev.txt" } -install_mooncake() { - echo "====> Install mooncake" - apt-get update -y - apt-get install -y --no-install-recommends mpich libmpich-dev - cd $SRC_DIR/Mooncake - bash dependencies.sh --yes - apt purge mpich libmpich-dev -y - apt purge openmpi-bin -y - apt purge openmpi-bin libopenmpi-dev -y - apt install mpich libmpich-dev -y - export CPATH=/usr/lib/aarch64-linux-gnu/mpich/include/:$CPATH - export CPATH=/usr/lib/aarch64-linux-gnu/openmpi/lib:$CPATH +download_go() { + ARCH=$(uname -m) + GOVER=1.23.8 + if [ "$ARCH" = "aarch64" ]; then + ARCH="arm64" + elif [ "$ARCH" = "x86_64" ]; then + ARCH="amd64" + else + echo "Unsupported architecture: $ARCH" + exit 1 + fi + # Download Go + echo "Downloading Go $GOVER..." + wget -q --show-progress https://golang.google.cn/dl/go$GOVER.linux-$ARCH.tar.gz + check_success "Failed to download Go $GOVER" - mkdir build - cd - - cd $SRC_DIR/Mooncake/build - cmake .. - make -j - make install - cp mooncake-transfer-engine/src/transport/ascend_transport/hccl_transport/ascend_transport_c/libascend_transport_mem.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/ - cp mooncake-transfer-engine/src/libtransfer_engine.so /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/ - cd - + # Install Go + echo "Installing Go $GOVER..." + tar -C /usr/local -xzf go$GOVER.linux-$ARCH.tar.gz + check_success "Failed to install Go $GOVER" + + # Clean up downloaded file + rm -f go$GOVER.linux-$ARCH.tar.gz + check_success "Failed to clean up Go installation file" + + print_success "Go $GOVER installed successfully" +} + +install_go() { + # Check if Go is already installed + if command -v go &> /dev/null; then + GO_VERSION=$(go version | awk '{print $3}') + if [[ "$GO_VERSION" == "go$GOVER" ]]; then + echo -e "${YELLOW}Go $GOVER is already installed. Skipping...${NC}" + else + echo -e "${YELLOW}Found Go $GO_VERSION. Will install Go $GOVER...${NC}" + download_go + fi + else + download_go + fi + + # Add Go to PATH if not already there + if ! grep -q "export PATH=\$PATH:/usr/local/go/bin" ~/.bashrc; then + echo -e "${YELLOW}Adding Go to your PATH in ~/.bashrc${NC}" + echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc + echo -e "${YELLOW}Please run 'source ~/.bashrc' or start a new terminal to use Go${NC}" + fi + export PATH=$PATH:/usr/local/go/bin } kill_npu_processes() { @@ -89,47 +151,14 @@ kill_npu_processes() { } run_tests() { - echo "====> Run tests" - - shopt -s nullglob - declare -A results - local total=0 - local passed=0 - local failed=0 - - local REPORT_FILE="/root/.cache/test_summary.md" - echo "#Nightly Multi-node Test Summary" > "$REPORT_FILE" - echo "" >> "$REPORT_FILE" - echo "| Config File | Result |" >> "$REPORT_FILE" - echo "|--------------|---------|" >> "$REPORT_FILE" - - for file in tests/e2e/nightly/multi_node/config/models/*.yaml; do - export CONFIG_YAML_PATH="$file" - echo "Running test with config: $CONFIG_YAML_PATH" - - if pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py; then - results["$file"]="✅ PASS" - ((passed++)) - else - results["$file"]="❌ FAIL" - ((failed++)) - fi - ((total++)) - - echo "| \`$file\` | ${results[$file]} |" >> "$REPORT_FILE" - echo "------------------------------------------" - kill_npu_processes - done - shopt -u nullglob - - echo "" >> "$REPORT_FILE" - echo "## Summary" >> "$REPORT_FILE" - echo "- **Total:** $total" >> "$REPORT_FILE" - echo "- **Passed:** $passed ✅" >> "$REPORT_FILE" - echo "- **Failed:** $failed ❌" >> "$REPORT_FILE" - - echo - echo "✅ Markdown report written to: $REPORT_FILE" + pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py + kill_npu_processes + ret=$? + if [ "$LWS_WORKER_INDEX" -eq 0 ]; then + mkdir -p "$(dirname "$RESULT_PATH")" + echo $ret > "$RESULT_PATH" + fi + return $ret } main() { @@ -138,7 +167,12 @@ main() { checkout_src install_sys_dependencies install_vllm - install_mooncake + # to speed up mooncake build process, install Go here + install_go + cd "$WORKSPACE/source_code" + . $SRC_DIR/vllm-ascend/tests/e2e/nightly/multi_node/scripts/build_mooncake.sh \ + pooling_async_memecpy_v1 9d96b2e1dd76cc601d76b1b4c5f6e04605cd81d3 + cd "$WORKSPACE/source_code/vllm-ascend" run_tests } diff --git a/tests/e2e/nightly/multi_node/test_multi_node.py b/tests/e2e/nightly/multi_node/test_multi_node.py index c1e85c84..3808dc3f 100644 --- a/tests/e2e/nightly/multi_node/test_multi_node.py +++ b/tests/e2e/nightly/multi_node/test_multi_node.py @@ -8,7 +8,10 @@ def test_multi_node() -> None: env_dict = config.envs # perf_cmd = config.perf_cmd # acc_cmd = config.acc_cmd - server_port = config.server_port if not config.disaggregated_prefill else config.proxy_port + nodes_info = config.nodes_info + disaggregated_prefill = config.disaggregated_prefill + server_port = config.server_port + proxy_port = config.proxy_port server_host = config.cluster_ips[0] with config.launch_server_proxy(DISAGGREGATED_PREFILL_PROXY_SCRIPT): with RemoteOpenAIServer( @@ -18,6 +21,9 @@ def test_multi_node() -> None: server_host=server_host, env_dict=env_dict, auto_port=False, + proxy_port=proxy_port, + disaggregated_prefill=disaggregated_prefill, + nodes_info=nodes_info, max_wait_seconds=2000, ) as remote_server: # base_url = remote_server.url_root