diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 249526fa..ca2854da 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -7,6 +7,10 @@ on: required: true type: string description: use a2 or a3 + runner: + required: false + type: string + default: linux-aarch64-a3-0 image: required: false type: string @@ -62,7 +66,7 @@ concurrency: jobs: e2e: # This is a runner with no NPU for k8s controller - runs-on: linux-aarch64-a3-0 + runs-on: ${{ inputs.runner }} container: image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 env: @@ -90,8 +94,7 @@ jobs: kubectl version --client=true # TODO: Add A2 tests - - name: Setup kubeconfig for A3 - if: inputs.soc_version == 'a3' + - name: Decode kubeconfig from secrets run: | # Decode and save kubeconfig echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG @@ -129,6 +132,12 @@ jobs: fi done + if [ "${{ inputs.soc_version }}" = "a3" ]; then + npu_per_node=16 + else + npu_per_node=8 + fi + jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \ -D size="$size" \ -D replicas="$replicas" \ @@ -138,6 +147,7 @@ jobs: -D vllm_ascend_remote_url="$vllm_ascend_remote_url" \ -D vllm_ascend_ref="$vllm_ascend_ref" \ -D result_file_path="$result_file_path" \ + -D npu_per_node="$npu_per_node" \ --outfile lws.yaml kubectl apply -f ./lws.yaml diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/vllm_ascend_test_nightly_a2.yaml index 4e9925e4..8380346f 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a2.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml @@ -61,3 +61,25 @@ jobs: vllm: v0.11.0 runner: ${{ matrix.test_config.os }} tests: ${{ matrix.test_config.tests }} + + multi-node-tests: + needs: single-node-tests + if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + strategy: + fail-fast: false + max-parallel: 1 + matrix: + test_config: + - name: multi-node-deepseek-dp + config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml + size: 2 + uses: ./.github/workflows/_e2e_nightly_multi_node.yaml + with: + soc_version: a2 + runner: linux-aarch64-a2-0 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + replicas: 1 + size: ${{ matrix.test_config.size }} + config_file_path: ${{ matrix.test_config.config_file_path }} + secrets: + KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }} diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/vllm_ascend_test_nightly_a3.yaml index cbf8a2e0..7a34b234 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml @@ -104,10 +104,10 @@ jobs: uses: ./.github/workflows/_e2e_nightly_multi_node.yaml with: soc_version: a3 + runner: linux-aarch64-a3-0 image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 replicas: 1 size: ${{ matrix.test_config.size }} config_file_path: ${{ matrix.test_config.config_file_path }} secrets: KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }} - diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeep-R1-W8A8-A2.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeep-R1-W8A8-A2.yaml new file mode 100644 index 00000000..6f7774c4 --- /dev/null +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeep-R1-W8A8-A2.yaml @@ -0,0 +1,57 @@ +test_name: "test DeepSeek-R1-W8A8 on A2" +model: "vllm-ascend/DeepSeek-R1-0528-W8A8" +num_nodes: 2 +npu_per_node: 8 +env_common: + VLLM_USE_MODELSCOPE: true + HCCL_BUFFSIZE: 1024 + SERVER_PORT: 8080 + OMP_PROC_BIND: false + OMP_NUM_THREADS: 10 + + +deployment: + - + server_cmd: > + vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 + --host 0.0.0.0 + --port $SERVER_PORT + --data-parallel-size 4 + --data-parallel-size-local 2 + --data-parallel-address $LOCAL_IP + --data-parallel-rpc-port 13399 + --no-enable-prefix-caching + --max-num-seqs 16 + --tensor-parallel-size 4 + --max-model-len 36864 + --max-num-batched-tokens 6000 + --enable-expert-parallel + --trust-remote-code + --quantization ascend + --gpu-memory-utilization 0.9 + --enforce-eager + --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ + --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' + + - + server_cmd: > + vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 + --headless + --data-parallel-size 4 + --data-parallel-rpc-port 13399 + --data-parallel-size-local 2 + --data-parallel-start-rank 2 + --data-parallel-address $MASTER_IP + --no-enable-prefix-caching + --max-num-seqs 16 + --tensor-parallel-size 4 + --max-model-len 36864 + --max-num-batched-tokens 6000 + --enable-expert-parallel + --trust-remote-code + --quantization ascend + --gpu-memory-utilization 0.9 + --enforce-eager + --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ + --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' +benchmarks: diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 index dfaa1956..e6b5e98d 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 @@ -37,11 +37,11 @@ spec: bash /root/.cache/tests/run.sh resources: limits: - huawei.com/ascend-1980: "16" + huawei.com/ascend-1980: {{ npu_per_node | default("16") }} memory: 512Gi ephemeral-storage: 100Gi requests: - huawei.com/ascend-1980: "16" + huawei.com/ascend-1980: {{ npu_per_node | default("16") }} ephemeral-storage: 100Gi cpu: 125 ports: @@ -95,11 +95,11 @@ spec: bash /root/.cache/tests/run.sh resources: limits: - huawei.com/ascend-1980: "16" + huawei.com/ascend-1980: {{ npu_per_node | default("16") }} memory: 512Gi ephemeral-storage: 100Gi requests: - huawei.com/ascend-1980: "16" + huawei.com/ascend-1980: {{ npu_per_node | default("16") }} ephemeral-storage: 100Gi cpu: 125 volumeMounts: