[CI] Add multi-node test case for a2 (#3805)
### What this PR does / why we need it?
This patch add multi-node test case for a2
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0rc3
- vLLM main:
c9461e05a4
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
16
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
16
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
@@ -7,6 +7,10 @@ on:
|
|||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
description: use a2 or a3
|
description: use a2 or a3
|
||||||
|
runner:
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
default: linux-aarch64-a3-0
|
||||||
image:
|
image:
|
||||||
required: false
|
required: false
|
||||||
type: string
|
type: string
|
||||||
@@ -62,7 +66,7 @@ concurrency:
|
|||||||
jobs:
|
jobs:
|
||||||
e2e:
|
e2e:
|
||||||
# This is a runner with no NPU for k8s controller
|
# This is a runner with no NPU for k8s controller
|
||||||
runs-on: linux-aarch64-a3-0
|
runs-on: ${{ inputs.runner }}
|
||||||
container:
|
container:
|
||||||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
||||||
env:
|
env:
|
||||||
@@ -90,8 +94,7 @@ jobs:
|
|||||||
kubectl version --client=true
|
kubectl version --client=true
|
||||||
|
|
||||||
# TODO: Add A2 tests
|
# TODO: Add A2 tests
|
||||||
- name: Setup kubeconfig for A3
|
- name: Decode kubeconfig from secrets
|
||||||
if: inputs.soc_version == 'a3'
|
|
||||||
run: |
|
run: |
|
||||||
# Decode and save kubeconfig
|
# Decode and save kubeconfig
|
||||||
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
|
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
|
||||||
@@ -129,6 +132,12 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
if [ "${{ inputs.soc_version }}" = "a3" ]; then
|
||||||
|
npu_per_node=16
|
||||||
|
else
|
||||||
|
npu_per_node=8
|
||||||
|
fi
|
||||||
|
|
||||||
jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
|
jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
|
||||||
-D size="$size" \
|
-D size="$size" \
|
||||||
-D replicas="$replicas" \
|
-D replicas="$replicas" \
|
||||||
@@ -138,6 +147,7 @@ jobs:
|
|||||||
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
|
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
|
||||||
-D vllm_ascend_ref="$vllm_ascend_ref" \
|
-D vllm_ascend_ref="$vllm_ascend_ref" \
|
||||||
-D result_file_path="$result_file_path" \
|
-D result_file_path="$result_file_path" \
|
||||||
|
-D npu_per_node="$npu_per_node" \
|
||||||
--outfile lws.yaml
|
--outfile lws.yaml
|
||||||
|
|
||||||
kubectl apply -f ./lws.yaml
|
kubectl apply -f ./lws.yaml
|
||||||
|
|||||||
@@ -61,3 +61,25 @@ jobs:
|
|||||||
vllm: v0.11.0
|
vllm: v0.11.0
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
tests: ${{ matrix.test_config.tests }}
|
tests: ${{ matrix.test_config.tests }}
|
||||||
|
|
||||||
|
multi-node-tests:
|
||||||
|
needs: single-node-tests
|
||||||
|
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
max-parallel: 1
|
||||||
|
matrix:
|
||||||
|
test_config:
|
||||||
|
- name: multi-node-deepseek-dp
|
||||||
|
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
|
||||||
|
size: 2
|
||||||
|
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||||
|
with:
|
||||||
|
soc_version: a2
|
||||||
|
runner: linux-aarch64-a2-0
|
||||||
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
||||||
|
replicas: 1
|
||||||
|
size: ${{ matrix.test_config.size }}
|
||||||
|
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||||
|
secrets:
|
||||||
|
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
|
||||||
|
|||||||
@@ -104,10 +104,10 @@ jobs:
|
|||||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||||
with:
|
with:
|
||||||
soc_version: a3
|
soc_version: a3
|
||||||
|
runner: linux-aarch64-a3-0
|
||||||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
||||||
replicas: 1
|
replicas: 1
|
||||||
size: ${{ matrix.test_config.size }}
|
size: ${{ matrix.test_config.size }}
|
||||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||||
secrets:
|
secrets:
|
||||||
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
|
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,57 @@
|
|||||||
|
test_name: "test DeepSeek-R1-W8A8 on A2"
|
||||||
|
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
|
||||||
|
num_nodes: 2
|
||||||
|
npu_per_node: 8
|
||||||
|
env_common:
|
||||||
|
VLLM_USE_MODELSCOPE: true
|
||||||
|
HCCL_BUFFSIZE: 1024
|
||||||
|
SERVER_PORT: 8080
|
||||||
|
OMP_PROC_BIND: false
|
||||||
|
OMP_NUM_THREADS: 10
|
||||||
|
|
||||||
|
|
||||||
|
deployment:
|
||||||
|
-
|
||||||
|
server_cmd: >
|
||||||
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||||
|
--host 0.0.0.0
|
||||||
|
--port $SERVER_PORT
|
||||||
|
--data-parallel-size 4
|
||||||
|
--data-parallel-size-local 2
|
||||||
|
--data-parallel-address $LOCAL_IP
|
||||||
|
--data-parallel-rpc-port 13399
|
||||||
|
--no-enable-prefix-caching
|
||||||
|
--max-num-seqs 16
|
||||||
|
--tensor-parallel-size 4
|
||||||
|
--max-model-len 36864
|
||||||
|
--max-num-batched-tokens 6000
|
||||||
|
--enable-expert-parallel
|
||||||
|
--trust-remote-code
|
||||||
|
--quantization ascend
|
||||||
|
--gpu-memory-utilization 0.9
|
||||||
|
--enforce-eager
|
||||||
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
|
||||||
|
--additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||||
|
|
||||||
|
-
|
||||||
|
server_cmd: >
|
||||||
|
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
|
||||||
|
--headless
|
||||||
|
--data-parallel-size 4
|
||||||
|
--data-parallel-rpc-port 13399
|
||||||
|
--data-parallel-size-local 2
|
||||||
|
--data-parallel-start-rank 2
|
||||||
|
--data-parallel-address $MASTER_IP
|
||||||
|
--no-enable-prefix-caching
|
||||||
|
--max-num-seqs 16
|
||||||
|
--tensor-parallel-size 4
|
||||||
|
--max-model-len 36864
|
||||||
|
--max-num-batched-tokens 6000
|
||||||
|
--enable-expert-parallel
|
||||||
|
--trust-remote-code
|
||||||
|
--quantization ascend
|
||||||
|
--gpu-memory-utilization 0.9
|
||||||
|
--enforce-eager
|
||||||
|
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
|
||||||
|
--additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
|
||||||
|
benchmarks:
|
||||||
@@ -37,11 +37,11 @@ spec:
|
|||||||
bash /root/.cache/tests/run.sh
|
bash /root/.cache/tests/run.sh
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
huawei.com/ascend-1980: "16"
|
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
|
||||||
memory: 512Gi
|
memory: 512Gi
|
||||||
ephemeral-storage: 100Gi
|
ephemeral-storage: 100Gi
|
||||||
requests:
|
requests:
|
||||||
huawei.com/ascend-1980: "16"
|
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
|
||||||
ephemeral-storage: 100Gi
|
ephemeral-storage: 100Gi
|
||||||
cpu: 125
|
cpu: 125
|
||||||
ports:
|
ports:
|
||||||
@@ -95,11 +95,11 @@ spec:
|
|||||||
bash /root/.cache/tests/run.sh
|
bash /root/.cache/tests/run.sh
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
huawei.com/ascend-1980: "16"
|
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
|
||||||
memory: 512Gi
|
memory: 512Gi
|
||||||
ephemeral-storage: 100Gi
|
ephemeral-storage: 100Gi
|
||||||
requests:
|
requests:
|
||||||
huawei.com/ascend-1980: "16"
|
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
|
||||||
ephemeral-storage: 100Gi
|
ephemeral-storage: 100Gi
|
||||||
cpu: 125
|
cpu: 125
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
|
|||||||
Reference in New Issue
Block a user