[CI] Move nightly-a2 test to hk (#5807)

### What this PR does / why we need it?
This patch initial testing involved connecting two nodes from the HK
region to nightly A2.

- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2026-01-12 22:58:35 +08:00
committed by GitHub
parent 2a010a1f0e
commit 75c92a3640
5 changed files with 146 additions and 53 deletions

View File

@@ -69,35 +69,12 @@ jobs:
# This is the runner with no NPU for k8s controller # This is the runner with no NPU for k8s controller
runs-on: ${{ inputs.runner }} runs-on: ${{ inputs.runner }}
container: container:
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11 image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-cpu
env: env:
KUBECONFIG: /tmp/kubeconfig KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl
NAMESPACE: vllm-project NAMESPACE: vllm-project
LEADER_POD: vllm-0 LEADER_POD: vllm-0
RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }}
steps: steps:
- name: Install system denpendencies
run: |
# configure apt and pip source
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
pip install jinja2-cli
- name: Install kubectl
run: |
# Install kubectl
arch=$(uname -m)
if echo "$arch" | grep -qiE "arm|aarch64"; then
echo "Detected ARM architecture: $arch"
KUBECTL="$KUBECTL"_arm
fi
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
# Verify kubectl installation
kubectl version --client=true
- name: Decode kubeconfig from secrets - name: Decode kubeconfig from secrets
run: | run: |
# Decode and save kubeconfig # Decode and save kubeconfig
@@ -110,8 +87,6 @@ jobs:
run: | run: |
# prepare for lws entrypoint scripts # prepare for lws entrypoint scripts
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
# clear log directory
rm -fr $RESULT_FILE
- name: Clear resources - name: Clear resources
run: | run: |
@@ -157,10 +132,6 @@ jobs:
replicas="${{ inputs.replicas }}" replicas="${{ inputs.replicas }}"
image="${{ inputs.image }}" image="${{ inputs.image }}"
config_file_path="${{ inputs.config_file_path }}" config_file_path="${{ inputs.config_file_path }}"
vllm_version="${{ inputs.vllm_version }}"
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
result_file_path="$RESULT_FILE"
fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}" fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
@@ -174,19 +145,17 @@ jobs:
if [ "${{ inputs.soc_version }}" = "a3" ]; then if [ "${{ inputs.soc_version }}" = "a3" ]; then
npu_per_node=16 npu_per_node=16
TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2"
else else
npu_per_node=8 npu_per_node=8
TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2"
fi fi
jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \ jinja2 $TEMPLATE_FILE \
-D size="$size" \ -D size="$size" \
-D replicas="$replicas" \ -D replicas="$replicas" \
-D image="$image" \ -D image="$image" \
-D config_file_path="$config_file_path" \ -D config_file_path="$config_file_path" \
-D vllm_version="$vllm_version" \
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
-D vllm_ascend_ref="$vllm_ascend_ref" \
-D result_file_path="$result_file_path" \
-D npu_per_node="$npu_per_node" \ -D npu_per_node="$npu_per_node" \
-D fail_tag="$fail_tag" \ -D fail_tag="$fail_tag" \
--outfile lws.yaml --outfile lws.yaml

View File

@@ -93,13 +93,13 @@ jobs:
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with: with:
soc_version: a2 soc_version: a2
runner: linux-aarch64-a2-0 runner: linux-amd64-cpu-8-hk
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
replicas: 1 replicas: 1
size: ${{ matrix.test_config.size }} size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }} config_file_path: ${{ matrix.test_config.config_file_path }}
secrets: secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }} KUBECONFIG_B64: ${{ secrets.KUBECONFIG_HK_001_INTERNAL_B64 }}
single-node-accuracy-tests: single-node-accuracy-tests:
if: >- if: >-

View File

@@ -0,0 +1,138 @@
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: vllm
namespace: vllm-project
spec:
replicas: {{ replicas | default(1) }}
leaderWorkerTemplate:
size: {{ size | default(2) }}
restartPolicy: None
leaderTemplate:
metadata:
labels:
role: leader
spec:
schedulerName: volcano
tolerations:
- key: "instance"
operator: "Equal"
value: "vllm"
effect: "NoSchedule"
containers:
- name: vllm-leader
imagePullPolicy: Always
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
env:
- name: CONFIG_YAML_PATH
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/vllm-workspace"
- name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }}
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
resources:
limits:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
memory: 512Gi
ephemeral-storage: 100Gi
requests:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
ephemeral-storage: 100Gi
cpu: 125
ports:
- containerPort: 8080
# readinessProbe:
# tcpSocket:
# port: 8080
# initialDelaySeconds: 15
# periodSeconds: 10
volumeMounts:
- mountPath: /root/.cache
name: shared-volume
- mountPath: /usr/local/Ascend/driver/tools
name: driver-tools
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
- name: shared-volume
persistentVolumeClaim:
claimName: vllm-project-hk001
- name: driver-tools
hostPath:
path: /usr/local/Ascend/driver/tools
workerTemplate:
spec:
schedulerName: volcano
tolerations:
- key: "instance"
operator: "Equal"
value: "vllm"
effect: "NoSchedule"
containers:
- name: vllm-worker
imagePullPolicy: Always
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
env:
- name: CONFIG_YAML_PATH
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/vllm-workspace"
- name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }}
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
resources:
limits:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
memory: 512Gi
ephemeral-storage: 100Gi
requests:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
ephemeral-storage: 100Gi
cpu: 125
volumeMounts:
- mountPath: /root/.cache
name: shared-volume
- mountPath: /usr/local/Ascend/driver/tools
name: driver-tools
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
- name: shared-volume
persistentVolumeClaim:
claimName: vllm-project-hk001
- name: driver-tools
hostPath:
path: /usr/local/Ascend/driver/tools
---
apiVersion: v1
kind: Service
metadata:
name: vllm-leader
namespace: vllm-project
spec:
ports:
- name: http
port: 8080
protocol: TCP
targetPort: 8080
selector:
leaderworkerset.sigs.k8s.io/name: vllm
role: leader
type: ClusterIP

View File

@@ -22,13 +22,6 @@ spec:
value: {{ config_file_path | default("DeepSeek-V3.yaml") }} value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE - name: WORKSPACE
value: "/vllm-workspace" value: "/vllm-workspace"
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
- name: VLLM_ASCEND_VERSION
value: {{ vllm_ascend_ref | default("main") }}
- name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret") }}
- name: FAIL_TAG - name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }} value: {{ fail_tag | default("FAIL_TAG") }}
command: command:
@@ -81,13 +74,6 @@ spec:
value: {{ config_file_path | default("DeepSeek-V3.yaml") }} value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE - name: WORKSPACE
value: "/vllm-workspace" value: "/vllm-workspace"
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
- name: VLLM_ASCEND_VERSION
value: {{ vllm_ascend_ref | default("main") }}
- name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret") }}
- name: FAIL_TAG - name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }} value: {{ fail_tag | default("FAIL_TAG") }}
command: command: