[CI] Move nightly-a2 test to hk (#5807)

### What this PR does / why we need it?
This patch initial testing involved connecting two nodes from the HK
region to nightly A2.

- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2026-01-12 22:58:35 +08:00
committed by GitHub
parent 2a010a1f0e
commit 75c92a3640
5 changed files with 146 additions and 53 deletions

View File

@@ -69,35 +69,12 @@ jobs:
# This is the runner with no NPU for k8s controller # This is the runner with no NPU for k8s controller
runs-on: ${{ inputs.runner }} runs-on: ${{ inputs.runner }}
container: container:
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11 image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-cpu
env: env:
KUBECONFIG: /tmp/kubeconfig KUBECONFIG: /tmp/kubeconfig
KUBECTL: /root/.cache/.kube/kubectl
NAMESPACE: vllm-project NAMESPACE: vllm-project
LEADER_POD: vllm-0 LEADER_POD: vllm-0
RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }}
steps: steps:
- name: Install system denpendencies
run: |
# configure apt and pip source
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
pip install jinja2-cli
- name: Install kubectl
run: |
# Install kubectl
arch=$(uname -m)
if echo "$arch" | grep -qiE "arm|aarch64"; then
echo "Detected ARM architecture: $arch"
KUBECTL="$KUBECTL"_arm
fi
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
# Verify kubectl installation
kubectl version --client=true
- name: Decode kubeconfig from secrets - name: Decode kubeconfig from secrets
run: | run: |
# Decode and save kubeconfig # Decode and save kubeconfig
@@ -110,8 +87,6 @@ jobs:
run: | run: |
# prepare for lws entrypoint scripts # prepare for lws entrypoint scripts
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
# clear log directory
rm -fr $RESULT_FILE
- name: Clear resources - name: Clear resources
run: | run: |
@@ -157,10 +132,6 @@ jobs:
replicas="${{ inputs.replicas }}" replicas="${{ inputs.replicas }}"
image="${{ inputs.image }}" image="${{ inputs.image }}"
config_file_path="${{ inputs.config_file_path }}" config_file_path="${{ inputs.config_file_path }}"
vllm_version="${{ inputs.vllm_version }}"
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
result_file_path="$RESULT_FILE"
fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}" fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
@@ -174,19 +145,17 @@ jobs:
if [ "${{ inputs.soc_version }}" = "a3" ]; then if [ "${{ inputs.soc_version }}" = "a3" ]; then
npu_per_node=16 npu_per_node=16
TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2"
else else
npu_per_node=8 npu_per_node=8
TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2"
fi fi
jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \ jinja2 $TEMPLATE_FILE \
-D size="$size" \ -D size="$size" \
-D replicas="$replicas" \ -D replicas="$replicas" \
-D image="$image" \ -D image="$image" \
-D config_file_path="$config_file_path" \ -D config_file_path="$config_file_path" \
-D vllm_version="$vllm_version" \
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
-D vllm_ascend_ref="$vllm_ascend_ref" \
-D result_file_path="$result_file_path" \
-D npu_per_node="$npu_per_node" \ -D npu_per_node="$npu_per_node" \
-D fail_tag="$fail_tag" \ -D fail_tag="$fail_tag" \
--outfile lws.yaml --outfile lws.yaml

View File

@@ -93,13 +93,13 @@ jobs:
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with: with:
soc_version: a2 soc_version: a2
runner: linux-aarch64-a2-0 runner: linux-amd64-cpu-8-hk
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
replicas: 1 replicas: 1
size: ${{ matrix.test_config.size }} size: ${{ matrix.test_config.size }}
config_file_path: ${{ matrix.test_config.config_file_path }} config_file_path: ${{ matrix.test_config.config_file_path }}
secrets: secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }} KUBECONFIG_B64: ${{ secrets.KUBECONFIG_HK_001_INTERNAL_B64 }}
single-node-accuracy-tests: single-node-accuracy-tests:
if: >- if: >-

View File

@@ -0,0 +1,138 @@
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: vllm
namespace: vllm-project
spec:
replicas: {{ replicas | default(1) }}
leaderWorkerTemplate:
size: {{ size | default(2) }}
restartPolicy: None
leaderTemplate:
metadata:
labels:
role: leader
spec:
schedulerName: volcano
tolerations:
- key: "instance"
operator: "Equal"
value: "vllm"
effect: "NoSchedule"
containers:
- name: vllm-leader
imagePullPolicy: Always
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
env:
- name: CONFIG_YAML_PATH
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/vllm-workspace"
- name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }}
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
resources:
limits:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
memory: 512Gi
ephemeral-storage: 100Gi
requests:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
ephemeral-storage: 100Gi
cpu: 125
ports:
- containerPort: 8080
# readinessProbe:
# tcpSocket:
# port: 8080
# initialDelaySeconds: 15
# periodSeconds: 10
volumeMounts:
- mountPath: /root/.cache
name: shared-volume
- mountPath: /usr/local/Ascend/driver/tools
name: driver-tools
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
- name: shared-volume
persistentVolumeClaim:
claimName: vllm-project-hk001
- name: driver-tools
hostPath:
path: /usr/local/Ascend/driver/tools
workerTemplate:
spec:
schedulerName: volcano
tolerations:
- key: "instance"
operator: "Equal"
value: "vllm"
effect: "NoSchedule"
containers:
- name: vllm-worker
imagePullPolicy: Always
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
env:
- name: CONFIG_YAML_PATH
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE
value: "/vllm-workspace"
- name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }}
command:
- sh
- -c
- |
bash /root/.cache/tests/run.sh
resources:
limits:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
memory: 512Gi
ephemeral-storage: 100Gi
requests:
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
ephemeral-storage: 100Gi
cpu: 125
volumeMounts:
- mountPath: /root/.cache
name: shared-volume
- mountPath: /usr/local/Ascend/driver/tools
name: driver-tools
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
- name: shared-volume
persistentVolumeClaim:
claimName: vllm-project-hk001
- name: driver-tools
hostPath:
path: /usr/local/Ascend/driver/tools
---
apiVersion: v1
kind: Service
metadata:
name: vllm-leader
namespace: vllm-project
spec:
ports:
- name: http
port: 8080
protocol: TCP
targetPort: 8080
selector:
leaderworkerset.sigs.k8s.io/name: vllm
role: leader
type: ClusterIP

View File

@@ -22,13 +22,6 @@ spec:
value: {{ config_file_path | default("DeepSeek-V3.yaml") }} value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE - name: WORKSPACE
value: "/vllm-workspace" value: "/vllm-workspace"
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
- name: VLLM_ASCEND_VERSION
value: {{ vllm_ascend_ref | default("main") }}
- name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret") }}
- name: FAIL_TAG - name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }} value: {{ fail_tag | default("FAIL_TAG") }}
command: command:
@@ -81,13 +74,6 @@ spec:
value: {{ config_file_path | default("DeepSeek-V3.yaml") }} value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
- name: WORKSPACE - name: WORKSPACE
value: "/vllm-workspace" value: "/vllm-workspace"
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
- name: VLLM_ASCEND_VERSION
value: {{ vllm_ascend_ref | default("main") }}
- name: VLLM_ASCEND_REMOTE_URL
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
- name: RESULT_FILE_PATH
value: {{ result_file_path | default("/root/.cache/tests/ret") }}
- name: FAIL_TAG - name: FAIL_TAG
value: {{ fail_tag | default("FAIL_TAG") }} value: {{ fail_tag | default("FAIL_TAG") }}
command: command:

View File

@@ -167,8 +167,8 @@ run_tests_with_log() {
if [ $ret -eq 0 ]; then if [ $ret -eq 0 ]; then
print_success "All tests passed!" print_success "All tests passed!"
else else
print_failure "Some tests failed, please check the error stack above for details.\ print_failure "Some tests failed, please check the error stack above for details. \
If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary." If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary."
fi fi
fi fi
} }