[CI] Move nightly-a2 test to hk (#5807)
### What this PR does / why we need it?
This patch initial testing involved connecting two nodes from the HK
region to nightly A2.
- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
39
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
39
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
@@ -69,35 +69,12 @@ jobs:
|
|||||||
# This is the runner with no NPU for k8s controller
|
# This is the runner with no NPU for k8s controller
|
||||||
runs-on: ${{ inputs.runner }}
|
runs-on: ${{ inputs.runner }}
|
||||||
container:
|
container:
|
||||||
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-cpu
|
||||||
env:
|
env:
|
||||||
KUBECONFIG: /tmp/kubeconfig
|
KUBECONFIG: /tmp/kubeconfig
|
||||||
KUBECTL: /root/.cache/.kube/kubectl
|
|
||||||
NAMESPACE: vllm-project
|
NAMESPACE: vllm-project
|
||||||
LEADER_POD: vllm-0
|
LEADER_POD: vllm-0
|
||||||
RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }}
|
|
||||||
steps:
|
steps:
|
||||||
- name: Install system denpendencies
|
|
||||||
run: |
|
|
||||||
# configure apt and pip source
|
|
||||||
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
|
|
||||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
|
||||||
pip install jinja2-cli
|
|
||||||
|
|
||||||
- name: Install kubectl
|
|
||||||
run: |
|
|
||||||
# Install kubectl
|
|
||||||
arch=$(uname -m)
|
|
||||||
|
|
||||||
if echo "$arch" | grep -qiE "arm|aarch64"; then
|
|
||||||
echo "Detected ARM architecture: $arch"
|
|
||||||
KUBECTL="$KUBECTL"_arm
|
|
||||||
fi
|
|
||||||
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
|
|
||||||
|
|
||||||
# Verify kubectl installation
|
|
||||||
kubectl version --client=true
|
|
||||||
|
|
||||||
- name: Decode kubeconfig from secrets
|
- name: Decode kubeconfig from secrets
|
||||||
run: |
|
run: |
|
||||||
# Decode and save kubeconfig
|
# Decode and save kubeconfig
|
||||||
@@ -110,8 +87,6 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
# prepare for lws entrypoint scripts
|
# prepare for lws entrypoint scripts
|
||||||
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
|
install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
|
||||||
# clear log directory
|
|
||||||
rm -fr $RESULT_FILE
|
|
||||||
|
|
||||||
- name: Clear resources
|
- name: Clear resources
|
||||||
run: |
|
run: |
|
||||||
@@ -157,10 +132,6 @@ jobs:
|
|||||||
replicas="${{ inputs.replicas }}"
|
replicas="${{ inputs.replicas }}"
|
||||||
image="${{ inputs.image }}"
|
image="${{ inputs.image }}"
|
||||||
config_file_path="${{ inputs.config_file_path }}"
|
config_file_path="${{ inputs.config_file_path }}"
|
||||||
vllm_version="${{ inputs.vllm_version }}"
|
|
||||||
vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
|
|
||||||
vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
|
|
||||||
result_file_path="$RESULT_FILE"
|
|
||||||
fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
|
fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
|
||||||
echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
|
echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
|
||||||
|
|
||||||
@@ -174,19 +145,17 @@ jobs:
|
|||||||
|
|
||||||
if [ "${{ inputs.soc_version }}" = "a3" ]; then
|
if [ "${{ inputs.soc_version }}" = "a3" ]; then
|
||||||
npu_per_node=16
|
npu_per_node=16
|
||||||
|
TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2"
|
||||||
else
|
else
|
||||||
npu_per_node=8
|
npu_per_node=8
|
||||||
|
TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
|
jinja2 $TEMPLATE_FILE \
|
||||||
-D size="$size" \
|
-D size="$size" \
|
||||||
-D replicas="$replicas" \
|
-D replicas="$replicas" \
|
||||||
-D image="$image" \
|
-D image="$image" \
|
||||||
-D config_file_path="$config_file_path" \
|
-D config_file_path="$config_file_path" \
|
||||||
-D vllm_version="$vllm_version" \
|
|
||||||
-D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
|
|
||||||
-D vllm_ascend_ref="$vllm_ascend_ref" \
|
|
||||||
-D result_file_path="$result_file_path" \
|
|
||||||
-D npu_per_node="$npu_per_node" \
|
-D npu_per_node="$npu_per_node" \
|
||||||
-D fail_tag="$fail_tag" \
|
-D fail_tag="$fail_tag" \
|
||||||
--outfile lws.yaml
|
--outfile lws.yaml
|
||||||
|
|||||||
4
.github/workflows/nightly_test_a2.yaml
vendored
4
.github/workflows/nightly_test_a2.yaml
vendored
@@ -93,13 +93,13 @@ jobs:
|
|||||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||||
with:
|
with:
|
||||||
soc_version: a2
|
soc_version: a2
|
||||||
runner: linux-aarch64-a2-0
|
runner: linux-amd64-cpu-8-hk
|
||||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
||||||
replicas: 1
|
replicas: 1
|
||||||
size: ${{ matrix.test_config.size }}
|
size: ${{ matrix.test_config.size }}
|
||||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||||
secrets:
|
secrets:
|
||||||
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
|
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_HK_001_INTERNAL_B64 }}
|
||||||
|
|
||||||
single-node-accuracy-tests:
|
single-node-accuracy-tests:
|
||||||
if: >-
|
if: >-
|
||||||
|
|||||||
138
tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2
Normal file
138
tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||||
|
kind: LeaderWorkerSet
|
||||||
|
metadata:
|
||||||
|
name: vllm
|
||||||
|
namespace: vllm-project
|
||||||
|
spec:
|
||||||
|
replicas: {{ replicas | default(1) }}
|
||||||
|
leaderWorkerTemplate:
|
||||||
|
size: {{ size | default(2) }}
|
||||||
|
restartPolicy: None
|
||||||
|
leaderTemplate:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
role: leader
|
||||||
|
spec:
|
||||||
|
schedulerName: volcano
|
||||||
|
tolerations:
|
||||||
|
- key: "instance"
|
||||||
|
operator: "Equal"
|
||||||
|
value: "vllm"
|
||||||
|
effect: "NoSchedule"
|
||||||
|
containers:
|
||||||
|
- name: vllm-leader
|
||||||
|
imagePullPolicy: Always
|
||||||
|
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
|
||||||
|
env:
|
||||||
|
- name: CONFIG_YAML_PATH
|
||||||
|
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
|
||||||
|
- name: WORKSPACE
|
||||||
|
value: "/vllm-workspace"
|
||||||
|
- name: FAIL_TAG
|
||||||
|
value: {{ fail_tag | default("FAIL_TAG") }}
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
bash /root/.cache/tests/run.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
|
||||||
|
memory: 512Gi
|
||||||
|
ephemeral-storage: 100Gi
|
||||||
|
requests:
|
||||||
|
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
|
||||||
|
ephemeral-storage: 100Gi
|
||||||
|
cpu: 125
|
||||||
|
ports:
|
||||||
|
- containerPort: 8080
|
||||||
|
# readinessProbe:
|
||||||
|
# tcpSocket:
|
||||||
|
# port: 8080
|
||||||
|
# initialDelaySeconds: 15
|
||||||
|
# periodSeconds: 10
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /root/.cache
|
||||||
|
name: shared-volume
|
||||||
|
- mountPath: /usr/local/Ascend/driver/tools
|
||||||
|
name: driver-tools
|
||||||
|
- mountPath: /dev/shm
|
||||||
|
name: dshm
|
||||||
|
volumes:
|
||||||
|
- name: dshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 15Gi
|
||||||
|
- name: shared-volume
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: vllm-project-hk001
|
||||||
|
- name: driver-tools
|
||||||
|
hostPath:
|
||||||
|
path: /usr/local/Ascend/driver/tools
|
||||||
|
workerTemplate:
|
||||||
|
spec:
|
||||||
|
schedulerName: volcano
|
||||||
|
tolerations:
|
||||||
|
- key: "instance"
|
||||||
|
operator: "Equal"
|
||||||
|
value: "vllm"
|
||||||
|
effect: "NoSchedule"
|
||||||
|
containers:
|
||||||
|
- name: vllm-worker
|
||||||
|
imagePullPolicy: Always
|
||||||
|
image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
|
||||||
|
env:
|
||||||
|
- name: CONFIG_YAML_PATH
|
||||||
|
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
|
||||||
|
- name: WORKSPACE
|
||||||
|
value: "/vllm-workspace"
|
||||||
|
- name: FAIL_TAG
|
||||||
|
value: {{ fail_tag | default("FAIL_TAG") }}
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
bash /root/.cache/tests/run.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
|
||||||
|
memory: 512Gi
|
||||||
|
ephemeral-storage: 100Gi
|
||||||
|
requests:
|
||||||
|
huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
|
||||||
|
ephemeral-storage: 100Gi
|
||||||
|
cpu: 125
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /root/.cache
|
||||||
|
name: shared-volume
|
||||||
|
- mountPath: /usr/local/Ascend/driver/tools
|
||||||
|
name: driver-tools
|
||||||
|
- mountPath: /dev/shm
|
||||||
|
name: dshm
|
||||||
|
volumes:
|
||||||
|
- name: dshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 15Gi
|
||||||
|
- name: shared-volume
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: vllm-project-hk001
|
||||||
|
- name: driver-tools
|
||||||
|
hostPath:
|
||||||
|
path: /usr/local/Ascend/driver/tools
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: vllm-leader
|
||||||
|
namespace: vllm-project
|
||||||
|
spec:
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 8080
|
||||||
|
protocol: TCP
|
||||||
|
targetPort: 8080
|
||||||
|
selector:
|
||||||
|
leaderworkerset.sigs.k8s.io/name: vllm
|
||||||
|
role: leader
|
||||||
|
type: ClusterIP
|
||||||
@@ -22,13 +22,6 @@ spec:
|
|||||||
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
|
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
|
||||||
- name: WORKSPACE
|
- name: WORKSPACE
|
||||||
value: "/vllm-workspace"
|
value: "/vllm-workspace"
|
||||||
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
|
|
||||||
- name: VLLM_ASCEND_VERSION
|
|
||||||
value: {{ vllm_ascend_ref | default("main") }}
|
|
||||||
- name: VLLM_ASCEND_REMOTE_URL
|
|
||||||
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
|
|
||||||
- name: RESULT_FILE_PATH
|
|
||||||
value: {{ result_file_path | default("/root/.cache/tests/ret") }}
|
|
||||||
- name: FAIL_TAG
|
- name: FAIL_TAG
|
||||||
value: {{ fail_tag | default("FAIL_TAG") }}
|
value: {{ fail_tag | default("FAIL_TAG") }}
|
||||||
command:
|
command:
|
||||||
@@ -81,13 +74,6 @@ spec:
|
|||||||
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
|
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
|
||||||
- name: WORKSPACE
|
- name: WORKSPACE
|
||||||
value: "/vllm-workspace"
|
value: "/vllm-workspace"
|
||||||
# Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
|
|
||||||
- name: VLLM_ASCEND_VERSION
|
|
||||||
value: {{ vllm_ascend_ref | default("main") }}
|
|
||||||
- name: VLLM_ASCEND_REMOTE_URL
|
|
||||||
value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
|
|
||||||
- name: RESULT_FILE_PATH
|
|
||||||
value: {{ result_file_path | default("/root/.cache/tests/ret") }}
|
|
||||||
- name: FAIL_TAG
|
- name: FAIL_TAG
|
||||||
value: {{ fail_tag | default("FAIL_TAG") }}
|
value: {{ fail_tag | default("FAIL_TAG") }}
|
||||||
command:
|
command:
|
||||||
|
|||||||
@@ -167,8 +167,8 @@ run_tests_with_log() {
|
|||||||
if [ $ret -eq 0 ]; then
|
if [ $ret -eq 0 ]; then
|
||||||
print_success "All tests passed!"
|
print_success "All tests passed!"
|
||||||
else
|
else
|
||||||
print_failure "Some tests failed, please check the error stack above for details.\
|
print_failure "Some tests failed, please check the error stack above for details. \
|
||||||
If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary."
|
If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary."
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user