[CI] Move nightly-a2 test to hk (#5807)

### What this PR does / why we need it? This patch initial testing involved connecting two nodes from the HK region to nightly A2. - vLLM version: v0.13.0 - vLLM main: 2f4e6548ef --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2026-01-12 22:58:35 +08:00
parent 2a010a1f0e
commit 75c92a3640
5 changed files with 146 additions and 53 deletions
--- a/.github/workflows/_e2e_nightly_multi_node.yaml
+++ b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -69,35 +69,12 @@ jobs:
    # This is the runner with no NPU for k8s controller
    runs-on: ${{ inputs.runner }}
    container:
-      image: m.daocloud.io/quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-cpu
      env:
        KUBECONFIG: /tmp/kubeconfig
        KUBECTL: /root/.cache/.kube/kubectl
        NAMESPACE: vllm-project
        LEADER_POD: vllm-0
        RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }}
    steps:
        - name: Install system denpendencies
          run: |
           # configure apt and pip source
           sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
           pip install jinja2-cli
        - name: Install kubectl
          run: |
            # Install kubectl
            arch=$(uname -m)
            if echo "$arch" | grep -qiE "arm|aarch64"; then
              echo "Detected ARM architecture: $arch"
              KUBECTL="$KUBECTL"_arm
            fi
            install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
            # Verify kubectl installation
            kubectl version --client=true
        - name: Decode kubeconfig from secrets
          run: |
            # Decode and save kubeconfig
@@ -110,8 +87,6 @@ jobs:
          run: |
            # prepare for lws entrypoint scripts
            install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
            # clear log directory
            rm -fr $RESULT_FILE
        - name: Clear resources
          run: |
@@ -157,10 +132,6 @@ jobs:
            replicas="${{ inputs.replicas }}"
            image="${{ inputs.image }}"
            config_file_path="${{ inputs.config_file_path }}"
            vllm_version="${{ inputs.vllm_version }}"
            vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
            vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
            result_file_path="$RESULT_FILE"
            fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
            echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV
@@ -174,19 +145,17 @@ jobs:
            if [ "${{ inputs.soc_version }}" = "a3" ]; then
              npu_per_node=16
              TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2"
            else
              npu_per_node=8
              TEMPLATE_FILE="tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2"
            fi
-            jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
+            jinja2 $TEMPLATE_FILE \
              -D size="$size" \
              -D replicas="$replicas" \
              -D image="$image" \
              -D config_file_path="$config_file_path" \
              -D vllm_version="$vllm_version" \
              -D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
              -D vllm_ascend_ref="$vllm_ascend_ref" \
              -D result_file_path="$result_file_path" \
              -D npu_per_node="$npu_per_node" \
              -D fail_tag="$fail_tag" \
              --outfile lws.yaml
--- a/.github/workflows/nightly_test_a2.yaml
+++ b/.github/workflows/nightly_test_a2.yaml
@@ -93,13 +93,13 @@ jobs:
    uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
    with:
      soc_version: a2
-      runner: linux-aarch64-a2-0
+      runner: linux-amd64-cpu-8-hk
      image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
      replicas: 1
      size: ${{ matrix.test_config.size }}
      config_file_path: ${{ matrix.test_config.config_file_path }}
    secrets:
-      KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
+      KUBECONFIG_B64: ${{ secrets.KUBECONFIG_HK_001_INTERNAL_B64 }}
  single-node-accuracy-tests:
    if: >-
--- a/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2
+++ b/tests/e2e/nightly/multi_node/scripts/lws-a2.yaml.jinja2
@@ -0,0 +1,138 @@
 apiVersion: leaderworkerset.x-k8s.io/v1
 kind: LeaderWorkerSet
 metadata:
  name: vllm
  namespace: vllm-project
 spec:
  replicas: {{ replicas | default(1) }}
  leaderWorkerTemplate:
    size: {{ size | default(2) }}
    restartPolicy: None
    leaderTemplate:
      metadata:
        labels:
          role: leader
      spec:
        schedulerName: volcano
        tolerations:
          - key: "instance"
            operator: "Equal"
            value: "vllm"
            effect: "NoSchedule"
        containers:
          - name: vllm-leader
            imagePullPolicy: Always
            image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
            env:
              - name: CONFIG_YAML_PATH
                value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
              - name: WORKSPACE
                value: "/vllm-workspace"
              - name: FAIL_TAG
                value: {{ fail_tag | default("FAIL_TAG") }}
            command:
              - sh
              - -c
              - |
                bash /root/.cache/tests/run.sh
            resources:
              limits:
                huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
                memory: 512Gi
                ephemeral-storage: 100Gi
              requests:
                huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
                ephemeral-storage: 100Gi
                cpu: 125
            ports:
              - containerPort: 8080
            # readinessProbe:
            #   tcpSocket:
            #     port: 8080
            #   initialDelaySeconds: 15
            #   periodSeconds: 10
            volumeMounts:
              - mountPath: /root/.cache
                name: shared-volume
              - mountPath: /usr/local/Ascend/driver/tools
                name: driver-tools
              - mountPath: /dev/shm
                name: dshm
        volumes:
        - name: dshm
          emptyDir:
            medium: Memory
            sizeLimit: 15Gi
        - name: shared-volume
          persistentVolumeClaim:
            claimName: vllm-project-hk001
        - name: driver-tools
          hostPath:
            path: /usr/local/Ascend/driver/tools
    workerTemplate:
      spec:
        schedulerName: volcano
        tolerations:
          - key: "instance"
            operator: "Equal"
            value: "vllm"
            effect: "NoSchedule"
        containers:
          - name: vllm-worker
            imagePullPolicy: Always
            image: {{ image | default("swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2") }}
            env:
              - name: CONFIG_YAML_PATH
                value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
              - name: WORKSPACE
                value: "/vllm-workspace"
              - name: FAIL_TAG
                value: {{ fail_tag | default("FAIL_TAG") }}
            command:
              - sh
              - -c
              - |
                bash /root/.cache/tests/run.sh
            resources:
              limits:
                huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
                memory: 512Gi
                ephemeral-storage: 100Gi
              requests:
                huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
                ephemeral-storage: 100Gi
                cpu: 125
            volumeMounts:
              - mountPath: /root/.cache
                name: shared-volume
              - mountPath: /usr/local/Ascend/driver/tools
                name: driver-tools
              - mountPath: /dev/shm
                name: dshm
        volumes:
        - name: dshm
          emptyDir:
            medium: Memory
            sizeLimit: 15Gi
        - name: shared-volume
          persistentVolumeClaim:
            claimName: vllm-project-hk001
        - name: driver-tools
          hostPath:
            path: /usr/local/Ascend/driver/tools
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: vllm-leader
  namespace: vllm-project
 spec:
  ports:
    - name: http
      port: 8080
      protocol: TCP
      targetPort: 8080
  selector:
    leaderworkerset.sigs.k8s.io/name: vllm
    role: leader
  type: ClusterIP
--- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
+++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
@@ -22,13 +22,6 @@ spec:
                value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
              - name: WORKSPACE
                value: "/vllm-workspace"
              # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
              - name: VLLM_ASCEND_VERSION
                value: {{ vllm_ascend_ref | default("main") }}
              - name: VLLM_ASCEND_REMOTE_URL
                value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
              - name: RESULT_FILE_PATH
                value: {{ result_file_path | default("/root/.cache/tests/ret") }}
              - name: FAIL_TAG
                value: {{ fail_tag | default("FAIL_TAG") }}
            command:
@@ -81,13 +74,6 @@ spec:
                value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
              - name: WORKSPACE
                value: "/vllm-workspace"
              # Set vLLM version and vLLM-Ascend version here, once there is a new release, update here.
              - name: VLLM_ASCEND_VERSION
                value: {{ vllm_ascend_ref | default("main") }}
              - name: VLLM_ASCEND_REMOTE_URL
                value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
              - name: RESULT_FILE_PATH
                value: {{ result_file_path | default("/root/.cache/tests/ret") }}
              - name: FAIL_TAG
                value: {{ fail_tag | default("FAIL_TAG") }}
            command:
--- a/tests/e2e/nightly/multi_node/scripts/run.sh
+++ b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -167,8 +167,8 @@ run_tests_with_log() {
        if [ $ret -eq 0 ]; then
            print_success "All tests passed!"
        else
-            print_failure "Some tests failed, please check the error stack above for details.\
+            print_failure "Some tests failed, please check the error stack above for details. \
-            If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary."
+If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary."
        fi
    fi
 }