[CI] Optimize nightly CI (#3858)

### What this PR does / why we need it? This patch optimize nightly CI: 1. Bug fixes ais_bench get None repo_type error 2. Fix A2 install kubectl error with arm arch 3. Fix the multi_node CI unable to determine whether the job was successful error ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: 83f478bb19 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-29 22:30:19 +08:00
parent cba69e117e
commit 4a2ab13743
8 changed files with 110 additions and 39 deletions
--- a/.github/workflows/_e2e_nightly_multi_node.yaml
+++ b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -65,6 +65,7 @@ concurrency:

 jobs:
  e2e:
+    name: ${{ inputs.config_file_path }}
    # This is a runner with no NPU for k8s controller
    runs-on: ${{ inputs.runner }}
    container:
@@ -112,9 +113,10 @@ jobs:
            # prepare for lws entrypoint scripts
            install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh

-        - name: Clear result ret
+        - name: Clear resources
          run: |
-            rm -f $RESULT_FILE
+            # pre clear the crd resources created by lws
+            kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found

        - name: Launch cluster
          run: |
@@ -153,6 +155,8 @@ jobs:
              -D vllm_ascend_ref="$vllm_ascend_ref" \
              -D result_file_path="$result_file_path" \
              -D npu_per_node="$npu_per_node" \
+              -D controller_name="$HOSTNAME" \
+              -D kb_secret=${{ secrets.KUBECONFIG_B64 }} \
              --outfile lws.yaml

            kubectl apply -f ./lws.yaml
@@ -178,29 +182,6 @@ jobs:
          run: |
            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"

-        - name: Determine is success
-          run: |
-            TIMEOUT=300
-            ELAPSED=0
-            while [ ! -f "$RESULT_FILE" ]; do
-              sleep 5
-              ELAPSED=$((ELAPSED + 5))
-              if [ $ELAPSED -ge $TIMEOUT ]; then
-                echo "Timeout waiting for test result file"
-                exit 1
-              fi
-            done
-
-            RET=$(cat "$RESULT_FILE")
-            echo "Test result: $RET"
-
-            if [ "$RET" -ne 0 ]; then
-              echo "Test failed"
-              exit 1
-            else
-              echo "Test succeeded"
-            fi
-
        - name: Post process
          if: always()
          run: |
--- a/.github/workflows/_e2e_nightly_single_node.yaml
+++ b/.github/workflows/_e2e_nightly_single_node.yaml
@@ -49,12 +49,10 @@ concurrency:

 jobs:
  e2e-nightly:
-    name: e2e-nightly
+    name: ${{ inputs.tests }}
    runs-on: ${{ inputs.runner }}
    container:
      image: ${{ inputs.image }}
-      env:
-        VLLM_USE_MODELSCOPE: True
    steps:
      - name: Check npu and CANN info
        run: |
@@ -111,5 +109,4 @@ jobs:
          VLLM_USE_MODELSCOPE: True
          VLLM_CI_RUNNER: ${{ inputs.runner }}
        run: |
-          # TODO: enable more tests
          pytest -sv ${{ inputs.tests }}
--- a/.github/workflows/_kill_lws_resources.yaml
+++ b/.github/workflows/_kill_lws_resources.yaml
@@ -0,0 +1,57 @@
+name: 'resource clear'
+
+on:
+  workflow_call:
+    inputs:
+      runner:
+        required: false
+        type: string
+        default: linux-aarch64-a3-0
+    secrets:
+      KUBECONFIG_B64:
+        required: true
+
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+jobs:
+  resource_clear:
+    # This is a runner with no NPU for k8s controller
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+      env:
+        KUBECONFIG: /tmp/kubeconfig
+        KUBECTL: /root/.cache/.kube/kubectl
+        NAMESPACE: vllm-project
+        LEADER_POD: vllm-0
+        RESULT_FILE: /root/.cache/tests/ret/test_result.txt
+    steps:
+        - name: Install kubectl
+          run: |
+            # Install kubectl
+            arch=$(uname -m)
+
+            if echo "$arch" | grep -qiE "arm|aarch64"; then
+              echo "Detected ARM architecture: $arch"
+              KUBECTL="$KUBECTL"_arm
+            fi
+            install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
+
+            # Verify kubectl installation
+            kubectl version --client=true
+
+        - name: Decode kubeconfig from secrets
+          run: |
+            # Decode and save kubeconfig
+            echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
+
+        - name: Clear LWS resources
+          if: always()
+          run: |
+            kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
--- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml
@@ -101,6 +101,12 @@ jobs:
          - name: multi-node-dpsk-4node-pd
            config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
            size: 4
+          - name: multi-node-qwenw8a8-2node
+            config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml
+            size: 2
+          - name: multi-node-glm-2node
+            config_file_path: tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml
+            size: 2
    uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
    with:
      soc_version: a3
@@ -111,3 +117,12 @@ jobs:
      config_file_path: ${{ matrix.test_config.config_file_path }}
    secrets:
      KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
+
+  clear_resources:
+    needs: multi-node-tests
+    if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
+    uses: ./.github/workflows/_kill_lws_resources.yaml
+    with:
+      runner: linux-aarch64-a3-0
+    secrets:
+      KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
--- a/tests/e2e/nightly/multi_node/config/multi_node_config.py
+++ b/tests/e2e/nightly/multi_node/config/multi_node_config.py
@@ -15,7 +15,7 @@ from tests.e2e.nightly.multi_node.config.utils import (get_avaliable_port,

 setup_logger()
 logger = logging.getLogger(__name__)
-DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py"
+DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py"
 DISAGGEGATED_PREFILL_PORT = 5333


--- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
+++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
@@ -7,7 +7,7 @@ spec:
  replicas: {{ replicas | default(1) }}
  leaderWorkerTemplate:
    size: {{ size | default(2) }}
-    restartPolicy: RecreateGroupOnPodRestart
+    restartPolicy: None
    leaderTemplate:
      metadata:
        labels:
@@ -30,6 +30,10 @@ spec:
                value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
              - name: RESULT_FILE_PATH
                value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
+              - name: CONTROLLER_NAME
+                value: {{ controller_name | default("placeholder") }}
+              - name: SECRET
+                value: {{ kb_secret | default("placeholder") }}
            command:
              - sh
              - -c
--- a/tests/e2e/nightly/multi_node/scripts/run.sh
+++ b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -162,14 +162,31 @@ kill_npu_processes() {
 }

 run_tests() {
-    pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
+    set +e
    kill_npu_processes
+    pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
    ret=$?
    if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
-        mkdir -p "$(dirname "$RESULT_FILE_PATH")"
-        echo $ret > "$RESULT_FILE_PATH"
+        if [ $ret -eq 0 ]; then
+            print_success "All tests passed!"
+        else
+            print_error "Some tests failed!"
+            kubectl delete pod $CONTROLLER_NAME -n vllm-project
+        fi
    fi
-    return $ret
+    set -e
+}
+
+install_kubectl() {
+    arch=$(uname -m)
+    KUBECTL=/root/.cache/.kube/kubectl
+    if echo "$arch" | grep -qiE "arm|aarch64"; then
+        echo "Detected ARM architecture: $arch"
+        KUBECTL="$KUBECTL"_arm
+    fi
+    install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
+    echo "$SECRET" | base64 -d > /tmp/kubeconfig
+    export KUBECONFIG=/tmp/kubeconfig
 }

 main() {
@@ -177,6 +194,7 @@ main() {
    check_and_config
    checkout_src
    install_sys_dependencies
+    install_kubectl
    install_vllm
    install_ais_bench
    # to speed up mooncake build process, install Go here
--- a/tools/aisbench.py
+++ b/tools/aisbench.py
@@ -284,12 +284,12 @@ def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):

 def maybe_download_from_modelscope(
    model: str,
-    repo_type: str | None = None,
+    repo_type: str = "model",
    revision: str | None = None,
    download_dir: str | None = None,
    ignore_patterns: str | list[str] | None = None,
    allow_patterns: list[str] | str | None = None,
-) -> str | None:
+) -> str:
    """
    Download model/dataset from ModelScope hub.
    Returns the path to the downloaded model, or None if the model is not
@@ -310,5 +310,4 @@ def maybe_download_from_modelscope(
            )
        else:
            model_path = model
-        return model_path
-    return None
+    return model_path