[CI] Optimize nightly CI (#3858)

### What this PR does / why we need it? This patch optimize nightly CI: 1. Bug fixes ais_bench get None repo_type error 2. Fix A2 install kubectl error with arm arch 3. Fix the multi_node CI unable to determine whether the job was successful error ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: 83f478bb19 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-29 22:30:19 +08:00
parent cba69e117e
commit 4a2ab13743
8 changed files with 110 additions and 39 deletions
--- a/.github/workflows/_e2e_nightly_multi_node.yaml
+++ b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -65,6 +65,7 @@ concurrency:
 jobs:
  e2e:
    name: ${{ inputs.config_file_path }}
    # This is a runner with no NPU for k8s controller
    runs-on: ${{ inputs.runner }}
    container:
@@ -112,9 +113,10 @@ jobs:
            # prepare for lws entrypoint scripts
            install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
-        - name: Clear result ret
+        - name: Clear resources
          run: |
-            rm -f $RESULT_FILE
+            # pre clear the crd resources created by lws
            kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
        - name: Launch cluster
          run: |
@@ -153,6 +155,8 @@ jobs:
              -D vllm_ascend_ref="$vllm_ascend_ref" \
              -D result_file_path="$result_file_path" \
              -D npu_per_node="$npu_per_node" \
              -D controller_name="$HOSTNAME" \
              -D kb_secret=${{ secrets.KUBECONFIG_B64 }} \
              --outfile lws.yaml
            kubectl apply -f ./lws.yaml
@@ -178,29 +182,6 @@ jobs:
          run: |
            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
        - name: Determine is success
          run: |
            TIMEOUT=300
            ELAPSED=0
            while [ ! -f "$RESULT_FILE" ]; do
              sleep 5
              ELAPSED=$((ELAPSED + 5))
              if [ $ELAPSED -ge $TIMEOUT ]; then
                echo "Timeout waiting for test result file"
                exit 1
              fi
            done
            RET=$(cat "$RESULT_FILE")
            echo "Test result: $RET"
            if [ "$RET" -ne 0 ]; then
              echo "Test failed"
              exit 1
            else
              echo "Test succeeded"
            fi
        - name: Post process
          if: always()
          run: |
--- a/.github/workflows/_e2e_nightly_single_node.yaml
+++ b/.github/workflows/_e2e_nightly_single_node.yaml
@@ -49,12 +49,10 @@ concurrency:
 jobs:
  e2e-nightly:
-    name: e2e-nightly
+    name: ${{ inputs.tests }}
    runs-on: ${{ inputs.runner }}
    container:
      image: ${{ inputs.image }}
      env:
        VLLM_USE_MODELSCOPE: True
    steps:
      - name: Check npu and CANN info
        run: |
@@ -111,5 +109,4 @@ jobs:
          VLLM_USE_MODELSCOPE: True
          VLLM_CI_RUNNER: ${{ inputs.runner }}
        run: |
          # TODO: enable more tests
          pytest -sv ${{ inputs.tests }}
--- a/.github/workflows/_kill_lws_resources.yaml
+++ b/.github/workflows/_kill_lws_resources.yaml
@@ -0,0 +1,57 @@
 name: 'resource clear'
 on:
  workflow_call:
    inputs:
      runner:
        required: false
        type: string
        default: linux-aarch64-a3-0
    secrets:
      KUBECONFIG_B64:
        required: true
 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
 # declared as "shell: bash -el {0}" on steps that need to be properly activated.
 # It's used to activate ascend-toolkit environment variables.
 defaults:
  run:
    shell: bash -el {0}
 jobs:
  resource_clear:
    # This is a runner with no NPU for k8s controller
    runs-on: ${{ inputs.runner }}
    container:
      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
      env:
        KUBECONFIG: /tmp/kubeconfig
        KUBECTL: /root/.cache/.kube/kubectl
        NAMESPACE: vllm-project
        LEADER_POD: vllm-0
        RESULT_FILE: /root/.cache/tests/ret/test_result.txt
    steps:
        - name: Install kubectl
          run: |
            # Install kubectl
            arch=$(uname -m)
            if echo "$arch" | grep -qiE "arm|aarch64"; then
              echo "Detected ARM architecture: $arch"
              KUBECTL="$KUBECTL"_arm
            fi
            install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
            # Verify kubectl installation
            kubectl version --client=true
        - name: Decode kubeconfig from secrets
          run: |
            # Decode and save kubeconfig
            echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
        - name: Clear LWS resources
          if: always()
          run: |
            kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
--- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml
@@ -101,6 +101,12 @@ jobs:
          - name: multi-node-dpsk-4node-pd
            config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
            size: 4
          - name: multi-node-qwenw8a8-2node
            config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml
            size: 2
          - name: multi-node-glm-2node
            config_file_path: tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml
            size: 2
    uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
    with:
      soc_version: a3
@@ -111,3 +117,12 @@ jobs:
      config_file_path: ${{ matrix.test_config.config_file_path }}
    secrets:
      KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
  clear_resources:
    needs: multi-node-tests
    if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
    uses: ./.github/workflows/_kill_lws_resources.yaml
    with:
      runner: linux-aarch64-a3-0
    secrets:
      KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
--- a/tests/e2e/nightly/multi_node/config/multi_node_config.py
+++ b/tests/e2e/nightly/multi_node/config/multi_node_config.py
@@ -15,7 +15,7 @@ from tests.e2e.nightly.multi_node.config.utils import (get_avaliable_port,
 setup_logger()
 logger = logging.getLogger(__name__)
-DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py"
+DISAGGREGATED_PREFILL_PROXY_SCRIPT = "examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py"
 DISAGGEGATED_PREFILL_PORT = 5333
--- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
+++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
@@ -7,7 +7,7 @@ spec:
  replicas: {{ replicas | default(1) }}
  leaderWorkerTemplate:
    size: {{ size | default(2) }}
-    restartPolicy: RecreateGroupOnPodRestart
+    restartPolicy: None
    leaderTemplate:
      metadata:
        labels:
@@ -30,6 +30,10 @@ spec:
                value: {{ vllm_ascend_remote_url | default("https://github.com/vllm-project/vllm-ascend.git") }}
              - name: RESULT_FILE_PATH
                value: {{ result_file_path | default("/root/.cache/tests/ret/test_result.txt") }}
              - name: CONTROLLER_NAME
                value: {{ controller_name | default("placeholder") }}
              - name: SECRET
                value: {{ kb_secret | default("placeholder") }}
            command:
              - sh
              - -c
--- a/tests/e2e/nightly/multi_node/scripts/run.sh
+++ b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -162,14 +162,31 @@ kill_npu_processes() {
 }
 run_tests() {
-    pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
+    set +e
    kill_npu_processes
    pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
    ret=$?
    if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
-        mkdir -p "$(dirname "$RESULT_FILE_PATH")"
+        if [ $ret -eq 0 ]; then
-        echo $ret > "$RESULT_FILE_PATH"
+            print_success "All tests passed!"
        else
            print_error "Some tests failed!"
            kubectl delete pod $CONTROLLER_NAME -n vllm-project
        fi
-    return $ret
+    fi
    set -e
 }
 install_kubectl() {
    arch=$(uname -m)
    KUBECTL=/root/.cache/.kube/kubectl
    if echo "$arch" | grep -qiE "arm|aarch64"; then
        echo "Detected ARM architecture: $arch"
        KUBECTL="$KUBECTL"_arm
    fi
    install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
    echo "$SECRET" | base64 -d > /tmp/kubeconfig
    export KUBECONFIG=/tmp/kubeconfig
 }
 main() {
@@ -177,6 +194,7 @@ main() {
    check_and_config
    checkout_src
    install_sys_dependencies
    install_kubectl
    install_vllm
    install_ais_bench
    # to speed up mooncake build process, install Go here
--- a/tools/aisbench.py
+++ b/tools/aisbench.py
@@ -284,12 +284,12 @@ def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
 def maybe_download_from_modelscope(
    model: str,
-    repo_type: str | None = None,
+    repo_type: str = "model",
    revision: str | None = None,
    download_dir: str | None = None,
    ignore_patterns: str | list[str] | None = None,
    allow_patterns: list[str] | str | None = None,
-) -> str | None:
+) -> str:
    """
    Download model/dataset from ModelScope hub.
    Returns the path to the downloaded model, or None if the model is not
@@ -311,4 +311,3 @@ def maybe_download_from_modelscope(
        else:
            model_path = model
    return model_path
    return None