[CI] Optimize nightly CI (#3898)

### What this PR does / why we need it? This patch mainly fix the the problem of not being able to determine the exit status of the pod's entrypoint script and some other tiny optimizations: 1. Shorten wait for server timeout 2. fix typo 3. fix the issue of ais_bench failing to correctly access the proxy URL in a PD separation scenario. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 83f478bb19 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-30 23:42:20 +08:00
parent 2c291bc63f
commit eb0a2ee2d0
14 changed files with 94 additions and 66 deletions
--- a/.github/workflows/_e2e_nightly_multi_node.yaml
+++ b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -60,13 +60,13 @@ defaults:
 # only cancel in-progress runs of the same workflow
 # and ignore the lint / 8 cards test type
 concurrency:
-  group: ascend-nightly-${{ github.ref }}-${{ inputs.config_file_path }}
+  group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.config_file_path }}
  cancel-in-progress: true

 jobs:
  e2e:
    name: ${{ inputs.config_file_path }}
-    # This is a runner with no NPU for k8s controller
+    # This is the runner with no NPU for k8s controller
    runs-on: ${{ inputs.runner }}
    container:
      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
@@ -75,7 +75,7 @@ jobs:
        KUBECTL: /root/.cache/.kube/kubectl
        NAMESPACE: vllm-project
        LEADER_POD: vllm-0
-        RESULT_FILE: /root/.cache/tests/ret/test_result.txt
+        RESULT_FILE: /root/.cache/tests/ret_${{ inputs.soc_version }}
    steps:
        - name: Install system denpendencies
          run: |
@@ -84,7 +84,7 @@ jobs:
           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
           pip install jinja2-cli

-           apt-get update -y && apt-get install -y git curl
+           #apt-get update -y && apt-get install -y git curl

        - name: Install kubectl
          run: |
@@ -117,8 +117,8 @@ jobs:
          run: |
            # pre clear the crd resources created by lws
            kubectl delete leaderworkerset vllm -n "$NAMESPACE" --ignore-not-found
-
        - name: Launch cluster
+          id: launcher
          run: |
            set -e

@@ -130,6 +130,8 @@ jobs:
            vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
            vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
            result_file_path="$RESULT_FILE"
+            fail_tag=FAIL_TAG_"${{ inputs.config_file_path }}"
+            echo "FAIL_TAG=${fail_tag}" >> $GITHUB_ENV

            required_params=("size" "replicas" "image" "config_file_path")
            for param in "${required_params[@]}"; do
@@ -155,8 +157,7 @@ jobs:
              -D vllm_ascend_ref="$vllm_ascend_ref" \
              -D result_file_path="$result_file_path" \
              -D npu_per_node="$npu_per_node" \
-              -D controller_name="$HOSTNAME" \
-              -D kb_secret=${{ secrets.KUBECONFIG_B64 }} \
+              -D fail_tag="$fail_tag" \
              --outfile lws.yaml

            kubectl apply -f ./lws.yaml
@@ -180,7 +181,14 @@ jobs:

        - name: Stream logs
          run: |
-            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
+            set -euo pipefail
+            echo "Looking for logs containing: $FAIL_TAG"
+            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while read -r line; do
+              echo "$line"
+              if echo "$line" | grep -q "$FAIL_TAG"; then
+                exit 1   # workflow step failed
+              fi
+            done

        - name: Post process
          if: always()