From c2f776b846af7054f88c448980b08bbf4bcf5cb9 Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Fri, 26 Dec 2025 11:39:07 +0800
Subject: [PATCH] [Nightly] Initial logging for nightly multi-node testing
 (#5362)

### What this PR does / why we need it?
Currently, our multi-node logs only show the master node's logs (via the
Kubernetes API), which is insufficient for effective problem
localization if other nodes experience issues. Therefore, this pull
request adds the ability to upload logs for other nodes.

Next plan: Output structured directory logs, including logs from each
node and the polog.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/bc0a5a0c089844b17cb93f3294348f411e523586

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 .../workflows/_e2e_nightly_multi_node.yaml    | 36 +++++++++++++++++--
 .../workflows/_e2e_nightly_single_node.yaml   |  7 ----
 tests/e2e/conftest.py                         |  2 +-
 tests/e2e/nightly/multi_node/scripts/run.sh   | 17 +++------
 4 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml
index 04f2aa72..93ad51fc 100644
--- a/.github/workflows/_e2e_nightly_multi_node.yaml
+++ b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -252,14 +252,46 @@ jobs:
         - name: Stream logs
           run: |
             set -euo pipefail
+
+            size="${{ inputs.size }}"
+            pids=()
+
+            cleanup() {
+              echo "Cleaning up background log streams..."
+              for pid in "${pids[@]}"; do
+                kill "$pid" 2>/dev/null || true
+              done
+            }
+            trap cleanup EXIT
+
+            for i in $(seq 1 $((size - 1))); do
+              POD="vllm-0-${i}"
+
+              echo "==== Collecting logs from worker pod: $POD ===="
+              kubectl logs -f "$POD" -n "$NAMESPACE" \
+                > "/tmp/${POD}_logs.txt" 2>&1 &
+
+              pids+=($!)
+            done
+
+            echo "==== Streaming logs from leader pod: $LEADER_POD ===="
             echo "Looking for logs containing: $FAIL_TAG"
-            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while read -r line; do
+
+            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while IFS= read -r line; do
               echo "$line"
               if echo "$line" | grep -q "$FAIL_TAG"; then
-                exit 1   # workflow step failed
+                exit 1
               fi
             done
 
+        - name: Upload logs
+          if: always()
+          uses: actions/upload-artifact@v4
+          with:
+            name: pod-logs
+            path: /tmp/vllm*_logs.txt
+            retention-days: 7
+
         - name: Post process
           if: always()
           run: |
diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml
index 6bb61115..7fab7613 100644
--- a/.github/workflows/_e2e_nightly_single_node.yaml
+++ b/.github/workflows/_e2e_nightly_single_node.yaml
@@ -66,13 +66,6 @@ jobs:
           npu-smi info
           cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
 
-      - name: Sync from vllm-Ascend main branch
-        working-directory: /vllm-workspace/vllm-ascend
-        run: |
-          git config --global --add safe.directory "$(pwd)"
-          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
-          git pull origin main
-
       - name: Show vLLM and vLLM-Ascend version
         working-directory: /vllm-workspace
         run: |
diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
index be9d2a2d..1d993c1c 100644
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -257,7 +257,7 @@ class RemoteOpenAIServer:
                 except RequestException:
                     all_ready = False
                     if should_log:
-                        logger.info(f"[WAIT] {url}: connection failed")
+                        logger.debug(f"[WAIT] {url}: connection failed")
 
                     # check unexpected exit
                     result = self._poll()
diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh
index 2e02f744..8ef48bf4 100644
--- a/tests/e2e/nightly/multi_node/scripts/run.sh
+++ b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -13,6 +13,7 @@ LOG_DIR="/root/.cache/tests/logs"
 OVERWRITE_LOGS=true
 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
 export BENCHMARK_HOME=${WORKSPACE}/vllm-ascend/benchmark
+export VLLM_LOGGING_LEVEL="DEBUG"
 
 # Function to print section headers
 print_section() {
@@ -124,27 +125,18 @@ kill_npu_processes() {
   sleep 4
 }
 
-upgrade_vllm_ascend_scr() {
-    # Fix me(Potabk): Remove this once our image build use 
-    # The separate architecture build process currently suffers from errors during cross-compilation
-    # causing the image to fail to build correctly. 
-    # This results in the nightly test code not being the latest version.
-    cd "$WORKSPACE/vllm-ascend"
-    git pull origin main
-    
-}
-
 run_tests_with_log() {
     set +e
     kill_npu_processes
-    pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py
+    pytest -sv --show-capture=no tests/e2e/nightly/multi_node/test_multi_node.py
     ret=$?
     set -e
     if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
         if [ $ret -eq 0 ]; then
             print_success "All tests passed!"
         else
-            print_failure "Some tests failed!"
+            print_failure "Some tests failed, please check the error stack above for details.\
+            If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary."
         fi
     fi
 }
@@ -156,7 +148,6 @@ main() {
     if [[ "$CONFIG_YAML_PATH" == *"DeepSeek-V3_2-Exp-bf16.yaml" ]]; then
         install_extra_components
     fi
-    upgrade_vllm_ascend_scr
     cd "$WORKSPACE/vllm-ascend"
     run_tests_with_log
 }