[Nightly] Initial logging for nightly multi-node testing (#5362)

### What this PR does / why we need it?
Currently, our multi-node logs only show the master node's logs (via the
Kubernetes API), which is insufficient for effective problem
localization if other nodes experience issues. Therefore, this pull
request adds the ability to upload logs for other nodes.

Next plan: Output structured directory logs, including logs from each
node and the polog.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: release/v0.13.0
- vLLM main:
bc0a5a0c08

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-12-26 11:39:07 +08:00
committed by GitHub
parent 320877d488
commit c2f776b846
4 changed files with 39 additions and 23 deletions

View File

@@ -252,14 +252,46 @@ jobs:
- name: Stream logs - name: Stream logs
run: | run: |
set -euo pipefail set -euo pipefail
size="${{ inputs.size }}"
pids=()
cleanup() {
echo "Cleaning up background log streams..."
for pid in "${pids[@]}"; do
kill "$pid" 2>/dev/null || true
done
}
trap cleanup EXIT
for i in $(seq 1 $((size - 1))); do
POD="vllm-0-${i}"
echo "==== Collecting logs from worker pod: $POD ===="
kubectl logs -f "$POD" -n "$NAMESPACE" \
> "/tmp/${POD}_logs.txt" 2>&1 &
pids+=($!)
done
echo "==== Streaming logs from leader pod: $LEADER_POD ===="
echo "Looking for logs containing: $FAIL_TAG" echo "Looking for logs containing: $FAIL_TAG"
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while read -r line; do
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while IFS= read -r line; do
echo "$line" echo "$line"
if echo "$line" | grep -q "$FAIL_TAG"; then if echo "$line" | grep -q "$FAIL_TAG"; then
exit 1 # workflow step failed exit 1
fi fi
done done
- name: Upload logs
if: always()
uses: actions/upload-artifact@v4
with:
name: pod-logs
path: /tmp/vllm*_logs.txt
retention-days: 7
- name: Post process - name: Post process
if: always() if: always()
run: | run: |

View File

@@ -66,13 +66,6 @@ jobs:
npu-smi info npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
- name: Sync from vllm-Ascend main branch
working-directory: /vllm-workspace/vllm-ascend
run: |
git config --global --add safe.directory "$(pwd)"
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
git pull origin main
- name: Show vLLM and vLLM-Ascend version - name: Show vLLM and vLLM-Ascend version
working-directory: /vllm-workspace working-directory: /vllm-workspace
run: | run: |

View File

@@ -257,7 +257,7 @@ class RemoteOpenAIServer:
except RequestException: except RequestException:
all_ready = False all_ready = False
if should_log: if should_log:
logger.info(f"[WAIT] {url}: connection failed") logger.debug(f"[WAIT] {url}: connection failed")
# check unexpected exit # check unexpected exit
result = self._poll() result = self._poll()

View File

@@ -13,6 +13,7 @@ LOG_DIR="/root/.cache/tests/logs"
OVERWRITE_LOGS=true OVERWRITE_LOGS=true
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
export BENCHMARK_HOME=${WORKSPACE}/vllm-ascend/benchmark export BENCHMARK_HOME=${WORKSPACE}/vllm-ascend/benchmark
export VLLM_LOGGING_LEVEL="DEBUG"
# Function to print section headers # Function to print section headers
print_section() { print_section() {
@@ -124,27 +125,18 @@ kill_npu_processes() {
sleep 4 sleep 4
} }
upgrade_vllm_ascend_scr() {
# Fix me(Potabk): Remove this once our image build use
# The separate architecture build process currently suffers from errors during cross-compilation
# causing the image to fail to build correctly.
# This results in the nightly test code not being the latest version.
cd "$WORKSPACE/vllm-ascend"
git pull origin main
}
run_tests_with_log() { run_tests_with_log() {
set +e set +e
kill_npu_processes kill_npu_processes
pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py pytest -sv --show-capture=no tests/e2e/nightly/multi_node/test_multi_node.py
ret=$? ret=$?
set -e set -e
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
if [ $ret -eq 0 ]; then if [ $ret -eq 0 ]; then
print_success "All tests passed!" print_success "All tests passed!"
else else
print_failure "Some tests failed!" print_failure "Some tests failed, please check the error stack above for details.\
If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary."
fi fi
fi fi
} }
@@ -156,7 +148,6 @@ main() {
if [[ "$CONFIG_YAML_PATH" == *"DeepSeek-V3_2-Exp-bf16.yaml" ]]; then if [[ "$CONFIG_YAML_PATH" == *"DeepSeek-V3_2-Exp-bf16.yaml" ]]; then
install_extra_components install_extra_components
fi fi
upgrade_vllm_ascend_scr
cd "$WORKSPACE/vllm-ascend" cd "$WORKSPACE/vllm-ascend"
run_tests_with_log run_tests_with_log
} }