[Nightly] Initial logging for nightly multi-node testing (#5362)
### What this PR does / why we need it?
Currently, our multi-node logs only show the master node's logs (via the
Kubernetes API), which is insufficient for effective problem
localization if other nodes experience issues. Therefore, this pull
request adds the ability to upload logs for other nodes.
Next plan: Output structured directory logs, including logs from each
node and the polog.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: release/v0.13.0
- vLLM main:
bc0a5a0c08
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
36
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
36
.github/workflows/_e2e_nightly_multi_node.yaml
vendored
@@ -252,14 +252,46 @@ jobs:
|
||||
- name: Stream logs
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
size="${{ inputs.size }}"
|
||||
pids=()
|
||||
|
||||
cleanup() {
|
||||
echo "Cleaning up background log streams..."
|
||||
for pid in "${pids[@]}"; do
|
||||
kill "$pid" 2>/dev/null || true
|
||||
done
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
for i in $(seq 1 $((size - 1))); do
|
||||
POD="vllm-0-${i}"
|
||||
|
||||
echo "==== Collecting logs from worker pod: $POD ===="
|
||||
kubectl logs -f "$POD" -n "$NAMESPACE" \
|
||||
> "/tmp/${POD}_logs.txt" 2>&1 &
|
||||
|
||||
pids+=($!)
|
||||
done
|
||||
|
||||
echo "==== Streaming logs from leader pod: $LEADER_POD ===="
|
||||
echo "Looking for logs containing: $FAIL_TAG"
|
||||
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while read -r line; do
|
||||
|
||||
kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while IFS= read -r line; do
|
||||
echo "$line"
|
||||
if echo "$line" | grep -q "$FAIL_TAG"; then
|
||||
exit 1 # workflow step failed
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Upload logs
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: pod-logs
|
||||
path: /tmp/vllm*_logs.txt
|
||||
retention-days: 7
|
||||
|
||||
- name: Post process
|
||||
if: always()
|
||||
run: |
|
||||
|
||||
@@ -66,13 +66,6 @@ jobs:
|
||||
npu-smi info
|
||||
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
|
||||
|
||||
- name: Sync from vllm-Ascend main branch
|
||||
working-directory: /vllm-workspace/vllm-ascend
|
||||
run: |
|
||||
git config --global --add safe.directory "$(pwd)"
|
||||
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
||||
git pull origin main
|
||||
|
||||
- name: Show vLLM and vLLM-Ascend version
|
||||
working-directory: /vllm-workspace
|
||||
run: |
|
||||
|
||||
Reference in New Issue
Block a user