diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 04f2aa72..93ad51fc 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -252,14 +252,46 @@ jobs: - name: Stream logs run: | set -euo pipefail + + size="${{ inputs.size }}" + pids=() + + cleanup() { + echo "Cleaning up background log streams..." + for pid in "${pids[@]}"; do + kill "$pid" 2>/dev/null || true + done + } + trap cleanup EXIT + + for i in $(seq 1 $((size - 1))); do + POD="vllm-0-${i}" + + echo "==== Collecting logs from worker pod: $POD ====" + kubectl logs -f "$POD" -n "$NAMESPACE" \ + > "/tmp/${POD}_logs.txt" 2>&1 & + + pids+=($!) + done + + echo "==== Streaming logs from leader pod: $LEADER_POD ====" echo "Looking for logs containing: $FAIL_TAG" - kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while read -r line; do + + kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" | while IFS= read -r line; do echo "$line" if echo "$line" | grep -q "$FAIL_TAG"; then - exit 1 # workflow step failed + exit 1 fi done + - name: Upload logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: pod-logs + path: /tmp/vllm*_logs.txt + retention-days: 7 + - name: Post process if: always() run: | diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml index 6bb61115..7fab7613 100644 --- a/.github/workflows/_e2e_nightly_single_node.yaml +++ b/.github/workflows/_e2e_nightly_single_node.yaml @@ -66,13 +66,6 @@ jobs: npu-smi info cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info - - name: Sync from vllm-Ascend main branch - working-directory: /vllm-workspace/vllm-ascend - run: | - git config --global --add safe.directory "$(pwd)" - git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ - git pull origin main - - name: Show vLLM and vLLM-Ascend version working-directory: /vllm-workspace run: | diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index be9d2a2d..1d993c1c 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -257,7 +257,7 @@ class RemoteOpenAIServer: except RequestException: all_ready = False if should_log: - logger.info(f"[WAIT] {url}: connection failed") + logger.debug(f"[WAIT] {url}: connection failed") # check unexpected exit result = self._poll() diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh index 2e02f744..8ef48bf4 100644 --- a/tests/e2e/nightly/multi_node/scripts/run.sh +++ b/tests/e2e/nightly/multi_node/scripts/run.sh @@ -13,6 +13,7 @@ LOG_DIR="/root/.cache/tests/logs" OVERWRITE_LOGS=true export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH export BENCHMARK_HOME=${WORKSPACE}/vllm-ascend/benchmark +export VLLM_LOGGING_LEVEL="DEBUG" # Function to print section headers print_section() { @@ -124,27 +125,18 @@ kill_npu_processes() { sleep 4 } -upgrade_vllm_ascend_scr() { - # Fix me(Potabk): Remove this once our image build use - # The separate architecture build process currently suffers from errors during cross-compilation - # causing the image to fail to build correctly. - # This results in the nightly test code not being the latest version. - cd "$WORKSPACE/vllm-ascend" - git pull origin main - -} - run_tests_with_log() { set +e kill_npu_processes - pytest -sv tests/e2e/nightly/multi_node/test_multi_node.py + pytest -sv --show-capture=no tests/e2e/nightly/multi_node/test_multi_node.py ret=$? set -e if [ "$LWS_WORKER_INDEX" -eq 0 ]; then if [ $ret -eq 0 ]; then print_success "All tests passed!" else - print_failure "Some tests failed!" + print_failure "Some tests failed, please check the error stack above for details.\ + If this is insufficient to pinpoint the error, please download and review the logs of all other nodes from the job's summary." fi fi } @@ -156,7 +148,6 @@ main() { if [[ "$CONFIG_YAML_PATH" == *"DeepSeek-V3_2-Exp-bf16.yaml" ]]; then install_extra_components fi - upgrade_vllm_ascend_scr cd "$WORKSPACE/vllm-ascend" run_tests_with_log }