[CI]Add CI summary log (#7202)

### What this PR does / why we need it? This PR adds a new CI log summarizer, `ci_log_summary.py`, and wires it into unit-test and e2e workflows so failed jobs publish a structured failure summary to the GitHub step summary. Examples: - `python3 .github/workflows/scripts/ci_log_summary.py --log-file /tmp/unit-test.log --mode ut --step-name "Unit test"` - `python3 .github/workflows/scripts/ci_log_summary.py --run-id 23127187822 --format json` A maintenance note is added to `ci_utils.py` to clarify that the `START` / `PASSED` / `FAILED (exit code X)` log lines are parsed by `ci_log_summary.py`, so any future format changes must be coordinated with the corresponding summarizer regexes. 🤖 Generated with [Codex]<noreply@openai.com> - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com> Signed-off-by: meihanc <jcccx.cmh@gmail.com> Co-authored-by: Codex <noreply@openai.com>
2026-03-19 09:32:06 +08:00
parent e8f7b2e3f1
commit ab9cd2e305
6 changed files with 1154 additions and 14 deletions
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -92,20 +92,33 @@ jobs:
        env:
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
          VLLM_WORKER_MULTIPROC_METHOD: spawn
+        shell: bash
        run: |
+          set -o pipefail
          if [ "${{ inputs.continue_on_error }}" = "true" ]; then
            python3 .github/workflows/scripts/run_suite.py \
              --suite e2e-singlecard-light \
              --auto-partition-id "${{ matrix.part }}" \
              --auto-partition-size 1 \
              --auto-upgrade-estimated-times \
-              --continue-on-error
+              --continue-on-error \
+              2>&1 | tee /tmp/e2e-singlecard-light-part${{ matrix.part }}.log
          else
            python3 .github/workflows/scripts/run_suite.py \
              --suite e2e-singlecard-light \
              --auto-partition-id "${{ matrix.part }}" \
-              --auto-partition-size 1
+              --auto-partition-size 1 \
+              2>&1 | tee /tmp/e2e-singlecard-light-part${{ matrix.part }}.log
          fi
+          exit ${PIPESTATUS[0]}
+
+      - name: Summarize singlecard-light failure
+        if: ${{ always() }}
+        run: |
+          python3 .github/workflows/scripts/ci_log_summary.py \
+            --step-name "Run singlecard-light test" \
+            --log-file /tmp/e2e-singlecard-light-part${{ matrix.part }}.log \
+            --output "$GITHUB_STEP_SUMMARY"


      - name: Upload timing data
@@ -183,20 +196,33 @@ jobs:
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
+        shell: bash
        run: |
+          set -o pipefail
          if [ "${{ inputs.continue_on_error }}" = "true" ]; then
            python3 .github/workflows/scripts/run_suite.py \
              --suite e2e-singlecard \
              --auto-partition-id "${{ matrix.part }}" \
              --auto-partition-size 2 \
              --auto-upgrade-estimated-times \
-              --continue-on-error
+              --continue-on-error \
+              2>&1 | tee /tmp/e2e-singlecard-full-part${{ matrix.part }}.log
          else
            python3 .github/workflows/scripts/run_suite.py \
              --suite e2e-singlecard \
              --auto-partition-id "${{ matrix.part }}" \
-              --auto-partition-size 2
+              --auto-partition-size 2 \
+              2>&1 | tee /tmp/e2e-singlecard-full-part${{ matrix.part }}.log
          fi
+          exit ${PIPESTATUS[0]}
+
+      - name: Summarize singlecard-full failure
+        if: ${{ always() }}
+        run: |
+          python3 .github/workflows/scripts/ci_log_summary.py \
+            --step-name "Run singlecard-full test" \
+            --log-file /tmp/e2e-singlecard-full-part${{ matrix.part }}.log \
+            --output "$GITHUB_STEP_SUMMARY"

      - name: Upload timing data
        uses: actions/upload-artifact@v4
@@ -271,20 +297,33 @@ jobs:
      - name: Run vllm-project/vllm-ascend test (light)
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
+        shell: bash
        run: |
+          set -o pipefail
          if [ "${{ inputs.continue_on_error }}" = "true" ]; then
            python3 .github/workflows/scripts/run_suite.py \
              --suite e2e-2card-light \
              --auto-partition-id "${{ matrix.part }}" \
              --auto-partition-size 1 \
              --auto-upgrade-estimated-times \
-              --continue-on-error
+              --continue-on-error \
+              2>&1 | tee /tmp/e2e-2card-light-part${{ matrix.part }}.log
          else
            python3 .github/workflows/scripts/run_suite.py \
              --suite e2e-2card-light \
              --auto-partition-id "${{ matrix.part }}" \
-              --auto-partition-size 1
+              --auto-partition-size 1 \
+              2>&1 | tee /tmp/e2e-2card-light-part${{ matrix.part }}.log
          fi
+          exit ${PIPESTATUS[0]}
+
+      - name: Summarize multicard-2-light failure
+        if: ${{ always() }}
+        run: |
+          python3 .github/workflows/scripts/ci_log_summary.py \
+            --step-name "Run multicard-2-light test" \
+            --log-file /tmp/e2e-2card-light-part${{ matrix.part }}.log \
+            --output "$GITHUB_STEP_SUMMARY"


      - name: Upload timing data
@@ -360,20 +399,33 @@ jobs:
      - name: Run vllm-project/vllm-ascend test (full)
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
+        shell: bash
        run: |
+          set -o pipefail
          if [ "${{ inputs.continue_on_error }}" = "true" ]; then
            python3 .github/workflows/scripts/run_suite.py \
              --suite e2e-multicard-2-cards \
              --auto-partition-id "${{ matrix.part }}" \
              --auto-partition-size 1 \
              --auto-upgrade-estimated-times \
-              --continue-on-error
+              --continue-on-error \
+              2>&1 | tee /tmp/e2e-2card-full-part${{ matrix.part }}.log
          else
            python3 .github/workflows/scripts/run_suite.py \
              --suite e2e-multicard-2-cards \
              --auto-partition-id "${{ matrix.part }}" \
-              --auto-partition-size 1
+              --auto-partition-size 1 \
+              2>&1 | tee /tmp/e2e-2card-full-part${{ matrix.part }}.log
          fi
+          exit ${PIPESTATUS[0]}
+
+      - name: Summarize multicard-2-full failure
+        if: ${{ always() }}
+        run: |
+          python3 .github/workflows/scripts/ci_log_summary.py \
+            --step-name "Run multicard-2-full test " \
+            --log-file /tmp/e2e-2card-full-part${{ matrix.part }}.log \
+            --output "$GITHUB_STEP_SUMMARY"


      - name: Upload timing data
@@ -389,9 +441,21 @@ jobs:
        if: ${{ inputs.type == 'full' && matrix.part == 0 }}
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
+        shell: bash
        run: |
+          set -o pipefail
          python3 -m pip uninstall -y triton-ascend
-          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py \
+            2>&1 | tee /tmp/e2e-non-triton.log
+          exit ${PIPESTATUS[0]}
+
+      - name: Summarize non-triton failure
+        if: ${{ always() && inputs.type == 'full' && matrix.part == 0 }}
+        run: |
+          python3 .github/workflows/scripts/ci_log_summary.py \
+            --step-name "Run multicard-2-full test (non triton)" \
+            --log-file /tmp/e2e-non-triton.log \
+            --output "$GITHUB_STEP_SUMMARY"

  e2e-4-cards-full:
    name: multicard-4-full
@@ -457,20 +521,33 @@ jobs:
      - name: Run vllm-project/vllm-ascend test for V1 Engine
        env:
          VLLM_WORKER_MULTIPROC_METHOD: spawn
+        shell: bash
        run: |
+          set -o pipefail
          if [ "${{ inputs.continue_on_error }}" = "true" ]; then
            python3 .github/workflows/scripts/run_suite.py \
              --suite e2e-multicard-4-cards \
              --auto-partition-id "${{ matrix.part }}" \
              --auto-partition-size 1 \
              --auto-upgrade-estimated-times \
-              --continue-on-error
+              --continue-on-error \
+              2>&1 | tee /tmp/e2e-4card-full-part${{ matrix.part }}.log
          else
            python3 .github/workflows/scripts/run_suite.py \
              --suite e2e-multicard-4-cards \
              --auto-partition-id "${{ matrix.part }}" \
-              --auto-partition-size 1
+              --auto-partition-size 1 \
+              2>&1 | tee /tmp/e2e-4card-full-part${{ matrix.part }}.log
          fi
+          exit ${PIPESTATUS[0]}
+
+      - name: Summarize multicard-4-full failure
+        if: ${{ always() }}
+        run: |
+          python3 .github/workflows/scripts/ci_log_summary.py \
+            --step-name "Run vllm-project/vllm-ascend test for V1 Engine" \
+            --log-file /tmp/e2e-4card-full-part${{ matrix.part }}.log \
+            --output "$GITHUB_STEP_SUMMARY"


      - name: Upload timing data
@@ -540,9 +617,21 @@ jobs:
        env:
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
          VLLM_WORKER_MULTIPROC_METHOD: spawn
+        shell: bash
        run: |
+          set -o pipefail
          pytest -sv --durations=0 tests/e2e/310p/singlecard/test_dense_model_singlecard.py \
-          tests/e2e/310p/singlecard/test_vl_model_singlecard.py
+          tests/e2e/310p/singlecard/test_vl_model_singlecard.py \
+          2>&1 | tee /tmp/e2e-310p-singlecard.log
+          exit ${PIPESTATUS[0]}
+
+      - name: Summarize 310p singlecard failure
+        if: ${{ always() && inputs.contains_310 }}
+        run: |
+          python3 .github/workflows/scripts/ci_log_summary.py \
+            --step-name "Run vllm-project/vllm-ascend test" \
+            --log-file /tmp/e2e-310p-singlecard.log \
+            --output "$GITHUB_STEP_SUMMARY"

  e2e_310p-4cards:
    name: 310p multicards 4cards
@@ -602,8 +691,20 @@ jobs:
        env:
          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
          VLLM_WORKER_MULTIPROC_METHOD: spawn
+        shell: bash
        run: |
+          set -o pipefail
          pytest -sv --durations=0 \
          tests/e2e/310p/multicard/test_dense_model_multicard.py \
          tests/e2e/310p/multicard/test_moe_model_multicard.py \
-          tests/e2e/310p/multicard/test_vl_model_multicard.py
+          tests/e2e/310p/multicard/test_vl_model_multicard.py \
+          2>&1 | tee /tmp/e2e-310p-4cards.log
+          exit ${PIPESTATUS[0]}
+
+      - name: Summarize 310p multicards failure
+        if: ${{ always() && inputs.contains_310 }}
+        run: |
+          python3 .github/workflows/scripts/ci_log_summary.py \
+            --step-name "Run vllm-project/vllm-ascend test" \
+            --log-file /tmp/e2e-310p-4cards.log \
+            --output "$GITHUB_STEP_SUMMARY"