[CI]Add CI summary log (#7202)

### What this PR does / why we need it?
This PR adds a new CI log summarizer, `ci_log_summary.py`, and wires it
into unit-test and e2e workflows so failed jobs publish a structured
failure summary to the GitHub step summary.
Examples:
- `python3 .github/workflows/scripts/ci_log_summary.py --log-file
/tmp/unit-test.log --mode ut --step-name "Unit test"`
- `python3 .github/workflows/scripts/ci_log_summary.py --run-id
23127187822 --format json`

A maintenance note is added to `ci_utils.py` to clarify that the `START`
/ `PASSED` / `FAILED (exit code X)` log lines are parsed by
`ci_log_summary.py`, so any future format changes must be coordinated
with the corresponding summarizer regexes.

🤖 Generated with [Codex]<noreply@openai.com>
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
Signed-off-by: meihanc <jcccx.cmh@gmail.com>
Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
meihanc
2026-03-19 09:32:06 +08:00
committed by GitHub
parent e8f7b2e3f1
commit ab9cd2e305
6 changed files with 1154 additions and 14 deletions

View File

@@ -92,20 +92,33 @@ jobs:
env:
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
VLLM_WORKER_MULTIPROC_METHOD: spawn
shell: bash
run: |
set -o pipefail
if [ "${{ inputs.continue_on_error }}" = "true" ]; then
python3 .github/workflows/scripts/run_suite.py \
--suite e2e-singlecard-light \
--auto-partition-id "${{ matrix.part }}" \
--auto-partition-size 1 \
--auto-upgrade-estimated-times \
--continue-on-error
--continue-on-error \
2>&1 | tee /tmp/e2e-singlecard-light-part${{ matrix.part }}.log
else
python3 .github/workflows/scripts/run_suite.py \
--suite e2e-singlecard-light \
--auto-partition-id "${{ matrix.part }}" \
--auto-partition-size 1
--auto-partition-size 1 \
2>&1 | tee /tmp/e2e-singlecard-light-part${{ matrix.part }}.log
fi
exit ${PIPESTATUS[0]}
- name: Summarize singlecard-light failure
if: ${{ always() }}
run: |
python3 .github/workflows/scripts/ci_log_summary.py \
--step-name "Run singlecard-light test" \
--log-file /tmp/e2e-singlecard-light-part${{ matrix.part }}.log \
--output "$GITHUB_STEP_SUMMARY"
- name: Upload timing data
@@ -183,20 +196,33 @@ jobs:
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
shell: bash
run: |
set -o pipefail
if [ "${{ inputs.continue_on_error }}" = "true" ]; then
python3 .github/workflows/scripts/run_suite.py \
--suite e2e-singlecard \
--auto-partition-id "${{ matrix.part }}" \
--auto-partition-size 2 \
--auto-upgrade-estimated-times \
--continue-on-error
--continue-on-error \
2>&1 | tee /tmp/e2e-singlecard-full-part${{ matrix.part }}.log
else
python3 .github/workflows/scripts/run_suite.py \
--suite e2e-singlecard \
--auto-partition-id "${{ matrix.part }}" \
--auto-partition-size 2
--auto-partition-size 2 \
2>&1 | tee /tmp/e2e-singlecard-full-part${{ matrix.part }}.log
fi
exit ${PIPESTATUS[0]}
- name: Summarize singlecard-full failure
if: ${{ always() }}
run: |
python3 .github/workflows/scripts/ci_log_summary.py \
--step-name "Run singlecard-full test" \
--log-file /tmp/e2e-singlecard-full-part${{ matrix.part }}.log \
--output "$GITHUB_STEP_SUMMARY"
- name: Upload timing data
uses: actions/upload-artifact@v4
@@ -271,20 +297,33 @@ jobs:
- name: Run vllm-project/vllm-ascend test (light)
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
shell: bash
run: |
set -o pipefail
if [ "${{ inputs.continue_on_error }}" = "true" ]; then
python3 .github/workflows/scripts/run_suite.py \
--suite e2e-2card-light \
--auto-partition-id "${{ matrix.part }}" \
--auto-partition-size 1 \
--auto-upgrade-estimated-times \
--continue-on-error
--continue-on-error \
2>&1 | tee /tmp/e2e-2card-light-part${{ matrix.part }}.log
else
python3 .github/workflows/scripts/run_suite.py \
--suite e2e-2card-light \
--auto-partition-id "${{ matrix.part }}" \
--auto-partition-size 1
--auto-partition-size 1 \
2>&1 | tee /tmp/e2e-2card-light-part${{ matrix.part }}.log
fi
exit ${PIPESTATUS[0]}
- name: Summarize multicard-2-light failure
if: ${{ always() }}
run: |
python3 .github/workflows/scripts/ci_log_summary.py \
--step-name "Run multicard-2-light test" \
--log-file /tmp/e2e-2card-light-part${{ matrix.part }}.log \
--output "$GITHUB_STEP_SUMMARY"
- name: Upload timing data
@@ -360,20 +399,33 @@ jobs:
- name: Run vllm-project/vllm-ascend test (full)
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
shell: bash
run: |
set -o pipefail
if [ "${{ inputs.continue_on_error }}" = "true" ]; then
python3 .github/workflows/scripts/run_suite.py \
--suite e2e-multicard-2-cards \
--auto-partition-id "${{ matrix.part }}" \
--auto-partition-size 1 \
--auto-upgrade-estimated-times \
--continue-on-error
--continue-on-error \
2>&1 | tee /tmp/e2e-2card-full-part${{ matrix.part }}.log
else
python3 .github/workflows/scripts/run_suite.py \
--suite e2e-multicard-2-cards \
--auto-partition-id "${{ matrix.part }}" \
--auto-partition-size 1
--auto-partition-size 1 \
2>&1 | tee /tmp/e2e-2card-full-part${{ matrix.part }}.log
fi
exit ${PIPESTATUS[0]}
- name: Summarize multicard-2-full failure
if: ${{ always() }}
run: |
python3 .github/workflows/scripts/ci_log_summary.py \
--step-name "Run multicard-2-full test " \
--log-file /tmp/e2e-2card-full-part${{ matrix.part }}.log \
--output "$GITHUB_STEP_SUMMARY"
- name: Upload timing data
@@ -389,9 +441,21 @@ jobs:
if: ${{ inputs.type == 'full' && matrix.part == 0 }}
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
shell: bash
run: |
set -o pipefail
python3 -m pip uninstall -y triton-ascend
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py \
2>&1 | tee /tmp/e2e-non-triton.log
exit ${PIPESTATUS[0]}
- name: Summarize non-triton failure
if: ${{ always() && inputs.type == 'full' && matrix.part == 0 }}
run: |
python3 .github/workflows/scripts/ci_log_summary.py \
--step-name "Run multicard-2-full test (non triton)" \
--log-file /tmp/e2e-non-triton.log \
--output "$GITHUB_STEP_SUMMARY"
e2e-4-cards-full:
name: multicard-4-full
@@ -457,20 +521,33 @@ jobs:
- name: Run vllm-project/vllm-ascend test for V1 Engine
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
shell: bash
run: |
set -o pipefail
if [ "${{ inputs.continue_on_error }}" = "true" ]; then
python3 .github/workflows/scripts/run_suite.py \
--suite e2e-multicard-4-cards \
--auto-partition-id "${{ matrix.part }}" \
--auto-partition-size 1 \
--auto-upgrade-estimated-times \
--continue-on-error
--continue-on-error \
2>&1 | tee /tmp/e2e-4card-full-part${{ matrix.part }}.log
else
python3 .github/workflows/scripts/run_suite.py \
--suite e2e-multicard-4-cards \
--auto-partition-id "${{ matrix.part }}" \
--auto-partition-size 1
--auto-partition-size 1 \
2>&1 | tee /tmp/e2e-4card-full-part${{ matrix.part }}.log
fi
exit ${PIPESTATUS[0]}
- name: Summarize multicard-4-full failure
if: ${{ always() }}
run: |
python3 .github/workflows/scripts/ci_log_summary.py \
--step-name "Run vllm-project/vllm-ascend test for V1 Engine" \
--log-file /tmp/e2e-4card-full-part${{ matrix.part }}.log \
--output "$GITHUB_STEP_SUMMARY"
- name: Upload timing data
@@ -540,9 +617,21 @@ jobs:
env:
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
VLLM_WORKER_MULTIPROC_METHOD: spawn
shell: bash
run: |
set -o pipefail
pytest -sv --durations=0 tests/e2e/310p/singlecard/test_dense_model_singlecard.py \
tests/e2e/310p/singlecard/test_vl_model_singlecard.py
tests/e2e/310p/singlecard/test_vl_model_singlecard.py \
2>&1 | tee /tmp/e2e-310p-singlecard.log
exit ${PIPESTATUS[0]}
- name: Summarize 310p singlecard failure
if: ${{ always() && inputs.contains_310 }}
run: |
python3 .github/workflows/scripts/ci_log_summary.py \
--step-name "Run vllm-project/vllm-ascend test" \
--log-file /tmp/e2e-310p-singlecard.log \
--output "$GITHUB_STEP_SUMMARY"
e2e_310p-4cards:
name: 310p multicards 4cards
@@ -602,8 +691,20 @@ jobs:
env:
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
VLLM_WORKER_MULTIPROC_METHOD: spawn
shell: bash
run: |
set -o pipefail
pytest -sv --durations=0 \
tests/e2e/310p/multicard/test_dense_model_multicard.py \
tests/e2e/310p/multicard/test_moe_model_multicard.py \
tests/e2e/310p/multicard/test_vl_model_multicard.py
tests/e2e/310p/multicard/test_vl_model_multicard.py \
2>&1 | tee /tmp/e2e-310p-4cards.log
exit ${PIPESTATUS[0]}
- name: Summarize 310p multicards failure
if: ${{ always() && inputs.contains_310 }}
run: |
python3 .github/workflows/scripts/ci_log_summary.py \
--step-name "Run vllm-project/vllm-ascend test" \
--log-file /tmp/e2e-310p-4cards.log \
--output "$GITHUB_STEP_SUMMARY"