[CI]Add CI summary log (#7202)
### What this PR does / why we need it?
This PR adds a new CI log summarizer, `ci_log_summary.py`, and wires it
into unit-test and e2e workflows so failed jobs publish a structured
failure summary to the GitHub step summary.
Examples:
- `python3 .github/workflows/scripts/ci_log_summary.py --log-file
/tmp/unit-test.log --mode ut --step-name "Unit test"`
- `python3 .github/workflows/scripts/ci_log_summary.py --run-id
23127187822 --format json`
A maintenance note is added to `ci_utils.py` to clarify that the `START`
/ `PASSED` / `FAILED (exit code X)` log lines are parsed by
`ci_log_summary.py`, so any future format changes must be coordinated
with the corresponding summarizer regexes.
🤖 Generated with [Codex]<noreply@openai.com>
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
Signed-off-by: meihanc <jcccx.cmh@gmail.com>
Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
127
.github/workflows/_e2e_test.yaml
vendored
127
.github/workflows/_e2e_test.yaml
vendored
@@ -92,20 +92,33 @@ jobs:
|
||||
env:
|
||||
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
shell: bash
|
||||
run: |
|
||||
set -o pipefail
|
||||
if [ "${{ inputs.continue_on_error }}" = "true" ]; then
|
||||
python3 .github/workflows/scripts/run_suite.py \
|
||||
--suite e2e-singlecard-light \
|
||||
--auto-partition-id "${{ matrix.part }}" \
|
||||
--auto-partition-size 1 \
|
||||
--auto-upgrade-estimated-times \
|
||||
--continue-on-error
|
||||
--continue-on-error \
|
||||
2>&1 | tee /tmp/e2e-singlecard-light-part${{ matrix.part }}.log
|
||||
else
|
||||
python3 .github/workflows/scripts/run_suite.py \
|
||||
--suite e2e-singlecard-light \
|
||||
--auto-partition-id "${{ matrix.part }}" \
|
||||
--auto-partition-size 1
|
||||
--auto-partition-size 1 \
|
||||
2>&1 | tee /tmp/e2e-singlecard-light-part${{ matrix.part }}.log
|
||||
fi
|
||||
exit ${PIPESTATUS[0]}
|
||||
|
||||
- name: Summarize singlecard-light failure
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
python3 .github/workflows/scripts/ci_log_summary.py \
|
||||
--step-name "Run singlecard-light test" \
|
||||
--log-file /tmp/e2e-singlecard-light-part${{ matrix.part }}.log \
|
||||
--output "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
|
||||
- name: Upload timing data
|
||||
@@ -183,20 +196,33 @@ jobs:
|
||||
env:
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
||||
shell: bash
|
||||
run: |
|
||||
set -o pipefail
|
||||
if [ "${{ inputs.continue_on_error }}" = "true" ]; then
|
||||
python3 .github/workflows/scripts/run_suite.py \
|
||||
--suite e2e-singlecard \
|
||||
--auto-partition-id "${{ matrix.part }}" \
|
||||
--auto-partition-size 2 \
|
||||
--auto-upgrade-estimated-times \
|
||||
--continue-on-error
|
||||
--continue-on-error \
|
||||
2>&1 | tee /tmp/e2e-singlecard-full-part${{ matrix.part }}.log
|
||||
else
|
||||
python3 .github/workflows/scripts/run_suite.py \
|
||||
--suite e2e-singlecard \
|
||||
--auto-partition-id "${{ matrix.part }}" \
|
||||
--auto-partition-size 2
|
||||
--auto-partition-size 2 \
|
||||
2>&1 | tee /tmp/e2e-singlecard-full-part${{ matrix.part }}.log
|
||||
fi
|
||||
exit ${PIPESTATUS[0]}
|
||||
|
||||
- name: Summarize singlecard-full failure
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
python3 .github/workflows/scripts/ci_log_summary.py \
|
||||
--step-name "Run singlecard-full test" \
|
||||
--log-file /tmp/e2e-singlecard-full-part${{ matrix.part }}.log \
|
||||
--output "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
- name: Upload timing data
|
||||
uses: actions/upload-artifact@v4
|
||||
@@ -271,20 +297,33 @@ jobs:
|
||||
- name: Run vllm-project/vllm-ascend test (light)
|
||||
env:
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
shell: bash
|
||||
run: |
|
||||
set -o pipefail
|
||||
if [ "${{ inputs.continue_on_error }}" = "true" ]; then
|
||||
python3 .github/workflows/scripts/run_suite.py \
|
||||
--suite e2e-2card-light \
|
||||
--auto-partition-id "${{ matrix.part }}" \
|
||||
--auto-partition-size 1 \
|
||||
--auto-upgrade-estimated-times \
|
||||
--continue-on-error
|
||||
--continue-on-error \
|
||||
2>&1 | tee /tmp/e2e-2card-light-part${{ matrix.part }}.log
|
||||
else
|
||||
python3 .github/workflows/scripts/run_suite.py \
|
||||
--suite e2e-2card-light \
|
||||
--auto-partition-id "${{ matrix.part }}" \
|
||||
--auto-partition-size 1
|
||||
--auto-partition-size 1 \
|
||||
2>&1 | tee /tmp/e2e-2card-light-part${{ matrix.part }}.log
|
||||
fi
|
||||
exit ${PIPESTATUS[0]}
|
||||
|
||||
- name: Summarize multicard-2-light failure
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
python3 .github/workflows/scripts/ci_log_summary.py \
|
||||
--step-name "Run multicard-2-light test" \
|
||||
--log-file /tmp/e2e-2card-light-part${{ matrix.part }}.log \
|
||||
--output "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
|
||||
- name: Upload timing data
|
||||
@@ -360,20 +399,33 @@ jobs:
|
||||
- name: Run vllm-project/vllm-ascend test (full)
|
||||
env:
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
shell: bash
|
||||
run: |
|
||||
set -o pipefail
|
||||
if [ "${{ inputs.continue_on_error }}" = "true" ]; then
|
||||
python3 .github/workflows/scripts/run_suite.py \
|
||||
--suite e2e-multicard-2-cards \
|
||||
--auto-partition-id "${{ matrix.part }}" \
|
||||
--auto-partition-size 1 \
|
||||
--auto-upgrade-estimated-times \
|
||||
--continue-on-error
|
||||
--continue-on-error \
|
||||
2>&1 | tee /tmp/e2e-2card-full-part${{ matrix.part }}.log
|
||||
else
|
||||
python3 .github/workflows/scripts/run_suite.py \
|
||||
--suite e2e-multicard-2-cards \
|
||||
--auto-partition-id "${{ matrix.part }}" \
|
||||
--auto-partition-size 1
|
||||
--auto-partition-size 1 \
|
||||
2>&1 | tee /tmp/e2e-2card-full-part${{ matrix.part }}.log
|
||||
fi
|
||||
exit ${PIPESTATUS[0]}
|
||||
|
||||
- name: Summarize multicard-2-full failure
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
python3 .github/workflows/scripts/ci_log_summary.py \
|
||||
--step-name "Run multicard-2-full test " \
|
||||
--log-file /tmp/e2e-2card-full-part${{ matrix.part }}.log \
|
||||
--output "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
|
||||
- name: Upload timing data
|
||||
@@ -389,9 +441,21 @@ jobs:
|
||||
if: ${{ inputs.type == 'full' && matrix.part == 0 }}
|
||||
env:
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
shell: bash
|
||||
run: |
|
||||
set -o pipefail
|
||||
python3 -m pip uninstall -y triton-ascend
|
||||
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py
|
||||
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py \
|
||||
2>&1 | tee /tmp/e2e-non-triton.log
|
||||
exit ${PIPESTATUS[0]}
|
||||
|
||||
- name: Summarize non-triton failure
|
||||
if: ${{ always() && inputs.type == 'full' && matrix.part == 0 }}
|
||||
run: |
|
||||
python3 .github/workflows/scripts/ci_log_summary.py \
|
||||
--step-name "Run multicard-2-full test (non triton)" \
|
||||
--log-file /tmp/e2e-non-triton.log \
|
||||
--output "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
e2e-4-cards-full:
|
||||
name: multicard-4-full
|
||||
@@ -457,20 +521,33 @@ jobs:
|
||||
- name: Run vllm-project/vllm-ascend test for V1 Engine
|
||||
env:
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
shell: bash
|
||||
run: |
|
||||
set -o pipefail
|
||||
if [ "${{ inputs.continue_on_error }}" = "true" ]; then
|
||||
python3 .github/workflows/scripts/run_suite.py \
|
||||
--suite e2e-multicard-4-cards \
|
||||
--auto-partition-id "${{ matrix.part }}" \
|
||||
--auto-partition-size 1 \
|
||||
--auto-upgrade-estimated-times \
|
||||
--continue-on-error
|
||||
--continue-on-error \
|
||||
2>&1 | tee /tmp/e2e-4card-full-part${{ matrix.part }}.log
|
||||
else
|
||||
python3 .github/workflows/scripts/run_suite.py \
|
||||
--suite e2e-multicard-4-cards \
|
||||
--auto-partition-id "${{ matrix.part }}" \
|
||||
--auto-partition-size 1
|
||||
--auto-partition-size 1 \
|
||||
2>&1 | tee /tmp/e2e-4card-full-part${{ matrix.part }}.log
|
||||
fi
|
||||
exit ${PIPESTATUS[0]}
|
||||
|
||||
- name: Summarize multicard-4-full failure
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
python3 .github/workflows/scripts/ci_log_summary.py \
|
||||
--step-name "Run vllm-project/vllm-ascend test for V1 Engine" \
|
||||
--log-file /tmp/e2e-4card-full-part${{ matrix.part }}.log \
|
||||
--output "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
|
||||
- name: Upload timing data
|
||||
@@ -540,9 +617,21 @@ jobs:
|
||||
env:
|
||||
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
shell: bash
|
||||
run: |
|
||||
set -o pipefail
|
||||
pytest -sv --durations=0 tests/e2e/310p/singlecard/test_dense_model_singlecard.py \
|
||||
tests/e2e/310p/singlecard/test_vl_model_singlecard.py
|
||||
tests/e2e/310p/singlecard/test_vl_model_singlecard.py \
|
||||
2>&1 | tee /tmp/e2e-310p-singlecard.log
|
||||
exit ${PIPESTATUS[0]}
|
||||
|
||||
- name: Summarize 310p singlecard failure
|
||||
if: ${{ always() && inputs.contains_310 }}
|
||||
run: |
|
||||
python3 .github/workflows/scripts/ci_log_summary.py \
|
||||
--step-name "Run vllm-project/vllm-ascend test" \
|
||||
--log-file /tmp/e2e-310p-singlecard.log \
|
||||
--output "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
e2e_310p-4cards:
|
||||
name: 310p multicards 4cards
|
||||
@@ -602,8 +691,20 @@ jobs:
|
||||
env:
|
||||
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
||||
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||
shell: bash
|
||||
run: |
|
||||
set -o pipefail
|
||||
pytest -sv --durations=0 \
|
||||
tests/e2e/310p/multicard/test_dense_model_multicard.py \
|
||||
tests/e2e/310p/multicard/test_moe_model_multicard.py \
|
||||
tests/e2e/310p/multicard/test_vl_model_multicard.py
|
||||
tests/e2e/310p/multicard/test_vl_model_multicard.py \
|
||||
2>&1 | tee /tmp/e2e-310p-4cards.log
|
||||
exit ${PIPESTATUS[0]}
|
||||
|
||||
- name: Summarize 310p multicards failure
|
||||
if: ${{ always() && inputs.contains_310 }}
|
||||
run: |
|
||||
python3 .github/workflows/scripts/ci_log_summary.py \
|
||||
--step-name "Run vllm-project/vllm-ascend test" \
|
||||
--log-file /tmp/e2e-310p-4cards.log \
|
||||
--output "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
Reference in New Issue
Block a user