[ci]use H20 to run disaggregation test (#11543)
This commit is contained in:
106
.github/workflows/pr-test-h20.yml
vendored
106
.github/workflows/pr-test-h20.yml
vendored
@@ -1,106 +0,0 @@
|
||||
name: PR Test (H20)
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
types: [synchronize, labeled]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
version:
|
||||
required: true
|
||||
type: choice
|
||||
default: 'release'
|
||||
options:
|
||||
- 'release'
|
||||
- 'nightly'
|
||||
|
||||
concurrency:
|
||||
group: pr-test-h20-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
check-changes:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
h20_files: ${{ steps.filter.outputs.h20_files }}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Fail if the PR does not have the 'run-ci' label
|
||||
if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'run-ci')
|
||||
run: |
|
||||
echo "This pull request does not have the 'run-ci' label. Failing the workflow."
|
||||
exit 1
|
||||
|
||||
- name: Fail if the PR is a draft
|
||||
if: github.event_name == 'pull_request' && github.event.pull_request.draft == true
|
||||
run: |
|
||||
echo "This pull request is a draft. Failing the workflow."
|
||||
exit 1
|
||||
|
||||
- name: Detect file changes
|
||||
id: filter
|
||||
uses: dorny/paths-filter@v3
|
||||
with:
|
||||
filters: |
|
||||
h20_files:
|
||||
- "python/sglang/srt/models/deepseek*"
|
||||
- "python/sglang/srt/layers/moe/**"
|
||||
- ".github/workflows/pr-test-h20.yml"
|
||||
- "python/pyproject.toml"
|
||||
|
||||
per-commit-8-gpu-h20:
|
||||
needs: [check-changes]
|
||||
if: needs.check-changes.outputs.h20_files == 'true'
|
||||
runs-on: 8-gpu-h20
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 20
|
||||
|
||||
run: |
|
||||
cd test/srt
|
||||
python3 run_suite.py --suite per-commit-8-gpu-h20
|
||||
|
||||
pr-test-h20-finish:
|
||||
needs: [
|
||||
check-changes,
|
||||
per-commit-8-gpu-h20,
|
||||
]
|
||||
if: always()
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check all dependent job statuses
|
||||
run: |
|
||||
# Convert the 'needs' context to a JSON string
|
||||
json_needs='${{ toJson(needs) }}'
|
||||
|
||||
# Get a list of all job names from the JSON keys
|
||||
job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
|
||||
|
||||
for job in $job_names; do
|
||||
# For each job, extract its result
|
||||
result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
|
||||
|
||||
# Print the job name and its result
|
||||
echo "$job: $result"
|
||||
|
||||
# Check for failure or cancellation and exit if found
|
||||
if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
|
||||
echo "The above jobs failed."
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# If the loop completes, all jobs were successful
|
||||
echo "All jobs completed successfully"
|
||||
exit 0
|
||||
33
.github/workflows/pr-test.yml
vendored
33
.github/workflows/pr-test.yml
vendored
@@ -350,6 +350,39 @@ jobs:
|
||||
cd test/srt
|
||||
python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
|
||||
|
||||
unit-test-backend-8-gpu-h20:
|
||||
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
|
||||
if: always() && !failure() && !cancelled() &&
|
||||
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
|
||||
runs-on: 8-gpu-h20
|
||||
env:
|
||||
SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4"
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
part: [0, 1]
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Download artifacts
|
||||
if: needs.check-changes.outputs.sgl_kernel == 'true'
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: sgl-kernel/dist/
|
||||
merge-multiple: true
|
||||
pattern: wheel-python3.10-cuda12.9
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 20
|
||||
run: |
|
||||
cd test/srt
|
||||
python3 run_suite.py --suite per-commit-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
|
||||
|
||||
performance-test-1-gpu-part-1:
|
||||
needs: [check-changes, sgl-kernel-build-wheels]
|
||||
if: always() && !failure() && !cancelled() &&
|
||||
|
||||
Reference in New Issue
Block a user