From 195a59fe238f7dd32d5af97d829fc327653b579d Mon Sep 17 00:00:00 2001 From: Sai Enduri Date: Wed, 1 Oct 2025 01:12:28 -0700 Subject: [PATCH] Refactor AMD CI. (#11128) --- .github/workflows/pr-test-amd.yml | 48 ++++++++----------------------- test/srt/run_suite.py | 6 ++-- 2 files changed, 15 insertions(+), 39 deletions(-) diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 0a2d01a21..3efa5c2f1 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -30,7 +30,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi300-gpu-1, linux-mi325-gpu-1] + runner: [linux-mi325-gpu-1] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -56,7 +56,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi300-gpu-2, linux-mi325-gpu-2, linux-mi35x-gpu-2] + runner: [linux-mi325-gpu-2] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -80,7 +80,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi300-gpu-1, linux-mi325-gpu-1] + runner: [linux-mi325-gpu-1] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -104,7 +104,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi300-gpu-1, linux-mi325-gpu-1] + runner: [linux-mi325-gpu-1] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -144,7 +144,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi300-gpu-1, linux-mi325-gpu-1] + runner: [linux-mi325-gpu-1] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -178,7 +178,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi300-gpu-2, linux-mi325-gpu-2] + runner: [linux-mi325-gpu-2] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -222,8 +222,8 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi300-gpu-1, linux-mi325-gpu-1] - part: [0, 1, 2, 3, 4, 5, 6, 7] + runner: [linux-mi325-gpu-1] + part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -238,40 +238,16 @@ jobs: run: bash scripts/ci/amd_ci_install_dependency.sh - name: Run test - timeout-minutes: 50 + timeout-minutes: 30 run: | - bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 8 - - unit-test-backend-1-gpu-amd-mi35x: - if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci') - strategy: - fail-fast: false - matrix: - runner: [linux-mi35x-gpu-1] - runs-on: ${{matrix.runner}} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Start CI container - run: bash scripts/ci/amd_ci_start_container.sh - env: - GITHUB_WORKSPACE: ${{ github.workspace }} - - - name: Install dependencies - run: bash scripts/ci/amd_ci_install_dependency.sh - - - name: Run test - timeout-minutes: 50 - run: | - bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd-mi35x + bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 12 unit-test-backend-2-gpu-amd: if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci') strategy: fail-fast: false matrix: - runner: [linux-mi300-gpu-2, linux-mi325-gpu-2] + runner: [linux-mi325-gpu-2] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -319,7 +295,7 @@ jobs: strategy: fail-fast: false matrix: - runner: [linux-mi300-gpu-1, linux-mi325-gpu-1] + runner: [linux-mi325-gpu-1] runs-on: ${{matrix.runner}} steps: - name: Checkout code diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index a93b3f47c..71862a7e8 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -184,7 +184,7 @@ suite_amd = { TestFile("lora/test_multi_lora_backend.py", 60), TestFile("lora/test_lora_cuda_graph.py", 250), TestFile("lora/test_lora_qwen3.py", 97), - TestFile("models/test_embedding_models.py", 73), + # TestFile("models/test_embedding_models.py", 73), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127 TestFile("models/test_compressed_tensors_models.py", 42), TestFile("models/test_qwen_models.py", 82), TestFile("models/test_reward_models.py", 132), @@ -246,7 +246,7 @@ suite_amd = { TestFile("test_triton_attention_backend.py", 150), # TestFile("test_vision_chunked_prefill.py", 175), # Disabled temporarily and track in #7701 TestFile("test_wave_attention_kernels.py", 2), - TestFile("test_wave_attention_backend.py", 150), + # TestFile("test_wave_attention_backend.py", 150), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127 ], "per-commit-amd-mi35x": [ TestFile("test_mla.py", 242), @@ -257,7 +257,7 @@ suite_amd = { TestFile("rl/test_update_weights_from_distributed.py", 103), TestFile("test_data_parallelism.py", 73), TestFile("test_load_weights_from_remote_instance.py", 72), - TestFile("test_patch_torch.py", 19), + # TestFile("test_patch_torch.py", 19), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127 ], "per-commit-4-gpu-amd": [ TestFile("test_pp_single_node.py", 150),