From 195a59fe238f7dd32d5af97d829fc327653b579d Mon Sep 17 00:00:00 2001
From: Sai Enduri <saimanas.enduri@amd.com>
Date: Wed, 1 Oct 2025 01:12:28 -0700
Subject: [PATCH] Refactor AMD CI. (#11128)

---
 .github/workflows/pr-test-amd.yml | 48 ++++++++-----------------------
 test/srt/run_suite.py             |  6 ++--
 2 files changed, 15 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
index 0a2d01a21..3efa5c2f1 100644
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -30,7 +30,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+        runner: [linux-mi325-gpu-1]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
@@ -56,7 +56,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2, linux-mi35x-gpu-2]
+        runner: [linux-mi325-gpu-2]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
@@ -80,7 +80,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+        runner: [linux-mi325-gpu-1]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
@@ -104,7 +104,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+        runner: [linux-mi325-gpu-1]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
@@ -144,7 +144,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+        runner: [linux-mi325-gpu-1]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
@@ -178,7 +178,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2]
+        runner: [linux-mi325-gpu-2]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
@@ -222,8 +222,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
-        part: [0, 1, 2, 3, 4, 5, 6, 7]
+        runner: [linux-mi325-gpu-1]
+        part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
@@ -238,40 +238,16 @@ jobs:
         run: bash scripts/ci/amd_ci_install_dependency.sh
 
       - name: Run test
-        timeout-minutes: 50
+        timeout-minutes: 30
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 8
-
-  unit-test-backend-1-gpu-amd-mi35x:
-    if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
-    strategy:
-      fail-fast: false
-      matrix:
-        runner: [linux-mi35x-gpu-1]
-    runs-on: ${{matrix.runner}}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Start CI container
-        run: bash scripts/ci/amd_ci_start_container.sh
-        env:
-          GITHUB_WORKSPACE: ${{ github.workspace }}
-
-      - name: Install dependencies
-        run: bash scripts/ci/amd_ci_install_dependency.sh
-
-      - name: Run test
-        timeout-minutes: 50
-        run: |
-          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd-mi35x
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 12
 
   unit-test-backend-2-gpu-amd:
     if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
     strategy:
       fail-fast: false
       matrix:
-        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2]
+        runner: [linux-mi325-gpu-2]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
@@ -319,7 +295,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+        runner: [linux-mi325-gpu-1]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index a93b3f47c..71862a7e8 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -184,7 +184,7 @@ suite_amd = {
         TestFile("lora/test_multi_lora_backend.py", 60),
         TestFile("lora/test_lora_cuda_graph.py", 250),
         TestFile("lora/test_lora_qwen3.py", 97),
-        TestFile("models/test_embedding_models.py", 73),
+        # TestFile("models/test_embedding_models.py", 73), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127
         TestFile("models/test_compressed_tensors_models.py", 42),
         TestFile("models/test_qwen_models.py", 82),
         TestFile("models/test_reward_models.py", 132),
@@ -246,7 +246,7 @@ suite_amd = {
         TestFile("test_triton_attention_backend.py", 150),
         # TestFile("test_vision_chunked_prefill.py", 175), # Disabled temporarily and track in #7701
         TestFile("test_wave_attention_kernels.py", 2),
-        TestFile("test_wave_attention_backend.py", 150),
+        # TestFile("test_wave_attention_backend.py", 150), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127
     ],
     "per-commit-amd-mi35x": [
         TestFile("test_mla.py", 242),
@@ -257,7 +257,7 @@ suite_amd = {
         TestFile("rl/test_update_weights_from_distributed.py", 103),
         TestFile("test_data_parallelism.py", 73),
         TestFile("test_load_weights_from_remote_instance.py", 72),
-        TestFile("test_patch_torch.py", 19),
+        # TestFile("test_patch_torch.py", 19), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127
     ],
     "per-commit-4-gpu-amd": [
         TestFile("test_pp_single_node.py", 150),