[AMD] Add unit-test-sgl-kernel-amd to AMD CI (#7539)

2025-06-29 15:50:09 -07:00
parent b691dcc490
commit 3b3f1e3aeb
3 changed files with 55 additions and 12 deletions
--- a/test/srt/test_custom_allreduce.py
+++ b/test/srt/test_custom_allreduce.py
@@ -56,22 +56,30 @@ def multi_process_parallel(


 class TestCustomAllReduce(CustomTestCase):
+    TEST_SIZES = [
+        512,
+        4096,
+        32768,
+        262144,
+        2097152,
+        16777216,
+        33554432,
+    ]  # 512B...32MB
+    WORLD_SIZES = [2, 4, 6, 8]
+    TEST_LOOP = 10
+
    @classmethod
    def setUpClass(cls):
-        random.seed(42)
-        # 512B to 32MB
-        cls.test_sizes = [512, 4096, 32768, 262144, 2097152, 16777216, 33554432]
-        cls.world_sizes = [2, 4, 6, 8]
-        cls.test_loop = 10
+        random.seed(42)  # keep the deterministic seed

    def test_graph_allreduce(self):
-        for world_size in self.world_sizes:
+        for world_size in self.WORLD_SIZES:
            if world_size > torch.cuda.device_count():
                continue
            multi_process_parallel(world_size, self, self.graph_allreduce)

    def test_eager_allreduce(self):
-        for world_size in self.world_sizes:
+        for world_size in self.WORLD_SIZES:
            if world_size > torch.cuda.device_count():
                continue
            multi_process_parallel(world_size, self, self.eager_allreduce)
@@ -102,9 +110,9 @@ class TestCustomAllReduce(CustomTestCase):
        torch.cuda.synchronize()
        del data

-        for sz in self.test_sizes:
+        for sz in self.TEST_SIZES:
            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-                for _ in range(self.test_loop):
+                for _ in range(self.TEST_LOOP):
                    with graph_capture() as graph_capture_context:
                        # use integers so result matches NCCL exactly
                        inp1 = torch.randint(
@@ -151,9 +159,9 @@ class TestCustomAllReduce(CustomTestCase):
        initialize_model_parallel(tensor_model_parallel_size=world_size)
        group = get_tensor_model_parallel_group().device_group

-        for sz in self.test_sizes:
+        for sz in self.TEST_SIZES:
            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-                for _ in range(self.test_loop):
+                for _ in range(self.TEST_LOOP):
                    inp1 = torch.randint(
                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
                    )