[2/N] Added the core structure of elastic EP and the eplb algorithm with faulty rank (#10606)

Co-authored-by: Xun Sun <UNIDY2002@outlook.com> Co-authored-by: Shangming Cai <csmthu@gmail.com>
2025-10-22 16:13:31 +08:00
parent e028af6998
commit 904655c5fd
7 changed files with 297 additions and 234 deletions
--- a/test/srt/ep/test_mooncake_ep_small.py
+++ b/test/srt/ep/test_mooncake_ep_small.py
@@ -3,6 +3,7 @@ from types import SimpleNamespace

 from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_disaggregation_utils import get_rdma_devices_args
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST_MLA,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -11,8 +12,12 @@ from sglang.test.test_utils import (
    popen_launch_server,
 )

+ib_devices = get_rdma_devices_args()
+
+
+class TestTP(CustomTestCase):
+    extra_args = []

-class TestPureDP(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
@@ -25,13 +30,10 @@ class TestPureDP(CustomTestCase):
                "--trust-remote-code",
                "--tp",
                "4",
-                "--enable-dp-attention",
-                "--dp",
-                "4",
                "--elastic-ep-backend",
                "mooncake",
                "--mooncake-ib-device",
-                "mlx5_roce0,mlx5_roce1,mlx5_roce2,mlx5_roce3,mlx5_roce4,mlx5_roce5,mlx5_roce6,mlx5_roce7",
+                ib_devices,
                "--moe-a2a-backend",
                "deepep",
                "--deepep-mode",
@@ -44,6 +46,7 @@ class TestPureDP(CustomTestCase):
                "512",
                "--mem-fraction-static",
                "0.5",
+                *cls.extra_args,
            ],
        )

@@ -67,219 +70,73 @@ class TestPureDP(CustomTestCase):
        self.assertGreater(metrics["accuracy"], 0.60)


-class TestHybridDPTP(CustomTestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--tp",
-                "4",
-                "--enable-dp-attention",
-                "--dp",
-                "2",
-                "--elastic-ep-backend",
-                "mooncake",
-                "--mooncake-ib-device",
-                "mlx5_roce0,mlx5_roce1,mlx5_roce2,mlx5_roce3,mlx5_roce4,mlx5_roce5,mlx5_roce6,mlx5_roce7",
-                "--moe-a2a-backend",
-                "deepep",
-                "--deepep-mode",
-                "low_latency",
-                "--chunked-prefill-size",
-                "512",
-                "--cuda-graph-max-bs",
-                "128",
-                "--max-running-requests",
-                "256",
-            ],
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_gsm8k(self):
-        args = SimpleNamespace(
-            num_shots=5,
-            data_path=None,
-            num_questions=200,
-            max_new_tokens=512,
-            parallel=128,
-            host="http://127.0.0.1",
-            port=int(self.base_url.split(":")[-1]),
-        )
-        metrics = run_eval_few_shot_gsm8k(args)
-        print(metrics)
-
-        self.assertGreater(metrics["accuracy"], 0.60)
+class TestPureDP(TestTP):
+    extra_args = [
+        "--tp",
+        "4",
+        "--enable-dp-attention",
+        "--dp",
+        "4",
+    ]


-class TestTP(CustomTestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--tp",
-                "4",
-                "--elastic-ep-backend",
-                "mooncake",
-                "--mooncake-ib-device",
-                "mlx5_roce0,mlx5_roce1,mlx5_roce2,mlx5_roce3,mlx5_roce4,mlx5_roce5,mlx5_roce6,mlx5_roce7",
-                "--moe-a2a-backend",
-                "deepep",
-                "--deepep-mode",
-                "low_latency",
-                "--chunked-prefill-size",
-                "512",
-                "--cuda-graph-max-bs",
-                "128",
-                "--max-running-requests",
-                "128",
-            ],
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_gsm8k(self):
-        args = SimpleNamespace(
-            num_shots=5,
-            data_path=None,
-            num_questions=200,
-            max_new_tokens=512,
-            parallel=128,
-            host="http://127.0.0.1",
-            port=int(self.base_url.split(":")[-1]),
-        )
-        metrics = run_eval_few_shot_gsm8k(args)
-        print(metrics)
-
-        self.assertGreater(metrics["accuracy"], 0.60)
+class TestHybridDPTP(TestTP):
+    extra_args = [
+        "--tp",
+        "4",
+        "--enable-dp-attention",
+        "--dp",
+        "2",
+    ]


-class TestNoGatherdBuffer(CustomTestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--tp",
-                "4",
-                "--enable-dp-attention",
-                "--dp",
-                "4",
-                "--moe-dense-tp-size",
-                "1",
-                "--enable-dp-lm-head",
-                "--elastic-ep-backend",
-                "mooncake",
-                "--mooncake-ib-device",
-                "mlx5_roce0,mlx5_roce1,mlx5_roce2,mlx5_roce3,mlx5_roce4,mlx5_roce5,mlx5_roce6,mlx5_roce7",
-                "--moe-a2a-backend",
-                "deepep",
-                "--deepep-mode",
-                "low_latency",
-                "--chunked-prefill-size",
-                "512",
-                "--cuda-graph-max-bs",
-                "32",
-                "--max-running-requests",
-                "512",
-            ],
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_gsm8k(self):
-        args = SimpleNamespace(
-            num_shots=5,
-            data_path=None,
-            num_questions=200,
-            max_new_tokens=512,
-            parallel=128,
-            host="http://127.0.0.1",
-            port=int(self.base_url.split(":")[-1]),
-        )
-        metrics = run_eval_few_shot_gsm8k(args)
-        print(metrics)
-
-        self.assertGreater(metrics["accuracy"], 0.60)
+class TestNoGatherdBuffer(TestTP):
+    extra_args = [
+        "--tp",
+        "4",
+        "--enable-dp-attention",
+        "--dp",
+        "4",
+        "--moe-dense-tp-size",
+        "1",
+    ]


-class TestTBO(CustomTestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--tp",
-                "4",
-                "--enable-dp-attention",
-                "--dp",
-                "4",
-                "--moe-dense-tp-size",
-                "1",
-                "--elastic-ep-backend",
-                "mooncake",
-                "--mooncake-ib-device",
-                "mlx5_roce0,mlx5_roce1,mlx5_roce2,mlx5_roce3,mlx5_roce4,mlx5_roce5,mlx5_roce6,mlx5_roce7",
-                "--moe-a2a-backend",
-                "deepep",
-                "--deepep-mode",
-                "low_latency",
-                "--chunked-prefill-size",
-                "512",
-                "--enable-two-batch-overlap",
-                "--cuda-graph-max-bs",
-                "128",
-                "--max-running-requests",
-                "512",
-            ],
-        )
+class TestTBO(TestTP):
+    extra_args = [
+        "--tp",
+        "4",
+        "--enable-dp-attention",
+        "--dp",
+        "4",
+        "--moe-dense-tp-size",
+        "1",
+        "--enable-two-batch-overlap",
+    ]

-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)

-    def test_gsm8k(self):
-        args = SimpleNamespace(
-            num_shots=5,
-            data_path=None,
-            num_questions=200,
-            max_new_tokens=512,
-            parallel=128,
-            host="http://127.0.0.1",
-            port=int(self.base_url.split(":")[-1]),
-        )
-        metrics = run_eval_few_shot_gsm8k(args)
-        print(metrics)
-
-        self.assertGreater(metrics["accuracy"], 0.60)
+class TestMooncakeWitchEPLB(TestTP):
+    extra_args = [
+        "--tp",
+        "4",
+        "--enable-dp-attention",
+        "--dp",
+        "4",
+        "--moe-dense-tp-size",
+        "1",
+        "--enable-two-batch-overlap",
+        "--enable-eplb",
+        "--ep-num-redundant-experts",
+        "4",
+        "--eplb-rebalance-num-iterations",
+        "50",
+        "--expert-distribution-recorder-buffer-size",
+        "50",
+        "--expert-distribution-recorder-mode",
+        "stat",
+        "--ep-dispatch-algorithm",
+        "static",
+    ]


 if __name__ == "__main__":