qwen3moe support two batch overlap (#6598)

2025-05-26 14:08:16 +08:00
parent 16f69b1f65
commit f9bab3d591
5 changed files with 355 additions and 32 deletions
--- a/test/srt/test_two_batch_overlap.py
+++ b/test/srt/test_two_batch_overlap.py
@@ -9,6 +9,7 @@ from sglang.srt.two_batch_overlap import compute_split_seq_index
 from sglang.srt.utils import kill_process_tree
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
+    DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST,
    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
@@ -104,5 +105,32 @@ class TestTwoBatchOverlapUnitTest(unittest.TestCase):
            self.assertEqual(actual, expect)


+class TestQwen3TwoBatchOverlap(TestTwoBatchOverlap):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-1234"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "2",
+                "--dp",
+                "2",
+                "--enable-dp-attention",
+                "--enable-deepep-moe",
+                "--deepep-mode",
+                "normal",
+                "--disable-cuda-graph",  # DeepEP normal does not support CUDA Graph
+                "--enable-two-batch-overlap",
+            ],
+            env={"SGL_ENABLE_JIT_DEEPGEMM": "0", **os.environ},
+        )
+
+
 if __name__ == "__main__":
    unittest.main()