piecewise cuda graph support qwen3-moe (#11845)

2025-10-21 10:55:49 +08:00
parent 74de76c685
commit 8374a96e49
4 changed files with 71 additions and 6 deletions
--- a/test/srt/test_piecewise_cuda_graph.py
+++ b/test/srt/test_piecewise_cuda_graph.py
@@ -55,5 +55,45 @@ class TestPiecewiseCudaGraphBenchmark(CustomTestCase):
        self.assertLess(prefill_latency, 0.015)


+class TestPiecewiseCudaGraphQwen3MoE(CustomTestCase):
+    """Test piecewise CUDA graph with Qwen3-Coder-30B-A3B-Instruct MoE model"""
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--enable-piecewise-cuda-graph",
+                "--piecewise-cuda-graph-compiler",
+                "eager",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k_accuracy(self):
+        """Test GSM8K accuracy with 8-shot setting"""
+        num_examples = 2000
+
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=num_examples,
+            num_threads=min(num_examples, 1024),
+        )
+
+        metrics = run_eval(args)
+        print(f"GSM8K Accuracy: {metrics['score']:.3f}")
+
+        self.assertGreaterEqual(metrics["score"], 0.90)
+
+
 if __name__ == "__main__":
    unittest.main()