feat: support compatibility between MTP and two-batch-overlap (#7225)

Co-authored-by: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
2025-06-27 01:10:27 -07:00
parent 1b95162008
commit 41650b0d70
7 changed files with 325 additions and 38 deletions
--- a/test/srt/test_dp_attention.py
+++ b/test/srt/test_dp_attention.py
@@ -137,5 +137,86 @@ class TestDPAttentionDP2TP2DeepseekV3MTP(CustomTestCase):
        self.assertGreater(avg_spec_accept_length, 2.5)


+# TODO: enable this test later
+# class TestDPAttentionDP2TP2DeepseekV3MTPTBO(CustomTestCase):
+#     @classmethod
+#     def setUpClass(cls):
+#         import os
+
+#         # print debug log for tbo
+#         os.environ["SGLANG_TBO_DEBUG"] = "1"
+#         cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+#         cls.base_url = DEFAULT_URL_FOR_TEST
+#         other_args = [
+#             "--trust-remote-code",
+#             "--disable-radix",
+#             "--speculative-algorithm",
+#             "EAGLE",
+#             "--speculative-num-steps",
+#             "2",
+#             "--speculative-eagle-topk",
+#             "4",
+#             "--speculative-num-draft-tokens",
+#             "4",
+#             "--speculative-draft",
+#             DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN,
+#             "--tp-size",
+#             "2",
+#             "--enable-dp-attention",
+#             "--dp-size",
+#             "2",
+#             "--enable-two-batch-overlap",
+#             "--enable-deepep-moe",
+#             "--deepep-mode",
+#             "low_latency",
+#             "--chunked-prefill-size",
+#             "256",
+#             "--cuda-graph-max-bs",
+#             "32",
+#             "--max-running-requests",
+#             "32",
+#         ]
+#         if not is_in_amd_ci():
+#             other_args += ["--mem-frac", "0.7"]
+#         cls.process = popen_launch_server(
+#             cls.model,
+#             cls.base_url,
+#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+#             other_args=other_args,
+#         )
+
+#     @classmethod
+#     def tearDownClass(cls):
+#         kill_process_tree(cls.process.pid)
+
+#     def test_gsm8k(self):
+#         requests.get(self.base_url + "/flush_cache")
+
+#         args = SimpleNamespace(
+#             num_shots=5,
+#             data_path=None,
+#             num_questions=200,
+#             max_new_tokens=512,
+#             parallel=128,
+#             host="http://127.0.0.1",
+#             port=int(self.base_url.split(":")[-1]),
+#         )
+#         metrics = run_eval_few_shot_gsm8k(args)
+#         print(metrics)
+
+#         self.assertGreater(metrics["accuracy"], 0.60)
+
+#         server_info = requests.get(self.base_url + "/get_server_info")
+#         avg_spec_accept_length = server_info.json()["internal_states"][0][
+#             "avg_spec_accept_length"
+#         ]
+#         print(
+#             f"###test_gsm8k (deepseek-v3 mtp + dp + tbo):\n"
+#             f"accuracy={metrics['accuracy']=:.3f}\n"
+#             f"{avg_spec_accept_length=:.3f}\n"
+#         )
+#         self.assertGreater(avg_spec_accept_length, 2.3)
+
+
 if __name__ == "__main__":
    unittest.main()