Support NextN (MTP) speculative decoding for DeepSeek-V3/R1 (#3582)

2025-02-15 05:28:34 +08:00
parent fb4c9c3a30
commit 862dd76c76
7 changed files with 437 additions and 7 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -262,14 +262,17 @@ class ServerArgs:
            )

        # Speculative Decoding
-        if self.speculative_algorithm == "EAGLE":
+        if (
+            self.speculative_algorithm == "EAGLE"
+            or self.speculative_algorithm == "NEXTN"
+        ):
            self.prefill_only_one_req = True
            self.disable_cuda_graph_padding = True
            self.disable_radix_cache = True
            self.disable_overlap_schedule = True
            self.chunked_prefill_size = -1
            logger.info(
-                "The radix cache, chunked prefill, and overlap scheduler are disabled because of using eagle speculative decoding."
+                f"The radix cache, chunked prefill, and overlap scheduler are disabled because of using {self.speculative_algorithm} speculative decoding."
            )

        # GGUF
@@ -705,7 +708,7 @@ class ServerArgs:
        parser.add_argument(
            "--speculative-algorithm",
            type=str,
-            choices=["EAGLE"],
+            choices=["EAGLE", "NEXTN"],
            help="Speculative algorithm.",
        )
        parser.add_argument(