Standalone speculative decoding (#10090)

2025-09-07 20:55:09 -07:00
parent 400d3b97ae
commit 8cda5a622c
11 changed files with 285 additions and 9 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -473,9 +473,14 @@ class ServerArgs:
                    # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
                    reserved_mem = 32 * 1024

+                # draft model and larger cuda graph buffers
                if self.speculative_algorithm is not None:
-                    # draft model and larger cuda graph buffers
-                    reserved_mem += 2 * 1024
+                    if self.speculative_algorithm == "STANDALONE":
+                        # Standalone speculative decoding needs more memory than other speculative
+                        # decoding algorithms since the draft model is typically larger.
+                        reserved_mem += 6 * 1024
+                    else:
+                        reserved_mem += 2 * 1024
                if self.enable_dp_attention:
                    reserved_mem += 4 * 1024

@@ -704,7 +709,12 @@ class ServerArgs:
            # NEXTN shares the same implementation of EAGLE
            self.speculative_algorithm = "EAGLE"

-        if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
+        if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
+            if self.speculative_algorithm == "STANDALONE":
+                # TODO: support dp attention for standalone speculative decoding
+                assert (
+                    self.enable_dp_attention is False
+                ), "Currently standalone speculative decoding does not support dp attention."
            if self.max_running_requests is None:
                self.max_running_requests = 48
            self.disable_overlap_schedule = True
@@ -1499,7 +1509,7 @@ class ServerArgs:
        parser.add_argument(
            "--speculative-algorithm",
            type=str,
-            choices=["EAGLE", "EAGLE3", "NEXTN"],
+            choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
            help="Speculative algorithm.",
        )
        parser.add_argument(
@@ -2635,7 +2645,9 @@ def auto_choose_speculative_params(self: ServerArgs):
    """
    hf_config = self.get_hf_config()
    arch = hf_config.architectures[0]
-
+    if self.speculative_algorithm == "STANDALONE":
+        # The default value for standalone speculative decoding
+        return (3, 1, 4)
    if arch in ["LlamaForCausalLM"]:
        # The default value for llama
        return (5, 4, 8)