Add speculator attention backend switch (#9981)

2025-09-08 06:44:36 +02:00
parent 3b99f23c44
commit 8c5930f08a
6 changed files with 130 additions and 54 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -262,6 +262,7 @@ class ServerArgs:
    speculative_accept_threshold_single: float = 1.0
    speculative_accept_threshold_acc: float = 1.0
    speculative_token_map: Optional[str] = None
+    speculative_attention_backend: str = "prefill"

    # Expert parallelism
    ep_size: int = 1
@@ -1561,6 +1562,13 @@ class ServerArgs:
            help="The path of the draft model's small vocab table.",
            default=ServerArgs.speculative_token_map,
        )
+        parser.add_argument(
+            "--speculative-attention-backend",
+            type=str,
+            choices=["prefill", "decode"],
+            help="Attention backend to use for speculative decoding operations (both target verify and draft extend). 'prefill' (default) or 'decode'.",
+            default=ServerArgs.speculative_attention_backend,
+        )

        # Expert parallelism
        parser.add_argument(