Cleaning codes for speculative attention mode (#10149)

2025-09-08 17:38:06 -07:00
parent 148022fc36
commit 8ad700f735
7 changed files with 14 additions and 35 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -262,7 +262,7 @@ class ServerArgs:
    speculative_accept_threshold_single: float = 1.0
    speculative_accept_threshold_acc: float = 1.0
    speculative_token_map: Optional[str] = None
-    speculative_attention_backend: str = "prefill"
+    speculative_attention_mode: str = "prefill"

    # Expert parallelism
    ep_size: int = 1
@@ -1563,11 +1563,11 @@ class ServerArgs:
            default=ServerArgs.speculative_token_map,
        )
        parser.add_argument(
-            "--speculative-attention-backend",
+            "--speculative-attention-mode",
            type=str,
            choices=["prefill", "decode"],
-            help="Attention backend to use for speculative decoding operations (both target verify and draft extend). 'prefill' (default) or 'decode'.",
-            default=ServerArgs.speculative_attention_backend,
+            help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
+            default=ServerArgs.speculative_attention_mode,
        )

        # Expert parallelism