Cleaning codes for speculative attention mode (#10149)

This commit is contained in:
Baizhou Zhang
2025-09-08 17:38:06 -07:00
committed by GitHub
parent 148022fc36
commit 8ad700f735
7 changed files with 14 additions and 35 deletions

View File

@@ -262,7 +262,7 @@ class ServerArgs:
speculative_accept_threshold_single: float = 1.0
speculative_accept_threshold_acc: float = 1.0
speculative_token_map: Optional[str] = None
speculative_attention_backend: str = "prefill"
speculative_attention_mode: str = "prefill"
# Expert parallelism
ep_size: int = 1
@@ -1563,11 +1563,11 @@ class ServerArgs:
default=ServerArgs.speculative_token_map,
)
parser.add_argument(
"--speculative-attention-backend",
"--speculative-attention-mode",
type=str,
choices=["prefill", "decode"],
help="Attention backend to use for speculative decoding operations (both target verify and draft extend). 'prefill' (default) or 'decode'.",
default=ServerArgs.speculative_attention_backend,
help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
default=ServerArgs.speculative_attention_mode,
)
# Expert parallelism