Add speculator attention backend switch (#9981)

This commit is contained in:
cicirori
2025-09-08 06:44:36 +02:00
committed by GitHub
parent 3b99f23c44
commit 8c5930f08a
6 changed files with 130 additions and 54 deletions

View File

@@ -262,6 +262,7 @@ class ServerArgs:
speculative_accept_threshold_single: float = 1.0
speculative_accept_threshold_acc: float = 1.0
speculative_token_map: Optional[str] = None
speculative_attention_backend: str = "prefill"
# Expert parallelism
ep_size: int = 1
@@ -1561,6 +1562,13 @@ class ServerArgs:
help="The path of the draft model's small vocab table.",
default=ServerArgs.speculative_token_map,
)
parser.add_argument(
"--speculative-attention-backend",
type=str,
choices=["prefill", "decode"],
help="Attention backend to use for speculative decoding operations (both target verify and draft extend). 'prefill' (default) or 'decode'.",
default=ServerArgs.speculative_attention_backend,
)
# Expert parallelism
parser.add_argument(