Add speculator attention backend switch (#9981)
This commit is contained in:
@@ -262,6 +262,7 @@ class ServerArgs:
|
||||
speculative_accept_threshold_single: float = 1.0
|
||||
speculative_accept_threshold_acc: float = 1.0
|
||||
speculative_token_map: Optional[str] = None
|
||||
speculative_attention_backend: str = "prefill"
|
||||
|
||||
# Expert parallelism
|
||||
ep_size: int = 1
|
||||
@@ -1561,6 +1562,13 @@ class ServerArgs:
|
||||
help="The path of the draft model's small vocab table.",
|
||||
default=ServerArgs.speculative_token_map,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speculative-attention-backend",
|
||||
type=str,
|
||||
choices=["prefill", "decode"],
|
||||
help="Attention backend to use for speculative decoding operations (both target verify and draft extend). 'prefill' (default) or 'decode'.",
|
||||
default=ServerArgs.speculative_attention_backend,
|
||||
)
|
||||
|
||||
# Expert parallelism
|
||||
parser.add_argument(
|
||||
|
||||
Reference in New Issue
Block a user