Add Eagle Speculative Decoding to FA3 Backend (#4951)
Co-authored-by: hebiao064 <hebiaobuaa@gmail.com> Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com> Co-authored-by: zcnrex <zcnrex@gmail.com>
This commit is contained in:
@@ -184,6 +184,19 @@ class EAGLEWorker(TpModelWorker):
|
||||
self.draft_extend_attn_backend = None
|
||||
self.padded_static_len = self.speculative_num_steps + 1
|
||||
self.has_prefill_wrapper_verify = True
|
||||
elif self.server_args.attention_backend == "fa3":
|
||||
from sglang.srt.layers.attention.flashattention_backend import (
|
||||
FlashAttentionMultiStepBackend,
|
||||
)
|
||||
|
||||
self.draft_attn_backend = FlashAttentionMultiStepBackend(
|
||||
self.draft_model_runner,
|
||||
self.topk,
|
||||
self.speculative_num_steps,
|
||||
)
|
||||
self.draft_extend_attn_backend = None
|
||||
self.padded_static_len = self.speculative_num_steps + 1
|
||||
self.has_prefill_wrapper_verify = False
|
||||
else:
|
||||
raise ValueError(
|
||||
f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}"
|
||||
|
||||
Reference in New Issue
Block a user