Add Eagle Speculative Decoding to FA3 Backend (#4951)

Co-authored-by: hebiao064 <hebiaobuaa@gmail.com>
Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
Co-authored-by: zcnrex <zcnrex@gmail.com>
This commit is contained in:
Qingquan Song
2025-04-02 13:09:02 -07:00
committed by GitHub
parent e9c6ce461d
commit e983e43248
2 changed files with 214 additions and 31 deletions

View File

@@ -184,6 +184,19 @@ class EAGLEWorker(TpModelWorker):
self.draft_extend_attn_backend = None
self.padded_static_len = self.speculative_num_steps + 1
self.has_prefill_wrapper_verify = True
elif self.server_args.attention_backend == "fa3":
from sglang.srt.layers.attention.flashattention_backend import (
FlashAttentionMultiStepBackend,
)
self.draft_attn_backend = FlashAttentionMultiStepBackend(
self.draft_model_runner,
self.topk,
self.speculative_num_steps,
)
self.draft_extend_attn_backend = None
self.padded_static_len = self.speculative_num_steps + 1
self.has_prefill_wrapper_verify = False
else:
raise ValueError(
f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}"