Support nextn for flashinfer mla attention backend (#4218)

This commit is contained in:
Baizhou Zhang
2025-03-09 00:01:54 -08:00
committed by GitHub
parent 89ccb533ad
commit 9fb48f951f
5 changed files with 393 additions and 58 deletions

View File

@@ -123,6 +123,16 @@ class EAGLEWorker(TpModelWorker):
self.topk,
self.speculative_num_steps,
)
elif self.server_args.attention_backend == "flashinfer_mla":
from sglang.srt.layers.attention.flashinfer_mla_backend import (
FlashInferMLAMultiStepDraftBackend,
)
self.draft_attn_backend = FlashInferMLAMultiStepDraftBackend(
self.model_runner,
self.topk,
self.speculative_num_steps,
)
else:
raise ValueError(
f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}"