[Feat] Support FlashMLA backend with MTP and FP8 KV cache (#6109)

Co-authored-by: Yingyi <yingyihuang2000@outlook.com>
Co-authored-by: neiltian <neiltian@tencent.com>
Co-authored-by: lukec <118525388+sleepcoo@users.noreply.github.com>
Co-authored-by: kexueyu <kexueyu@tencent.com>
Co-authored-by: vincentmeng <vincentmeng@tencent.com>
Co-authored-by: pengmeng <pengmeng@tencent.com>
This commit is contained in:
quinnrong94
2025-05-15 15:48:09 +08:00
committed by GitHub
parent 44a3783d13
commit 2e4babdb0a
8 changed files with 443 additions and 86 deletions

View File

@@ -199,6 +199,19 @@ class EAGLEWorker(TpModelWorker):
self.draft_extend_attn_backend = None
self.padded_static_len = self.speculative_num_steps + 1
self.has_prefill_wrapper_verify = False
elif self.server_args.attention_backend == "flashmla":
from sglang.srt.layers.attention.flashmla_backend import (
FlashMLAMultiStepDraftBackend,
)
self.draft_attn_backend = FlashMLAMultiStepDraftBackend(
self.draft_model_runner,
self.topk,
self.speculative_num_steps,
)
self.draft_extend_attn_backend = None
self.padded_static_len = self.speculative_num_steps + 1
self.has_prefill_wrapper_verify = False
else:
raise ValueError(
f"EAGLE is not supported in attention backend {self.server_args.attention_backend}"