Revert "Add fast decode plan for flashinfer mla" (#4008)

This commit is contained in:
Lianmin Zheng
2025-03-02 19:29:10 -08:00
committed by GitHub
parent fa56106731
commit 9e1014cf99
9 changed files with 52 additions and 156 deletions

View File

@@ -269,10 +269,9 @@ class FlashInferAttnBackend(AttentionBackend):
num_tokens: int,
req_pool_indices: torch.Tensor,
seq_lens: torch.Tensor,
forward_mode: ForwardMode,
encoder_lens: Optional[torch.Tensor],
forward_mode: ForwardMode,
spec_info: Optional[SpecInfo],
**kwargs,
):
if forward_mode.is_decode_or_idle():
decode_wrappers = []
@@ -340,10 +339,9 @@ class FlashInferAttnBackend(AttentionBackend):
req_pool_indices: torch.Tensor,
seq_lens: torch.Tensor,
seq_lens_sum: int,
forward_mode: ForwardMode,
encoder_lens: Optional[torch.Tensor],
forward_mode: ForwardMode,
spec_info: Optional[SpecInfo],
**kwargs,
):
if forward_mode.is_decode_or_idle():
self.indices_updater_decode.update(