Add trtllm_mla and cutlass_mla for ragged fmha for chunked prefill (#9480)

This commit is contained in:
Elfie Guo
2025-08-21 23:01:36 -07:00
committed by GitHub
parent 61a0e600df
commit 243e745d07

View File

@@ -999,6 +999,8 @@ class DeepseekV2AttentionMLA(nn.Module):
attention_backend == "flashinfer"
or attention_backend == "fa3"
or attention_backend == "flashmla"
or attention_backend == "trtllm_mla"
or attention_backend == "cutlass_mla"
):
# Use MHA with chunked KV cache when prefilling on long sequences.
sum_extend_prefix_lens = (