Add trtllm_mla and cutlass_mla for ragged fmha for chunked prefill (#9480)
This commit is contained in:
@@ -999,6 +999,8 @@ class DeepseekV2AttentionMLA(nn.Module):
|
||||
attention_backend == "flashinfer"
|
||||
or attention_backend == "fa3"
|
||||
or attention_backend == "flashmla"
|
||||
or attention_backend == "trtllm_mla"
|
||||
or attention_backend == "cutlass_mla"
|
||||
):
|
||||
# Use MHA with chunked KV cache when prefilling on long sequences.
|
||||
sum_extend_prefix_lens = (
|
||||
|
||||
Reference in New Issue
Block a user