Add trtllm_mla and cutlass_mla for ragged fmha for chunked prefill (#9480)
This commit is contained in:
@@ -999,6 +999,8 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|||||||
attention_backend == "flashinfer"
|
attention_backend == "flashinfer"
|
||||||
or attention_backend == "fa3"
|
or attention_backend == "fa3"
|
||||||
or attention_backend == "flashmla"
|
or attention_backend == "flashmla"
|
||||||
|
or attention_backend == "trtllm_mla"
|
||||||
|
or attention_backend == "cutlass_mla"
|
||||||
):
|
):
|
||||||
# Use MHA with chunked KV cache when prefilling on long sequences.
|
# Use MHA with chunked KV cache when prefilling on long sequences.
|
||||||
sum_extend_prefix_lens = (
|
sum_extend_prefix_lens = (
|
||||||
|
|||||||
Reference in New Issue
Block a user