Add trtllm_mla and cutlass_mla for ragged fmha for chunked prefill (#9480)

2025-08-21 23:01:36 -07:00
parent 61a0e600df
commit 243e745d07
1 changed files with 2 additions and 0 deletions
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -999,6 +999,8 @@ class DeepseekV2AttentionMLA(nn.Module):
            attention_backend == "flashinfer"
            or attention_backend == "fa3"
            or attention_backend == "flashmla"
            or attention_backend == "trtllm_mla"
            or attention_backend == "cutlass_mla"
        ):
            # Use MHA with chunked KV cache when prefilling on long sequences.
            sum_extend_prefix_lens = (