From 243e745d0758a7214d29fe644d88f5c3b5c3d9ff Mon Sep 17 00:00:00 2001 From: Elfie Guo <164945471+elfiegg@users.noreply.github.com> Date: Thu, 21 Aug 2025 23:01:36 -0700 Subject: [PATCH] Add trtllm_mla and cutlass_mla for ragged fmha for chunked prefill (#9480) --- python/sglang/srt/models/deepseek_v2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 391627c7a..95b962fa3 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -999,6 +999,8 @@ class DeepseekV2AttentionMLA(nn.Module): attention_backend == "flashinfer" or attention_backend == "fa3" or attention_backend == "flashmla" + or attention_backend == "trtllm_mla" + or attention_backend == "cutlass_mla" ): # Use MHA with chunked KV cache when prefilling on long sequences. sum_extend_prefix_lens = (