diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index b299c3037..241c46cf9 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -583,13 +583,17 @@ class DeepseekV2AttentionMLA(nn.Module): return AttnForwardMethod.MLA elif self.attention_backend == "fa3": # Flash Attention: Use MHA with chunked KV cache when prefilling on long sequences. + if forward_batch.extend_prefix_lens_cpu is not None: + sum_extend_prefix_lens = sum(forward_batch.extend_prefix_lens_cpu) if ( forward_batch.forward_mode.is_extend() and not self.disable_chunked_prefix_cache and not forward_batch.forward_mode.is_target_verify() and not forward_batch.forward_mode.is_draft_extend() - and sum(forward_batch.extend_prefix_lens_cpu) - >= self.chunked_prefix_cache_threshold + and ( + sum_extend_prefix_lens >= self.chunked_prefix_cache_threshold + or sum_extend_prefix_lens == 0 + ) ): return AttnForwardMethod.MHA_CHUNKED_KV else: