From 4418f599a54699181b35d89b0def2697cccb721a Mon Sep 17 00:00:00 2001 From: JieXin Liang Date: Tue, 22 Apr 2025 16:41:41 +0800 Subject: [PATCH] Fix FA3 DeepSeek prefill performance regression (#5624) Co-authored-by: ispobock --- python/sglang/srt/models/deepseek_v2.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index b299c3037..241c46cf9 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -583,13 +583,17 @@ class DeepseekV2AttentionMLA(nn.Module): return AttnForwardMethod.MLA elif self.attention_backend == "fa3": # Flash Attention: Use MHA with chunked KV cache when prefilling on long sequences. + if forward_batch.extend_prefix_lens_cpu is not None: + sum_extend_prefix_lens = sum(forward_batch.extend_prefix_lens_cpu) if ( forward_batch.forward_mode.is_extend() and not self.disable_chunked_prefix_cache and not forward_batch.forward_mode.is_target_verify() and not forward_batch.forward_mode.is_draft_extend() - and sum(forward_batch.extend_prefix_lens_cpu) - >= self.chunked_prefix_cache_threshold + and ( + sum_extend_prefix_lens >= self.chunked_prefix_cache_threshold + or sum_extend_prefix_lens == 0 + ) ): return AttnForwardMethod.MHA_CHUNKED_KV else: