From 1f30c05d4a43105a7dfd985499f3d7afe13b244b Mon Sep 17 00:00:00 2001 From: JieXin Liang Date: Mon, 19 May 2025 03:50:15 +0800 Subject: [PATCH] [fix] fix fa3 forward_decode with spec_decode (#6395) Co-authored-by: Stefan He --- .../sglang/srt/layers/attention/flashattention_backend.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/attention/flashattention_backend.py b/python/sglang/srt/layers/attention/flashattention_backend.py index a626ff0d8..4ce337580 100644 --- a/python/sglang/srt/layers/attention/flashattention_backend.py +++ b/python/sglang/srt/layers/attention/flashattention_backend.py @@ -918,8 +918,11 @@ class FlashAttentionBackend(AttentionBackend): and local_attn_metadata is not None and (hasattr(layer, "use_irope") and layer.use_irope) ) - # We do cascade attention for Draft Decode with topk > 1 - use_cascade_attn = self.topk > 1 + + # When Spec Decode enabled, forward_decode would be called with two mode:
 + # 1. DRAFT_DECODE: we enable cascade attention when top_k > 1 + # 2. IDLE: we don’t need cascade attention, spec_info will be none in this case + use_cascade_attn = forward_batch.spec_info is not None and self.topk > 1 # Calculate window size (can be moved to metadata if layer properties don't change) # we don't do layer.sliding_window_size - 1 since in model.get_attention_sliding_window_size() we already - 1