From 59911195417ec64a8055a163cdcb9d089532f0a8 Mon Sep 17 00:00:00 2001 From: "jacky.cheng" Date: Tue, 30 Sep 2025 14:51:30 +0800 Subject: [PATCH] [Fix] Resolve performance drop in speculative decoding aiter backend (#11087) --- python/sglang/srt/layers/attention/aiter_backend.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/attention/aiter_backend.py b/python/sglang/srt/layers/attention/aiter_backend.py index 188d772c7..f1b2da5f8 100644 --- a/python/sglang/srt/layers/attention/aiter_backend.py +++ b/python/sglang/srt/layers/attention/aiter_backend.py @@ -619,7 +619,11 @@ class AiterAttnBackend(AttentionBackend): assert len(k.shape) == 3 assert len(v.shape) == 3 - if forward_batch.forward_mode.is_extend(): + if ( + forward_batch.forward_mode.is_extend() + and not forward_batch.forward_mode.is_target_verify() + and not forward_batch.forward_mode.is_draft_extend() + ): if kv_indices.shape[0] == 0: o = flash_attn_varlen_func( q,