[Fix] Resolve performance drop in speculative decoding aiter backend (#11087)

2025-09-30 14:51:30 +08:00
parent 424591d53d
commit 5991119541
1 changed files with 5 additions and 1 deletions
--- a/python/sglang/srt/layers/attention/aiter_backend.py
+++ b/python/sglang/srt/layers/attention/aiter_backend.py
@@ -619,7 +619,11 @@ class AiterAttnBackend(AttentionBackend):
            assert len(k.shape) == 3
            assert len(v.shape) == 3

-            if forward_batch.forward_mode.is_extend():
+            if (
+                forward_batch.forward_mode.is_extend()
+                and not forward_batch.forward_mode.is_target_verify()
+                and not forward_batch.forward_mode.is_draft_extend()
+            ):
                if kv_indices.shape[0] == 0:
                    o = flash_attn_varlen_func(
                        q,