[Fix] Resolve performance drop in speculative decoding aiter backend (#11087)

This commit is contained in:
jacky.cheng
2025-09-30 14:51:30 +08:00
committed by GitHub
parent 424591d53d
commit 5991119541

View File

@@ -619,7 +619,11 @@ class AiterAttnBackend(AttentionBackend):
assert len(k.shape) == 3
assert len(v.shape) == 3
if forward_batch.forward_mode.is_extend():
if (
forward_batch.forward_mode.is_extend()
and not forward_batch.forward_mode.is_target_verify()
and not forward_batch.forward_mode.is_draft_extend()
):
if kv_indices.shape[0] == 0:
o = flash_attn_varlen_func(
q,