Support speculative decoding in hybrid attention backend (#9573)

This commit is contained in:
Qiaolin Yu
2025-08-28 01:11:42 -07:00
committed by GitHub
parent c377923304
commit 4a4772ae03
3 changed files with 83 additions and 26 deletions

View File

@@ -1440,14 +1440,12 @@ class ModelRunner:
else self.server_args.attention_backend
)
if self.decode_attention_backend_str != self.prefill_attention_backend_str:
assert (
self.server_args.speculative_algorithm is None
), "Currently HybridAttentionBackend does not support speculative decoding."
from sglang.srt.layers.attention.hybrid_attn_backend import (
HybridAttnBackend,
)
attn_backend = HybridAttnBackend(
self,
decode_backend=self._get_attention_backend_from_str(
self.decode_attention_backend_str
),