Add speculator attention backend switch (#9981)

This commit is contained in:
cicirori
2025-09-08 06:44:36 +02:00
committed by GitHub
parent 3b99f23c44
commit 8c5930f08a
6 changed files with 130 additions and 54 deletions

View File

@@ -191,7 +191,7 @@ class EAGLEWorker(TpModelWorker):
# Initialize decode attention backend
self.draft_attn_backend = self._create_decode_backend()
# Initialize prefill attention backend
# Initialize draft extend attention backend (respects speculative_attention_backend setting)
self.draft_extend_attn_backend = self._create_draft_extend_backend()
self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
@@ -234,11 +234,15 @@ class EAGLEWorker(TpModelWorker):
"trtllm_mha": self._create_trtllm_mha_prefill_backend,
"trtllm_mla": self._create_trtllm_mla_prefill_backend,
}
backend_name = (
"decode_attention_backend"
if self.server_args.speculative_attention_backend == "decode"
else "prefill_attention_backend"
)
return self._create_backend(
"prefill_attention_backend",
backend_name,
backend_map,
"EAGLE is not supported in prefill attention backend {backend_type}",
"EAGLE is not supported in attention backend {backend_type}",
)
def _create_flashinfer_decode_backend(self):