Add speculator attention backend switch (#9981)
This commit is contained in:
@@ -191,7 +191,7 @@ class EAGLEWorker(TpModelWorker):
|
||||
# Initialize decode attention backend
|
||||
self.draft_attn_backend = self._create_decode_backend()
|
||||
|
||||
# Initialize prefill attention backend
|
||||
# Initialize draft extend attention backend (respects speculative_attention_backend setting)
|
||||
self.draft_extend_attn_backend = self._create_draft_extend_backend()
|
||||
|
||||
self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
|
||||
@@ -234,11 +234,15 @@ class EAGLEWorker(TpModelWorker):
|
||||
"trtllm_mha": self._create_trtllm_mha_prefill_backend,
|
||||
"trtllm_mla": self._create_trtllm_mla_prefill_backend,
|
||||
}
|
||||
|
||||
backend_name = (
|
||||
"decode_attention_backend"
|
||||
if self.server_args.speculative_attention_backend == "decode"
|
||||
else "prefill_attention_backend"
|
||||
)
|
||||
return self._create_backend(
|
||||
"prefill_attention_backend",
|
||||
backend_name,
|
||||
backend_map,
|
||||
"EAGLE is not supported in prefill attention backend {backend_type}",
|
||||
"EAGLE is not supported in attention backend {backend_type}",
|
||||
)
|
||||
|
||||
def _create_flashinfer_decode_backend(self):
|
||||
|
||||
Reference in New Issue
Block a user