Add draft extend CUDA graph for Triton backend (#6705)

This commit is contained in:
Ke Bao
2025-05-29 15:13:07 +08:00
committed by GitHub
parent c673727e0e
commit 7e41290082
2 changed files with 51 additions and 1 deletions

View File

@@ -179,6 +179,7 @@ class EAGLEWorker(TpModelWorker):
self.has_prefill_wrapper_verify = True
elif self.server_args.attention_backend == "triton":
from sglang.srt.layers.attention.triton_backend import (
TritonAttnBackend,
TritonMultiStepDraftBackend,
)
@@ -187,7 +188,10 @@ class EAGLEWorker(TpModelWorker):
self.topk,
self.speculative_num_steps,
)
self.draft_extend_attn_backend = None
self.draft_extend_attn_backend = TritonAttnBackend(
self.draft_model_runner,
skip_prefill=False,
)
self.padded_static_len = self.speculative_num_steps + 1
self.has_prefill_wrapper_verify = False
elif self.server_args.attention_backend == "fa3":