[fix] fix illegal mem access and clean up triton attention backend (#4571)
This commit is contained in:
@@ -349,7 +349,6 @@ class FlashInferAttnBackend(AttentionBackend):
|
||||
def init_forward_metadata_replay_cuda_graph(
|
||||
self,
|
||||
bs: int,
|
||||
num_kv_heads: int,
|
||||
req_pool_indices: torch.Tensor,
|
||||
seq_lens: torch.Tensor,
|
||||
seq_lens_sum: int,
|
||||
@@ -1063,7 +1062,6 @@ class FlashInferMultiStepDraftBackend:
|
||||
def call_fn(i, forward_batch):
|
||||
self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
|
||||
bs,
|
||||
-1,
|
||||
forward_batch.req_pool_indices,
|
||||
forward_batch.seq_lens,
|
||||
seq_lens_sum=-1,
|
||||
|
||||
Reference in New Issue
Block a user