[fix] fix illegal mem access and clean up triton attention backend (#4571)

This commit is contained in:
JieXin Liang
2025-03-20 17:01:52 +08:00
committed by GitHub
parent fad86a6863
commit 9e93ef3f8e
7 changed files with 124 additions and 125 deletions

View File

@@ -349,7 +349,6 @@ class FlashInferAttnBackend(AttentionBackend):
def init_forward_metadata_replay_cuda_graph(
self,
bs: int,
num_kv_heads: int,
req_pool_indices: torch.Tensor,
seq_lens: torch.Tensor,
seq_lens_sum: int,
@@ -1063,7 +1062,6 @@ class FlashInferMultiStepDraftBackend:
def call_fn(i, forward_batch):
self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
bs,
-1,
forward_batch.req_pool_indices,
forward_batch.seq_lens,
seq_lens_sum=-1,