[Fix] Fix cuda graph padding for triton attention backend (#1782)

This commit is contained in:
Lianmin Zheng
2024-10-24 12:33:15 -07:00
committed by GitHub
parent 0089c4bc96
commit fc82f5a743
6 changed files with 3 additions and 19 deletions

View File

@@ -38,7 +38,7 @@ class ReqToTokenPool:
self.size = size
self.max_context_len = max_context_len
self.device = device
self.req_to_token = torch.empty(
self.req_to_token = torch.zeros(
(size, max_context_len), dtype=torch.int32, device=device
)
self.free_slots = list(range(size))