[Fix] Fix cuda graph padding for triton attention backend (#1782)
This commit is contained in:
@@ -38,7 +38,7 @@ class ReqToTokenPool:
|
||||
self.size = size
|
||||
self.max_context_len = max_context_len
|
||||
self.device = device
|
||||
self.req_to_token = torch.empty(
|
||||
self.req_to_token = torch.zeros(
|
||||
(size, max_context_len), dtype=torch.int32, device=device
|
||||
)
|
||||
self.free_slots = list(range(size))
|
||||
|
||||
Reference in New Issue
Block a user