[Fix] Fix cuda graph padding for triton attention backend (#1782)

2024-10-24 12:33:15 -07:00
parent 0089c4bc96
commit fc82f5a743
6 changed files with 3 additions and 19 deletions
--- a/python/sglang/srt/mem_cache/memory_pool.py
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -38,7 +38,7 @@ class ReqToTokenPool:
        self.size = size
        self.max_context_len = max_context_len
        self.device = device
-        self.req_to_token = torch.empty(
+        self.req_to_token = torch.zeros(
            (size, max_context_len), dtype=torch.int32, device=device
        )
        self.free_slots = list(range(size))