[Fix] fix flashinfer usage for window attention (#1107)

2024-08-15 00:53:24 -07:00
parent e86b1ccbf0
commit 14cb544d56
3 changed files with 12 additions and 18 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -342,15 +342,14 @@ class ModelRunner:
                dtype=torch.uint8,
                device="cuda",
            )
-            self.flashinfer_prefill_wrapper_ragged = []
+            self.flashinfer_prefill_wrapper_ragged = (
+                BatchPrefillWithRaggedKVCacheWrapper(
+                    self.flashinfer_workspace_buffer, "NHD"
+                )
+            )
            self.flashinfer_prefill_wrapper_paged = []
            self.flashinfer_decode_wrapper = []
            for i in range(2):
-                self.flashinfer_prefill_wrapper_ragged.append(
-                    BatchPrefillWithRaggedKVCacheWrapper(
-                        self.flashinfer_workspace_buffer, "NHD"
-                    )
-                )
                self.flashinfer_prefill_wrapper_paged.append(
                    BatchPrefillWithPagedKVCacheWrapper(
                        self.flashinfer_workspace_buffer, "NHD"