[Fix] Window attention compatible with RadixAttention and chunked prefill (#1112)

2024-08-15 10:33:20 -07:00
parent 9195d1362a
commit 93d4e354d8
5 changed files with 37 additions and 56 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -334,11 +334,7 @@ class ModelRunner:
                dtype=torch.uint8,
                device="cuda",
            )
-            self.flashinfer_prefill_wrapper_ragged = (
-                BatchPrefillWithRaggedKVCacheWrapper(
-                    self.flashinfer_workspace_buffer, "NHD"
-                )
-            )
+            self.flashinfer_prefill_wrapper_ragged = None
            self.flashinfer_prefill_wrapper_paged = []
            self.flashinfer_decode_wrapper = []
            for i in range(2):