[Fix] Compatibility of window attention and cuda graph (#1090)

This commit is contained in:
Ying Sheng
2024-08-14 10:37:01 -07:00
committed by GitHub
parent a34dd86a7d
commit 96a2093ef0
7 changed files with 70 additions and 39 deletions

View File

@@ -453,10 +453,12 @@ class ServerArgs:
logger.info(
f"When using sliding window in gemma-2, disable radix_cache, regex_jump_forward, and turn on flashinfer."
)
# FIXME: compatibility with radix attention
self.disable_radix_cache = True
# FIXME: compatibility with jump forward
self.disable_regex_jump_forward = True
self.disable_flashinfer = False
self.disable_cuda_graph = True
# FIXME: compatibility with chunked prefill
self.chunked_prefill_size = None