Adjust flashinfer workspace size for Qwen2 models (#2879)
This commit is contained in:
@@ -84,6 +84,10 @@ class FlashInferAttnBackend(AttentionBackend):
|
||||
self.num_wrappers = 1
|
||||
self.dispatch_reason = None
|
||||
|
||||
# Qwen2 models require higher flashinfer workspace size
|
||||
if "Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures:
|
||||
global_config.flashinfer_workspace_size = 512 * 1024 * 1024
|
||||
|
||||
# Allocate buffers
|
||||
self.workspace_buffer = torch.empty(
|
||||
global_config.flashinfer_workspace_size,
|
||||
|
||||
Reference in New Issue
Block a user