Adjust flashinfer workspace size for Qwen2 models (#2879)
This commit is contained in:
@@ -84,6 +84,10 @@ class FlashInferAttnBackend(AttentionBackend):
|
|||||||
self.num_wrappers = 1
|
self.num_wrappers = 1
|
||||||
self.dispatch_reason = None
|
self.dispatch_reason = None
|
||||||
|
|
||||||
|
# Qwen2 models require higher flashinfer workspace size
|
||||||
|
if "Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures:
|
||||||
|
global_config.flashinfer_workspace_size = 512 * 1024 * 1024
|
||||||
|
|
||||||
# Allocate buffers
|
# Allocate buffers
|
||||||
self.workspace_buffer = torch.empty(
|
self.workspace_buffer = torch.empty(
|
||||||
global_config.flashinfer_workspace_size,
|
global_config.flashinfer_workspace_size,
|
||||||
|
|||||||
Reference in New Issue
Block a user