diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index f03839462..6a4636128 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -84,6 +84,10 @@ class FlashInferAttnBackend(AttentionBackend): self.num_wrappers = 1 self.dispatch_reason = None + # Qwen2 models require higher flashinfer workspace size + if "Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures: + global_config.flashinfer_workspace_size = 512 * 1024 * 1024 + # Allocate buffers self.workspace_buffer = torch.empty( global_config.flashinfer_workspace_size,