From c19d84829c7de194d3965cb0edd414de24c145d8 Mon Sep 17 00:00:00 2001
From: Ke Bao <ISPObaoke@163.com>
Date: Tue, 14 Jan 2025 13:34:22 +0800
Subject: [PATCH] Adjust flashinfer workspace size for Qwen2 models (#2879)

---
 python/sglang/srt/layers/attention/flashinfer_backend.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
index f03839462..6a4636128 100644
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -84,6 +84,10 @@ class FlashInferAttnBackend(AttentionBackend):
             self.num_wrappers = 1
             self.dispatch_reason = None
 
+        # Qwen2 models require higher flashinfer workspace size
+        if "Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures:
+            global_config.flashinfer_workspace_size = 512 * 1024 * 1024
+
         # Allocate buffers
         self.workspace_buffer = torch.empty(
             global_config.flashinfer_workspace_size,