From e94e60d6fbb39d967638347c01a711cbe82e2c42 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Fri, 21 Jun 2024 17:32:36 -0700 Subject: [PATCH] make flashinfer workspace larger --- python/sglang/srt/managers/controller/model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py index ecca79976..942f29070 100644 --- a/python/sglang/srt/managers/controller/model_runner.py +++ b/python/sglang/srt/managers/controller/model_runner.py @@ -360,7 +360,7 @@ class ModelRunner: use_tensor_cores = False workspace_buffer = torch.empty( - 32 * 1024 * 1024, dtype=torch.int8, device="cuda" + 128 * 1024 * 1024, dtype=torch.int8, device="cuda" ) self.flashinfer_prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper( workspace_buffer, "NHD"