From 535c838674ee9e8c991a8887f12dcad725b9c307 Mon Sep 17 00:00:00 2001 From: JieXin Liang Date: Thu, 29 May 2025 14:25:18 +0800 Subject: [PATCH] [fix] more mem for draft_extend cuda_graph (#6726) --- python/sglang/srt/server_args.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 6b15d702a..9c843a315 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -271,6 +271,9 @@ class ServerArgs: mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem, (gpu_mem - reserve_mem) / gpu_mem, ) + else: + if self.speculative_algorithm is not None: + self.mem_fraction_static *= 0.95 # Set chunked prefill size, which depends on the gpu memory capacity if self.chunked_prefill_size is None: