From e4b6133b785a48918e326d79e26b61a33036b84d Mon Sep 17 00:00:00 2001 From: JieXin Liang Date: Wed, 30 Apr 2025 08:01:12 +0800 Subject: [PATCH] [fix] relax mem_fraction_static for h200 (#5893) Co-authored-by: alcanerian --- .../srt/model_executor/cuda_graph_runner.py | 2 +- python/sglang/srt/server_args.py | 29 ++++++++++--------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 4b20071d5..b36f15a86 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -135,7 +135,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner): gpu_mem = get_device_memory_capacity() # Batch size of each rank will not become so large when DP is on - if gpu_mem is not None and gpu_mem > 81920 and server_args.dp_size == 1: + if gpu_mem is not None and gpu_mem > 96 * 1024: capture_bs += list(range(160, 257, 8)) if max(capture_bs) > model_runner.req_to_token_pool.size: diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index d06ae705f..29585a7f9 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -222,20 +222,23 @@ class ServerArgs: # Set mem fraction static, which depends on the tensor parallelism size if self.mem_fraction_static is None: - if gpu_mem <= 81920: - if self.tp_size >= 16: - self.mem_fraction_static = 0.79 - elif self.tp_size >= 8: - self.mem_fraction_static = 0.81 - elif self.tp_size >= 4: - self.mem_fraction_static = 0.85 - elif self.tp_size >= 2: - self.mem_fraction_static = 0.87 - else: - self.mem_fraction_static = 0.88 + if self.tp_size >= 16: + self.mem_fraction_static = 0.79 + elif self.tp_size >= 8: + self.mem_fraction_static = 0.81 + elif self.tp_size >= 4: + self.mem_fraction_static = 0.85 + elif self.tp_size >= 2: + self.mem_fraction_static = 0.87 else: - # FIXME: more fine grained auto-selection polices - self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem + self.mem_fraction_static = 0.88 + if gpu_mem > 96 * 1024: + mem_fraction = self.mem_fraction_static + self.mem_fraction_static = min( + mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem, + (gpu_mem - 1024 * 18) + / gpu_mem, # 15 GB + additional 3GB for cuda graph + ) # Set chunked prefill size, which depends on the gpu memory capacity if self.chunked_prefill_size is None: