From e69a219074683dc4c3a100db4da7ea9204018f94 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Tue, 22 Apr 2025 06:15:00 +0800 Subject: [PATCH] Enhance GPU memory settings (#5604) --- python/sglang/srt/mem_cache/memory_pool.py | 12 ++++++ .../srt/model_executor/cuda_graph_runner.py | 25 ++++++++---- python/sglang/srt/server_args.py | 39 +++++++------------ python/sglang/srt/utils.py | 14 +++++++ 4 files changed, 59 insertions(+), 31 deletions(-) diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py index 9f5ecbdbe..615ec3a2b 100644 --- a/python/sglang/srt/mem_cache/memory_pool.py +++ b/python/sglang/srt/mem_cache/memory_pool.py @@ -448,6 +448,18 @@ class MLATokenToKVPool(KVCache): self.layer_transfer_counter = None self.page_size = page_size + kv_size = self.get_kv_size_bytes() + logger.info( + f"KV Cache is allocated. #tokens: {size}, KV size: {kv_size / GB:.2f} GB" + ) + + def get_kv_size_bytes(self): + assert hasattr(self, "kv_buffer") + kv_size_bytes = 0 + for kv_cache in self.kv_buffer: + kv_size_bytes += np.prod(kv_cache.shape) * kv_cache.dtype.itemsize + return kv_size_bytes + # for disagg def get_contiguous_buf_infos(self): # MLA has only one kv_buffer, so only the information of this buffer needs to be returned. diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index b1fa22614..b64c05580 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -35,7 +35,11 @@ from sglang.srt.model_executor.forward_batch_info import ( ForwardMode, ) from sglang.srt.patch_torch import monkey_patch_torch_compile -from sglang.srt.utils import get_available_gpu_memory, is_hip +from sglang.srt.utils import ( + get_available_gpu_memory, + get_whatever_gpu_memory_capacity, + is_hip, +) if TYPE_CHECKING: from sglang.srt.model_executor.model_runner import ModelRunner @@ -132,6 +136,11 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner): if _is_hip: capture_bs += list(range(160, 257, 8)) + gpu_mem = get_whatever_gpu_memory_capacity() / 1024 + + if gpu_mem is not None and gpu_mem > 120: + capture_bs += list(range(160, 256, 8)) + if max(capture_bs) > model_runner.req_to_token_pool.size: # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests # is very small. We add more values here to make sure we capture the maximum bs. @@ -140,12 +149,13 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner): ] capture_bs = list(sorted(set(capture_bs))) - capture_bs = [ - bs - for bs in capture_bs - if bs <= model_runner.req_to_token_pool.size - and bs <= server_args.cuda_graph_max_bs - ] + + assert len(capture_bs) > 0 and capture_bs[0] > 0 + capture_bs = [bs for bs in capture_bs if bs <= model_runner.req_to_token_pool.size] + + if server_args.cuda_graph_max_bs: + capture_bs = [bs for bs in capture_bs if bs <= server_args.cuda_graph_max_bs] + compile_bs = ( [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs] if server_args.enable_torch_compile @@ -186,6 +196,7 @@ class CudaGraphRunner: # Batch sizes to capture self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner) + self.capture_forward_mode = ForwardMode.DECODE self.capture_hidden_mode = CaptureHiddenMode.NULL self.num_tokens_per_bs = 1 diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index ec3f11437..ed0c6bf51 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -26,11 +26,8 @@ from sglang.srt.hf_transformers_utils import check_gguf_file from sglang.srt.reasoning_parser import ReasoningParser from sglang.srt.utils import ( configure_ipv6, - get_amdgpu_memory_capacity, get_device, - get_hpu_memory_capacity, - get_nvgpu_memory_capacity, - is_cuda, + get_whatever_gpu_memory_capacity, is_flashinfer_available, is_hip, is_port_available, @@ -221,28 +218,24 @@ class ServerArgs: if self.random_seed is None: self.random_seed = random.randint(0, 1 << 30) - if is_cuda(): - gpu_mem = get_nvgpu_memory_capacity() - elif is_hip(): - gpu_mem = get_amdgpu_memory_capacity() - elif self.device == "hpu": - gpu_mem = get_hpu_memory_capacity() - else: - # GPU memory is not known yet or no GPU is available. - gpu_mem = None + gpu_mem = get_whatever_gpu_memory_capacity(self.device) # Set mem fraction static, which depends on the tensor parallelism size if self.mem_fraction_static is None: - if self.tp_size >= 16: - self.mem_fraction_static = 0.79 - elif self.tp_size >= 8: - self.mem_fraction_static = 0.81 - elif self.tp_size >= 4: - self.mem_fraction_static = 0.85 - elif self.tp_size >= 2: - self.mem_fraction_static = 0.87 + if gpu_mem <= 81920: + if self.tp_size >= 16: + self.mem_fraction_static = 0.79 + elif self.tp_size >= 8: + self.mem_fraction_static = 0.81 + elif self.tp_size >= 4: + self.mem_fraction_static = 0.85 + elif self.tp_size >= 2: + self.mem_fraction_static = 0.87 + else: + self.mem_fraction_static = 0.88 else: - self.mem_fraction_static = 0.88 + # FIXME: more fine grained auto-selection polices + self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem # Set chunked prefill size, which depends on the gpu memory capacity if self.chunked_prefill_size is None: @@ -271,8 +264,6 @@ class ServerArgs: self.cuda_graph_max_bs = 8 else: self.cuda_graph_max_bs = 80 - else: - self.cuda_graph_max_bs = 160 # Set kernel backends for hpu device if self.device == "hpu": diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index b93c28c39..5cf0e1607 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -1160,6 +1160,20 @@ def get_hpu_memory_capacity(): ) +def get_whatever_gpu_memory_capacity(device: str = None): + if is_cuda(): + gpu_mem = get_nvgpu_memory_capacity() + elif is_hip(): + gpu_mem = get_amdgpu_memory_capacity() + elif device == "hpu": + gpu_mem = get_hpu_memory_capacity() + else: + # GPU memory is not known yet or no GPU is available. + gpu_mem = None + + return gpu_mem + + # Copy from pytorch and OpenRLHF to allow creating multiple main groups. # https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py # https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py