From 229d2b95f19573ece9c1c5d6b357df9874e04f59 Mon Sep 17 00:00:00 2001 From: Zaili Wang <109502517+ZailiWang@users.noreply.github.com> Date: Tue, 30 Sep 2025 23:41:20 +0800 Subject: [PATCH] [CPU] Adding Memory Capacity Acquisition Functionality (#11102) --- python/sglang/srt/server_args.py | 8 ++++++-- python/sglang/srt/utils.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 701415390..18269528a 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -639,7 +639,7 @@ class ServerArgs: if self.cuda_graph_max_bs > 300: reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5 - if gpu_mem > 60 * 1024: + if gpu_mem is not None and gpu_mem > 60 * 1024: reserved_mem = max(reserved_mem, 10 * 1024) if self.speculative_algorithm is not None: @@ -650,7 +650,11 @@ class ServerArgs: # eagle draft models and cuda graphs reserved_mem += 2 * 1024 - self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3) + self.mem_fraction_static = ( + round((gpu_mem - reserved_mem) / gpu_mem, 3) + if gpu_mem is not None + else 0.88 + ) # Lazy init to avoid circular import # Multimodal models need more memory for the image processor diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 8038ccf8a..dce5db06f 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -1507,6 +1507,32 @@ def get_npu_memory_capacity(): raise ImportError("torch_npu is required when run on npu device.") +def get_cpu_memory_capacity(): + # Per-rank memory capacity cannot be determined for customized core settings + if os.environ.get("SGLANG_CPU_OMP_THREADS_BIND", ""): + return None + n_numa_node: int = len(get_cpu_ids_by_node()) + if n_numa_node == 0: + # Cannot determine NUMA config, fallback to total memory and avoid ZeroDivisionError. + return float(psutil.virtual_memory().total // (1 << 20)) + try: + numa_mem_list = list() + file_prefix = "/sys/devices/system/node/" + for numa_id in range(n_numa_node): + file_meminfo = f"node{numa_id}/meminfo" + with open(os.path.join(file_prefix, file_meminfo), "r") as f: + # 1st line contains 'MemTotal' + line = f.read().split("\n")[0] + numa_mem_list.append(int(line.split()[3])) + # Retrieved value in KB, need MB + numa_mem = float(min(numa_mem_list) // 1024) + return numa_mem + except FileNotFoundError: + numa_mem = psutil.virtual_memory().total / n_numa_node + # Retrieved value in Byte, need MB + return float(numa_mem // (1 << 20)) + + def get_device_memory_capacity(device: str = None): if is_cuda(): gpu_mem = get_nvgpu_memory_capacity() @@ -1516,6 +1542,8 @@ def get_device_memory_capacity(device: str = None): gpu_mem = get_hpu_memory_capacity() elif device == "npu": gpu_mem = get_npu_memory_capacity() + elif device == "cpu": + gpu_mem = get_cpu_memory_capacity() else: # GPU memory is not known yet or no GPU is available. gpu_mem = None