[CPU] fix OOM when mem-fraction is not set (#9090)

This commit is contained in:
Zaili Wang
2025-09-11 14:52:22 +08:00
committed by GitHub
parent 4aa1e69bc7
commit ef959d7b85
6 changed files with 29 additions and 16 deletions

View File

@@ -87,7 +87,7 @@ srt_hip = [
]
# https://docs.sglang.ai/platforms/cpu_server.html
srt_cpu = ["sglang[runtime_common]"]
srt_cpu = ["sglang[runtime_common]", "intel-openmp"]
# https://docs.sglang.ai/platforms/ascend_npu.html
srt_npu = ["sglang[runtime_common]"]

View File

@@ -1673,10 +1673,9 @@ class ModelRunner:
def init_threads_binding(self):
omp_cpuids = os.environ.get("SGLANG_CPU_OMP_THREADS_BIND", "all")
cpu_ids_by_node = get_cpu_ids_by_node()
n_numa_node = len(cpu_ids_by_node)
if omp_cpuids == "all":
cpu_ids_by_node = get_cpu_ids_by_node()
n_numa_node = len(cpu_ids_by_node)
assert self.tp_size <= n_numa_node, (
f"SGLANG_CPU_OMP_THREADS_BIND is not set, in this case, "
f"tp_size {self.tp_size} should be smaller than or equal to number of numa node on the machine {n_numa_node}. "
@@ -1693,7 +1692,18 @@ class ModelRunner:
)
self.local_omp_cpuid = cpu_ids_by_node[self.tp_rank]
else:
self.local_omp_cpuid = omp_cpuids.split("|")[self.tp_rank]
threads_bind_list = omp_cpuids.split("|")
assert self.tp_size == len(threads_bind_list), (
f"SGLANG_CPU_OMP_THREADS_BIND setting must be aligned with TP size parameter ({self.tp_size}). "
f"Please double check your settings."
)
self.local_omp_cpuid = threads_bind_list[self.tp_rank]
if self.tp_size > n_numa_node:
logger.warning(
f"TP size ({self.tp_size})is larger than numa node number ({n_numa_node}), "
f"in this case the available memory amount of each rank cannot be determined in prior. "
f"Please set proper `--max-total-tokens` to avoid the out-of-memory error."
)
def apply_torch_tp(self):
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")

View File

@@ -434,7 +434,9 @@ def get_available_gpu_memory(
elif device == "cpu":
# TODO: rename the variables in the current function to be not GPU specific
free_gpu_memory = psutil.virtual_memory().available
total_free_memory = psutil.virtual_memory().available
n_numa_node: int = len(get_cpu_ids_by_node())
free_gpu_memory = round(total_free_memory / n_numa_node, 3)
elif device == "npu":
num_gpus = torch.npu.device_count()
assert gpu_id < num_gpus