From 10189d08dde1096f5759316c0a6ff05962714c4b Mon Sep 17 00:00:00 2001 From: HAI Date: Mon, 25 Nov 2024 14:57:32 -0800 Subject: [PATCH] [Performance]: Process affinity to CPU cores with multiple sockets support (#2171) --- python/sglang/srt/managers/scheduler.py | 4 +++ python/sglang/srt/utils.py | 35 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 1d1cf3688..2ae705422 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -72,6 +72,7 @@ from sglang.srt.utils import ( configure_logger, crash_on_warnings, get_zmq_socket, + gpu_proc_affinity, kill_parent_process, set_random_seed, suppress_other_loggers, @@ -1393,6 +1394,9 @@ def run_scheduler_process( dp_rank: Optional[int], pipe_writer, ): + # set cpu affinity to this gpu process + gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id) + # [For Router] if env var "DP_RANK" exist, set dp_rank to the value of the env var if dp_rank is None and "DP_RANK" in os.environ: dp_rank = int(os.environ["DP_RANK"]) diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index e947d1a92..0222824e6 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -15,6 +15,7 @@ import base64 import ipaddress +import itertools import json import logging import os @@ -987,3 +988,37 @@ def direct_register_custom_op( my_lib.impl(op_name, op_func, "CUDA") if fake_impl is not None: my_lib._register_fake(op_name, fake_impl) + + +def gpu_proc_affinity( + tp_size: int, + nnodes: int, + gpu_id: int, +): + # current process + pid = os.getpid() + p = psutil.Process(pid) + + tp_size_per_node = tp_size // nnodes + + # total physical cores + total_pcores = psutil.cpu_count(logical=False) + # physical cores per TP (N.B. more Cores than GPUs on node) + num_cores_bind = total_pcores // tp_size_per_node + + # able to handle multiple DP per node + start_cpu_id = (gpu_id * num_cores_bind) % total_pcores + end_cpu_id = start_cpu_id + num_cores_bind + + if psutil.cpu_count() != psutil.cpu_count(logical=False): + # HT on + upper_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)] + lower_cpu_ids = [id + total_pcores for id in range(start_cpu_id, end_cpu_id)] + bind_cpu_ids = list(itertools.chain(upper_cpu_ids, lower_cpu_ids)) + else: + # HT off + bind_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)] + + # set cpu_affinity to current process + p.cpu_affinity(bind_cpu_ids) + logger.info(f"Process {pid} gpu_id {gpu_id} is running on CPUs: {p.cpu_affinity()}")