[Performance]: Process affinity to CPU cores with multiple sockets support (#2171)
This commit is contained in:
@@ -15,6 +15,7 @@
|
||||
|
||||
import base64
|
||||
import ipaddress
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -987,3 +988,37 @@ def direct_register_custom_op(
|
||||
my_lib.impl(op_name, op_func, "CUDA")
|
||||
if fake_impl is not None:
|
||||
my_lib._register_fake(op_name, fake_impl)
|
||||
|
||||
|
||||
def gpu_proc_affinity(
|
||||
tp_size: int,
|
||||
nnodes: int,
|
||||
gpu_id: int,
|
||||
):
|
||||
# current process
|
||||
pid = os.getpid()
|
||||
p = psutil.Process(pid)
|
||||
|
||||
tp_size_per_node = tp_size // nnodes
|
||||
|
||||
# total physical cores
|
||||
total_pcores = psutil.cpu_count(logical=False)
|
||||
# physical cores per TP (N.B. more Cores than GPUs on node)
|
||||
num_cores_bind = total_pcores // tp_size_per_node
|
||||
|
||||
# able to handle multiple DP per node
|
||||
start_cpu_id = (gpu_id * num_cores_bind) % total_pcores
|
||||
end_cpu_id = start_cpu_id + num_cores_bind
|
||||
|
||||
if psutil.cpu_count() != psutil.cpu_count(logical=False):
|
||||
# HT on
|
||||
upper_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
|
||||
lower_cpu_ids = [id + total_pcores for id in range(start_cpu_id, end_cpu_id)]
|
||||
bind_cpu_ids = list(itertools.chain(upper_cpu_ids, lower_cpu_ids))
|
||||
else:
|
||||
# HT off
|
||||
bind_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
|
||||
|
||||
# set cpu_affinity to current process
|
||||
p.cpu_affinity(bind_cpu_ids)
|
||||
logger.info(f"Process {pid} gpu_id {gpu_id} is running on CPUs: {p.cpu_affinity()}")
|
||||
|
||||
Reference in New Issue
Block a user