xc-llm-ascend/vllm_ascend/cpu_binding.py

import os
import subprocess
from dataclasses import dataclass
from itertools import accumulate
from typing import Dict, List, Optional, Tuple, Union

import psutil
import torch_npu
from vllm.logger import logger

ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")
CPU_BINDING_NUM = os.getenv("CPU_BINDING_NUM")


def execute_command(cmd_list):
    with subprocess.Popen(cmd_list,
                          shell=False,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.PIPE) as p:
        out, err = p.communicate(timeout=1000)
    res = out.decode()
    return res


@dataclass
class DeviceInfo:
    """
    Parse a single line of device information into structured data.
    """
    _info_line: str = ""
    npu_id: int = 0
    chip_id: int = 0
    chip_logic_id: Union[int, str] = 0
    chip_name: str = ""

    def __post_init__(self):
        npu_id_str, chip_id_str, chip_logic_id_str, self.chip_name = self._info_line.strip(
        ).split(None, 3)
        self.npu_id = int(npu_id_str)
        self.chip_id = int(chip_id_str)
        if chip_logic_id_str.isnumeric():
            self.chip_logic_id = int(chip_logic_id_str)


class NpuHbmInfo:
    visible_npu_ids: Optional[List[int]] = None
    hbm_capacity: Optional[int] = None
    hbm_usage: Optional[int] = None

    @classmethod
    def set_visible_devices(cls, world_size):
        """
        Determine which NPUs are visible to the current process and cache their
        logical NPU IDs in `cls.visible_npu_ids`.
        """
        if cls.visible_npu_ids:
            return
        if ASCEND_RT_VISIBLE_DEVICES is None:
            devices = sorted(list(_get_device_map_info().keys()))
        else:
            devices_str = ASCEND_RT_VISIBLE_DEVICES
            devices = [int(x) for x in devices_str.split(",")]
        device_map_info = _get_device_map_info()
        npu_ids = []
        for device in devices:
            device_info = device_map_info.get(device)
            if device_info is None:
                raise RuntimeError(
                    f"Device {device} not found in device_map_info")
            npu_ids.append(device_info.npu_id)
        cls.visible_npu_ids = npu_ids

    @classmethod
    def get_hbm_capacity(cls, rank, world_size, need_nz):
        """
        Query and cache the HBM (or DDR) capacity in **bytes** for the NPU assigned
        to the current process.
        """
        soc_version = torch_npu._C._npu_get_soc_version()
        if cls.hbm_capacity:
            return cls.hbm_capacity
        if not cls.visible_npu_ids:
            cls.set_visible_devices(world_size)
        assert cls.visible_npu_ids is not None
        npu_id = cls.visible_npu_ids[rank]
        memory_info = execute_command(
            ["npu-smi", "info", "-i", f"{npu_id}", "-t",
             "memory"]).split("\n")[1:]
        if soc_version == 240:
            hbm_capacity_key = 'Capacity(MB)'
        elif not need_nz:
            hbm_capacity_key = 'HBM Capacity(MB)'
        else:
            hbm_capacity_key = 'DDR Capacity(MB)'
        for line in memory_info:
            try:
                key, value = line.strip().split(':', 2)
                if key.strip() == hbm_capacity_key:
                    cls.hbm_capacity = int(value.strip()) * 1024 * 1024
                    return cls.hbm_capacity
            except ValueError:
                pass
        raise ValueError('not found valid hbm capactiy')

    @classmethod
    def get_hbm_usage(cls, rank, world_size, need_nz):
        """
        Return the current HBM or DDR usage 
        ratio (0-1) for the NPU assigned to the given rank.
        """
        if cls.hbm_usage:
            return cls.hbm_usage
        if not cls.visible_npu_ids:
            cls.set_visible_devices(world_size)
        assert cls.visible_npu_ids is not None
        npu_id = cls.visible_npu_ids[rank]
        usage_info = execute_command(
            ["npu-smi", "info", "-i", f"{npu_id}", "-t",
             "usages"]).split("\n")[1:]
        soc_version = torch_npu._C._npu_get_soc_version()
        if soc_version == 240:
            hbm_capacity_key = 'Memory Usage Rate(%)'
        elif not need_nz:
            hbm_capacity_key = 'HBM Usage Rate(%)'
        else:
            hbm_capacity_key = 'DDR Usage Rate(%)'
        for line in usage_info:
            try:
                key, value = line.strip().split(':', 2)
                if key.strip() == hbm_capacity_key:
                    hbm_usage = (float(value.strip()) + 1) / 100
                    return hbm_usage
            except ValueError:
                pass
        raise ValueError('not found valid hbm usage')


def _get_device_map_info() -> Dict[int, DeviceInfo]:
    """
    Build and return a mapping from logical chip ID (int) to its DeviceInfo object.
    """
    device_map_info = {}
    device_map = execute_command(["npu-smi", "info",
                                  "-m"]).strip().split("\n")[1:]
    for line in device_map:
        device_info = DeviceInfo(line.strip())
        if isinstance(device_info.chip_logic_id, int):
            device_map_info[device_info.chip_logic_id] = device_info
    return device_map_info


def _get_pcie_info(devices: List[int], keyword="PCIeBusInfo"):
    """
    Query each NPU in the given device list and return a mapping 
    from logical device ID to its PCIe bus address.
    """
    device_map_info = _get_device_map_info()
    device_pcie_tbl = {}
    for device in devices:
        device_info = device_map_info.get(device)
        if not device_info:
            raise RuntimeError(
                "Can not get device info, you can use BIND_CPU=0 to skip.")
        pcie_info = execute_command([
            "npu-smi", "info", "-t", "board", "-i", f"{device_info.npu_id}",
            "-c", f"{device_info.chip_id}"
        ]).strip().split("\n")
        for _ in pcie_info:
            line = ''.join(_.split())
            if line.startswith(keyword):
                device_pcie_tbl[device] = line[len(keyword) + 1:]
                break

    return device_pcie_tbl


def _get_numa_info(pcie_tbl, keyword="NUMAnode"):
    """
    Build two mappings: device → NUMA node, and NUMA node → [devices].
    """
    device_numa_tbl: Dict[int, int] = {}  # device id -> numa id
    numa_devices_tbl: Dict[int, List[int]] = {}  # numa id -> device ids

    for device, pcie_no in pcie_tbl.items():
        numa_info = execute_command(["lspci", "-s", f"{pcie_no}",
                                     "-vvv"]).split("\n")
        for _ in numa_info:
            line = ''.join(_.split())
            if line.startswith(keyword):
                numa_id = int(line[len(keyword) + 1:])
                device_numa_tbl[device] = numa_id

                devices = numa_devices_tbl.get(numa_id, None)
                if devices is None:
                    numa_devices_tbl[numa_id] = list()

                numa_devices_tbl[numa_id].append(device)
                break

    return device_numa_tbl, numa_devices_tbl


def _get_numa_info_v2(
        devices: List[int],
        keyword="NUMAnode(s)") -> Tuple[Dict[int, int], Dict[int, List[int]]]:
    """
    Evenly distribute the given device list across all NUMA nodes and return
    both device-to-numa and numa-to-devices mappings.
    """
    numa_nodes = 1
    numa_info = execute_command(["lscpu"]).split("\n")
    for _ in numa_info:
        line = ''.join(_.split())
        if keyword not in line:
            continue
        numa_nodes = int(line[-1])
        break

    device_per_numa, tail_device = divmod(len(devices), numa_nodes)
    device_count_per_numa_list = [
        device_per_numa + (i < tail_device) for i in range(numa_nodes)
    ]

    ends = list(accumulate(device_count_per_numa_list))
    starts = [0] + ends[:-1]

    numa_devices_tbl = {
        ind: devices[start:end]
        for ind, (start, end) in enumerate(zip(starts, ends))
    }

    device_numa_tbl = {
        device: numa
        for numa, _devices in numa_devices_tbl.items()
        for device in _devices
    }

    return device_numa_tbl, numa_devices_tbl


def _get_cpu_info(numa_ids, keyword1="NUMAnode", keyword2="CPU(s)"):
    """
    Parse lscpu output to build a dict that maps each NUMA 
    node ID to the list of CPU core IDs belonging to it.
    """
    cpu_idx_tbl = dict()
    numa_keywords = [keyword1 + str(idx) + keyword2 for idx in numa_ids]
    cpu_info = execute_command(["lscpu"]).split("\n")
    for _ in cpu_info:
        line = ''.join(_.split())
        if any(line.startswith(word) for word in numa_keywords):
            split_info = line.split(":")
            cpu_id_ranges = split_info[-1].split(",")

            ranges = list()
            for range_str in cpu_id_ranges:
                endpoints = range_str.split("-")
                if len(endpoints) != 2:
                    raise Exception(
                        "lscpu command output error, please check !")

                ranges += [
                    cid for cid in range(int(endpoints[0]),
                                         int(endpoints[1]) + 1)
                ]

            numa_id = int(split_info[0].replace(keyword1,
                                                '').replace(keyword2, ''))
            cpu_idx_tbl[numa_id] = ranges
    return cpu_idx_tbl


def bind_cpus(rank_id, ratio=0.5):
    # get all visible devices
    visible_devices = ASCEND_RT_VISIBLE_DEVICES

    if visible_devices is None:
        devices = sorted(list(_get_device_map_info().keys()))
    else:
        devices = [int(x) for x in visible_devices.split(",")]

    # Query the NUMA affinity of each NPU via its PCIe address; if this fails,
    # fall back to evenly distributing the devices across NUMA nodes.
    device_pcie_tbl = _get_pcie_info(devices)
    device_numa_tbl, numa_devices_tbl = _get_numa_info(device_pcie_tbl)
    if not device_numa_tbl or not numa_devices_tbl:
        device_numa_tbl, numa_devices_tbl = _get_numa_info_v2(devices)

    # Obtain the complete list of CPU cores for each NUMA node.
    cpu_idx_tbl = _get_cpu_info(list(numa_devices_tbl.keys()))

    cur_device = devices[rank_id]
    numa_id = device_numa_tbl.get(cur_device)

    # Within the NUMA node, evenly partition the CPU cores
    # among all NPUs (or use the amount specified by CPU_BINDING_NUM)
    shard_devices = numa_devices_tbl.get(numa_id)
    shard_devices.sort()

    all_cpus = cpu_idx_tbl.get(numa_id)
    logger.info(
        f"rank_id: {rank_id}, device_id: {cur_device}, "
        f"numa_id: {numa_id}, shard_devices: {shard_devices}, cpus: {all_cpus}"
    )

    cpu_nums = len(all_cpus)
    if CPU_BINDING_NUM is None:
        cpu_num_per_device = int(cpu_nums * ratio // len(shard_devices))
    else:
        cpu_num_per_device = int(CPU_BINDING_NUM)
        if len(shard_devices) * cpu_num_per_device > cpu_nums:
            raise RuntimeError(
                f"Cpu num in numa {numa_id} to assign {cpu_num_per_device} for every device is not enough, "
                f"please decrease the value of CPU_BINDING_NUM!")
        if cpu_num_per_device < 0:
            raise ValueError("CPU_BINDING_NUM should not be less than 0.")

    idx = shard_devices.index(cur_device)
    binding_cpus = [
        all_cpus[_] for _ in range(idx * cpu_num_per_device, (idx + 1) *
                                   cpu_num_per_device)
    ]

    # cpu bind
    p = psutil.Process()
    p.cpu_affinity(binding_cpus)
    new_affinity = p.cpu_affinity()
    logger.info(
        f"process {p.pid}, new_affinity is {new_affinity}, cpu count {cpu_num_per_device}"
    )
[main] support cpu binding (#3546) ### What this PR does / why we need it? Currently, in the piecewise of aclgraph, the model will be in eagle mode in attention, which will cause abnormal allreduce latency of O matrix. The reason is that cpu resources will be preempted in eagle mode. So I hope to temporarily add cpu binding to vllm-ascend. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed with new existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: GDzhu1 <809721801@qq.com> 2025-10-21 09:17:03 +08:00			`import os`
			`import subprocess`
			`from dataclasses import dataclass`
			`from itertools import accumulate`
			`from typing import Dict, List, Optional, Tuple, Union`

			`import psutil`
			`import torch_npu`
			`from vllm.logger import logger`

			`ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")`
			`CPU_BINDING_NUM = os.getenv("CPU_BINDING_NUM")`


			`def execute_command(cmd_list):`
			`with subprocess.Popen(cmd_list,`
			`shell=False,`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.PIPE) as p:`
			`out, err = p.communicate(timeout=1000)`
			`res = out.decode()`
			`return res`


			`@dataclass`
			`class DeviceInfo:`
			`"""`
			`Parse a single line of device information into structured data.`
			`"""`
			`_info_line: str = ""`
			`npu_id: int = 0`
			`chip_id: int = 0`
			`chip_logic_id: Union[int, str] = 0`
			`chip_name: str = ""`

			`def __post_init__(self):`
			`npu_id_str, chip_id_str, chip_logic_id_str, self.chip_name = self._info_line.strip(`
			`).split(None, 3)`
			`self.npu_id = int(npu_id_str)`
			`self.chip_id = int(chip_id_str)`
			`if chip_logic_id_str.isnumeric():`
			`self.chip_logic_id = int(chip_logic_id_str)`


			`class NpuHbmInfo:`
			`visible_npu_ids: Optional[List[int]] = None`
			`hbm_capacity: Optional[int] = None`
			`hbm_usage: Optional[int] = None`

			`@classmethod`
			`def set_visible_devices(cls, world_size):`
			`"""`
			`Determine which NPUs are visible to the current process and cache their`
			logical NPU IDs in `cls.visible_npu_ids`.
			`"""`
			`if cls.visible_npu_ids:`
			`return`
			`if ASCEND_RT_VISIBLE_DEVICES is None:`
			`devices = sorted(list(_get_device_map_info().keys()))`
			`else:`
			`devices_str = ASCEND_RT_VISIBLE_DEVICES`
			`devices = [int(x) for x in devices_str.split(",")]`
			`device_map_info = _get_device_map_info()`
			`npu_ids = []`
			`for device in devices:`
			`device_info = device_map_info.get(device)`
			`if device_info is None:`
			`raise RuntimeError(`
			`f"Device {device} not found in device_map_info")`
			`npu_ids.append(device_info.npu_id)`
			`cls.visible_npu_ids = npu_ids`

			`@classmethod`
			`def get_hbm_capacity(cls, rank, world_size, need_nz):`
			`"""`
			`Query and cache the HBM (or DDR) capacity in bytes for the NPU assigned`
			`to the current process.`
			`"""`
			`soc_version = torch_npu._C._npu_get_soc_version()`
			`if cls.hbm_capacity:`
			`return cls.hbm_capacity`
			`if not cls.visible_npu_ids:`
			`cls.set_visible_devices(world_size)`
			`assert cls.visible_npu_ids is not None`
			`npu_id = cls.visible_npu_ids[rank]`
			`memory_info = execute_command(`
			`["npu-smi", "info", "-i", f"{npu_id}", "-t",`
			`"memory"]).split("\n")[1:]`
			`if soc_version == 240:`
			`hbm_capacity_key = 'Capacity(MB)'`
			`elif not need_nz:`
			`hbm_capacity_key = 'HBM Capacity(MB)'`
			`else:`
			`hbm_capacity_key = 'DDR Capacity(MB)'`
			`for line in memory_info:`
			`try:`
			`key, value = line.strip().split(':', 2)`
			`if key.strip() == hbm_capacity_key:`
			`cls.hbm_capacity = int(value.strip()) * 1024 * 1024`
			`return cls.hbm_capacity`
			`except ValueError:`
			`pass`
			`raise ValueError('not found valid hbm capactiy')`

			`@classmethod`
			`def get_hbm_usage(cls, rank, world_size, need_nz):`
			`"""`
			`Return the current HBM or DDR usage`
			`ratio (0-1) for the NPU assigned to the given rank.`
			`"""`
			`if cls.hbm_usage:`
			`return cls.hbm_usage`
			`if not cls.visible_npu_ids:`
			`cls.set_visible_devices(world_size)`
			`assert cls.visible_npu_ids is not None`
			`npu_id = cls.visible_npu_ids[rank]`
			`usage_info = execute_command(`
			`["npu-smi", "info", "-i", f"{npu_id}", "-t",`
			`"usages"]).split("\n")[1:]`
			`soc_version = torch_npu._C._npu_get_soc_version()`
			`if soc_version == 240:`
			`hbm_capacity_key = 'Memory Usage Rate(%)'`
			`elif not need_nz:`
			`hbm_capacity_key = 'HBM Usage Rate(%)'`
			`else:`
			`hbm_capacity_key = 'DDR Usage Rate(%)'`
			`for line in usage_info:`
			`try:`
			`key, value = line.strip().split(':', 2)`
			`if key.strip() == hbm_capacity_key:`
			`hbm_usage = (float(value.strip()) + 1) / 100`
			`return hbm_usage`
			`except ValueError:`
			`pass`
			`raise ValueError('not found valid hbm usage')`


			`def _get_device_map_info() -> Dict[int, DeviceInfo]:`
			`"""`
			`Build and return a mapping from logical chip ID (int) to its DeviceInfo object.`
			`"""`
			`device_map_info = {}`
			`device_map = execute_command(["npu-smi", "info",`
			`"-m"]).strip().split("\n")[1:]`
			`for line in device_map:`
			`device_info = DeviceInfo(line.strip())`
			`if isinstance(device_info.chip_logic_id, int):`
			`device_map_info[device_info.chip_logic_id] = device_info`
			`return device_map_info`


			`def _get_pcie_info(devices: List[int], keyword="PCIeBusInfo"):`
			`"""`
			`Query each NPU in the given device list and return a mapping`
			`from logical device ID to its PCIe bus address.`
			`"""`
			`device_map_info = _get_device_map_info()`
			`device_pcie_tbl = {}`
			`for device in devices:`
			`device_info = device_map_info.get(device)`
			`if not device_info:`
			`raise RuntimeError(`
			`"Can not get device info, you can use BIND_CPU=0 to skip.")`
			`pcie_info = execute_command([`
			`"npu-smi", "info", "-t", "board", "-i", f"{device_info.npu_id}",`
			`"-c", f"{device_info.chip_id}"`
			`]).strip().split("\n")`
			`for _ in pcie_info:`
			`line = ''.join(_.split())`
			`if line.startswith(keyword):`
			`device_pcie_tbl[device] = line[len(keyword) + 1:]`
			`break`

			`return device_pcie_tbl`


			`def _get_numa_info(pcie_tbl, keyword="NUMAnode"):`
			`"""`
			`Build two mappings: device → NUMA node, and NUMA node → [devices].`
			`"""`
			`device_numa_tbl: Dict[int, int] = {} # device id -> numa id`
			`numa_devices_tbl: Dict[int, List[int]] = {} # numa id -> device ids`

			`for device, pcie_no in pcie_tbl.items():`
			`numa_info = execute_command(["lspci", "-s", f"{pcie_no}",`
			`"-vvv"]).split("\n")`
			`for _ in numa_info:`
			`line = ''.join(_.split())`
			`if line.startswith(keyword):`
			`numa_id = int(line[len(keyword) + 1:])`
			`device_numa_tbl[device] = numa_id`

			`devices = numa_devices_tbl.get(numa_id, None)`
			`if devices is None:`
			`numa_devices_tbl[numa_id] = list()`

			`numa_devices_tbl[numa_id].append(device)`
			`break`

			`return device_numa_tbl, numa_devices_tbl`


			`def _get_numa_info_v2(`
			`devices: List[int],`
			`keyword="NUMAnode(s)") -> Tuple[Dict[int, int], Dict[int, List[int]]]:`
			`"""`
			`Evenly distribute the given device list across all NUMA nodes and return`
			`both device-to-numa and numa-to-devices mappings.`
			`"""`
			`numa_nodes = 1`
			`numa_info = execute_command(["lscpu"]).split("\n")`
			`for _ in numa_info:`
			`line = ''.join(_.split())`
			`if keyword not in line:`
			`continue`
			`numa_nodes = int(line[-1])`
			`break`

			`device_per_numa, tail_device = divmod(len(devices), numa_nodes)`
			`device_count_per_numa_list = [`
			`device_per_numa + (i < tail_device) for i in range(numa_nodes)`
			`]`

			`ends = list(accumulate(device_count_per_numa_list))`
			`starts = [0] + ends[:-1]`

			`numa_devices_tbl = {`
			`ind: devices[start:end]`
			`for ind, (start, end) in enumerate(zip(starts, ends))`
			`}`

			`device_numa_tbl = {`
			`device: numa`
			`for numa, _devices in numa_devices_tbl.items()`
			`for device in _devices`
			`}`

			`return device_numa_tbl, numa_devices_tbl`


			`def _get_cpu_info(numa_ids, keyword1="NUMAnode", keyword2="CPU(s)"):`
			`"""`
			`Parse lscpu output to build a dict that maps each NUMA`
			`node ID to the list of CPU core IDs belonging to it.`
			`"""`
			`cpu_idx_tbl = dict()`
			`numa_keywords = [keyword1 + str(idx) + keyword2 for idx in numa_ids]`
			`cpu_info = execute_command(["lscpu"]).split("\n")`
			`for _ in cpu_info:`
			`line = ''.join(_.split())`
			`if any(line.startswith(word) for word in numa_keywords):`
			`split_info = line.split(":")`
			`cpu_id_ranges = split_info[-1].split(",")`

			`ranges = list()`
			`for range_str in cpu_id_ranges:`
			`endpoints = range_str.split("-")`
			`if len(endpoints) != 2:`
			`raise Exception(`
			`"lscpu command output error, please check !")`

			`ranges += [`
			`cid for cid in range(int(endpoints[0]),`
			`int(endpoints[1]) + 1)`
			`]`

			`numa_id = int(split_info[0].replace(keyword1,`
			`'').replace(keyword2, ''))`
			`cpu_idx_tbl[numa_id] = ranges`
			`return cpu_idx_tbl`


			`def bind_cpus(rank_id, ratio=0.5):`
			`# get all visible devices`
			`visible_devices = ASCEND_RT_VISIBLE_DEVICES`

			`if visible_devices is None:`
			`devices = sorted(list(_get_device_map_info().keys()))`
			`else:`
			`devices = [int(x) for x in visible_devices.split(",")]`

			`# Query the NUMA affinity of each NPU via its PCIe address; if this fails,`
			`# fall back to evenly distributing the devices across NUMA nodes.`
			`device_pcie_tbl = _get_pcie_info(devices)`
			`device_numa_tbl, numa_devices_tbl = _get_numa_info(device_pcie_tbl)`
			`if not device_numa_tbl or not numa_devices_tbl:`
			`device_numa_tbl, numa_devices_tbl = _get_numa_info_v2(devices)`

			`# Obtain the complete list of CPU cores for each NUMA node.`
			`cpu_idx_tbl = _get_cpu_info(list(numa_devices_tbl.keys()))`

			`cur_device = devices[rank_id]`
			`numa_id = device_numa_tbl.get(cur_device)`

			`# Within the NUMA node, evenly partition the CPU cores`
			`# among all NPUs (or use the amount specified by CPU_BINDING_NUM)`
			`shard_devices = numa_devices_tbl.get(numa_id)`
			`shard_devices.sort()`

			`all_cpus = cpu_idx_tbl.get(numa_id)`
			`logger.info(`
			`f"rank_id: {rank_id}, device_id: {cur_device}, "`
			`f"numa_id: {numa_id}, shard_devices: {shard_devices}, cpus: {all_cpus}"`
			`)`

			`cpu_nums = len(all_cpus)`
			`if CPU_BINDING_NUM is None:`
			`cpu_num_per_device = int(cpu_nums * ratio // len(shard_devices))`
			`else:`
			`cpu_num_per_device = int(CPU_BINDING_NUM)`
			`if len(shard_devices) * cpu_num_per_device > cpu_nums:`
			`raise RuntimeError(`
			`f"Cpu num in numa {numa_id} to assign {cpu_num_per_device} for every device is not enough, "`
			`f"please decrease the value of CPU_BINDING_NUM!")`
			`if cpu_num_per_device < 0:`
			`raise ValueError("CPU_BINDING_NUM should not be less than 0.")`

			`idx = shard_devices.index(cur_device)`
			`binding_cpus = [`
			`all_cpus[_] for _ in range(idx * cpu_num_per_device, (idx + 1) *`
			`cpu_num_per_device)`
			`]`

			`# cpu bind`
			`p = psutil.Process()`
			`p.cpu_affinity(binding_cpus)`
			`new_affinity = p.cpu_affinity()`
			`logger.info(`
			`f"process {p.pid}, new_affinity is {new_affinity}, cpu count {cpu_num_per_device}"`
			`)`