xc-llm-ascend/vllm_ascend/cpu_binding.py

#!/usr/bin/env python3

import os
import platform
import shutil
import subprocess
from collections import defaultdict

import psutil
from vllm.logger import logger

from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type

MASK_BIT = 32  # Number of bits in a CPU affinity mask group
MIN_CPUS_PER_NPU = 5  # 2(IRQ) + 1(main, at least 1 CPU) + 1(acl) + 1(release) = 5 CPUs per NPU
ALLOWED_CPUS_PATH = "/proc/self/status"
ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")

TOPO_AFFINITY_MODE = "topo_affinity"
GLOBAL_SLICE_MODE = "global_slice"

DEVICE_BINDING_MODE: dict["AscendDeviceType", str] = {
    AscendDeviceType.A2: TOPO_AFFINITY_MODE,
    AscendDeviceType.A3: GLOBAL_SLICE_MODE,
    AscendDeviceType._310P: TOPO_AFFINITY_MODE,
}


def is_arm_cpu() -> bool:
    arch = platform.machine().lower()
    if arch in {"x86_64", "amd64", "i386", "i686"}:
        return False
    if arch in {"aarch64", "arm64"} or arch.startswith("arm"):
        return True
    logger.warning(f"Unknown CPU architecture '{arch}', CPU binding will be disabled.")
    return False


def execute_command(cmd: list[str]) -> tuple[str, int]:
    with subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
        out, _ = p.communicate(timeout=1000)
    return out.decode(), p.returncode


class DeviceInfo:
    def __init__(self):
        self.npu_map_info: dict[str, dict[str, str]] = self.get_npu_map_info()
        self.allowed_cpus: list[int] = self.parse_allowed_cpus()
        self.running_npu_list: list[int] = self.get_running_npus()
        self.npu_affinity: dict[int, list[int]] = self.parse_topo_affinity()
        self.all_logic_npus: list[int] = self.get_all_logic_npus()
        self.total_logic_npus: int = len(self.all_logic_npus)

    @staticmethod
    def expand_cpu_list(allowed_list_str: str) -> list[int]:
        allowed_cpus_list: list[int] = []
        for per_range in allowed_list_str.split(","):
            if "-" in per_range:
                start_cpu, end_cpu = map(int, per_range.split("-"))
                allowed_cpus_list.extend(range(start_cpu, end_cpu + 1))
            else:
                allowed_cpus_list.append(int(per_range))
        return allowed_cpus_list

    def get_all_logic_npus(self) -> list[int]:
        """Collect all logical NPU IDs from the NPU mapping.

        self.npu_map_info maps a board_id (A3) or npu_id (A2) to a per-chip map.
        The per-chip map uses chip_id as the key and the logical NPU ID string
        as the value.
        """
        logic_ids: set[int] = set()
        for _, chip_map in self.npu_map_info.items():
            for _, logic_str in chip_map.items():
                if logic_str and logic_str.isdigit():
                    logic_ids.add(int(logic_str))
        return sorted(logic_ids)

    @staticmethod
    def get_npu_map_info() -> dict[str, dict[str, str]]:
        npu_map_info: dict[str, dict[str, str]] = {}
        npu_info, _ = execute_command(["npu-smi", "info", "-m"])
        npu_map = npu_info.strip().split("\n")[1:]
        for line in npu_map:
            npu_id, chip_id, chip_logic_id = line.strip().split()[:3]
            if not chip_logic_id.isdigit():
                continue
            if npu_id not in npu_map_info:
                npu_map_info[npu_id] = {}
            npu_map_info[npu_id][chip_id] = chip_logic_id
        return npu_map_info

    def get_running_npus(self) -> list[int]:
        npu_message, _ = execute_command(["npu-smi", "info"])
        in_proc_section = False
        running_npu_set = set()
        for line in npu_message.splitlines():
            line = line.strip()
            if line.startswith("| NPU") and "Process id" in line:
                in_proc_section = True
                continue
            if not in_proc_section:
                continue
            if line.startswith("| "):
                parts = [p.strip() for p in line.strip("|").split("|")]
                if len(parts) < 2:
                    continue
                npu_id = parts[0].split()[0]
                chip_id = parts[0].split()[1]
                if not npu_id.isdigit() or not chip_id.isdigit():
                    continue
                chip_logic_id = self.npu_map_info.get(npu_id, {}).get(chip_id)
                if not chip_logic_id or not chip_logic_id.isdigit():
                    raise RuntimeError("Failed to get correct chip_logic_id from command 'npu-smi info -m'.")
                running_npu_set.add(int(chip_logic_id))
        if ASCEND_RT_VISIBLE_DEVICES:
            devices_str = ASCEND_RT_VISIBLE_DEVICES
            devices_list = [int(x) for x in devices_str.split(",")]
            running_npu_set = set(devices_list) & running_npu_set
        if not running_npu_set:
            raise RuntimeError("Can not get running npu info.")
        return sorted(running_npu_set)

    def parse_allowed_cpus(self) -> list[int]:
        if not os.path.exists(ALLOWED_CPUS_PATH):
            return []
        with open(ALLOWED_CPUS_PATH) as f:
            for line in f:
                if line.startswith("Cpus_allowed_list"):
                    return self.expand_cpu_list(line.split()[1])
        raise RuntimeError("Can not found specific 'Cpus_allowed_list' in the '/proc/self/status' file.")

    def parse_topo_affinity(self) -> dict[int, list[int]]:
        chip_logic_id = 0
        affinity: dict[int, list[int]] = {}
        affinity_message, _ = execute_command(["npu-smi", "info", "-t", "topo"])
        for line in affinity_message.splitlines():
            if line.startswith("NPU"):
                parts = line.split()
                last_part = parts[-1]
                if last_part != "Affinity":
                    affinity[chip_logic_id] = self.expand_cpu_list(last_part)
                chip_logic_id += 1
        return affinity


class CpuAlloc:
    def __init__(self, rank_id: int):
        self.rank_id = rank_id
        self.device_info: DeviceInfo = DeviceInfo()
        self.cpu_node: dict[int, int] = {}
        self.numa_to_cpu_map: dict[int, list[int]] = defaultdict(list)
        self.npu_cpu_pool: dict[int, list[int]] = {}
        self.assign_main: dict[int, list[int]] = {}
        self.assign_acl: dict[int, list[int]] = {}
        self.assign_rel: dict[int, list[int]] = {}

    @staticmethod
    def cpu_to_mask(cpu: int) -> str:
        group = cpu // MASK_BIT
        bit = cpu % MASK_BIT
        value = 1 << bit
        mask = f"{value:08x}"
        for _ in range(1, group + 1):
            mask = f"{mask},{'0' * (MASK_BIT // 4)}"
        return mask

    @staticmethod
    def get_threads_map(thread_message: str) -> dict[str, dict[str, list[str]]]:
        threads_map: dict[str, dict[str, list[str]]] = {}
        for line in thread_message.splitlines():
            parts = line.split()
            if len(parts) < 2:
                continue
            main_pid, sub_pid = parts[0], parts[1]
            if "acl_thread" in line:
                key = "acl_thread"
            elif "release_thread" in line:
                key = "release_thread"
            else:
                continue
            if main_pid not in threads_map:
                threads_map[main_pid] = {"acl_thread": [], "release_thread": []}
            threads_map[main_pid][key].append(sub_pid)
        return threads_map

    @staticmethod
    def bind(pid: str, cpus: list[int], bind_sub_thread: bool) -> None:
        if cpus:
            cpu_list = ",".join(map(str, cpus))
            if bind_sub_thread:
                bind_result, return_code = execute_command(["taskset", "-acp", cpu_list, pid])
            else:
                bind_result, return_code = execute_command(["taskset", "-cp", cpu_list, pid])
            if return_code != 0:
                raise RuntimeError(f"Failed to bind {pid} to CPU {cpu_list}.")

    def average_distribute(self, groups: dict[str, list[int]]) -> dict[int, list[int]]:
        result: dict[int, list[int]] = {}
        for key, npu_list in groups.items():
            cpu_list = sorted(self.npu_cpu_pool[npu_list[0]])
            cpu_num_per_npu = len(cpu_list) // len(npu_list)
            for i, npu in enumerate(npu_list):
                start_index = i * cpu_num_per_npu
                end_index = (i + 1) * cpu_num_per_npu if i < len(npu_list) - 1 else len(cpu_list)
                result[npu] = cpu_list[start_index:end_index]
        return result

    def extend_numa(self, cpu_list: list[int]) -> list[int]:
        if not cpu_list:
            return []
        nodes = {self.cpu_node[c] for c in cpu_list}
        if len(nodes) != 1:
            return cpu_list
        node = list(nodes)[0]
        next_node = (node + 1) % len(self.numa_to_cpu_map)
        extended = cpu_list[:]
        for cpu in self.numa_to_cpu_map[next_node]:
            if cpu in self.device_info.allowed_cpus:
                extended.append(cpu)
        return sorted(set(extended))

    def build_cpu_node_map(self) -> None:
        cpu_numa_map, _ = execute_command(["lscpu", "-e=CPU,NODE"])
        for line in cpu_numa_map.splitlines():
            line = line.strip()
            if not line or not line[0].isdigit():
                continue
            cpu_str, node_str = line.split()
            cpu = int(cpu_str)
            node = int(node_str)
            self.cpu_node[cpu] = node
            self.numa_to_cpu_map[node].append(cpu)
        if len(self.numa_to_cpu_map) == 0:
            raise RuntimeError("lscpu command output error, no NUMA node available. Please check!")

    def build_global_slice_cpu_pool(self) -> None:
        """
        Build per-NPU CPU pools by slicing allowed_cpus using GLOBAL logical NPU ids.

        Why:
          - Multiple processes/DP groups may share the SAME cpuset (same allowed_cpus).
          - If each process slices only its visible NPUs, CPU ranges overlap across processes.
          - Global slicing ensures deterministic, non-overlapping CPU partitions per logical NPU id.

        Notes:
          - This strategy does NOT rely on npu-smi topo affinity.
          - NUMA locality is achieved only if CPU numbering aligns with NUMA layout.
          - Requires per-NPU slice size >= 5 (IRQ(2) + main(>=1) + acl(1) + release(1)).
        """
        running = list(self.device_info.running_npu_list)
        if not running:
            return

        allowed = sorted(set(self.device_info.allowed_cpus))
        total_cpu = len(allowed)
        if total_cpu == 0:
            return

        # Prefer mapping info (npu-smi info -m), fallback to topo keys, then visible list
        if self.device_info.total_logic_npus > 0:
            total_npus = self.device_info.total_logic_npus
        elif self.device_info.npu_affinity:
            total_npus = len(self.device_info.npu_affinity)
        else:
            total_npus = len(running)

        if total_npus <= 0:
            return

        # Compute global per-NPU slicing
        base = total_cpu // total_npus
        extra = total_cpu % total_npus

        logger.debug(
            f"[cpu_global_slice] rank:{self.rank_id} ASCEND_RT_VISIBLE_DEVICES={ASCEND_RT_VISIBLE_DEVICES} "
            f"running_npu_list:{running} total_npus:{total_npus} allowed_cpus:{total_cpu} "
            f"base:{base} extra:{extra} allowed_cpus_head:{allowed[:16]} allowed_cpus_tail:{allowed[-16:]}"
        )

        # Enforce per-NPU slice length >= 5.
        # Because with remainder distribution, some NPUs may get 'base' cores and some get 'base+1'.
        # The minimum slice size is 'base'.
        if base < MIN_CPUS_PER_NPU:
            raise RuntimeError(
                "Insufficient CPUs for binding with IRQ/ACL/REL reservations: "
                f"total_allowed={total_cpu}, total_npus={total_npus}, "
                f"min_per_npu={base} (<{MIN_CPUS_PER_NPU}). "
                f"Need at least {total_npus * MIN_CPUS_PER_NPU} CPUs in cpuset."
            )

        def _slice_for_npu(global_npu_id: int) -> list[int]:
            # start = global_npu_id*base + min(global_npu_id, extra)
            start = global_npu_id * base + (global_npu_id if global_npu_id < extra else extra)
            take = base + (1 if global_npu_id < extra else 0)
            end = start + take
            return allowed[start:end]

        for npu in running:
            if npu < 0 or npu >= total_npus:
                raise RuntimeError(f"Invalid NPU id {npu}, total_npus={total_npus}.")
            cpus = _slice_for_npu(npu)
            # Extra safety: should always be >= base >= 5
            if len(cpus) < MIN_CPUS_PER_NPU:
                raise RuntimeError(
                    f"NPU{npu} got too few CPUs: {len(cpus)} (<5). "
                    f"total_allowed={total_cpu}, total_npus={total_npus}, base={base}, extra={extra}"
                )
            self.npu_cpu_pool[npu] = cpus

    @staticmethod
    def _binding_mode() -> str:
        device_type = get_ascend_device_type()
        return DEVICE_BINDING_MODE.get(device_type, TOPO_AFFINITY_MODE)

    def build_cpu_pools(self) -> None:
        self.build_cpu_node_map()

        mode = self._binding_mode()
        logger.info(f"[cpu_bind_mode] mode={mode} rank={self.rank_id} visible_npus={self.device_info.running_npu_list}")
        if mode == GLOBAL_SLICE_MODE:
            self.build_global_slice_cpu_pool()
            return

        # topo_affinity mode
        if not self.device_info.npu_affinity:
            logger.warning("NPU topo affinity not found, fallback to global-slice CPU binding.")
            self.build_global_slice_cpu_pool()
            return

        for npu in self.device_info.running_npu_list:
            base_cpu_list = [
                cpu for cpu in self.device_info.npu_affinity.get(npu, []) if cpu in self.device_info.allowed_cpus
            ]
            if not base_cpu_list:
                raise RuntimeError("CPUs available in 'Cpus_allowed_list' conflict with NUMA affinity.")
            extra_cpu_list = self.extend_numa(base_cpu_list)
            self.npu_cpu_pool[npu] = extra_cpu_list

        groups = defaultdict(list)
        for npu, cpus in self.npu_cpu_pool.items():
            groups[str(cpus)].append(npu)

        final: dict[int, list[int]] = {}
        for key, npu_list in groups.items():
            if len(npu_list) == 1:
                final[npu_list[0]] = self.npu_cpu_pool[npu_list[0]]
            else:
                final.update(self.average_distribute({key: npu_list}))
        self.npu_cpu_pool = final

    def allocate(self) -> None:
        for npu, pool in self.npu_cpu_pool.items():
            if len(pool) >= MIN_CPUS_PER_NPU:
                main = pool[2:-2]
                acl = [pool[-2]]
                rel = [pool[-1]]
            else:
                raise RuntimeError(
                    f"The number of CPUs is insufficient. Each NPU requires at least {MIN_CPUS_PER_NPU} CPUs."
                )
            self.assign_main[npu] = main
            self.assign_acl[npu] = acl
            self.assign_rel[npu] = rel

    def print_plan(self) -> None:
        logger.info("The CPU allocation plan is as follows:")
        current_npu = self.device_info.running_npu_list[self.rank_id]
        main = " ".join(map(str, self.assign_main[current_npu]))
        acl = " ".join(map(str, self.assign_acl[current_npu]))
        rel = str(self.assign_rel[current_npu]) if self.assign_rel[current_npu] else ""
        logger.info(f"NPU{current_npu}: main=[{main}]  acl=[{acl}]  release=[{rel}]")

    def bind_memory(self, pid: str, npu: int) -> None:
        def _get_npu_numa_node(npu_id: int) -> int | None:
            cpu_pool = self.npu_cpu_pool.get(npu_id, [])
            if not cpu_pool:
                return None
            anchor_cpu = cpu_pool[0]
            return self.cpu_node.get(anchor_cpu)

        if not shutil.which("migratepages"):
            logger.info("The 'migratepages' command is not available, skipping memory binding.")
            return
        target_numa = _get_npu_numa_node(npu)
        if target_numa is None:
            logger.warning(f"[migrate] rank:{self.rank_id} -> NPU{npu} has no CPU pool, skip memory binding.")
            return
        all_numa_nodes = sorted(self.numa_to_cpu_map.keys())
        if target_numa not in all_numa_nodes:
            logger.warning(f"[migrate] NPU:{npu} -> NUMA {target_numa} not found, skip memory binding.")
            return
        # Bind memory to the NPU's NUMA node only to minimize cross-NUMA traffic.
        logger.info(f"[migrate] NPU:{npu} -> NUMA [{target_numa}]")
        execute_command(
            [
                "migratepages",
                pid,
                ",".join(map(str, all_numa_nodes)),
                str(target_numa),
            ]
        )

    def bind_threads(self) -> None:
        thread_message, _ = execute_command(["ps", "-Te"])
        threads_map = self.get_threads_map(thread_message)
        main_pid = str(psutil.Process().pid)
        current_npu = self.device_info.running_npu_list[self.rank_id]
        self.bind(main_pid, self.assign_main[current_npu], True)
        for acl_thread in threads_map.get(main_pid, {}).get("acl_thread", []):
            self.bind(acl_thread, self.assign_acl[current_npu], False)
            self.bind_memory(acl_thread, current_npu)
        for release_thread in threads_map.get(main_pid, {}).get("release_thread", []):
            self.bind(release_thread, self.assign_rel[current_npu], False)

    def bind_npu_irq(self) -> None:
        if not os.access("/proc/irq", os.W_OK):
            return

        # Only bind IRQ for current rank's NPU to avoid multi-process overwrite.
        current_npu = self.device_info.running_npu_list[self.rank_id]
        if current_npu not in self.npu_cpu_pool:
            logger.warning(f"[irq] rank:{self.rank_id} -> NPU{current_npu} has no cpu pool, skip irq binding.")
            return

        if shutil.which("systemctl"):
            output, _ = execute_command(["systemctl", "list-unit-files"])
            if "irqbalance.service" in output:
                _, return_code = execute_command(["systemctl", "is-active", "--quiet", "irqbalance"])
                if return_code == 0:
                    logger.warning(
                        "The irqbalance service is running and has been stopped. "
                        "You can run the systemctl start irqbalance command to restart it."
                    )
                    execute_command(["systemctl", "stop", "irqbalance"])

        sq_irqs = []
        with open("/proc/interrupts") as f:
            for line in f:
                if "sq_send_trigger_irq" in line:
                    irq = line.split(":")[0].strip()
                    sq_irqs.append(irq)

        npu = current_npu
        cpus = self.npu_cpu_pool[npu]
        if len(cpus) < 2:
            logger.warning(f"[irq] NPU{npu} cpu pool too small (<2), skip irq binding.")
            return

        sq_cpu, cq_cpu = cpus[0], cpus[1]  # Reserved for IRQ binding
        pci_addr = ""

        device_type = get_ascend_device_type()
        if device_type == AscendDeviceType.A3:
            # A3: logical npu_id = card_id*2 + chip_id
            card_id = npu // 2
            chip_id = npu % 2
            info, _ = execute_command(["npu-smi", "info", "-t", "board", "-i", str(card_id), "-c", str(chip_id)])
        else:
            # A2 / others: logical npu_id is card id
            info, _ = execute_command(["npu-smi", "info", "-t", "board", "-i", str(npu)])

        for line in info.splitlines():
            if "PCIe Bus Info" in line:
                pci_addr = line.split()[-1].lower()
                break

        if not pci_addr:
            logger.warning(f"Can't find pci address of NPU{npu} .")
            return

        try:
            npu_irq_list = sorted(os.listdir(f"/sys/bus/pci/devices/{pci_addr}/msi_irqs/"), key=lambda x: int(x))
        except FileNotFoundError:
            logger.warning(f"The msi_irqs folder cannot be found under /sys/bus/pci/devices/{pci_addr} .")
            return

        sq_irq, cq_irq = "", ""
        for irq in sq_irqs:
            if irq in npu_irq_list:
                sq_irq = irq
                cq_irq = str(int(irq) + 1)
                break
        if not sq_irq:
            logger.warning(f"The sq_send_trigger_irq of NPU{npu} is not found.")
            return

        logger.info(
            f"NPU{npu}(PCI {pci_addr}): sq_send_trigger_irq IRQ_ID={sq_irq} -> CPU{sq_cpu}, "
            f"cq_update_irq IRQ_ID={cq_irq} -> CPU{cq_cpu}"
        )
        with open(f"/proc/irq/{sq_irq}/smp_affinity", "w") as f:
            f.write(self.cpu_to_mask(sq_cpu))
        with open(f"/proc/irq/{cq_irq}/smp_affinity", "w") as f:
            f.write(self.cpu_to_mask(cq_cpu))

    def run_all(self) -> None:
        self.build_cpu_pools()
        self.allocate()
        self.print_plan()
        self.bind_threads()
        self.bind_npu_irq()


def bind_cpus(rank_id: int) -> None:
    if not is_arm_cpu():
        logger.info("CPU binding skipped: non-ARM CPU detected.")
        return
    binder = CpuAlloc(rank_id)
    binder.run_all()