#!/usr/bin/env python3 import os import platform import shutil import subprocess from collections import defaultdict import psutil from vllm.logger import logger from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type MASK_BIT = 32 # Number of bits in a CPU affinity mask group MIN_CPUS_PER_NPU = 5 # 2(IRQ) + 1(main, at least 1 CPU) + 1(acl) + 1(release) = 5 CPUs per NPU ALLOWED_CPUS_PATH = "/proc/self/status" ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES") TOPO_AFFINITY_MODE = "topo_affinity" GLOBAL_SLICE_MODE = "global_slice" DEVICE_BINDING_MODE: dict["AscendDeviceType", str] = { AscendDeviceType.A2: TOPO_AFFINITY_MODE, AscendDeviceType.A3: GLOBAL_SLICE_MODE, AscendDeviceType._310P: TOPO_AFFINITY_MODE, } def is_arm_cpu() -> bool: arch = platform.machine().lower() if arch in {"x86_64", "amd64", "i386", "i686"}: return False if arch in {"aarch64", "arm64"} or arch.startswith("arm"): return True logger.warning(f"Unknown CPU architecture '{arch}', CPU binding will be disabled.") return False def execute_command(cmd: list[str]) -> tuple[str, int]: with subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p: out, _ = p.communicate(timeout=1000) return out.decode(), p.returncode class DeviceInfo: def __init__(self): self.npu_map_info: dict[str, dict[str, str]] = self.get_npu_map_info() self.allowed_cpus: list[int] = self.parse_allowed_cpus() self.running_npu_list: list[int] = self.get_running_npus() self.npu_affinity: dict[int, list[int]] = self.parse_topo_affinity() self.all_logic_npus: list[int] = self.get_all_logic_npus() self.total_logic_npus: int = len(self.all_logic_npus) @staticmethod def expand_cpu_list(allowed_list_str: str) -> list[int]: allowed_cpus_list: list[int] = [] for per_range in allowed_list_str.split(","): if "-" in per_range: start_cpu, end_cpu = map(int, per_range.split("-")) allowed_cpus_list.extend(range(start_cpu, end_cpu + 1)) else: allowed_cpus_list.append(int(per_range)) return allowed_cpus_list def get_all_logic_npus(self) -> list[int]: """Collect all logical NPU IDs from the NPU mapping. self.npu_map_info maps a board_id (A3) or npu_id (A2) to a per-chip map. The per-chip map uses chip_id as the key and the logical NPU ID string as the value. """ logic_ids: set[int] = set() for _, chip_map in self.npu_map_info.items(): for _, logic_str in chip_map.items(): if logic_str and logic_str.isdigit(): logic_ids.add(int(logic_str)) return sorted(logic_ids) @staticmethod def get_npu_map_info() -> dict[str, dict[str, str]]: npu_map_info: dict[str, dict[str, str]] = {} npu_info, _ = execute_command(["npu-smi", "info", "-m"]) npu_map = npu_info.strip().split("\n")[1:] for line in npu_map: npu_id, chip_id, chip_logic_id = line.strip().split()[:3] if not chip_logic_id.isdigit(): continue if npu_id not in npu_map_info: npu_map_info[npu_id] = {} npu_map_info[npu_id][chip_id] = chip_logic_id return npu_map_info def get_running_npus(self) -> list[int]: npu_message, _ = execute_command(["npu-smi", "info"]) in_proc_section = False running_npu_set = set() for line in npu_message.splitlines(): line = line.strip() if line.startswith("| NPU") and "Process id" in line: in_proc_section = True continue if not in_proc_section: continue if line.startswith("| "): parts = [p.strip() for p in line.strip("|").split("|")] if len(parts) < 2: continue npu_id = parts[0].split()[0] chip_id = parts[0].split()[1] if not npu_id.isdigit() or not chip_id.isdigit(): continue chip_logic_id = self.npu_map_info.get(npu_id, {}).get(chip_id) if not chip_logic_id or not chip_logic_id.isdigit(): raise RuntimeError("Failed to get correct chip_logic_id from command 'npu-smi info -m'.") running_npu_set.add(int(chip_logic_id)) if ASCEND_RT_VISIBLE_DEVICES: devices_str = ASCEND_RT_VISIBLE_DEVICES devices_list = [int(x) for x in devices_str.split(",")] running_npu_set = set(devices_list) & running_npu_set if not running_npu_set: raise RuntimeError("Can not get running npu info.") return sorted(running_npu_set) def parse_allowed_cpus(self) -> list[int]: if not os.path.exists(ALLOWED_CPUS_PATH): return [] with open(ALLOWED_CPUS_PATH) as f: for line in f: if line.startswith("Cpus_allowed_list"): return self.expand_cpu_list(line.split()[1]) raise RuntimeError("Can not found specific 'Cpus_allowed_list' in the '/proc/self/status' file.") def parse_topo_affinity(self) -> dict[int, list[int]]: chip_logic_id = 0 affinity: dict[int, list[int]] = {} affinity_message, _ = execute_command(["npu-smi", "info", "-t", "topo"]) for line in affinity_message.splitlines(): if line.startswith("NPU"): parts = line.split() last_part = parts[-1] if last_part != "Affinity": affinity[chip_logic_id] = self.expand_cpu_list(last_part) chip_logic_id += 1 return affinity class CpuAlloc: def __init__(self, rank_id: int): self.rank_id = rank_id self.device_info: DeviceInfo = DeviceInfo() self.cpu_node: dict[int, int] = {} self.numa_to_cpu_map: dict[int, list[int]] = defaultdict(list) self.npu_cpu_pool: dict[int, list[int]] = {} self.assign_main: dict[int, list[int]] = {} self.assign_acl: dict[int, list[int]] = {} self.assign_rel: dict[int, list[int]] = {} @staticmethod def cpu_to_mask(cpu: int) -> str: group = cpu // MASK_BIT bit = cpu % MASK_BIT value = 1 << bit mask = f"{value:08x}" for _ in range(1, group + 1): mask = f"{mask},{'0' * (MASK_BIT // 4)}" return mask @staticmethod def get_threads_map(thread_message: str) -> dict[str, dict[str, list[str]]]: threads_map: dict[str, dict[str, list[str]]] = {} for line in thread_message.splitlines(): parts = line.split() if len(parts) < 2: continue main_pid, sub_pid = parts[0], parts[1] if "acl_thread" in line: key = "acl_thread" elif "release_thread" in line: key = "release_thread" else: continue if main_pid not in threads_map: threads_map[main_pid] = {"acl_thread": [], "release_thread": []} threads_map[main_pid][key].append(sub_pid) return threads_map @staticmethod def bind(pid: str, cpus: list[int], bind_sub_thread: bool) -> None: if cpus: cpu_list = ",".join(map(str, cpus)) if bind_sub_thread: bind_result, return_code = execute_command(["taskset", "-acp", cpu_list, pid]) else: bind_result, return_code = execute_command(["taskset", "-cp", cpu_list, pid]) if return_code != 0: raise RuntimeError(f"Failed to bind {pid} to CPU {cpu_list}.") def average_distribute(self, groups: dict[str, list[int]]) -> dict[int, list[int]]: result: dict[int, list[int]] = {} for key, npu_list in groups.items(): cpu_list = sorted(self.npu_cpu_pool[npu_list[0]]) cpu_num_per_npu = len(cpu_list) // len(npu_list) for i, npu in enumerate(npu_list): start_index = i * cpu_num_per_npu end_index = (i + 1) * cpu_num_per_npu if i < len(npu_list) - 1 else len(cpu_list) result[npu] = cpu_list[start_index:end_index] return result def extend_numa(self, cpu_list: list[int]) -> list[int]: if not cpu_list: return [] nodes = {self.cpu_node[c] for c in cpu_list} if len(nodes) != 1: return cpu_list node = list(nodes)[0] next_node = (node + 1) % len(self.numa_to_cpu_map) extended = cpu_list[:] for cpu in self.numa_to_cpu_map[next_node]: if cpu in self.device_info.allowed_cpus: extended.append(cpu) return sorted(set(extended)) def build_cpu_node_map(self) -> None: cpu_numa_map, _ = execute_command(["lscpu", "-e=CPU,NODE"]) for line in cpu_numa_map.splitlines(): line = line.strip() if not line or not line[0].isdigit(): continue cpu_str, node_str = line.split() cpu = int(cpu_str) node = int(node_str) self.cpu_node[cpu] = node self.numa_to_cpu_map[node].append(cpu) if len(self.numa_to_cpu_map) == 0: raise RuntimeError("lscpu command output error, no NUMA node available. Please check!") def build_global_slice_cpu_pool(self) -> None: """ Build per-NPU CPU pools by slicing allowed_cpus using GLOBAL logical NPU ids. Why: - Multiple processes/DP groups may share the SAME cpuset (same allowed_cpus). - If each process slices only its visible NPUs, CPU ranges overlap across processes. - Global slicing ensures deterministic, non-overlapping CPU partitions per logical NPU id. Notes: - This strategy does NOT rely on npu-smi topo affinity. - NUMA locality is achieved only if CPU numbering aligns with NUMA layout. - Requires per-NPU slice size >= 5 (IRQ(2) + main(>=1) + acl(1) + release(1)). """ running = list(self.device_info.running_npu_list) if not running: return allowed = sorted(set(self.device_info.allowed_cpus)) total_cpu = len(allowed) if total_cpu == 0: return # Prefer mapping info (npu-smi info -m), fallback to topo keys, then visible list if self.device_info.total_logic_npus > 0: total_npus = self.device_info.total_logic_npus elif self.device_info.npu_affinity: total_npus = len(self.device_info.npu_affinity) else: total_npus = len(running) if total_npus <= 0: return # Compute global per-NPU slicing base = total_cpu // total_npus extra = total_cpu % total_npus logger.debug( f"[cpu_global_slice] rank:{self.rank_id} ASCEND_RT_VISIBLE_DEVICES={ASCEND_RT_VISIBLE_DEVICES} " f"running_npu_list:{running} total_npus:{total_npus} allowed_cpus:{total_cpu} " f"base:{base} extra:{extra} allowed_cpus_head:{allowed[:16]} allowed_cpus_tail:{allowed[-16:]}" ) # Enforce per-NPU slice length >= 5. # Because with remainder distribution, some NPUs may get 'base' cores and some get 'base+1'. # The minimum slice size is 'base'. if base < MIN_CPUS_PER_NPU: raise RuntimeError( "Insufficient CPUs for binding with IRQ/ACL/REL reservations: " f"total_allowed={total_cpu}, total_npus={total_npus}, " f"min_per_npu={base} (<{MIN_CPUS_PER_NPU}). " f"Need at least {total_npus * MIN_CPUS_PER_NPU} CPUs in cpuset." ) def _slice_for_npu(global_npu_id: int) -> list[int]: # start = global_npu_id*base + min(global_npu_id, extra) start = global_npu_id * base + (global_npu_id if global_npu_id < extra else extra) take = base + (1 if global_npu_id < extra else 0) end = start + take return allowed[start:end] for npu in running: if npu < 0 or npu >= total_npus: raise RuntimeError(f"Invalid NPU id {npu}, total_npus={total_npus}.") cpus = _slice_for_npu(npu) # Extra safety: should always be >= base >= 5 if len(cpus) < MIN_CPUS_PER_NPU: raise RuntimeError( f"NPU{npu} got too few CPUs: {len(cpus)} (<5). " f"total_allowed={total_cpu}, total_npus={total_npus}, base={base}, extra={extra}" ) self.npu_cpu_pool[npu] = cpus @staticmethod def _binding_mode() -> str: device_type = get_ascend_device_type() return DEVICE_BINDING_MODE.get(device_type, TOPO_AFFINITY_MODE) def build_cpu_pools(self) -> None: self.build_cpu_node_map() mode = self._binding_mode() logger.info(f"[cpu_bind_mode] mode={mode} rank={self.rank_id} visible_npus={self.device_info.running_npu_list}") if mode == GLOBAL_SLICE_MODE: self.build_global_slice_cpu_pool() return # topo_affinity mode if not self.device_info.npu_affinity: logger.warning("NPU topo affinity not found, fallback to global-slice CPU binding.") self.build_global_slice_cpu_pool() return for npu in self.device_info.running_npu_list: base_cpu_list = [ cpu for cpu in self.device_info.npu_affinity.get(npu, []) if cpu in self.device_info.allowed_cpus ] if not base_cpu_list: raise RuntimeError("CPUs available in 'Cpus_allowed_list' conflict with NUMA affinity.") extra_cpu_list = self.extend_numa(base_cpu_list) self.npu_cpu_pool[npu] = extra_cpu_list groups = defaultdict(list) for npu, cpus in self.npu_cpu_pool.items(): groups[str(cpus)].append(npu) final: dict[int, list[int]] = {} for key, npu_list in groups.items(): if len(npu_list) == 1: final[npu_list[0]] = self.npu_cpu_pool[npu_list[0]] else: final.update(self.average_distribute({key: npu_list})) self.npu_cpu_pool = final def allocate(self) -> None: for npu, pool in self.npu_cpu_pool.items(): if len(pool) >= MIN_CPUS_PER_NPU: main = pool[2:-2] acl = [pool[-2]] rel = [pool[-1]] else: raise RuntimeError( f"The number of CPUs is insufficient. Each NPU requires at least {MIN_CPUS_PER_NPU} CPUs." ) self.assign_main[npu] = main self.assign_acl[npu] = acl self.assign_rel[npu] = rel def print_plan(self) -> None: logger.info("The CPU allocation plan is as follows:") current_npu = self.device_info.running_npu_list[self.rank_id] main = " ".join(map(str, self.assign_main[current_npu])) acl = " ".join(map(str, self.assign_acl[current_npu])) rel = str(self.assign_rel[current_npu]) if self.assign_rel[current_npu] else "" logger.info(f"NPU{current_npu}: main=[{main}] acl=[{acl}] release=[{rel}]") def bind_memory(self, pid: str, npu: int) -> None: def _get_npu_numa_node(npu_id: int) -> int | None: cpu_pool = self.npu_cpu_pool.get(npu_id, []) if not cpu_pool: return None anchor_cpu = cpu_pool[0] return self.cpu_node.get(anchor_cpu) if not shutil.which("migratepages"): logger.info("The 'migratepages' command is not available, skipping memory binding.") return target_numa = _get_npu_numa_node(npu) if target_numa is None: logger.warning(f"[migrate] rank:{self.rank_id} -> NPU{npu} has no CPU pool, skip memory binding.") return all_numa_nodes = sorted(self.numa_to_cpu_map.keys()) if target_numa not in all_numa_nodes: logger.warning(f"[migrate] NPU:{npu} -> NUMA {target_numa} not found, skip memory binding.") return # Bind memory to the NPU's NUMA node only to minimize cross-NUMA traffic. logger.info(f"[migrate] NPU:{npu} -> NUMA [{target_numa}]") execute_command( [ "migratepages", pid, ",".join(map(str, all_numa_nodes)), str(target_numa), ] ) def bind_threads(self) -> None: thread_message, _ = execute_command(["ps", "-Te"]) threads_map = self.get_threads_map(thread_message) main_pid = str(psutil.Process().pid) current_npu = self.device_info.running_npu_list[self.rank_id] self.bind(main_pid, self.assign_main[current_npu], True) for acl_thread in threads_map.get(main_pid, {}).get("acl_thread", []): self.bind(acl_thread, self.assign_acl[current_npu], False) self.bind_memory(acl_thread, current_npu) for release_thread in threads_map.get(main_pid, {}).get("release_thread", []): self.bind(release_thread, self.assign_rel[current_npu], False) def bind_npu_irq(self) -> None: if not os.access("/proc/irq", os.W_OK): return # Only bind IRQ for current rank's NPU to avoid multi-process overwrite. current_npu = self.device_info.running_npu_list[self.rank_id] if current_npu not in self.npu_cpu_pool: logger.warning(f"[irq] rank:{self.rank_id} -> NPU{current_npu} has no cpu pool, skip irq binding.") return if shutil.which("systemctl"): output, _ = execute_command(["systemctl", "list-unit-files"]) if "irqbalance.service" in output: _, return_code = execute_command(["systemctl", "is-active", "--quiet", "irqbalance"]) if return_code == 0: logger.warning( "The irqbalance service is running and has been stopped. " "You can run the systemctl start irqbalance command to restart it." ) execute_command(["systemctl", "stop", "irqbalance"]) sq_irqs = [] with open("/proc/interrupts") as f: for line in f: if "sq_send_trigger_irq" in line: irq = line.split(":")[0].strip() sq_irqs.append(irq) npu = current_npu cpus = self.npu_cpu_pool[npu] if len(cpus) < 2: logger.warning(f"[irq] NPU{npu} cpu pool too small (<2), skip irq binding.") return sq_cpu, cq_cpu = cpus[0], cpus[1] # Reserved for IRQ binding pci_addr = "" device_type = get_ascend_device_type() if device_type == AscendDeviceType.A3: # A3: logical npu_id = card_id*2 + chip_id card_id = npu // 2 chip_id = npu % 2 info, _ = execute_command(["npu-smi", "info", "-t", "board", "-i", str(card_id), "-c", str(chip_id)]) else: # A2 / others: logical npu_id is card id info, _ = execute_command(["npu-smi", "info", "-t", "board", "-i", str(npu)]) for line in info.splitlines(): if "PCIe Bus Info" in line: pci_addr = line.split()[-1].lower() break if not pci_addr: logger.warning(f"Can't find pci address of NPU{npu} .") return try: npu_irq_list = sorted(os.listdir(f"/sys/bus/pci/devices/{pci_addr}/msi_irqs/"), key=lambda x: int(x)) except FileNotFoundError: logger.warning(f"The msi_irqs folder cannot be found under /sys/bus/pci/devices/{pci_addr} .") return sq_irq, cq_irq = "", "" for irq in sq_irqs: if irq in npu_irq_list: sq_irq = irq cq_irq = str(int(irq) + 1) break if not sq_irq: logger.warning(f"The sq_send_trigger_irq of NPU{npu} is not found.") return logger.info( f"NPU{npu}(PCI {pci_addr}): sq_send_trigger_irq IRQ_ID={sq_irq} -> CPU{sq_cpu}, " f"cq_update_irq IRQ_ID={cq_irq} -> CPU{cq_cpu}" ) with open(f"/proc/irq/{sq_irq}/smp_affinity", "w") as f: f.write(self.cpu_to_mask(sq_cpu)) with open(f"/proc/irq/{cq_irq}/smp_affinity", "w") as f: f.write(self.cpu_to_mask(cq_cpu)) def run_all(self) -> None: self.build_cpu_pools() self.allocate() self.print_plan() self.bind_threads() self.bind_npu_irq() def bind_cpus(rank_id: int) -> None: if not is_arm_cpu(): logger.info("CPU binding skipped: non-ARM CPU detected.") return binder = CpuAlloc(rank_id) binder.run_all()