### What this PR does / why we need it?
This PR introduces global CPU slicing for Ascend NPUs to ensure
non-overlapping CPU partitions, addresses IRQ binding logical errors on
A3, and enhances the logic for determining total NPUs in CPU allocation.
These changes are necessary to optimize CPU resource management and
improve system stability.
- **Global CPU Slicing**: Introduced a global CPU slicing mechanism for
Ascend NPUs to ensure non-overlapping CPU partitions across multiple
processes or data parallel groups, preventing resource contention.
- **Improved IRQ Binding for A3 Devices**: Refined the IRQ binding logic
specifically for Ascend A3 devices, correctly mapping logical NPU IDs to
physical card and chip IDs for accurate npu-smi queries and preventing
multi-process overwrite of IRQ settings.
- **Enhanced NPU Count Determination**: Improved the logic for
determining the total number of logical NPUs, prioritizing NPU mapping
information to ensure more accurate CPU allocation.
- **Minimum CPU Requirement**: Established a minimum requirement of 5
CPUs per NPU for binding, reserving specific cores for IRQ, main, ACL,
and release operations to ensure stable operation.
### Does this PR introduce _any_ user-facing change?
No user-facing changes are introduced.
### How was this patch tested?
CI passed with new added/existing tests.
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
---------
Signed-off-by: c00818886 <chenchuwei@huawei.com>
511 lines
21 KiB
Python
511 lines
21 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import os
|
|
import platform
|
|
import shutil
|
|
import subprocess
|
|
from collections import defaultdict
|
|
|
|
import psutil
|
|
from vllm.logger import logger
|
|
|
|
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
|
|
|
MASK_BIT = 32 # Number of bits in a CPU affinity mask group
|
|
MIN_CPUS_PER_NPU = 5 # 2(IRQ) + 1(main, at least 1 CPU) + 1(acl) + 1(release) = 5 CPUs per NPU
|
|
ALLOWED_CPUS_PATH = "/proc/self/status"
|
|
ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")
|
|
|
|
TOPO_AFFINITY_MODE = "topo_affinity"
|
|
GLOBAL_SLICE_MODE = "global_slice"
|
|
|
|
DEVICE_BINDING_MODE: dict["AscendDeviceType", str] = {
|
|
AscendDeviceType.A2: TOPO_AFFINITY_MODE,
|
|
AscendDeviceType.A3: GLOBAL_SLICE_MODE,
|
|
AscendDeviceType._310P: TOPO_AFFINITY_MODE,
|
|
}
|
|
|
|
|
|
def is_arm_cpu() -> bool:
|
|
arch = platform.machine().lower()
|
|
if arch in {"x86_64", "amd64", "i386", "i686"}:
|
|
return False
|
|
if arch in {"aarch64", "arm64"} or arch.startswith("arm"):
|
|
return True
|
|
logger.warning(f"Unknown CPU architecture '{arch}', CPU binding will be disabled.")
|
|
return False
|
|
|
|
|
|
def execute_command(cmd: list[str]) -> tuple[str, int]:
|
|
with subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
|
|
out, _ = p.communicate(timeout=1000)
|
|
return out.decode(), p.returncode
|
|
|
|
|
|
class DeviceInfo:
|
|
def __init__(self):
|
|
self.npu_map_info: dict[str, dict[str, str]] = self.get_npu_map_info()
|
|
self.allowed_cpus: list[int] = self.parse_allowed_cpus()
|
|
self.running_npu_list: list[int] = self.get_running_npus()
|
|
self.npu_affinity: dict[int, list[int]] = self.parse_topo_affinity()
|
|
self.all_logic_npus: list[int] = self.get_all_logic_npus()
|
|
self.total_logic_npus: int = len(self.all_logic_npus)
|
|
|
|
@staticmethod
|
|
def expand_cpu_list(allowed_list_str: str) -> list[int]:
|
|
allowed_cpus_list: list[int] = []
|
|
for per_range in allowed_list_str.split(","):
|
|
if "-" in per_range:
|
|
start_cpu, end_cpu = map(int, per_range.split("-"))
|
|
allowed_cpus_list.extend(range(start_cpu, end_cpu + 1))
|
|
else:
|
|
allowed_cpus_list.append(int(per_range))
|
|
return allowed_cpus_list
|
|
|
|
def get_all_logic_npus(self) -> list[int]:
|
|
"""Collect all logical NPU IDs from the NPU mapping.
|
|
|
|
self.npu_map_info maps a board_id (A3) or npu_id (A2) to a per-chip map.
|
|
The per-chip map uses chip_id as the key and the logical NPU ID string
|
|
as the value.
|
|
"""
|
|
logic_ids: set[int] = set()
|
|
for _, chip_map in self.npu_map_info.items():
|
|
for _, logic_str in chip_map.items():
|
|
if logic_str and logic_str.isdigit():
|
|
logic_ids.add(int(logic_str))
|
|
return sorted(logic_ids)
|
|
|
|
@staticmethod
|
|
def get_npu_map_info() -> dict[str, dict[str, str]]:
|
|
npu_map_info: dict[str, dict[str, str]] = {}
|
|
npu_info, _ = execute_command(["npu-smi", "info", "-m"])
|
|
npu_map = npu_info.strip().split("\n")[1:]
|
|
for line in npu_map:
|
|
npu_id, chip_id, chip_logic_id = line.strip().split()[:3]
|
|
if not chip_logic_id.isdigit():
|
|
continue
|
|
if npu_id not in npu_map_info:
|
|
npu_map_info[npu_id] = {}
|
|
npu_map_info[npu_id][chip_id] = chip_logic_id
|
|
return npu_map_info
|
|
|
|
def get_running_npus(self) -> list[int]:
|
|
npu_message, _ = execute_command(["npu-smi", "info"])
|
|
in_proc_section = False
|
|
running_npu_set = set()
|
|
for line in npu_message.splitlines():
|
|
line = line.strip()
|
|
if line.startswith("| NPU") and "Process id" in line:
|
|
in_proc_section = True
|
|
continue
|
|
if not in_proc_section:
|
|
continue
|
|
if line.startswith("| "):
|
|
parts = [p.strip() for p in line.strip("|").split("|")]
|
|
if len(parts) < 2:
|
|
continue
|
|
npu_id = parts[0].split()[0]
|
|
chip_id = parts[0].split()[1]
|
|
if not npu_id.isdigit() or not chip_id.isdigit():
|
|
continue
|
|
chip_logic_id = self.npu_map_info.get(npu_id, {}).get(chip_id)
|
|
if not chip_logic_id or not chip_logic_id.isdigit():
|
|
raise RuntimeError("Failed to get correct chip_logic_id from command 'npu-smi info -m'.")
|
|
running_npu_set.add(int(chip_logic_id))
|
|
if ASCEND_RT_VISIBLE_DEVICES:
|
|
devices_str = ASCEND_RT_VISIBLE_DEVICES
|
|
devices_list = [int(x) for x in devices_str.split(",")]
|
|
running_npu_set = set(devices_list) & running_npu_set
|
|
if not running_npu_set:
|
|
raise RuntimeError("Can not get running npu info.")
|
|
return sorted(running_npu_set)
|
|
|
|
def parse_allowed_cpus(self) -> list[int]:
|
|
if not os.path.exists(ALLOWED_CPUS_PATH):
|
|
return []
|
|
with open(ALLOWED_CPUS_PATH) as f:
|
|
for line in f:
|
|
if line.startswith("Cpus_allowed_list"):
|
|
return self.expand_cpu_list(line.split()[1])
|
|
raise RuntimeError("Can not found specific 'Cpus_allowed_list' in the '/proc/self/status' file.")
|
|
|
|
def parse_topo_affinity(self) -> dict[int, list[int]]:
|
|
chip_logic_id = 0
|
|
affinity: dict[int, list[int]] = {}
|
|
affinity_message, _ = execute_command(["npu-smi", "info", "-t", "topo"])
|
|
for line in affinity_message.splitlines():
|
|
if line.startswith("NPU"):
|
|
parts = line.split()
|
|
last_part = parts[-1]
|
|
if last_part != "Affinity":
|
|
affinity[chip_logic_id] = self.expand_cpu_list(last_part)
|
|
chip_logic_id += 1
|
|
return affinity
|
|
|
|
|
|
class CpuAlloc:
|
|
def __init__(self, rank_id: int):
|
|
self.rank_id = rank_id
|
|
self.device_info: DeviceInfo = DeviceInfo()
|
|
self.cpu_node: dict[int, int] = {}
|
|
self.numa_to_cpu_map: dict[int, list[int]] = defaultdict(list)
|
|
self.npu_cpu_pool: dict[int, list[int]] = {}
|
|
self.assign_main: dict[int, list[int]] = {}
|
|
self.assign_acl: dict[int, list[int]] = {}
|
|
self.assign_rel: dict[int, list[int]] = {}
|
|
|
|
@staticmethod
|
|
def cpu_to_mask(cpu: int) -> str:
|
|
group = cpu // MASK_BIT
|
|
bit = cpu % MASK_BIT
|
|
value = 1 << bit
|
|
mask = f"{value:08x}"
|
|
for _ in range(1, group + 1):
|
|
mask = f"{mask},{'0' * (MASK_BIT // 4)}"
|
|
return mask
|
|
|
|
@staticmethod
|
|
def get_threads_map(thread_message: str) -> dict[str, dict[str, list[str]]]:
|
|
threads_map: dict[str, dict[str, list[str]]] = {}
|
|
for line in thread_message.splitlines():
|
|
parts = line.split()
|
|
if len(parts) < 2:
|
|
continue
|
|
main_pid, sub_pid = parts[0], parts[1]
|
|
if "acl_thread" in line:
|
|
key = "acl_thread"
|
|
elif "release_thread" in line:
|
|
key = "release_thread"
|
|
else:
|
|
continue
|
|
if main_pid not in threads_map:
|
|
threads_map[main_pid] = {"acl_thread": [], "release_thread": []}
|
|
threads_map[main_pid][key].append(sub_pid)
|
|
return threads_map
|
|
|
|
@staticmethod
|
|
def bind(pid: str, cpus: list[int], bind_sub_thread: bool) -> None:
|
|
if cpus:
|
|
cpu_list = ",".join(map(str, cpus))
|
|
if bind_sub_thread:
|
|
bind_result, return_code = execute_command(["taskset", "-acp", cpu_list, pid])
|
|
else:
|
|
bind_result, return_code = execute_command(["taskset", "-cp", cpu_list, pid])
|
|
if return_code != 0:
|
|
raise RuntimeError(f"Failed to bind {pid} to CPU {cpu_list}.")
|
|
|
|
def average_distribute(self, groups: dict[str, list[int]]) -> dict[int, list[int]]:
|
|
result: dict[int, list[int]] = {}
|
|
for key, npu_list in groups.items():
|
|
cpu_list = sorted(self.npu_cpu_pool[npu_list[0]])
|
|
cpu_num_per_npu = len(cpu_list) // len(npu_list)
|
|
for i, npu in enumerate(npu_list):
|
|
start_index = i * cpu_num_per_npu
|
|
end_index = (i + 1) * cpu_num_per_npu if i < len(npu_list) - 1 else len(cpu_list)
|
|
result[npu] = cpu_list[start_index:end_index]
|
|
return result
|
|
|
|
def extend_numa(self, cpu_list: list[int]) -> list[int]:
|
|
if not cpu_list:
|
|
return []
|
|
nodes = {self.cpu_node[c] for c in cpu_list}
|
|
if len(nodes) != 1:
|
|
return cpu_list
|
|
node = list(nodes)[0]
|
|
next_node = (node + 1) % len(self.numa_to_cpu_map)
|
|
extended = cpu_list[:]
|
|
for cpu in self.numa_to_cpu_map[next_node]:
|
|
if cpu in self.device_info.allowed_cpus:
|
|
extended.append(cpu)
|
|
return sorted(set(extended))
|
|
|
|
def build_cpu_node_map(self) -> None:
|
|
cpu_numa_map, _ = execute_command(["lscpu", "-e=CPU,NODE"])
|
|
for line in cpu_numa_map.splitlines():
|
|
line = line.strip()
|
|
if not line or not line[0].isdigit():
|
|
continue
|
|
cpu_str, node_str = line.split()
|
|
cpu = int(cpu_str)
|
|
node = int(node_str)
|
|
self.cpu_node[cpu] = node
|
|
self.numa_to_cpu_map[node].append(cpu)
|
|
if len(self.numa_to_cpu_map) == 0:
|
|
raise RuntimeError("lscpu command output error, no NUMA node available. Please check!")
|
|
|
|
def build_global_slice_cpu_pool(self) -> None:
|
|
"""
|
|
Build per-NPU CPU pools by slicing allowed_cpus using GLOBAL logical NPU ids.
|
|
|
|
Why:
|
|
- Multiple processes/DP groups may share the SAME cpuset (same allowed_cpus).
|
|
- If each process slices only its visible NPUs, CPU ranges overlap across processes.
|
|
- Global slicing ensures deterministic, non-overlapping CPU partitions per logical NPU id.
|
|
|
|
Notes:
|
|
- This strategy does NOT rely on npu-smi topo affinity.
|
|
- NUMA locality is achieved only if CPU numbering aligns with NUMA layout.
|
|
- Requires per-NPU slice size >= 5 (IRQ(2) + main(>=1) + acl(1) + release(1)).
|
|
"""
|
|
running = list(self.device_info.running_npu_list)
|
|
if not running:
|
|
return
|
|
|
|
allowed = sorted(set(self.device_info.allowed_cpus))
|
|
total_cpu = len(allowed)
|
|
if total_cpu == 0:
|
|
return
|
|
|
|
# Prefer mapping info (npu-smi info -m), fallback to topo keys, then visible list
|
|
if self.device_info.total_logic_npus > 0:
|
|
total_npus = self.device_info.total_logic_npus
|
|
elif self.device_info.npu_affinity:
|
|
total_npus = len(self.device_info.npu_affinity)
|
|
else:
|
|
total_npus = len(running)
|
|
|
|
if total_npus <= 0:
|
|
return
|
|
|
|
# Compute global per-NPU slicing
|
|
base = total_cpu // total_npus
|
|
extra = total_cpu % total_npus
|
|
|
|
logger.debug(
|
|
f"[cpu_global_slice] rank:{self.rank_id} ASCEND_RT_VISIBLE_DEVICES={ASCEND_RT_VISIBLE_DEVICES} "
|
|
f"running_npu_list:{running} total_npus:{total_npus} allowed_cpus:{total_cpu} "
|
|
f"base:{base} extra:{extra} allowed_cpus_head:{allowed[:16]} allowed_cpus_tail:{allowed[-16:]}"
|
|
)
|
|
|
|
# Enforce per-NPU slice length >= 5.
|
|
# Because with remainder distribution, some NPUs may get 'base' cores and some get 'base+1'.
|
|
# The minimum slice size is 'base'.
|
|
if base < MIN_CPUS_PER_NPU:
|
|
raise RuntimeError(
|
|
"Insufficient CPUs for binding with IRQ/ACL/REL reservations: "
|
|
f"total_allowed={total_cpu}, total_npus={total_npus}, "
|
|
f"min_per_npu={base} (<{MIN_CPUS_PER_NPU}). "
|
|
f"Need at least {total_npus * MIN_CPUS_PER_NPU} CPUs in cpuset."
|
|
)
|
|
|
|
def _slice_for_npu(global_npu_id: int) -> list[int]:
|
|
# start = global_npu_id*base + min(global_npu_id, extra)
|
|
start = global_npu_id * base + (global_npu_id if global_npu_id < extra else extra)
|
|
take = base + (1 if global_npu_id < extra else 0)
|
|
end = start + take
|
|
return allowed[start:end]
|
|
|
|
for npu in running:
|
|
if npu < 0 or npu >= total_npus:
|
|
raise RuntimeError(f"Invalid NPU id {npu}, total_npus={total_npus}.")
|
|
cpus = _slice_for_npu(npu)
|
|
# Extra safety: should always be >= base >= 5
|
|
if len(cpus) < MIN_CPUS_PER_NPU:
|
|
raise RuntimeError(
|
|
f"NPU{npu} got too few CPUs: {len(cpus)} (<5). "
|
|
f"total_allowed={total_cpu}, total_npus={total_npus}, base={base}, extra={extra}"
|
|
)
|
|
self.npu_cpu_pool[npu] = cpus
|
|
|
|
@staticmethod
|
|
def _binding_mode() -> str:
|
|
device_type = get_ascend_device_type()
|
|
return DEVICE_BINDING_MODE.get(device_type, TOPO_AFFINITY_MODE)
|
|
|
|
def build_cpu_pools(self) -> None:
|
|
self.build_cpu_node_map()
|
|
|
|
mode = self._binding_mode()
|
|
logger.info(f"[cpu_bind_mode] mode={mode} rank={self.rank_id} visible_npus={self.device_info.running_npu_list}")
|
|
if mode == GLOBAL_SLICE_MODE:
|
|
self.build_global_slice_cpu_pool()
|
|
return
|
|
|
|
# topo_affinity mode
|
|
if not self.device_info.npu_affinity:
|
|
logger.warning("NPU topo affinity not found, fallback to global-slice CPU binding.")
|
|
self.build_global_slice_cpu_pool()
|
|
return
|
|
|
|
for npu in self.device_info.running_npu_list:
|
|
base_cpu_list = [
|
|
cpu for cpu in self.device_info.npu_affinity.get(npu, []) if cpu in self.device_info.allowed_cpus
|
|
]
|
|
if not base_cpu_list:
|
|
raise RuntimeError("CPUs available in 'Cpus_allowed_list' conflict with NUMA affinity.")
|
|
extra_cpu_list = self.extend_numa(base_cpu_list)
|
|
self.npu_cpu_pool[npu] = extra_cpu_list
|
|
|
|
groups = defaultdict(list)
|
|
for npu, cpus in self.npu_cpu_pool.items():
|
|
groups[str(cpus)].append(npu)
|
|
|
|
final: dict[int, list[int]] = {}
|
|
for key, npu_list in groups.items():
|
|
if len(npu_list) == 1:
|
|
final[npu_list[0]] = self.npu_cpu_pool[npu_list[0]]
|
|
else:
|
|
final.update(self.average_distribute({key: npu_list}))
|
|
self.npu_cpu_pool = final
|
|
|
|
def allocate(self) -> None:
|
|
for npu, pool in self.npu_cpu_pool.items():
|
|
if len(pool) >= MIN_CPUS_PER_NPU:
|
|
main = pool[2:-2]
|
|
acl = [pool[-2]]
|
|
rel = [pool[-1]]
|
|
else:
|
|
raise RuntimeError(
|
|
f"The number of CPUs is insufficient. Each NPU requires at least {MIN_CPUS_PER_NPU} CPUs."
|
|
)
|
|
self.assign_main[npu] = main
|
|
self.assign_acl[npu] = acl
|
|
self.assign_rel[npu] = rel
|
|
|
|
def print_plan(self) -> None:
|
|
logger.info("The CPU allocation plan is as follows:")
|
|
current_npu = self.device_info.running_npu_list[self.rank_id]
|
|
main = " ".join(map(str, self.assign_main[current_npu]))
|
|
acl = " ".join(map(str, self.assign_acl[current_npu]))
|
|
rel = str(self.assign_rel[current_npu]) if self.assign_rel[current_npu] else ""
|
|
logger.info(f"NPU{current_npu}: main=[{main}] acl=[{acl}] release=[{rel}]")
|
|
|
|
def bind_memory(self, pid: str, npu: int) -> None:
|
|
def _get_npu_numa_node(npu_id: int) -> int | None:
|
|
cpu_pool = self.npu_cpu_pool.get(npu_id, [])
|
|
if not cpu_pool:
|
|
return None
|
|
anchor_cpu = cpu_pool[0]
|
|
return self.cpu_node.get(anchor_cpu)
|
|
|
|
if not shutil.which("migratepages"):
|
|
logger.info("The 'migratepages' command is not available, skipping memory binding.")
|
|
return
|
|
target_numa = _get_npu_numa_node(npu)
|
|
if target_numa is None:
|
|
logger.warning(f"[migrate] rank:{self.rank_id} -> NPU{npu} has no CPU pool, skip memory binding.")
|
|
return
|
|
all_numa_nodes = sorted(self.numa_to_cpu_map.keys())
|
|
if target_numa not in all_numa_nodes:
|
|
logger.warning(f"[migrate] NPU:{npu} -> NUMA {target_numa} not found, skip memory binding.")
|
|
return
|
|
# Bind memory to the NPU's NUMA node only to minimize cross-NUMA traffic.
|
|
logger.info(f"[migrate] NPU:{npu} -> NUMA [{target_numa}]")
|
|
execute_command(
|
|
[
|
|
"migratepages",
|
|
pid,
|
|
",".join(map(str, all_numa_nodes)),
|
|
str(target_numa),
|
|
]
|
|
)
|
|
|
|
def bind_threads(self) -> None:
|
|
thread_message, _ = execute_command(["ps", "-Te"])
|
|
threads_map = self.get_threads_map(thread_message)
|
|
main_pid = str(psutil.Process().pid)
|
|
current_npu = self.device_info.running_npu_list[self.rank_id]
|
|
self.bind(main_pid, self.assign_main[current_npu], True)
|
|
for acl_thread in threads_map.get(main_pid, {}).get("acl_thread", []):
|
|
self.bind(acl_thread, self.assign_acl[current_npu], False)
|
|
self.bind_memory(acl_thread, current_npu)
|
|
for release_thread in threads_map.get(main_pid, {}).get("release_thread", []):
|
|
self.bind(release_thread, self.assign_rel[current_npu], False)
|
|
|
|
def bind_npu_irq(self) -> None:
|
|
if not os.access("/proc/irq", os.W_OK):
|
|
return
|
|
|
|
# Only bind IRQ for current rank's NPU to avoid multi-process overwrite.
|
|
current_npu = self.device_info.running_npu_list[self.rank_id]
|
|
if current_npu not in self.npu_cpu_pool:
|
|
logger.warning(f"[irq] rank:{self.rank_id} -> NPU{current_npu} has no cpu pool, skip irq binding.")
|
|
return
|
|
|
|
if shutil.which("systemctl"):
|
|
output, _ = execute_command(["systemctl", "list-unit-files"])
|
|
if "irqbalance.service" in output:
|
|
_, return_code = execute_command(["systemctl", "is-active", "--quiet", "irqbalance"])
|
|
if return_code == 0:
|
|
logger.warning(
|
|
"The irqbalance service is running and has been stopped. "
|
|
"You can run the systemctl start irqbalance command to restart it."
|
|
)
|
|
execute_command(["systemctl", "stop", "irqbalance"])
|
|
|
|
sq_irqs = []
|
|
with open("/proc/interrupts") as f:
|
|
for line in f:
|
|
if "sq_send_trigger_irq" in line:
|
|
irq = line.split(":")[0].strip()
|
|
sq_irqs.append(irq)
|
|
|
|
npu = current_npu
|
|
cpus = self.npu_cpu_pool[npu]
|
|
if len(cpus) < 2:
|
|
logger.warning(f"[irq] NPU{npu} cpu pool too small (<2), skip irq binding.")
|
|
return
|
|
|
|
sq_cpu, cq_cpu = cpus[0], cpus[1] # Reserved for IRQ binding
|
|
pci_addr = ""
|
|
|
|
device_type = get_ascend_device_type()
|
|
if device_type == AscendDeviceType.A3:
|
|
# A3: logical npu_id = card_id*2 + chip_id
|
|
card_id = npu // 2
|
|
chip_id = npu % 2
|
|
info, _ = execute_command(["npu-smi", "info", "-t", "board", "-i", str(card_id), "-c", str(chip_id)])
|
|
else:
|
|
# A2 / others: logical npu_id is card id
|
|
info, _ = execute_command(["npu-smi", "info", "-t", "board", "-i", str(npu)])
|
|
|
|
for line in info.splitlines():
|
|
if "PCIe Bus Info" in line:
|
|
pci_addr = line.split()[-1].lower()
|
|
break
|
|
|
|
if not pci_addr:
|
|
logger.warning(f"Can't find pci address of NPU{npu} .")
|
|
return
|
|
|
|
try:
|
|
npu_irq_list = sorted(os.listdir(f"/sys/bus/pci/devices/{pci_addr}/msi_irqs/"), key=lambda x: int(x))
|
|
except FileNotFoundError:
|
|
logger.warning(f"The msi_irqs folder cannot be found under /sys/bus/pci/devices/{pci_addr} .")
|
|
return
|
|
|
|
sq_irq, cq_irq = "", ""
|
|
for irq in sq_irqs:
|
|
if irq in npu_irq_list:
|
|
sq_irq = irq
|
|
cq_irq = str(int(irq) + 1)
|
|
break
|
|
if not sq_irq:
|
|
logger.warning(f"The sq_send_trigger_irq of NPU{npu} is not found.")
|
|
return
|
|
|
|
logger.info(
|
|
f"NPU{npu}(PCI {pci_addr}): sq_send_trigger_irq IRQ_ID={sq_irq} -> CPU{sq_cpu}, "
|
|
f"cq_update_irq IRQ_ID={cq_irq} -> CPU{cq_cpu}"
|
|
)
|
|
with open(f"/proc/irq/{sq_irq}/smp_affinity", "w") as f:
|
|
f.write(self.cpu_to_mask(sq_cpu))
|
|
with open(f"/proc/irq/{cq_irq}/smp_affinity", "w") as f:
|
|
f.write(self.cpu_to_mask(cq_cpu))
|
|
|
|
def run_all(self) -> None:
|
|
self.build_cpu_pools()
|
|
self.allocate()
|
|
self.print_plan()
|
|
self.bind_threads()
|
|
self.bind_npu_irq()
|
|
|
|
|
|
def bind_cpus(rank_id: int) -> None:
|
|
if not is_arm_cpu():
|
|
logger.info("CPU binding skipped: non-ARM CPU detected.")
|
|
return
|
|
binder = CpuAlloc(rank_id)
|
|
binder.run_all()
|