Files
xc-llm-ascend/vllm_ascend/cpu_binding.py
Frank Chen b771ca9a47 [CPU binding] Implement global CPU slicing and improve IRQ binding for Ascend NPUs (#6945)
### What this PR does / why we need it?

This PR introduces global CPU slicing for Ascend NPUs to ensure
non-overlapping CPU partitions, addresses IRQ binding logical errors on
A3, and enhances the logic for determining total NPUs in CPU allocation.
These changes are necessary to optimize CPU resource management and
improve system stability.

- **Global CPU Slicing**: Introduced a global CPU slicing mechanism for
Ascend NPUs to ensure non-overlapping CPU partitions across multiple
processes or data parallel groups, preventing resource contention.
- **Improved IRQ Binding for A3 Devices**: Refined the IRQ binding logic
specifically for Ascend A3 devices, correctly mapping logical NPU IDs to
physical card and chip IDs for accurate npu-smi queries and preventing
multi-process overwrite of IRQ settings.
- **Enhanced NPU Count Determination**: Improved the logic for
determining the total number of logical NPUs, prioritizing NPU mapping
information to ensure more accurate CPU allocation.
- **Minimum CPU Requirement**: Established a minimum requirement of 5
CPUs per NPU for binding, reserving specific cores for IRQ, main, ACL,
and release operations to ensure stable operation.

### Does this PR introduce _any_ user-facing change?

No user-facing changes are introduced.

### How was this patch tested?

CI passed with new added/existing tests.

- vLLM version: v0.16.0
- vLLM main:
15d76f74e2

---------

Signed-off-by: c00818886 <chenchuwei@huawei.com>
2026-03-03 17:20:52 +08:00

511 lines
21 KiB
Python

#!/usr/bin/env python3
import os
import platform
import shutil
import subprocess
from collections import defaultdict
import psutil
from vllm.logger import logger
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
MASK_BIT = 32 # Number of bits in a CPU affinity mask group
MIN_CPUS_PER_NPU = 5 # 2(IRQ) + 1(main, at least 1 CPU) + 1(acl) + 1(release) = 5 CPUs per NPU
ALLOWED_CPUS_PATH = "/proc/self/status"
ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")
TOPO_AFFINITY_MODE = "topo_affinity"
GLOBAL_SLICE_MODE = "global_slice"
DEVICE_BINDING_MODE: dict["AscendDeviceType", str] = {
AscendDeviceType.A2: TOPO_AFFINITY_MODE,
AscendDeviceType.A3: GLOBAL_SLICE_MODE,
AscendDeviceType._310P: TOPO_AFFINITY_MODE,
}
def is_arm_cpu() -> bool:
arch = platform.machine().lower()
if arch in {"x86_64", "amd64", "i386", "i686"}:
return False
if arch in {"aarch64", "arm64"} or arch.startswith("arm"):
return True
logger.warning(f"Unknown CPU architecture '{arch}', CPU binding will be disabled.")
return False
def execute_command(cmd: list[str]) -> tuple[str, int]:
with subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
out, _ = p.communicate(timeout=1000)
return out.decode(), p.returncode
class DeviceInfo:
def __init__(self):
self.npu_map_info: dict[str, dict[str, str]] = self.get_npu_map_info()
self.allowed_cpus: list[int] = self.parse_allowed_cpus()
self.running_npu_list: list[int] = self.get_running_npus()
self.npu_affinity: dict[int, list[int]] = self.parse_topo_affinity()
self.all_logic_npus: list[int] = self.get_all_logic_npus()
self.total_logic_npus: int = len(self.all_logic_npus)
@staticmethod
def expand_cpu_list(allowed_list_str: str) -> list[int]:
allowed_cpus_list: list[int] = []
for per_range in allowed_list_str.split(","):
if "-" in per_range:
start_cpu, end_cpu = map(int, per_range.split("-"))
allowed_cpus_list.extend(range(start_cpu, end_cpu + 1))
else:
allowed_cpus_list.append(int(per_range))
return allowed_cpus_list
def get_all_logic_npus(self) -> list[int]:
"""Collect all logical NPU IDs from the NPU mapping.
self.npu_map_info maps a board_id (A3) or npu_id (A2) to a per-chip map.
The per-chip map uses chip_id as the key and the logical NPU ID string
as the value.
"""
logic_ids: set[int] = set()
for _, chip_map in self.npu_map_info.items():
for _, logic_str in chip_map.items():
if logic_str and logic_str.isdigit():
logic_ids.add(int(logic_str))
return sorted(logic_ids)
@staticmethod
def get_npu_map_info() -> dict[str, dict[str, str]]:
npu_map_info: dict[str, dict[str, str]] = {}
npu_info, _ = execute_command(["npu-smi", "info", "-m"])
npu_map = npu_info.strip().split("\n")[1:]
for line in npu_map:
npu_id, chip_id, chip_logic_id = line.strip().split()[:3]
if not chip_logic_id.isdigit():
continue
if npu_id not in npu_map_info:
npu_map_info[npu_id] = {}
npu_map_info[npu_id][chip_id] = chip_logic_id
return npu_map_info
def get_running_npus(self) -> list[int]:
npu_message, _ = execute_command(["npu-smi", "info"])
in_proc_section = False
running_npu_set = set()
for line in npu_message.splitlines():
line = line.strip()
if line.startswith("| NPU") and "Process id" in line:
in_proc_section = True
continue
if not in_proc_section:
continue
if line.startswith("| "):
parts = [p.strip() for p in line.strip("|").split("|")]
if len(parts) < 2:
continue
npu_id = parts[0].split()[0]
chip_id = parts[0].split()[1]
if not npu_id.isdigit() or not chip_id.isdigit():
continue
chip_logic_id = self.npu_map_info.get(npu_id, {}).get(chip_id)
if not chip_logic_id or not chip_logic_id.isdigit():
raise RuntimeError("Failed to get correct chip_logic_id from command 'npu-smi info -m'.")
running_npu_set.add(int(chip_logic_id))
if ASCEND_RT_VISIBLE_DEVICES:
devices_str = ASCEND_RT_VISIBLE_DEVICES
devices_list = [int(x) for x in devices_str.split(",")]
running_npu_set = set(devices_list) & running_npu_set
if not running_npu_set:
raise RuntimeError("Can not get running npu info.")
return sorted(running_npu_set)
def parse_allowed_cpus(self) -> list[int]:
if not os.path.exists(ALLOWED_CPUS_PATH):
return []
with open(ALLOWED_CPUS_PATH) as f:
for line in f:
if line.startswith("Cpus_allowed_list"):
return self.expand_cpu_list(line.split()[1])
raise RuntimeError("Can not found specific 'Cpus_allowed_list' in the '/proc/self/status' file.")
def parse_topo_affinity(self) -> dict[int, list[int]]:
chip_logic_id = 0
affinity: dict[int, list[int]] = {}
affinity_message, _ = execute_command(["npu-smi", "info", "-t", "topo"])
for line in affinity_message.splitlines():
if line.startswith("NPU"):
parts = line.split()
last_part = parts[-1]
if last_part != "Affinity":
affinity[chip_logic_id] = self.expand_cpu_list(last_part)
chip_logic_id += 1
return affinity
class CpuAlloc:
def __init__(self, rank_id: int):
self.rank_id = rank_id
self.device_info: DeviceInfo = DeviceInfo()
self.cpu_node: dict[int, int] = {}
self.numa_to_cpu_map: dict[int, list[int]] = defaultdict(list)
self.npu_cpu_pool: dict[int, list[int]] = {}
self.assign_main: dict[int, list[int]] = {}
self.assign_acl: dict[int, list[int]] = {}
self.assign_rel: dict[int, list[int]] = {}
@staticmethod
def cpu_to_mask(cpu: int) -> str:
group = cpu // MASK_BIT
bit = cpu % MASK_BIT
value = 1 << bit
mask = f"{value:08x}"
for _ in range(1, group + 1):
mask = f"{mask},{'0' * (MASK_BIT // 4)}"
return mask
@staticmethod
def get_threads_map(thread_message: str) -> dict[str, dict[str, list[str]]]:
threads_map: dict[str, dict[str, list[str]]] = {}
for line in thread_message.splitlines():
parts = line.split()
if len(parts) < 2:
continue
main_pid, sub_pid = parts[0], parts[1]
if "acl_thread" in line:
key = "acl_thread"
elif "release_thread" in line:
key = "release_thread"
else:
continue
if main_pid not in threads_map:
threads_map[main_pid] = {"acl_thread": [], "release_thread": []}
threads_map[main_pid][key].append(sub_pid)
return threads_map
@staticmethod
def bind(pid: str, cpus: list[int], bind_sub_thread: bool) -> None:
if cpus:
cpu_list = ",".join(map(str, cpus))
if bind_sub_thread:
bind_result, return_code = execute_command(["taskset", "-acp", cpu_list, pid])
else:
bind_result, return_code = execute_command(["taskset", "-cp", cpu_list, pid])
if return_code != 0:
raise RuntimeError(f"Failed to bind {pid} to CPU {cpu_list}.")
def average_distribute(self, groups: dict[str, list[int]]) -> dict[int, list[int]]:
result: dict[int, list[int]] = {}
for key, npu_list in groups.items():
cpu_list = sorted(self.npu_cpu_pool[npu_list[0]])
cpu_num_per_npu = len(cpu_list) // len(npu_list)
for i, npu in enumerate(npu_list):
start_index = i * cpu_num_per_npu
end_index = (i + 1) * cpu_num_per_npu if i < len(npu_list) - 1 else len(cpu_list)
result[npu] = cpu_list[start_index:end_index]
return result
def extend_numa(self, cpu_list: list[int]) -> list[int]:
if not cpu_list:
return []
nodes = {self.cpu_node[c] for c in cpu_list}
if len(nodes) != 1:
return cpu_list
node = list(nodes)[0]
next_node = (node + 1) % len(self.numa_to_cpu_map)
extended = cpu_list[:]
for cpu in self.numa_to_cpu_map[next_node]:
if cpu in self.device_info.allowed_cpus:
extended.append(cpu)
return sorted(set(extended))
def build_cpu_node_map(self) -> None:
cpu_numa_map, _ = execute_command(["lscpu", "-e=CPU,NODE"])
for line in cpu_numa_map.splitlines():
line = line.strip()
if not line or not line[0].isdigit():
continue
cpu_str, node_str = line.split()
cpu = int(cpu_str)
node = int(node_str)
self.cpu_node[cpu] = node
self.numa_to_cpu_map[node].append(cpu)
if len(self.numa_to_cpu_map) == 0:
raise RuntimeError("lscpu command output error, no NUMA node available. Please check!")
def build_global_slice_cpu_pool(self) -> None:
"""
Build per-NPU CPU pools by slicing allowed_cpus using GLOBAL logical NPU ids.
Why:
- Multiple processes/DP groups may share the SAME cpuset (same allowed_cpus).
- If each process slices only its visible NPUs, CPU ranges overlap across processes.
- Global slicing ensures deterministic, non-overlapping CPU partitions per logical NPU id.
Notes:
- This strategy does NOT rely on npu-smi topo affinity.
- NUMA locality is achieved only if CPU numbering aligns with NUMA layout.
- Requires per-NPU slice size >= 5 (IRQ(2) + main(>=1) + acl(1) + release(1)).
"""
running = list(self.device_info.running_npu_list)
if not running:
return
allowed = sorted(set(self.device_info.allowed_cpus))
total_cpu = len(allowed)
if total_cpu == 0:
return
# Prefer mapping info (npu-smi info -m), fallback to topo keys, then visible list
if self.device_info.total_logic_npus > 0:
total_npus = self.device_info.total_logic_npus
elif self.device_info.npu_affinity:
total_npus = len(self.device_info.npu_affinity)
else:
total_npus = len(running)
if total_npus <= 0:
return
# Compute global per-NPU slicing
base = total_cpu // total_npus
extra = total_cpu % total_npus
logger.debug(
f"[cpu_global_slice] rank:{self.rank_id} ASCEND_RT_VISIBLE_DEVICES={ASCEND_RT_VISIBLE_DEVICES} "
f"running_npu_list:{running} total_npus:{total_npus} allowed_cpus:{total_cpu} "
f"base:{base} extra:{extra} allowed_cpus_head:{allowed[:16]} allowed_cpus_tail:{allowed[-16:]}"
)
# Enforce per-NPU slice length >= 5.
# Because with remainder distribution, some NPUs may get 'base' cores and some get 'base+1'.
# The minimum slice size is 'base'.
if base < MIN_CPUS_PER_NPU:
raise RuntimeError(
"Insufficient CPUs for binding with IRQ/ACL/REL reservations: "
f"total_allowed={total_cpu}, total_npus={total_npus}, "
f"min_per_npu={base} (<{MIN_CPUS_PER_NPU}). "
f"Need at least {total_npus * MIN_CPUS_PER_NPU} CPUs in cpuset."
)
def _slice_for_npu(global_npu_id: int) -> list[int]:
# start = global_npu_id*base + min(global_npu_id, extra)
start = global_npu_id * base + (global_npu_id if global_npu_id < extra else extra)
take = base + (1 if global_npu_id < extra else 0)
end = start + take
return allowed[start:end]
for npu in running:
if npu < 0 or npu >= total_npus:
raise RuntimeError(f"Invalid NPU id {npu}, total_npus={total_npus}.")
cpus = _slice_for_npu(npu)
# Extra safety: should always be >= base >= 5
if len(cpus) < MIN_CPUS_PER_NPU:
raise RuntimeError(
f"NPU{npu} got too few CPUs: {len(cpus)} (<5). "
f"total_allowed={total_cpu}, total_npus={total_npus}, base={base}, extra={extra}"
)
self.npu_cpu_pool[npu] = cpus
@staticmethod
def _binding_mode() -> str:
device_type = get_ascend_device_type()
return DEVICE_BINDING_MODE.get(device_type, TOPO_AFFINITY_MODE)
def build_cpu_pools(self) -> None:
self.build_cpu_node_map()
mode = self._binding_mode()
logger.info(f"[cpu_bind_mode] mode={mode} rank={self.rank_id} visible_npus={self.device_info.running_npu_list}")
if mode == GLOBAL_SLICE_MODE:
self.build_global_slice_cpu_pool()
return
# topo_affinity mode
if not self.device_info.npu_affinity:
logger.warning("NPU topo affinity not found, fallback to global-slice CPU binding.")
self.build_global_slice_cpu_pool()
return
for npu in self.device_info.running_npu_list:
base_cpu_list = [
cpu for cpu in self.device_info.npu_affinity.get(npu, []) if cpu in self.device_info.allowed_cpus
]
if not base_cpu_list:
raise RuntimeError("CPUs available in 'Cpus_allowed_list' conflict with NUMA affinity.")
extra_cpu_list = self.extend_numa(base_cpu_list)
self.npu_cpu_pool[npu] = extra_cpu_list
groups = defaultdict(list)
for npu, cpus in self.npu_cpu_pool.items():
groups[str(cpus)].append(npu)
final: dict[int, list[int]] = {}
for key, npu_list in groups.items():
if len(npu_list) == 1:
final[npu_list[0]] = self.npu_cpu_pool[npu_list[0]]
else:
final.update(self.average_distribute({key: npu_list}))
self.npu_cpu_pool = final
def allocate(self) -> None:
for npu, pool in self.npu_cpu_pool.items():
if len(pool) >= MIN_CPUS_PER_NPU:
main = pool[2:-2]
acl = [pool[-2]]
rel = [pool[-1]]
else:
raise RuntimeError(
f"The number of CPUs is insufficient. Each NPU requires at least {MIN_CPUS_PER_NPU} CPUs."
)
self.assign_main[npu] = main
self.assign_acl[npu] = acl
self.assign_rel[npu] = rel
def print_plan(self) -> None:
logger.info("The CPU allocation plan is as follows:")
current_npu = self.device_info.running_npu_list[self.rank_id]
main = " ".join(map(str, self.assign_main[current_npu]))
acl = " ".join(map(str, self.assign_acl[current_npu]))
rel = str(self.assign_rel[current_npu]) if self.assign_rel[current_npu] else ""
logger.info(f"NPU{current_npu}: main=[{main}] acl=[{acl}] release=[{rel}]")
def bind_memory(self, pid: str, npu: int) -> None:
def _get_npu_numa_node(npu_id: int) -> int | None:
cpu_pool = self.npu_cpu_pool.get(npu_id, [])
if not cpu_pool:
return None
anchor_cpu = cpu_pool[0]
return self.cpu_node.get(anchor_cpu)
if not shutil.which("migratepages"):
logger.info("The 'migratepages' command is not available, skipping memory binding.")
return
target_numa = _get_npu_numa_node(npu)
if target_numa is None:
logger.warning(f"[migrate] rank:{self.rank_id} -> NPU{npu} has no CPU pool, skip memory binding.")
return
all_numa_nodes = sorted(self.numa_to_cpu_map.keys())
if target_numa not in all_numa_nodes:
logger.warning(f"[migrate] NPU:{npu} -> NUMA {target_numa} not found, skip memory binding.")
return
# Bind memory to the NPU's NUMA node only to minimize cross-NUMA traffic.
logger.info(f"[migrate] NPU:{npu} -> NUMA [{target_numa}]")
execute_command(
[
"migratepages",
pid,
",".join(map(str, all_numa_nodes)),
str(target_numa),
]
)
def bind_threads(self) -> None:
thread_message, _ = execute_command(["ps", "-Te"])
threads_map = self.get_threads_map(thread_message)
main_pid = str(psutil.Process().pid)
current_npu = self.device_info.running_npu_list[self.rank_id]
self.bind(main_pid, self.assign_main[current_npu], True)
for acl_thread in threads_map.get(main_pid, {}).get("acl_thread", []):
self.bind(acl_thread, self.assign_acl[current_npu], False)
self.bind_memory(acl_thread, current_npu)
for release_thread in threads_map.get(main_pid, {}).get("release_thread", []):
self.bind(release_thread, self.assign_rel[current_npu], False)
def bind_npu_irq(self) -> None:
if not os.access("/proc/irq", os.W_OK):
return
# Only bind IRQ for current rank's NPU to avoid multi-process overwrite.
current_npu = self.device_info.running_npu_list[self.rank_id]
if current_npu not in self.npu_cpu_pool:
logger.warning(f"[irq] rank:{self.rank_id} -> NPU{current_npu} has no cpu pool, skip irq binding.")
return
if shutil.which("systemctl"):
output, _ = execute_command(["systemctl", "list-unit-files"])
if "irqbalance.service" in output:
_, return_code = execute_command(["systemctl", "is-active", "--quiet", "irqbalance"])
if return_code == 0:
logger.warning(
"The irqbalance service is running and has been stopped. "
"You can run the systemctl start irqbalance command to restart it."
)
execute_command(["systemctl", "stop", "irqbalance"])
sq_irqs = []
with open("/proc/interrupts") as f:
for line in f:
if "sq_send_trigger_irq" in line:
irq = line.split(":")[0].strip()
sq_irqs.append(irq)
npu = current_npu
cpus = self.npu_cpu_pool[npu]
if len(cpus) < 2:
logger.warning(f"[irq] NPU{npu} cpu pool too small (<2), skip irq binding.")
return
sq_cpu, cq_cpu = cpus[0], cpus[1] # Reserved for IRQ binding
pci_addr = ""
device_type = get_ascend_device_type()
if device_type == AscendDeviceType.A3:
# A3: logical npu_id = card_id*2 + chip_id
card_id = npu // 2
chip_id = npu % 2
info, _ = execute_command(["npu-smi", "info", "-t", "board", "-i", str(card_id), "-c", str(chip_id)])
else:
# A2 / others: logical npu_id is card id
info, _ = execute_command(["npu-smi", "info", "-t", "board", "-i", str(npu)])
for line in info.splitlines():
if "PCIe Bus Info" in line:
pci_addr = line.split()[-1].lower()
break
if not pci_addr:
logger.warning(f"Can't find pci address of NPU{npu} .")
return
try:
npu_irq_list = sorted(os.listdir(f"/sys/bus/pci/devices/{pci_addr}/msi_irqs/"), key=lambda x: int(x))
except FileNotFoundError:
logger.warning(f"The msi_irqs folder cannot be found under /sys/bus/pci/devices/{pci_addr} .")
return
sq_irq, cq_irq = "", ""
for irq in sq_irqs:
if irq in npu_irq_list:
sq_irq = irq
cq_irq = str(int(irq) + 1)
break
if not sq_irq:
logger.warning(f"The sq_send_trigger_irq of NPU{npu} is not found.")
return
logger.info(
f"NPU{npu}(PCI {pci_addr}): sq_send_trigger_irq IRQ_ID={sq_irq} -> CPU{sq_cpu}, "
f"cq_update_irq IRQ_ID={cq_irq} -> CPU{cq_cpu}"
)
with open(f"/proc/irq/{sq_irq}/smp_affinity", "w") as f:
f.write(self.cpu_to_mask(sq_cpu))
with open(f"/proc/irq/{cq_irq}/smp_affinity", "w") as f:
f.write(self.cpu_to_mask(cq_cpu))
def run_all(self) -> None:
self.build_cpu_pools()
self.allocate()
self.print_plan()
self.bind_threads()
self.bind_npu_irq()
def bind_cpus(rank_id: int) -> None:
if not is_arm_cpu():
logger.info("CPU binding skipped: non-ARM CPU detected.")
return
binder = CpuAlloc(rank_id)
binder.run_all()