[Refactor] Modify the binding logic to allocate CPU cores for each NPU card (#5555)
[Refactor] Modify the binding logic to allocate CPU cores for each NPU card ### What this PR does / why we need it? Modify the binding logic to allocate CPU cores for each NPU card based on NUMA affinity, while isolating acl_thread/release_thread and other processes to prevent mutual interference. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested?c85cc045f8Signed-off-by: rowzwel_dx <1392851715@qq.com> - vLLM version: v0.13.0 - vLLM main:7157596103Signed-off-by: Rozwel-dx <1392851715@qq.com>
This commit is contained in:
167
tests/ut/device_allocator/test_cpu_binding.py
Normal file
167
tests/ut/device_allocator/test_cpu_binding.py
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
import unittest
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from vllm_ascend.cpu_binding import CpuAlloc, DeviceInfo
|
||||||
|
|
||||||
|
|
||||||
|
class TestDeviceInfo(unittest.TestCase):
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.execute_command')
|
||||||
|
def setUp(self, mock_execute_command):
|
||||||
|
mock_execute_command.side_effect = [
|
||||||
|
("NPU ID Chip ID Chip Logic ID Chip Name\n0 0 0 Ascend\n0 1 - Mcu\n1 0 1 Ascend",
|
||||||
|
0),
|
||||||
|
("| NPU Chip | Process id |\n| 0 0 | 1234 | vllm | 56000 |\n| 1 0 | 1235 | vllm | 56000 |",
|
||||||
|
0), ("", 0)
|
||||||
|
]
|
||||||
|
self.device_info = DeviceInfo()
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.execute_command')
|
||||||
|
def test_get_npu_map_info(self, mock_execute_command):
|
||||||
|
execute_result_list = [
|
||||||
|
("NPU ID Chip ID Chip Logic ID Chip Phy-ID Chip Name\n0 0 0 0 Ascend\n0 1 1 1 Ascend\n0 2 - - Mcu",
|
||||||
|
0),
|
||||||
|
("NPU ID Chip ID Chip Logic ID Chip Name\n8 0 0 Ascend\n8 1 - Mcu\n9 0 1 Ascend",
|
||||||
|
0),
|
||||||
|
]
|
||||||
|
result_list = [{
|
||||||
|
'0': {
|
||||||
|
'0': '0',
|
||||||
|
'1': '1'
|
||||||
|
}
|
||||||
|
}, {
|
||||||
|
'8': {
|
||||||
|
'0': '0'
|
||||||
|
},
|
||||||
|
'9': {
|
||||||
|
'0': '1'
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
for result in execute_result_list:
|
||||||
|
mock_execute_command.return_value = result
|
||||||
|
npu_map_info = self.device_info.get_npu_map_info()
|
||||||
|
expected = result_list.pop(0)
|
||||||
|
self.assertEqual(npu_map_info, expected)
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.execute_command')
|
||||||
|
def test_get_running_npus(self, mock_execute_command):
|
||||||
|
mock_execute_command.side_effect = [
|
||||||
|
("| NPU Chip | Process id |\n| 0 1 | 1236 | vllm | 56000 |", 0),
|
||||||
|
("", 0),
|
||||||
|
("| NPU Chip | Process id |\n| 1 0 | 1236 | vllm | 56000 |", 0)
|
||||||
|
]
|
||||||
|
with self.assertRaises(RuntimeError):
|
||||||
|
self.device_info.get_running_npus()
|
||||||
|
with self.assertRaises(RuntimeError):
|
||||||
|
self.device_info.get_running_npus()
|
||||||
|
running_npus = self.device_info.get_running_npus()
|
||||||
|
self.assertEqual(len(running_npus), 1)
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.execute_command')
|
||||||
|
def test_parse_topo_affinity(self, mock_execute_command):
|
||||||
|
mock_execute_command.return_value = (
|
||||||
|
"NPU0 X HCCS HCCS HCCS HCCS HCCS HCCS HCCS 0-3", 0)
|
||||||
|
affinity = self.device_info.parse_topo_affinity()
|
||||||
|
expected = {0: [0, 1, 2, 3]}
|
||||||
|
self.assertEqual(affinity, expected)
|
||||||
|
|
||||||
|
def test_expand_cpu_list(self):
|
||||||
|
result = self.device_info.expand_cpu_list("0-2, 4, 6-8")
|
||||||
|
self.assertEqual(result, [0, 1, 2, 4, 6, 7, 8])
|
||||||
|
|
||||||
|
|
||||||
|
class TestCpuAlloc(unittest.TestCase):
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.execute_command')
|
||||||
|
def setUp(self, mock_execute_command):
|
||||||
|
mock_execute_command.side_effect = [
|
||||||
|
("NPU ID Chip ID Chip Logic ID Chip Name\n0 0 0 Ascend\n0 1 - Mcu\n1 0 1 Ascend",
|
||||||
|
0),
|
||||||
|
("| NPU Chip | Process id |\n| 0 0 | 1234 | vllm | 56000 |\n| 1 0 | 1235 | vllm | 56000 |",
|
||||||
|
0), ("", 0)
|
||||||
|
]
|
||||||
|
self.cpu_alloc = CpuAlloc(0)
|
||||||
|
|
||||||
|
def test_average_distribute(self):
|
||||||
|
self.cpu_alloc.npu_cpu_pool = {
|
||||||
|
0: [10, 11, 12, 13],
|
||||||
|
1: [10, 11, 12, 13]
|
||||||
|
}
|
||||||
|
groups = {"[10, 11, 12, 13]": [0, 1]}
|
||||||
|
result = self.cpu_alloc.average_distribute(groups)
|
||||||
|
self.assertEqual(result, {0: [10, 11], 1: [12, 13]})
|
||||||
|
self.cpu_alloc.npu_cpu_pool = {
|
||||||
|
0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
|
||||||
|
1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
|
||||||
|
2: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
|
||||||
|
}
|
||||||
|
groups = {"[0, 1, 2, 3, 4, 5]": [0, 1, 2]}
|
||||||
|
result = self.cpu_alloc.average_distribute(groups)
|
||||||
|
self.assertEqual(result, {
|
||||||
|
0: [0, 1, 2, 3],
|
||||||
|
1: [4, 5, 6, 7],
|
||||||
|
2: [8, 9, 10, 11, 12, 13]
|
||||||
|
})
|
||||||
|
|
||||||
|
def test_extend_numa(self):
|
||||||
|
result = self.cpu_alloc.extend_numa([])
|
||||||
|
self.assertEqual(result, [])
|
||||||
|
self.cpu_alloc.cpu_node = {0: 0, 1: 0, 2: 1, 3: 1}
|
||||||
|
self.cpu_alloc.numa_to_cpu_map = {0: [0, 1], 1: [2, 3]}
|
||||||
|
self.cpu_alloc.device_info.allowed_cpus = [0, 1, 2, 3]
|
||||||
|
result = self.cpu_alloc.extend_numa([0, 1])
|
||||||
|
self.assertEqual(result, [0, 1, 2, 3])
|
||||||
|
self.cpu_alloc.device_info.allowed_cpus = [0, 1, 3]
|
||||||
|
result = self.cpu_alloc.extend_numa([0, 1])
|
||||||
|
self.assertEqual(result, [0, 1, 3])
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.execute_command')
|
||||||
|
def test_build_cpu_node_map(self, mock_execute_command):
|
||||||
|
mock_execute_command.return_value = ("", 0)
|
||||||
|
with self.assertRaises(RuntimeError):
|
||||||
|
self.cpu_alloc.build_cpu_node_map()
|
||||||
|
mock_execute_command.return_value = ("0 0\n1 1\n2 0\n3 1", 0)
|
||||||
|
self.cpu_alloc.build_cpu_node_map()
|
||||||
|
expected_cpu_node = {0: 0, 1: 1, 2: 0, 3: 1}
|
||||||
|
expected_numa_to_cpu_map = {0: [0, 2], 1: [1, 3]}
|
||||||
|
self.assertEqual(self.cpu_alloc.cpu_node, expected_cpu_node)
|
||||||
|
self.assertEqual(self.cpu_alloc.numa_to_cpu_map,
|
||||||
|
expected_numa_to_cpu_map)
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.execute_command')
|
||||||
|
def test_handle_no_affinity(self, mock_execute_command):
|
||||||
|
mock_execute_command.side_effect = [("0 0\n1 1", 0), ("0 0\n1 1", 0)]
|
||||||
|
self.cpu_alloc.device_info.running_npu_list = [0, 1]
|
||||||
|
self.cpu_alloc.device_info.allowed_cpus = [0, 1, 2, 3]
|
||||||
|
self.cpu_alloc.device_info.affinity = {}
|
||||||
|
self.assertEqual(self.cpu_alloc.npu_cpu_pool, {})
|
||||||
|
self.cpu_alloc.device_info.affinity = {0: [0, 1], 1: [2, 3]}
|
||||||
|
self.cpu_alloc.build_cpu_pools()
|
||||||
|
self.assertEqual(len(self.cpu_alloc.npu_cpu_pool), 2)
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.execute_command')
|
||||||
|
def test_allocate(self, mock_execute_command):
|
||||||
|
self.cpu_alloc.device_info.running_npu_list = [0]
|
||||||
|
self.cpu_alloc.npu_cpu_pool = {0: [0, 1, 2]}
|
||||||
|
self.cpu_alloc.allocate()
|
||||||
|
self.assertEqual(self.cpu_alloc.assign_main[0], [0])
|
||||||
|
self.assertEqual(self.cpu_alloc.assign_acl[0], [1])
|
||||||
|
self.assertEqual(self.cpu_alloc.assign_rel[0], [2])
|
||||||
|
self.cpu_alloc.npu_cpu_pool = {0: [0, 1]}
|
||||||
|
with self.assertRaises(RuntimeError):
|
||||||
|
self.cpu_alloc.allocate()
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.execute_command')
|
||||||
|
def test_bind_threads(self, mock_execute_command):
|
||||||
|
thread_message = "1234 1234 ? 00:00:03 acl_thread\n4567 4567 ? 00:00:03 release_thread"
|
||||||
|
mock_execute_command.return_value = (thread_message, 0)
|
||||||
|
self.cpu_alloc.device_info.running_npu_list = [0]
|
||||||
|
self.cpu_alloc.assign_main = {0: [0, 1]}
|
||||||
|
self.cpu_alloc.assign_acl = {0: [2]}
|
||||||
|
self.cpu_alloc.assign_rel = {0: [3]}
|
||||||
|
self.cpu_alloc.bind_threads()
|
||||||
|
mock_execute_command.assert_called()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
@@ -1,330 +1,319 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
from dataclasses import dataclass
|
from collections import defaultdict
|
||||||
from itertools import accumulate
|
from typing import Dict, List, Tuple
|
||||||
from typing import Dict, List, Optional, Tuple, Union
|
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
import torch_npu
|
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
|
|
||||||
|
ALLOWED_CPUS_PATH = "/proc/self/status"
|
||||||
ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")
|
ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")
|
||||||
CPU_BINDING_NUM = os.getenv("CPU_BINDING_NUM")
|
|
||||||
|
|
||||||
|
|
||||||
def execute_command(cmd_list):
|
def execute_command(cmd: List[str]) -> Tuple[str, int]:
|
||||||
with subprocess.Popen(cmd_list,
|
with subprocess.Popen(cmd,
|
||||||
shell=False,
|
shell=False,
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE) as p:
|
stderr=subprocess.PIPE) as p:
|
||||||
out, err = p.communicate(timeout=1000)
|
out, _ = p.communicate(timeout=1000)
|
||||||
res = out.decode()
|
return out.decode(), p.returncode
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class DeviceInfo:
|
class DeviceInfo:
|
||||||
"""
|
|
||||||
Parse a single line of device information into structured data.
|
|
||||||
"""
|
|
||||||
_info_line: str = ""
|
|
||||||
npu_id: int = 0
|
|
||||||
chip_id: int = 0
|
|
||||||
chip_logic_id: Union[int, str] = 0
|
|
||||||
chip_name: str = ""
|
|
||||||
|
|
||||||
def __post_init__(self):
|
def __init__(self):
|
||||||
npu_id_str, chip_id_str, chip_logic_id_str, self.chip_name = self._info_line.strip(
|
self.npu_map_info: Dict[str, Dict[str, str]] = self.get_npu_map_info()
|
||||||
).split(None, 3)
|
self.allowed_cpus: List[int] = self.parse_allowed_cpus()
|
||||||
self.npu_id = int(npu_id_str)
|
self.running_npu_list: List[int] = self.get_running_npus()
|
||||||
self.chip_id = int(chip_id_str)
|
self.npu_affinity: Dict[int, List[int]] = self.parse_topo_affinity()
|
||||||
if chip_logic_id_str.isnumeric():
|
|
||||||
self.chip_logic_id = int(chip_logic_id_str)
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def expand_cpu_list(allowed_list_str: str) -> List[int]:
|
||||||
|
allowed_cpus_list: List[int] = []
|
||||||
|
for per_range in allowed_list_str.split(","):
|
||||||
|
if "-" in per_range:
|
||||||
|
start_cpu, end_cpu = map(int, per_range.split("-"))
|
||||||
|
allowed_cpus_list.extend(range(start_cpu, end_cpu + 1))
|
||||||
|
else:
|
||||||
|
allowed_cpus_list.append(int(per_range))
|
||||||
|
return allowed_cpus_list
|
||||||
|
|
||||||
class NpuHbmInfo:
|
@staticmethod
|
||||||
visible_npu_ids: Optional[List[int]] = None
|
def get_npu_map_info() -> Dict[str, Dict[str, str]]:
|
||||||
hbm_capacity: Optional[int] = None
|
npu_map_info: Dict[str, Dict[str, str]] = {}
|
||||||
hbm_usage: Optional[int] = None
|
npu_info, _ = execute_command(["npu-smi", "info", "-m"])
|
||||||
|
npu_map = npu_info.strip().split("\n")[1:]
|
||||||
|
for line in npu_map:
|
||||||
|
npu_id, chip_id, chip_logic_id = line.strip().split()[:3]
|
||||||
|
if not chip_logic_id.isdigit():
|
||||||
|
continue
|
||||||
|
if npu_id not in npu_map_info:
|
||||||
|
npu_map_info[npu_id] = {}
|
||||||
|
npu_map_info[npu_id][chip_id] = chip_logic_id
|
||||||
|
return npu_map_info
|
||||||
|
|
||||||
@classmethod
|
def get_running_npus(self) -> List[int]:
|
||||||
def set_visible_devices(cls, world_size):
|
npu_message, _ = execute_command(["npu-smi", "info"])
|
||||||
"""
|
in_proc_section = False
|
||||||
Determine which NPUs are visible to the current process and cache their
|
running_npu_set = set()
|
||||||
logical NPU IDs in `cls.visible_npu_ids`.
|
for line in npu_message.splitlines():
|
||||||
"""
|
line = line.strip()
|
||||||
if cls.visible_npu_ids:
|
if line.startswith("| NPU") and "Process id" in line:
|
||||||
return
|
in_proc_section = True
|
||||||
if ASCEND_RT_VISIBLE_DEVICES is None:
|
continue
|
||||||
devices = sorted(list(_get_device_map_info().keys()))
|
if not in_proc_section:
|
||||||
else:
|
continue
|
||||||
|
if line.startswith("| "):
|
||||||
|
parts = [p.strip() for p in line.strip("|").split("|")]
|
||||||
|
if len(parts) < 2:
|
||||||
|
continue
|
||||||
|
npu_id = parts[0].split()[0]
|
||||||
|
chip_id = parts[0].split()[1]
|
||||||
|
if not npu_id.isdigit() or not chip_id.isdigit():
|
||||||
|
continue
|
||||||
|
chip_logic_id = self.npu_map_info.get(npu_id, {}).get(chip_id)
|
||||||
|
if not chip_logic_id or not chip_logic_id.isdigit():
|
||||||
|
raise RuntimeError(
|
||||||
|
"Failed to get correct chip_logic_id from command 'npu-smi info -m'."
|
||||||
|
)
|
||||||
|
running_npu_set.add(int(chip_logic_id))
|
||||||
|
if ASCEND_RT_VISIBLE_DEVICES:
|
||||||
devices_str = ASCEND_RT_VISIBLE_DEVICES
|
devices_str = ASCEND_RT_VISIBLE_DEVICES
|
||||||
devices = [int(x) for x in devices_str.split(",")]
|
devices_list = [int(x) for x in devices_str.split(",")]
|
||||||
device_map_info = _get_device_map_info()
|
running_npu_set = set(devices_list) & running_npu_set
|
||||||
npu_ids = []
|
if not running_npu_set:
|
||||||
for device in devices:
|
raise RuntimeError(
|
||||||
device_info = device_map_info.get(device)
|
"Can not get running npu info, you can use BIND_CPU=0 to skip."
|
||||||
if device_info is None:
|
)
|
||||||
|
return sorted(running_npu_set)
|
||||||
|
|
||||||
|
def parse_allowed_cpus(self) -> List[int]:
|
||||||
|
if not os.path.exists(ALLOWED_CPUS_PATH):
|
||||||
|
return []
|
||||||
|
with open(ALLOWED_CPUS_PATH) as f:
|
||||||
|
for line in f:
|
||||||
|
if line.startswith("Cpus_allowed_list"):
|
||||||
|
return self.expand_cpu_list(line.split()[1])
|
||||||
|
raise RuntimeError(
|
||||||
|
"Can not found specific 'Cpus_allowed_list' in the '/proc/self/status' file."
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_topo_affinity(self) -> Dict[int, List[int]]:
|
||||||
|
chip_logic_id = 0
|
||||||
|
affinity: Dict[int, List[int]] = {}
|
||||||
|
affinity_message, _ = execute_command(
|
||||||
|
["npu-smi", "info", "-t", "topo"])
|
||||||
|
for line in affinity_message.splitlines():
|
||||||
|
if line.startswith("NPU"):
|
||||||
|
parts = line.split()
|
||||||
|
last_part = parts[-1]
|
||||||
|
if last_part != "Affinity":
|
||||||
|
affinity[chip_logic_id] = self.expand_cpu_list(last_part)
|
||||||
|
chip_logic_id += 1
|
||||||
|
return affinity
|
||||||
|
|
||||||
|
|
||||||
|
class CpuAlloc:
|
||||||
|
|
||||||
|
def __init__(self, rank_id: int):
|
||||||
|
self.rank_id = rank_id
|
||||||
|
self.device_info: DeviceInfo = DeviceInfo()
|
||||||
|
self.cpu_node: Dict[int, int] = {}
|
||||||
|
self.numa_to_cpu_map: Dict[int, List[int]] = defaultdict(list)
|
||||||
|
self.npu_cpu_pool: Dict[int, List[int]] = {}
|
||||||
|
self.assign_main: Dict[int, List[int]] = {}
|
||||||
|
self.assign_acl: Dict[int, List[int]] = {}
|
||||||
|
self.assign_rel: Dict[int, List[int]] = {}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_threads_map(
|
||||||
|
thread_message: str) -> Dict[str, Dict[str, List[str]]]:
|
||||||
|
threads_map: Dict[str, Dict[str, List[str]]] = {}
|
||||||
|
for line in thread_message.splitlines():
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) < 2:
|
||||||
|
continue
|
||||||
|
main_pid, sub_pid = parts[0], parts[1]
|
||||||
|
if "acl_thread" in line:
|
||||||
|
key = "acl_thread"
|
||||||
|
elif "release_thread" in line:
|
||||||
|
key = "release_thread"
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
if main_pid not in threads_map:
|
||||||
|
threads_map[main_pid] = {
|
||||||
|
"acl_thread": [],
|
||||||
|
"release_thread": []
|
||||||
|
}
|
||||||
|
threads_map[main_pid][key].append(sub_pid)
|
||||||
|
return threads_map
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def bind(pid: str, cpus: List[int], bind_sub_thread: bool) -> None:
|
||||||
|
if cpus:
|
||||||
|
cpu_list = ",".join(map(str, cpus))
|
||||||
|
if bind_sub_thread:
|
||||||
|
bind_result, return_code = execute_command(
|
||||||
|
["taskset", "-acp", cpu_list, pid])
|
||||||
|
else:
|
||||||
|
bind_result, return_code = execute_command(
|
||||||
|
["taskset", "-cp", cpu_list, pid])
|
||||||
|
if return_code != 0:
|
||||||
|
raise RuntimeError(f"Failed to bind {pid} to CPU {cpu_list}.")
|
||||||
|
|
||||||
|
def average_distribute(
|
||||||
|
self, groups: Dict[str, List[int]]) -> Dict[int, List[int]]:
|
||||||
|
result: Dict[int, List[int]] = {}
|
||||||
|
for key, npu_list in groups.items():
|
||||||
|
cpu_list = sorted(self.npu_cpu_pool[npu_list[0]])
|
||||||
|
cpu_num_per_npu = len(cpu_list) // len(npu_list)
|
||||||
|
for i, npu in enumerate(npu_list):
|
||||||
|
start_index = i * cpu_num_per_npu
|
||||||
|
end_index = (i + 1) * cpu_num_per_npu if i < len(
|
||||||
|
npu_list) - 1 else len(cpu_list)
|
||||||
|
result[npu] = cpu_list[start_index:end_index]
|
||||||
|
return result
|
||||||
|
|
||||||
|
def extend_numa(self, cpu_list: List[int]) -> List[int]:
|
||||||
|
if not cpu_list:
|
||||||
|
return []
|
||||||
|
nodes = {self.cpu_node[c] for c in cpu_list}
|
||||||
|
if len(nodes) != 1:
|
||||||
|
return cpu_list
|
||||||
|
node = list(nodes)[0]
|
||||||
|
next_node = (node + 1) % len(self.numa_to_cpu_map)
|
||||||
|
extended = cpu_list[:]
|
||||||
|
for cpu in self.numa_to_cpu_map[next_node]:
|
||||||
|
if cpu in self.device_info.allowed_cpus:
|
||||||
|
extended.append(cpu)
|
||||||
|
return sorted(set(extended))
|
||||||
|
|
||||||
|
def build_cpu_node_map(self) -> None:
|
||||||
|
cpu_numa_map, _ = execute_command(["lscpu", "-e=CPU,NODE"])
|
||||||
|
for line in cpu_numa_map.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line or not line[0].isdigit():
|
||||||
|
continue
|
||||||
|
cpu_str, node_str = line.split()
|
||||||
|
cpu = int(cpu_str)
|
||||||
|
node = int(node_str)
|
||||||
|
self.cpu_node[cpu] = node
|
||||||
|
self.numa_to_cpu_map[node].append(cpu)
|
||||||
|
if len(self.numa_to_cpu_map) == 0:
|
||||||
|
raise RuntimeError(
|
||||||
|
"lscpu command output error, no NUMA node available. Please check!"
|
||||||
|
)
|
||||||
|
|
||||||
|
def handle_no_affinity(self) -> None:
|
||||||
|
num_running_npu = len(self.device_info.running_npu_list)
|
||||||
|
num_numa_node = len(self.numa_to_cpu_map)
|
||||||
|
if num_numa_node == 0 or num_running_npu == 0:
|
||||||
|
return
|
||||||
|
if num_running_npu % num_numa_node != 0:
|
||||||
|
npu_num_per_node = num_running_npu // num_numa_node + 1
|
||||||
|
else:
|
||||||
|
npu_num_per_node = num_running_npu // num_numa_node
|
||||||
|
index = 0
|
||||||
|
for node in sorted(self.numa_to_cpu_map):
|
||||||
|
# Available CPUs on this NUMA (constrained by allowed_cpus)
|
||||||
|
cpus = [
|
||||||
|
c for c in self.numa_to_cpu_map[node]
|
||||||
|
if c in self.device_info.allowed_cpus
|
||||||
|
]
|
||||||
|
if not cpus:
|
||||||
|
continue
|
||||||
|
# The actual number of NPUs to be allocated on this NUMA.
|
||||||
|
npu_num_this_node = min(npu_num_per_node, num_running_npu - index)
|
||||||
|
if npu_num_this_node <= 0:
|
||||||
|
break
|
||||||
|
# Evenly distribute the CPUs of this NUMA node among npu_num_this_node NPUs.
|
||||||
|
total_cpu_num = len(cpus)
|
||||||
|
base_cpu_num = total_cpu_num // npu_num_this_node
|
||||||
|
extra_cpu_num = total_cpu_num % npu_num_this_node
|
||||||
|
start_index = 0
|
||||||
|
for i in range(npu_num_this_node):
|
||||||
|
take_cpu_num = base_cpu_num + (1 if i < extra_cpu_num else 0)
|
||||||
|
end_index = start_index + take_cpu_num
|
||||||
|
select_cpus_list = cpus[start_index:end_index]
|
||||||
|
if index < num_running_npu:
|
||||||
|
npu = self.device_info.running_npu_list[index]
|
||||||
|
self.npu_cpu_pool[npu] = select_cpus_list
|
||||||
|
index += 1
|
||||||
|
start_index = end_index
|
||||||
|
|
||||||
|
def build_cpu_pools(self) -> None:
|
||||||
|
self.build_cpu_node_map()
|
||||||
|
if not self.device_info.npu_affinity:
|
||||||
|
self.handle_no_affinity()
|
||||||
|
return
|
||||||
|
for npu in self.device_info.running_npu_list:
|
||||||
|
base_cpu_list = [
|
||||||
|
cpu for cpu in self.device_info.npu_affinity.get(npu, [])
|
||||||
|
if cpu in self.device_info.allowed_cpus
|
||||||
|
]
|
||||||
|
if not base_cpu_list:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Device {device} not found in device_map_info")
|
"CPUs available in 'Cpus_allowed_list' conflict with NUMA affinity."
|
||||||
npu_ids.append(device_info.npu_id)
|
)
|
||||||
cls.visible_npu_ids = npu_ids
|
extra_cpu_list = self.extend_numa(base_cpu_list)
|
||||||
|
self.npu_cpu_pool[npu] = extra_cpu_list
|
||||||
|
groups = defaultdict(list)
|
||||||
|
for npu, cpus in self.npu_cpu_pool.items():
|
||||||
|
groups[str(cpus)].append(npu)
|
||||||
|
final: Dict[int, List[int]] = {}
|
||||||
|
for key, npu_list in groups.items():
|
||||||
|
if len(npu_list) == 1:
|
||||||
|
final[npu_list[0]] = self.npu_cpu_pool[npu_list[0]]
|
||||||
|
else:
|
||||||
|
final.update(self.average_distribute({key: npu_list}))
|
||||||
|
self.npu_cpu_pool = final
|
||||||
|
|
||||||
@classmethod
|
def allocate(self) -> None:
|
||||||
def get_hbm_capacity(cls, rank, world_size, need_nz):
|
for npu, pool in self.npu_cpu_pool.items():
|
||||||
"""
|
if len(pool) >= 3:
|
||||||
Query and cache the HBM (or DDR) capacity in **bytes** for the NPU assigned
|
main = pool[:-2]
|
||||||
to the current process.
|
acl = [pool[-2]]
|
||||||
"""
|
rel = [pool[-1]]
|
||||||
soc_version = torch_npu._C._npu_get_soc_version()
|
else:
|
||||||
if cls.hbm_capacity:
|
raise RuntimeError(
|
||||||
return cls.hbm_capacity
|
"The number of CPUs is insufficient to bind to the NPUs. "
|
||||||
if not cls.visible_npu_ids:
|
"Each NPU requires at least 3 CPUs.")
|
||||||
cls.set_visible_devices(world_size)
|
self.assign_main[npu] = main
|
||||||
assert cls.visible_npu_ids is not None
|
self.assign_acl[npu] = acl
|
||||||
npu_id = cls.visible_npu_ids[rank]
|
self.assign_rel[npu] = rel
|
||||||
memory_info = execute_command(
|
|
||||||
["npu-smi", "info", "-i", f"{npu_id}", "-t",
|
|
||||||
"memory"]).split("\n")[1:]
|
|
||||||
if soc_version == 240:
|
|
||||||
hbm_capacity_key = 'Capacity(MB)'
|
|
||||||
elif not need_nz:
|
|
||||||
hbm_capacity_key = 'HBM Capacity(MB)'
|
|
||||||
else:
|
|
||||||
hbm_capacity_key = 'DDR Capacity(MB)'
|
|
||||||
for line in memory_info:
|
|
||||||
try:
|
|
||||||
key, value = line.strip().split(':', 2)
|
|
||||||
if key.strip() == hbm_capacity_key:
|
|
||||||
cls.hbm_capacity = int(value.strip()) * 1024 * 1024
|
|
||||||
return cls.hbm_capacity
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
raise ValueError('not found valid hbm capactiy')
|
|
||||||
|
|
||||||
@classmethod
|
def print_plan(self) -> None:
|
||||||
def get_hbm_usage(cls, rank, world_size, need_nz):
|
logger.info("The CPU allocation plan is as follows:")
|
||||||
"""
|
current_npu = self.device_info.running_npu_list[self.rank_id]
|
||||||
Return the current HBM or DDR usage
|
main = " ".join(map(str, self.assign_main[current_npu]))
|
||||||
ratio (0-1) for the NPU assigned to the given rank.
|
acl = " ".join(map(str, self.assign_acl[current_npu]))
|
||||||
"""
|
rel = str(self.assign_rel[current_npu]
|
||||||
if cls.hbm_usage:
|
) if self.assign_rel[current_npu] else ""
|
||||||
return cls.hbm_usage
|
logger.info(
|
||||||
if not cls.visible_npu_ids:
|
f"NPU{current_npu}: main=[{main}] acl=[{acl}] release=[{rel}]")
|
||||||
cls.set_visible_devices(world_size)
|
|
||||||
assert cls.visible_npu_ids is not None
|
def bind_threads(self) -> None:
|
||||||
npu_id = cls.visible_npu_ids[rank]
|
thread_message, _ = execute_command(["ps", "-Te"])
|
||||||
usage_info = execute_command(
|
threads_map = self.get_threads_map(thread_message)
|
||||||
["npu-smi", "info", "-i", f"{npu_id}", "-t",
|
main_pid = str(psutil.Process().pid)
|
||||||
"usages"]).split("\n")[1:]
|
current_npu = self.device_info.running_npu_list[self.rank_id]
|
||||||
soc_version = torch_npu._C._npu_get_soc_version()
|
self.bind(main_pid, self.assign_main[current_npu], True)
|
||||||
if soc_version == 240:
|
for acl_thread in threads_map.get(main_pid, {}).get("acl_thread", []):
|
||||||
hbm_capacity_key = 'Memory Usage Rate(%)'
|
self.bind(acl_thread, self.assign_acl[current_npu], False)
|
||||||
elif not need_nz:
|
for release_thread in threads_map.get(main_pid,
|
||||||
hbm_capacity_key = 'HBM Usage Rate(%)'
|
{}).get("release_thread", []):
|
||||||
else:
|
self.bind(release_thread, self.assign_rel[current_npu], False)
|
||||||
hbm_capacity_key = 'DDR Usage Rate(%)'
|
|
||||||
for line in usage_info:
|
def run_all(self) -> None:
|
||||||
try:
|
self.build_cpu_pools()
|
||||||
key, value = line.strip().split(':', 2)
|
self.allocate()
|
||||||
if key.strip() == hbm_capacity_key:
|
self.print_plan()
|
||||||
hbm_usage = (float(value.strip()) + 1) / 100
|
self.bind_threads()
|
||||||
return hbm_usage
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
raise ValueError('not found valid hbm usage')
|
|
||||||
|
|
||||||
|
|
||||||
def _get_device_map_info() -> Dict[int, DeviceInfo]:
|
def bind_cpus(rank_id: int) -> None:
|
||||||
"""
|
binder = CpuAlloc(rank_id)
|
||||||
Build and return a mapping from logical chip ID (int) to its DeviceInfo object.
|
binder.run_all()
|
||||||
"""
|
|
||||||
device_map_info = {}
|
|
||||||
device_map = execute_command(["npu-smi", "info",
|
|
||||||
"-m"]).strip().split("\n")[1:]
|
|
||||||
for line in device_map:
|
|
||||||
device_info = DeviceInfo(line.strip())
|
|
||||||
if isinstance(device_info.chip_logic_id, int):
|
|
||||||
device_map_info[device_info.chip_logic_id] = device_info
|
|
||||||
return device_map_info
|
|
||||||
|
|
||||||
|
|
||||||
def _get_pcie_info(devices: List[int], keyword="PCIeBusInfo"):
|
|
||||||
"""
|
|
||||||
Query each NPU in the given device list and return a mapping
|
|
||||||
from logical device ID to its PCIe bus address.
|
|
||||||
"""
|
|
||||||
device_map_info = _get_device_map_info()
|
|
||||||
device_pcie_tbl = {}
|
|
||||||
for device in devices:
|
|
||||||
device_info = device_map_info.get(device)
|
|
||||||
if not device_info:
|
|
||||||
raise RuntimeError(
|
|
||||||
"Can not get device info, you can use BIND_CPU=0 to skip.")
|
|
||||||
pcie_info = execute_command([
|
|
||||||
"npu-smi", "info", "-t", "board", "-i", f"{device_info.npu_id}",
|
|
||||||
"-c", f"{device_info.chip_id}"
|
|
||||||
]).strip().split("\n")
|
|
||||||
for _ in pcie_info:
|
|
||||||
line = ''.join(_.split())
|
|
||||||
if line.startswith(keyword):
|
|
||||||
device_pcie_tbl[device] = line[len(keyword) + 1:]
|
|
||||||
break
|
|
||||||
|
|
||||||
return device_pcie_tbl
|
|
||||||
|
|
||||||
|
|
||||||
def _get_numa_info(pcie_tbl, keyword="NUMAnode"):
|
|
||||||
"""
|
|
||||||
Build two mappings: device → NUMA node, and NUMA node → [devices].
|
|
||||||
"""
|
|
||||||
device_numa_tbl: Dict[int, int] = {} # device id -> numa id
|
|
||||||
numa_devices_tbl: Dict[int, List[int]] = {} # numa id -> device ids
|
|
||||||
|
|
||||||
for device, pcie_no in pcie_tbl.items():
|
|
||||||
numa_info = execute_command(["lspci", "-s", f"{pcie_no}",
|
|
||||||
"-vvv"]).split("\n")
|
|
||||||
for _ in numa_info:
|
|
||||||
line = ''.join(_.split())
|
|
||||||
if line.startswith(keyword):
|
|
||||||
numa_id = int(line[len(keyword) + 1:])
|
|
||||||
device_numa_tbl[device] = numa_id
|
|
||||||
|
|
||||||
devices = numa_devices_tbl.get(numa_id, None)
|
|
||||||
if devices is None:
|
|
||||||
numa_devices_tbl[numa_id] = list()
|
|
||||||
|
|
||||||
numa_devices_tbl[numa_id].append(device)
|
|
||||||
break
|
|
||||||
|
|
||||||
return device_numa_tbl, numa_devices_tbl
|
|
||||||
|
|
||||||
|
|
||||||
def _get_numa_info_v2(
|
|
||||||
devices: List[int],
|
|
||||||
keyword="NUMAnode(s)") -> Tuple[Dict[int, int], Dict[int, List[int]]]:
|
|
||||||
"""
|
|
||||||
Evenly distribute the given device list across all NUMA nodes and return
|
|
||||||
both device-to-numa and numa-to-devices mappings.
|
|
||||||
"""
|
|
||||||
numa_nodes = 1
|
|
||||||
numa_info = execute_command(["lscpu"]).split("\n")
|
|
||||||
for _ in numa_info:
|
|
||||||
line = ''.join(_.split())
|
|
||||||
if keyword not in line:
|
|
||||||
continue
|
|
||||||
numa_nodes = int(line[-1])
|
|
||||||
break
|
|
||||||
|
|
||||||
device_per_numa, tail_device = divmod(len(devices), numa_nodes)
|
|
||||||
device_count_per_numa_list = [
|
|
||||||
device_per_numa + (i < tail_device) for i in range(numa_nodes)
|
|
||||||
]
|
|
||||||
|
|
||||||
ends = list(accumulate(device_count_per_numa_list))
|
|
||||||
starts = [0] + ends[:-1]
|
|
||||||
|
|
||||||
numa_devices_tbl = {
|
|
||||||
ind: devices[start:end]
|
|
||||||
for ind, (start, end) in enumerate(zip(starts, ends))
|
|
||||||
}
|
|
||||||
|
|
||||||
device_numa_tbl = {
|
|
||||||
device: numa
|
|
||||||
for numa, _devices in numa_devices_tbl.items()
|
|
||||||
for device in _devices
|
|
||||||
}
|
|
||||||
|
|
||||||
return device_numa_tbl, numa_devices_tbl
|
|
||||||
|
|
||||||
|
|
||||||
def _get_cpu_info(numa_ids, keyword1="NUMAnode", keyword2="CPU(s)"):
|
|
||||||
"""
|
|
||||||
Parse lscpu output to build a dict that maps each NUMA
|
|
||||||
node ID to the list of CPU core IDs belonging to it.
|
|
||||||
"""
|
|
||||||
cpu_idx_tbl = dict()
|
|
||||||
numa_keywords = [keyword1 + str(idx) + keyword2 for idx in numa_ids]
|
|
||||||
cpu_info = execute_command(["lscpu"]).split("\n")
|
|
||||||
for _ in cpu_info:
|
|
||||||
line = ''.join(_.split())
|
|
||||||
if any(line.startswith(word) for word in numa_keywords):
|
|
||||||
split_info = line.split(":")
|
|
||||||
cpu_id_ranges = split_info[-1].split(",")
|
|
||||||
|
|
||||||
ranges = list()
|
|
||||||
for range_str in cpu_id_ranges:
|
|
||||||
endpoints = range_str.split("-")
|
|
||||||
if len(endpoints) != 2:
|
|
||||||
raise Exception(
|
|
||||||
"lscpu command output error, please check !")
|
|
||||||
|
|
||||||
ranges += [
|
|
||||||
cid for cid in range(int(endpoints[0]),
|
|
||||||
int(endpoints[1]) + 1)
|
|
||||||
]
|
|
||||||
|
|
||||||
numa_id = int(split_info[0].replace(keyword1,
|
|
||||||
'').replace(keyword2, ''))
|
|
||||||
cpu_idx_tbl[numa_id] = ranges
|
|
||||||
return cpu_idx_tbl
|
|
||||||
|
|
||||||
|
|
||||||
def bind_cpus(rank_id, ratio=0.5):
|
|
||||||
# get all visible devices
|
|
||||||
visible_devices = ASCEND_RT_VISIBLE_DEVICES
|
|
||||||
|
|
||||||
if visible_devices is None:
|
|
||||||
devices = sorted(list(_get_device_map_info().keys()))
|
|
||||||
else:
|
|
||||||
devices = [int(x) for x in visible_devices.split(",")]
|
|
||||||
|
|
||||||
# Query the NUMA affinity of each NPU via its PCIe address; if this fails,
|
|
||||||
# fall back to evenly distributing the devices across NUMA nodes.
|
|
||||||
device_pcie_tbl = _get_pcie_info(devices)
|
|
||||||
device_numa_tbl, numa_devices_tbl = _get_numa_info(device_pcie_tbl)
|
|
||||||
if not device_numa_tbl or not numa_devices_tbl:
|
|
||||||
device_numa_tbl, numa_devices_tbl = _get_numa_info_v2(devices)
|
|
||||||
|
|
||||||
# Obtain the complete list of CPU cores for each NUMA node.
|
|
||||||
cpu_idx_tbl = _get_cpu_info(list(numa_devices_tbl.keys()))
|
|
||||||
|
|
||||||
cur_device = devices[rank_id]
|
|
||||||
numa_id = device_numa_tbl.get(cur_device)
|
|
||||||
|
|
||||||
# Within the NUMA node, evenly partition the CPU cores
|
|
||||||
# among all NPUs (or use the amount specified by CPU_BINDING_NUM)
|
|
||||||
shard_devices = numa_devices_tbl.get(numa_id)
|
|
||||||
shard_devices.sort()
|
|
||||||
|
|
||||||
all_cpus = cpu_idx_tbl.get(numa_id)
|
|
||||||
logger.info(
|
|
||||||
f"rank_id: {rank_id}, device_id: {cur_device}, "
|
|
||||||
f"numa_id: {numa_id}, shard_devices: {shard_devices}, cpus: {all_cpus}"
|
|
||||||
)
|
|
||||||
|
|
||||||
cpu_nums = len(all_cpus)
|
|
||||||
if CPU_BINDING_NUM is None:
|
|
||||||
cpu_num_per_device = int(cpu_nums * ratio // len(shard_devices))
|
|
||||||
else:
|
|
||||||
cpu_num_per_device = int(CPU_BINDING_NUM)
|
|
||||||
if len(shard_devices) * cpu_num_per_device > cpu_nums:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Cpu num in numa {numa_id} to assign {cpu_num_per_device} for every device is not enough, "
|
|
||||||
f"please decrease the value of CPU_BINDING_NUM!")
|
|
||||||
if cpu_num_per_device < 0:
|
|
||||||
raise ValueError("CPU_BINDING_NUM should not be less than 0.")
|
|
||||||
|
|
||||||
idx = shard_devices.index(cur_device)
|
|
||||||
binding_cpus = [
|
|
||||||
all_cpus[_] for _ in range(idx * cpu_num_per_device, (idx + 1) *
|
|
||||||
cpu_num_per_device)
|
|
||||||
]
|
|
||||||
|
|
||||||
# cpu bind
|
|
||||||
p = psutil.Process()
|
|
||||||
p.cpu_affinity(binding_cpus)
|
|
||||||
new_affinity = p.cpu_affinity()
|
|
||||||
logger.info(
|
|
||||||
f"process {p.pid}, new_affinity is {new_affinity}, cpu count {cpu_num_per_device}"
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -115,17 +115,6 @@ class NPUWorker(WorkerBase):
|
|||||||
distributed_init_method=distributed_init_method,
|
distributed_init_method=distributed_init_method,
|
||||||
is_driver_worker=is_driver_worker)
|
is_driver_worker=is_driver_worker)
|
||||||
|
|
||||||
# binding cpu
|
|
||||||
if get_ascend_config().enable_cpu_binding:
|
|
||||||
try:
|
|
||||||
bind_cpus(self.local_rank, ratio=1.0)
|
|
||||||
except RuntimeError as e:
|
|
||||||
logger.error(f"{e} in {self.local_rank}")
|
|
||||||
except ValueError as e:
|
|
||||||
logger.error(f"{e} in {self.local_rank}")
|
|
||||||
except Exception:
|
|
||||||
logger.info("Skip binding cpu.")
|
|
||||||
|
|
||||||
if self.cache_config.cache_dtype == "auto":
|
if self.cache_config.cache_dtype == "auto":
|
||||||
self.cache_dtype = self.model_config.dtype
|
self.cache_dtype = self.model_config.dtype
|
||||||
else:
|
else:
|
||||||
@@ -238,6 +227,15 @@ class NPUWorker(WorkerBase):
|
|||||||
set_random_seed(self.model_config.seed)
|
set_random_seed(self.model_config.seed)
|
||||||
# Initialize device properties used by triton kernels.
|
# Initialize device properties used by triton kernels.
|
||||||
init_device_properties_triton()
|
init_device_properties_triton()
|
||||||
|
|
||||||
|
# binding cpu
|
||||||
|
if get_ascend_config().enable_cpu_binding:
|
||||||
|
try:
|
||||||
|
bind_cpus(self.local_rank)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Bind cpus failed in rank{self.local_rank}: {e} Skip binding cpu."
|
||||||
|
)
|
||||||
return device
|
return device
|
||||||
|
|
||||||
def init_device(self):
|
def init_device(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user