[Refactor] Modify the binding logic to allocate CPU cores for each NPU card (#5555)

[Refactor] Modify the binding logic to allocate CPU cores for each NPU card ### What this PR does / why we need it? Modify the binding logic to allocate CPU cores for each NPU card based on NUMA affinity, while isolating acl_thread/release_thread and other processes to prevent mutual interference. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? c85cc045f8 Signed-off-by: rowzwel_dx <1392851715@qq.com> - vLLM version: v0.13.0 - vLLM main: 7157596103 Signed-off-by: Rozwel-dx <1392851715@qq.com>
2026-01-13 09:21:28 +08:00
parent d886b81971
commit 8d571286dd
3 changed files with 470 additions and 316 deletions
--- a/tests/ut/device_allocator/test_cpu_binding.py
+++ b/tests/ut/device_allocator/test_cpu_binding.py
@@ -0,0 +1,167 @@
 import unittest
 from unittest.mock import patch
 from vllm_ascend.cpu_binding import CpuAlloc, DeviceInfo
 class TestDeviceInfo(unittest.TestCase):
    @patch('vllm_ascend.cpu_binding.execute_command')
    def setUp(self, mock_execute_command):
        mock_execute_command.side_effect = [
            ("NPU ID  Chip ID  Chip Logic ID  Chip Name\n0 0 0 Ascend\n0 1 - Mcu\n1 0 1 Ascend",
             0),
            ("| NPU Chip | Process id |\n| 0 0 | 1234 | vllm | 56000 |\n| 1 0 | 1235 | vllm | 56000 |",
             0), ("", 0)
        ]
        self.device_info = DeviceInfo()
    @patch('vllm_ascend.cpu_binding.execute_command')
    def test_get_npu_map_info(self, mock_execute_command):
        execute_result_list = [
            ("NPU ID  Chip ID  Chip Logic ID  Chip Phy-ID Chip Name\n0 0 0 0 Ascend\n0 1 1 1 Ascend\n0 2 - - Mcu",
             0),
            ("NPU ID  Chip ID  Chip Logic ID  Chip Name\n8 0 0 Ascend\n8 1 - Mcu\n9 0 1 Ascend",
             0),
        ]
        result_list = [{
            '0': {
                '0': '0',
                '1': '1'
            }
        }, {
            '8': {
                '0': '0'
            },
            '9': {
                '0': '1'
            }
        }]
        for result in execute_result_list:
            mock_execute_command.return_value = result
            npu_map_info = self.device_info.get_npu_map_info()
            expected = result_list.pop(0)
            self.assertEqual(npu_map_info, expected)
    @patch('vllm_ascend.cpu_binding.execute_command')
    def test_get_running_npus(self, mock_execute_command):
        mock_execute_command.side_effect = [
            ("| NPU Chip | Process id |\n| 0 1 | 1236 | vllm | 56000 |", 0),
            ("", 0),
            ("| NPU Chip | Process id |\n| 1 0 | 1236 | vllm | 56000 |", 0)
        ]
        with self.assertRaises(RuntimeError):
            self.device_info.get_running_npus()
        with self.assertRaises(RuntimeError):
            self.device_info.get_running_npus()
        running_npus = self.device_info.get_running_npus()
        self.assertEqual(len(running_npus), 1)
    @patch('vllm_ascend.cpu_binding.execute_command')
    def test_parse_topo_affinity(self, mock_execute_command):
        mock_execute_command.return_value = (
            "NPU0 X HCCS HCCS HCCS HCCS HCCS HCCS HCCS 0-3", 0)
        affinity = self.device_info.parse_topo_affinity()
        expected = {0: [0, 1, 2, 3]}
        self.assertEqual(affinity, expected)
    def test_expand_cpu_list(self):
        result = self.device_info.expand_cpu_list("0-2, 4, 6-8")
        self.assertEqual(result, [0, 1, 2, 4, 6, 7, 8])
 class TestCpuAlloc(unittest.TestCase):
    @patch('vllm_ascend.cpu_binding.execute_command')
    def setUp(self, mock_execute_command):
        mock_execute_command.side_effect = [
            ("NPU ID  Chip ID  Chip Logic ID  Chip Name\n0 0 0 Ascend\n0 1 - Mcu\n1 0 1 Ascend",
             0),
            ("| NPU Chip | Process id |\n| 0 0 | 1234 | vllm | 56000 |\n| 1 0 | 1235 | vllm | 56000 |",
             0), ("", 0)
        ]
        self.cpu_alloc = CpuAlloc(0)
    def test_average_distribute(self):
        self.cpu_alloc.npu_cpu_pool = {
            0: [10, 11, 12, 13],
            1: [10, 11, 12, 13]
        }
        groups = {"[10, 11, 12, 13]": [0, 1]}
        result = self.cpu_alloc.average_distribute(groups)
        self.assertEqual(result, {0: [10, 11], 1: [12, 13]})
        self.cpu_alloc.npu_cpu_pool = {
            0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
            1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
            2: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
        }
        groups = {"[0, 1, 2, 3, 4, 5]": [0, 1, 2]}
        result = self.cpu_alloc.average_distribute(groups)
        self.assertEqual(result, {
            0: [0, 1, 2, 3],
            1: [4, 5, 6, 7],
            2: [8, 9, 10, 11, 12, 13]
        })
    def test_extend_numa(self):
        result = self.cpu_alloc.extend_numa([])
        self.assertEqual(result, [])
        self.cpu_alloc.cpu_node = {0: 0, 1: 0, 2: 1, 3: 1}
        self.cpu_alloc.numa_to_cpu_map = {0: [0, 1], 1: [2, 3]}
        self.cpu_alloc.device_info.allowed_cpus = [0, 1, 2, 3]
        result = self.cpu_alloc.extend_numa([0, 1])
        self.assertEqual(result, [0, 1, 2, 3])
        self.cpu_alloc.device_info.allowed_cpus = [0, 1, 3]
        result = self.cpu_alloc.extend_numa([0, 1])
        self.assertEqual(result, [0, 1, 3])
    @patch('vllm_ascend.cpu_binding.execute_command')
    def test_build_cpu_node_map(self, mock_execute_command):
        mock_execute_command.return_value = ("", 0)
        with self.assertRaises(RuntimeError):
            self.cpu_alloc.build_cpu_node_map()
        mock_execute_command.return_value = ("0 0\n1 1\n2 0\n3 1", 0)
        self.cpu_alloc.build_cpu_node_map()
        expected_cpu_node = {0: 0, 1: 1, 2: 0, 3: 1}
        expected_numa_to_cpu_map = {0: [0, 2], 1: [1, 3]}
        self.assertEqual(self.cpu_alloc.cpu_node, expected_cpu_node)
        self.assertEqual(self.cpu_alloc.numa_to_cpu_map,
                         expected_numa_to_cpu_map)
    @patch('vllm_ascend.cpu_binding.execute_command')
    def test_handle_no_affinity(self, mock_execute_command):
        mock_execute_command.side_effect = [("0 0\n1 1", 0), ("0 0\n1 1", 0)]
        self.cpu_alloc.device_info.running_npu_list = [0, 1]
        self.cpu_alloc.device_info.allowed_cpus = [0, 1, 2, 3]
        self.cpu_alloc.device_info.affinity = {}
        self.assertEqual(self.cpu_alloc.npu_cpu_pool, {})
        self.cpu_alloc.device_info.affinity = {0: [0, 1], 1: [2, 3]}
        self.cpu_alloc.build_cpu_pools()
        self.assertEqual(len(self.cpu_alloc.npu_cpu_pool), 2)
    @patch('vllm_ascend.cpu_binding.execute_command')
    def test_allocate(self, mock_execute_command):
        self.cpu_alloc.device_info.running_npu_list = [0]
        self.cpu_alloc.npu_cpu_pool = {0: [0, 1, 2]}
        self.cpu_alloc.allocate()
        self.assertEqual(self.cpu_alloc.assign_main[0], [0])
        self.assertEqual(self.cpu_alloc.assign_acl[0], [1])
        self.assertEqual(self.cpu_alloc.assign_rel[0], [2])
        self.cpu_alloc.npu_cpu_pool = {0: [0, 1]}
        with self.assertRaises(RuntimeError):
            self.cpu_alloc.allocate()
    @patch('vllm_ascend.cpu_binding.execute_command')
    def test_bind_threads(self, mock_execute_command):
        thread_message = "1234 1234 ? 00:00:03 acl_thread\n4567 4567 ? 00:00:03 release_thread"
        mock_execute_command.return_value = (thread_message, 0)
        self.cpu_alloc.device_info.running_npu_list = [0]
        self.cpu_alloc.assign_main = {0: [0, 1]}
        self.cpu_alloc.assign_acl = {0: [2]}
        self.cpu_alloc.assign_rel = {0: [3]}
        self.cpu_alloc.bind_threads()
        mock_execute_command.assert_called()
 if __name__ == '__main__':
    unittest.main()
--- a/vllm_ascend/cpu_binding.py
+++ b/vllm_ascend/cpu_binding.py
@@ -1,330 +1,319 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import os
 import subprocess
-from dataclasses import dataclass
+from collections import defaultdict
-from itertools import accumulate
+from typing import Dict, List, Tuple
 from typing import Dict, List, Optional, Tuple, Union
 import psutil
 import torch_npu
 from vllm.logger import logger
 ALLOWED_CPUS_PATH = "/proc/self/status"
 ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")
 CPU_BINDING_NUM = os.getenv("CPU_BINDING_NUM")
-def execute_command(cmd_list):
+def execute_command(cmd: List[str]) -> Tuple[str, int]:
-    with subprocess.Popen(cmd_list,
+    with subprocess.Popen(cmd,
                          shell=False,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.PIPE) as p:
-        out, err = p.communicate(timeout=1000)
+        out, _ = p.communicate(timeout=1000)
-    res = out.decode()
+    return out.decode(), p.returncode
    return res
@dataclass
 class DeviceInfo:
    """
    Parse a single line of device information into structured data.
    """
    _info_line: str = ""
    npu_id: int = 0
    chip_id: int = 0
    chip_logic_id: Union[int, str] = 0
    chip_name: str = ""
-    def __post_init__(self):
+    def __init__(self):
-        npu_id_str, chip_id_str, chip_logic_id_str, self.chip_name = self._info_line.strip(
+        self.npu_map_info: Dict[str, Dict[str, str]] = self.get_npu_map_info()
-        ).split(None, 3)
+        self.allowed_cpus: List[int] = self.parse_allowed_cpus()
-        self.npu_id = int(npu_id_str)
+        self.running_npu_list: List[int] = self.get_running_npus()
-        self.chip_id = int(chip_id_str)
+        self.npu_affinity: Dict[int, List[int]] = self.parse_topo_affinity()
        if chip_logic_id_str.isnumeric():
            self.chip_logic_id = int(chip_logic_id_str)
    @staticmethod
    def expand_cpu_list(allowed_list_str: str) -> List[int]:
        allowed_cpus_list: List[int] = []
        for per_range in allowed_list_str.split(","):
            if "-" in per_range:
                start_cpu, end_cpu = map(int, per_range.split("-"))
                allowed_cpus_list.extend(range(start_cpu, end_cpu + 1))
            else:
                allowed_cpus_list.append(int(per_range))
        return allowed_cpus_list
-class NpuHbmInfo:
+    @staticmethod
-    visible_npu_ids: Optional[List[int]] = None
+    def get_npu_map_info() -> Dict[str, Dict[str, str]]:
-    hbm_capacity: Optional[int] = None
+        npu_map_info: Dict[str, Dict[str, str]] = {}
-    hbm_usage: Optional[int] = None
+        npu_info, _ = execute_command(["npu-smi", "info", "-m"])
        npu_map = npu_info.strip().split("\n")[1:]
        for line in npu_map:
            npu_id, chip_id, chip_logic_id = line.strip().split()[:3]
            if not chip_logic_id.isdigit():
                continue
            if npu_id not in npu_map_info:
                npu_map_info[npu_id] = {}
            npu_map_info[npu_id][chip_id] = chip_logic_id
        return npu_map_info
-    @classmethod
+    def get_running_npus(self) -> List[int]:
-    def set_visible_devices(cls, world_size):
+        npu_message, _ = execute_command(["npu-smi", "info"])
-        """
+        in_proc_section = False
-        Determine which NPUs are visible to the current process and cache their
+        running_npu_set = set()
-        logical NPU IDs in `cls.visible_npu_ids`.
+        for line in npu_message.splitlines():
-        """
+            line = line.strip()
-        if cls.visible_npu_ids:
+            if line.startswith("| NPU") and "Process id" in line:
-            return
+                in_proc_section = True
-        if ASCEND_RT_VISIBLE_DEVICES is None:
+                continue
-            devices = sorted(list(_get_device_map_info().keys()))
+            if not in_proc_section:
-        else:
+                continue
            if line.startswith("| "):
                parts = [p.strip() for p in line.strip("|").split("|")]
                if len(parts) < 2:
                    continue
                npu_id = parts[0].split()[0]
                chip_id = parts[0].split()[1]
                if not npu_id.isdigit() or not chip_id.isdigit():
                    continue
                chip_logic_id = self.npu_map_info.get(npu_id, {}).get(chip_id)
                if not chip_logic_id or not chip_logic_id.isdigit():
                    raise RuntimeError(
                        "Failed to get correct chip_logic_id from command 'npu-smi info -m'."
                    )
                running_npu_set.add(int(chip_logic_id))
        if ASCEND_RT_VISIBLE_DEVICES:
            devices_str = ASCEND_RT_VISIBLE_DEVICES
-            devices = [int(x) for x in devices_str.split(",")]
+            devices_list = [int(x) for x in devices_str.split(",")]
-        device_map_info = _get_device_map_info()
+            running_npu_set = set(devices_list) & running_npu_set
-        npu_ids = []
+        if not running_npu_set:
-        for device in devices:
+            raise RuntimeError(
-            device_info = device_map_info.get(device)
+                "Can not get running npu info, you can use BIND_CPU=0 to skip."
-            if device_info is None:
+            )
        return sorted(running_npu_set)
    def parse_allowed_cpus(self) -> List[int]:
        if not os.path.exists(ALLOWED_CPUS_PATH):
            return []
        with open(ALLOWED_CPUS_PATH) as f:
            for line in f:
                if line.startswith("Cpus_allowed_list"):
                    return self.expand_cpu_list(line.split()[1])
        raise RuntimeError(
            "Can not found specific 'Cpus_allowed_list' in the '/proc/self/status' file."
        )
    def parse_topo_affinity(self) -> Dict[int, List[int]]:
        chip_logic_id = 0
        affinity: Dict[int, List[int]] = {}
        affinity_message, _ = execute_command(
            ["npu-smi", "info", "-t", "topo"])
        for line in affinity_message.splitlines():
            if line.startswith("NPU"):
                parts = line.split()
                last_part = parts[-1]
                if last_part != "Affinity":
                    affinity[chip_logic_id] = self.expand_cpu_list(last_part)
                chip_logic_id += 1
        return affinity
 class CpuAlloc:
    def __init__(self, rank_id: int):
        self.rank_id = rank_id
        self.device_info: DeviceInfo = DeviceInfo()
        self.cpu_node: Dict[int, int] = {}
        self.numa_to_cpu_map: Dict[int, List[int]] = defaultdict(list)
        self.npu_cpu_pool: Dict[int, List[int]] = {}
        self.assign_main: Dict[int, List[int]] = {}
        self.assign_acl: Dict[int, List[int]] = {}
        self.assign_rel: Dict[int, List[int]] = {}
    @staticmethod
    def get_threads_map(
            thread_message: str) -> Dict[str, Dict[str, List[str]]]:
        threads_map: Dict[str, Dict[str, List[str]]] = {}
        for line in thread_message.splitlines():
            parts = line.split()
            if len(parts) < 2:
                continue
            main_pid, sub_pid = parts[0], parts[1]
            if "acl_thread" in line:
                key = "acl_thread"
            elif "release_thread" in line:
                key = "release_thread"
            else:
                continue
            if main_pid not in threads_map:
                threads_map[main_pid] = {
                    "acl_thread": [],
                    "release_thread": []
                }
            threads_map[main_pid][key].append(sub_pid)
        return threads_map
    @staticmethod
    def bind(pid: str, cpus: List[int], bind_sub_thread: bool) -> None:
        if cpus:
            cpu_list = ",".join(map(str, cpus))
            if bind_sub_thread:
                bind_result, return_code = execute_command(
                    ["taskset", "-acp", cpu_list, pid])
            else:
                bind_result, return_code = execute_command(
                    ["taskset", "-cp", cpu_list, pid])
            if return_code != 0:
                raise RuntimeError(f"Failed to bind {pid} to CPU {cpu_list}.")
    def average_distribute(
            self, groups: Dict[str, List[int]]) -> Dict[int, List[int]]:
        result: Dict[int, List[int]] = {}
        for key, npu_list in groups.items():
            cpu_list = sorted(self.npu_cpu_pool[npu_list[0]])
            cpu_num_per_npu = len(cpu_list) // len(npu_list)
            for i, npu in enumerate(npu_list):
                start_index = i * cpu_num_per_npu
                end_index = (i + 1) * cpu_num_per_npu if i < len(
                    npu_list) - 1 else len(cpu_list)
                result[npu] = cpu_list[start_index:end_index]
        return result
    def extend_numa(self, cpu_list: List[int]) -> List[int]:
        if not cpu_list:
            return []
        nodes = {self.cpu_node[c] for c in cpu_list}
        if len(nodes) != 1:
            return cpu_list
        node = list(nodes)[0]
        next_node = (node + 1) % len(self.numa_to_cpu_map)
        extended = cpu_list[:]
        for cpu in self.numa_to_cpu_map[next_node]:
            if cpu in self.device_info.allowed_cpus:
                extended.append(cpu)
        return sorted(set(extended))
    def build_cpu_node_map(self) -> None:
        cpu_numa_map, _ = execute_command(["lscpu", "-e=CPU,NODE"])
        for line in cpu_numa_map.splitlines():
            line = line.strip()
            if not line or not line[0].isdigit():
                continue
            cpu_str, node_str = line.split()
            cpu = int(cpu_str)
            node = int(node_str)
            self.cpu_node[cpu] = node
            self.numa_to_cpu_map[node].append(cpu)
        if len(self.numa_to_cpu_map) == 0:
            raise RuntimeError(
                "lscpu command output error, no NUMA node available. Please check!"
            )
    def handle_no_affinity(self) -> None:
        num_running_npu = len(self.device_info.running_npu_list)
        num_numa_node = len(self.numa_to_cpu_map)
        if num_numa_node == 0 or num_running_npu == 0:
            return
        if num_running_npu % num_numa_node != 0:
            npu_num_per_node = num_running_npu // num_numa_node + 1
        else:
            npu_num_per_node = num_running_npu // num_numa_node
        index = 0
        for node in sorted(self.numa_to_cpu_map):
            # Available CPUs on this NUMA (constrained by allowed_cpus)
            cpus = [
                c for c in self.numa_to_cpu_map[node]
                if c in self.device_info.allowed_cpus
            ]
            if not cpus:
                continue
            # The actual number of NPUs to be allocated on this NUMA.
            npu_num_this_node = min(npu_num_per_node, num_running_npu - index)
            if npu_num_this_node <= 0:
                break
            # Evenly distribute the CPUs of this NUMA node among npu_num_this_node NPUs.
            total_cpu_num = len(cpus)
            base_cpu_num = total_cpu_num // npu_num_this_node
            extra_cpu_num = total_cpu_num % npu_num_this_node
            start_index = 0
            for i in range(npu_num_this_node):
                take_cpu_num = base_cpu_num + (1 if i < extra_cpu_num else 0)
                end_index = start_index + take_cpu_num
                select_cpus_list = cpus[start_index:end_index]
                if index < num_running_npu:
                    npu = self.device_info.running_npu_list[index]
                    self.npu_cpu_pool[npu] = select_cpus_list
                    index += 1
                start_index = end_index
    def build_cpu_pools(self) -> None:
        self.build_cpu_node_map()
        if not self.device_info.npu_affinity:
            self.handle_no_affinity()
            return
        for npu in self.device_info.running_npu_list:
            base_cpu_list = [
                cpu for cpu in self.device_info.npu_affinity.get(npu, [])
                if cpu in self.device_info.allowed_cpus
            ]
            if not base_cpu_list:
                raise RuntimeError(
-                    f"Device {device} not found in device_map_info")
+                    "CPUs available in 'Cpus_allowed_list' conflict with NUMA affinity."
-            npu_ids.append(device_info.npu_id)
+                )
-        cls.visible_npu_ids = npu_ids
+            extra_cpu_list = self.extend_numa(base_cpu_list)
            self.npu_cpu_pool[npu] = extra_cpu_list
        groups = defaultdict(list)
        for npu, cpus in self.npu_cpu_pool.items():
            groups[str(cpus)].append(npu)
        final: Dict[int, List[int]] = {}
        for key, npu_list in groups.items():
            if len(npu_list) == 1:
                final[npu_list[0]] = self.npu_cpu_pool[npu_list[0]]
            else:
                final.update(self.average_distribute({key: npu_list}))
        self.npu_cpu_pool = final
-    @classmethod
+    def allocate(self) -> None:
-    def get_hbm_capacity(cls, rank, world_size, need_nz):
+        for npu, pool in self.npu_cpu_pool.items():
-        """
+            if len(pool) >= 3:
-        Query and cache the HBM (or DDR) capacity in **bytes** for the NPU assigned
+                main = pool[:-2]
-        to the current process.
+                acl = [pool[-2]]
-        """
+                rel = [pool[-1]]
-        soc_version = torch_npu._C._npu_get_soc_version()
+            else:
-        if cls.hbm_capacity:
+                raise RuntimeError(
-            return cls.hbm_capacity
+                    "The number of CPUs is insufficient to bind to the NPUs. "
-        if not cls.visible_npu_ids:
+                    "Each NPU requires at least 3 CPUs.")
-            cls.set_visible_devices(world_size)
+            self.assign_main[npu] = main
-        assert cls.visible_npu_ids is not None
+            self.assign_acl[npu] = acl
-        npu_id = cls.visible_npu_ids[rank]
+            self.assign_rel[npu] = rel
        memory_info = execute_command(
            ["npu-smi", "info", "-i", f"{npu_id}", "-t",
             "memory"]).split("\n")[1:]
        if soc_version == 240:
            hbm_capacity_key = 'Capacity(MB)'
        elif not need_nz:
            hbm_capacity_key = 'HBM Capacity(MB)'
        else:
            hbm_capacity_key = 'DDR Capacity(MB)'
        for line in memory_info:
            try:
                key, value = line.strip().split(':', 2)
                if key.strip() == hbm_capacity_key:
                    cls.hbm_capacity = int(value.strip()) * 1024 * 1024
                    return cls.hbm_capacity
            except ValueError:
                pass
        raise ValueError('not found valid hbm capactiy')
-    @classmethod
+    def print_plan(self) -> None:
-    def get_hbm_usage(cls, rank, world_size, need_nz):
+        logger.info("The CPU allocation plan is as follows:")
-        """
+        current_npu = self.device_info.running_npu_list[self.rank_id]
-        Return the current HBM or DDR usage 
+        main = " ".join(map(str, self.assign_main[current_npu]))
-        ratio (0-1) for the NPU assigned to the given rank.
+        acl = " ".join(map(str, self.assign_acl[current_npu]))
-        """
+        rel = str(self.assign_rel[current_npu]
-        if cls.hbm_usage:
+                  ) if self.assign_rel[current_npu] else ""
-            return cls.hbm_usage
+        logger.info(
-        if not cls.visible_npu_ids:
+            f"NPU{current_npu}: main=[{main}]  acl=[{acl}]  release=[{rel}]")
-            cls.set_visible_devices(world_size)
+
-        assert cls.visible_npu_ids is not None
+    def bind_threads(self) -> None:
-        npu_id = cls.visible_npu_ids[rank]
+        thread_message, _ = execute_command(["ps", "-Te"])
-        usage_info = execute_command(
+        threads_map = self.get_threads_map(thread_message)
-            ["npu-smi", "info", "-i", f"{npu_id}", "-t",
+        main_pid = str(psutil.Process().pid)
-             "usages"]).split("\n")[1:]
+        current_npu = self.device_info.running_npu_list[self.rank_id]
-        soc_version = torch_npu._C._npu_get_soc_version()
+        self.bind(main_pid, self.assign_main[current_npu], True)
-        if soc_version == 240:
+        for acl_thread in threads_map.get(main_pid, {}).get("acl_thread", []):
-            hbm_capacity_key = 'Memory Usage Rate(%)'
+            self.bind(acl_thread, self.assign_acl[current_npu], False)
-        elif not need_nz:
+        for release_thread in threads_map.get(main_pid,
-            hbm_capacity_key = 'HBM Usage Rate(%)'
+                                              {}).get("release_thread", []):
-        else:
+            self.bind(release_thread, self.assign_rel[current_npu], False)
-            hbm_capacity_key = 'DDR Usage Rate(%)'
+
-        for line in usage_info:
+    def run_all(self) -> None:
-            try:
+        self.build_cpu_pools()
-                key, value = line.strip().split(':', 2)
+        self.allocate()
-                if key.strip() == hbm_capacity_key:
+        self.print_plan()
-                    hbm_usage = (float(value.strip()) + 1) / 100
+        self.bind_threads()
                    return hbm_usage
            except ValueError:
                pass
        raise ValueError('not found valid hbm usage')
-def _get_device_map_info() -> Dict[int, DeviceInfo]:
+def bind_cpus(rank_id: int) -> None:
-    """
+    binder = CpuAlloc(rank_id)
-    Build and return a mapping from logical chip ID (int) to its DeviceInfo object.
+    binder.run_all()
    """
    device_map_info = {}
    device_map = execute_command(["npu-smi", "info",
                                  "-m"]).strip().split("\n")[1:]
    for line in device_map:
        device_info = DeviceInfo(line.strip())
        if isinstance(device_info.chip_logic_id, int):
            device_map_info[device_info.chip_logic_id] = device_info
    return device_map_info
 def _get_pcie_info(devices: List[int], keyword="PCIeBusInfo"):
    """
    Query each NPU in the given device list and return a mapping 
    from logical device ID to its PCIe bus address.
    """
    device_map_info = _get_device_map_info()
    device_pcie_tbl = {}
    for device in devices:
        device_info = device_map_info.get(device)
        if not device_info:
            raise RuntimeError(
                "Can not get device info, you can use BIND_CPU=0 to skip.")
        pcie_info = execute_command([
            "npu-smi", "info", "-t", "board", "-i", f"{device_info.npu_id}",
            "-c", f"{device_info.chip_id}"
        ]).strip().split("\n")
        for _ in pcie_info:
            line = ''.join(_.split())
            if line.startswith(keyword):
                device_pcie_tbl[device] = line[len(keyword) + 1:]
                break
    return device_pcie_tbl
 def _get_numa_info(pcie_tbl, keyword="NUMAnode"):
    """
    Build two mappings: device → NUMA node, and NUMA node → [devices].
    """
    device_numa_tbl: Dict[int, int] = {}  # device id -> numa id
    numa_devices_tbl: Dict[int, List[int]] = {}  # numa id -> device ids
    for device, pcie_no in pcie_tbl.items():
        numa_info = execute_command(["lspci", "-s", f"{pcie_no}",
                                     "-vvv"]).split("\n")
        for _ in numa_info:
            line = ''.join(_.split())
            if line.startswith(keyword):
                numa_id = int(line[len(keyword) + 1:])
                device_numa_tbl[device] = numa_id
                devices = numa_devices_tbl.get(numa_id, None)
                if devices is None:
                    numa_devices_tbl[numa_id] = list()
                numa_devices_tbl[numa_id].append(device)
                break
    return device_numa_tbl, numa_devices_tbl
 def _get_numa_info_v2(
        devices: List[int],
        keyword="NUMAnode(s)") -> Tuple[Dict[int, int], Dict[int, List[int]]]:
    """
    Evenly distribute the given device list across all NUMA nodes and return
    both device-to-numa and numa-to-devices mappings.
    """
    numa_nodes = 1
    numa_info = execute_command(["lscpu"]).split("\n")
    for _ in numa_info:
        line = ''.join(_.split())
        if keyword not in line:
            continue
        numa_nodes = int(line[-1])
        break
    device_per_numa, tail_device = divmod(len(devices), numa_nodes)
    device_count_per_numa_list = [
        device_per_numa + (i < tail_device) for i in range(numa_nodes)
    ]
    ends = list(accumulate(device_count_per_numa_list))
    starts = [0] + ends[:-1]
    numa_devices_tbl = {
        ind: devices[start:end]
        for ind, (start, end) in enumerate(zip(starts, ends))
    }
    device_numa_tbl = {
        device: numa
        for numa, _devices in numa_devices_tbl.items()
        for device in _devices
    }
    return device_numa_tbl, numa_devices_tbl
 def _get_cpu_info(numa_ids, keyword1="NUMAnode", keyword2="CPU(s)"):
    """
    Parse lscpu output to build a dict that maps each NUMA 
    node ID to the list of CPU core IDs belonging to it.
    """
    cpu_idx_tbl = dict()
    numa_keywords = [keyword1 + str(idx) + keyword2 for idx in numa_ids]
    cpu_info = execute_command(["lscpu"]).split("\n")
    for _ in cpu_info:
        line = ''.join(_.split())
        if any(line.startswith(word) for word in numa_keywords):
            split_info = line.split(":")
            cpu_id_ranges = split_info[-1].split(",")
            ranges = list()
            for range_str in cpu_id_ranges:
                endpoints = range_str.split("-")
                if len(endpoints) != 2:
                    raise Exception(
                        "lscpu command output error, please check !")
                ranges += [
                    cid for cid in range(int(endpoints[0]),
                                         int(endpoints[1]) + 1)
                ]
            numa_id = int(split_info[0].replace(keyword1,
                                                '').replace(keyword2, ''))
            cpu_idx_tbl[numa_id] = ranges
    return cpu_idx_tbl
 def bind_cpus(rank_id, ratio=0.5):
    # get all visible devices
    visible_devices = ASCEND_RT_VISIBLE_DEVICES
    if visible_devices is None:
        devices = sorted(list(_get_device_map_info().keys()))
    else:
        devices = [int(x) for x in visible_devices.split(",")]
    # Query the NUMA affinity of each NPU via its PCIe address; if this fails,
    # fall back to evenly distributing the devices across NUMA nodes.
    device_pcie_tbl = _get_pcie_info(devices)
    device_numa_tbl, numa_devices_tbl = _get_numa_info(device_pcie_tbl)
    if not device_numa_tbl or not numa_devices_tbl:
        device_numa_tbl, numa_devices_tbl = _get_numa_info_v2(devices)
    # Obtain the complete list of CPU cores for each NUMA node.
    cpu_idx_tbl = _get_cpu_info(list(numa_devices_tbl.keys()))
    cur_device = devices[rank_id]
    numa_id = device_numa_tbl.get(cur_device)
    # Within the NUMA node, evenly partition the CPU cores
    # among all NPUs (or use the amount specified by CPU_BINDING_NUM)
    shard_devices = numa_devices_tbl.get(numa_id)
    shard_devices.sort()
    all_cpus = cpu_idx_tbl.get(numa_id)
    logger.info(
        f"rank_id: {rank_id}, device_id: {cur_device}, "
        f"numa_id: {numa_id}, shard_devices: {shard_devices}, cpus: {all_cpus}"
    )
    cpu_nums = len(all_cpus)
    if CPU_BINDING_NUM is None:
        cpu_num_per_device = int(cpu_nums * ratio // len(shard_devices))
    else:
        cpu_num_per_device = int(CPU_BINDING_NUM)
        if len(shard_devices) * cpu_num_per_device > cpu_nums:
            raise RuntimeError(
                f"Cpu num in numa {numa_id} to assign {cpu_num_per_device} for every device is not enough, "
                f"please decrease the value of CPU_BINDING_NUM!")
        if cpu_num_per_device < 0:
            raise ValueError("CPU_BINDING_NUM should not be less than 0.")
    idx = shard_devices.index(cur_device)
    binding_cpus = [
        all_cpus[_] for _ in range(idx * cpu_num_per_device, (idx + 1) *
                                   cpu_num_per_device)
    ]
    # cpu bind
    p = psutil.Process()
    p.cpu_affinity(binding_cpus)
    new_affinity = p.cpu_affinity()
    logger.info(
        f"process {p.pid}, new_affinity is {new_affinity}, cpu count {cpu_num_per_device}"
    )
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -115,17 +115,6 @@ class NPUWorker(WorkerBase):
                         distributed_init_method=distributed_init_method,
                         is_driver_worker=is_driver_worker)
        # binding cpu
        if get_ascend_config().enable_cpu_binding:
            try:
                bind_cpus(self.local_rank, ratio=1.0)
            except RuntimeError as e:
                logger.error(f"{e} in {self.local_rank}")
            except ValueError as e:
                logger.error(f"{e} in {self.local_rank}")
            except Exception:
                logger.info("Skip binding cpu.")
        if self.cache_config.cache_dtype == "auto":
            self.cache_dtype = self.model_config.dtype
        else:
@@ -238,6 +227,15 @@ class NPUWorker(WorkerBase):
        set_random_seed(self.model_config.seed)
        # Initialize device properties used by triton kernels.
        init_device_properties_triton()
        # binding cpu
        if get_ascend_config().enable_cpu_binding:
            try:
                bind_cpus(self.local_rank)
            except Exception as e:
                logger.warning(
                    f"Bind cpus failed in rank{self.local_rank}: {e} Skip binding cpu."
                )
        return device
    def init_device(self):