[Platform] Enable ARM-only CPU binding with NUMA-balanced A3 policy and update docs/tests (#6686)

### What this PR does / why we need it?

- Keeps enable_cpu_binding default on, but skips binding on non‑ARM CPUs
inside bind_cpus, with a clear log.
- Uses a table-driven binding policy: A3 uses NUMA‑balanced binding;
other device types use NUMA‑affinity binding.
- Updates docs to reflect the exact behavior and adds/updates unit tests
for the new logic.

### Does this PR introduce _any_ user-facing change?

- Yes. CPU binding is now enabled by default via additional_config, and
documented in the user guide.
- CPU binding behavior differs by device type (A3 vs. others).

### How was this patch tested?

Added/updated unit tests:

test_cpu_binding.py
1.   test_binding_mode_table covers A2 vs A3 binding mode mapping.
2. test_build_cpu_pools_fallback_to_numa_balanced covers fallback when
affinity info is missing.
3. TestBindingSwitch.test_is_arm_cpu covers ARM/x86/unknown arch
detection.
4.   test_bind_cpus_skip_non_arm covers non‑ARM skip path in bind_cpus.

test_worker_v1.py
1. Updated mocks for enable_cpu_binding default True to align with new
config default.

- vLLM version: v0.14.1
- vLLM main: d7de043

---------

Signed-off-by: chenchuw886 <chenchuw@huawei.com>
Co-authored-by: chenchuw886 <chenchuw@huawei.com>
This commit is contained in:
Frank Chen
2026-02-25 11:15:14 +08:00
committed by GitHub
parent ac9a7d1301
commit 3da2ba22eb
6 changed files with 80 additions and 9 deletions

View File

@@ -39,7 +39,7 @@ The following table lists additional configuration options available in vLLM Asc
| `multistream_overlap_shared_expert` | bool | `False` | Whether to enable multi-stream shared expert. This option only takes effect on MoE models with shared experts. | | `multistream_overlap_shared_expert` | bool | `False` | Whether to enable multi-stream shared expert. This option only takes effect on MoE models with shared experts. |
| `multistream_overlap_gate` | bool | `False` | Whether to enable multi-stream overlap gate. This option only takes effect on MoE models with shared experts. | | `multistream_overlap_gate` | bool | `False` | Whether to enable multi-stream overlap gate. This option only takes effect on MoE models with shared experts. |
| `recompute_scheduler_enable` | bool | `False` | Whether to enable recompute scheduler. | | `recompute_scheduler_enable` | bool | `False` | Whether to enable recompute scheduler. |
| `enable_cpu_binding` | bool | `False` | Whether to enable CPU Binding. | | `enable_cpu_binding` | bool | `True` | Whether to enable CPU binding. Only takes effect on ARM CPUs; when enabled, A3 uses NUMA-balanced binding strategy and other device types use NUMA-affinity's. |
| `SLO_limits_for_dynamic_batch` | int | `-1` | SLO limits for dynamic batch. This is new scheduler to support dynamic batch feature | | `SLO_limits_for_dynamic_batch` | int | `-1` | SLO limits for dynamic batch. This is new scheduler to support dynamic batch feature |
| `enable_npugraph_ex` | bool | `False` | Whether to enable npugraph_ex graph mode. | | `enable_npugraph_ex` | bool | `False` | Whether to enable npugraph_ex graph mode. |
| `pa_shape_list` | list | `[]` | The custom shape list of page attention ops. | | `pa_shape_list` | list | `[]` | The custom shape list of page attention ops. |

View File

@@ -9,6 +9,7 @@ pybind11
pyyaml pyyaml
scipy scipy
pandas pandas
psutil
setuptools>=64 setuptools>=64
setuptools-scm>=8 setuptools-scm>=8
torch==2.9.0 torch==2.9.0

View File

@@ -1,7 +1,8 @@
import unittest import unittest
from unittest.mock import patch from unittest.mock import patch
from vllm_ascend.cpu_binding import CpuAlloc, DeviceInfo from vllm_ascend.cpu_binding import CpuAlloc, DeviceInfo, bind_cpus, is_arm_cpu
from vllm_ascend.utils import AscendDeviceType
class TestDeviceInfo(unittest.TestCase): class TestDeviceInfo(unittest.TestCase):
@@ -103,6 +104,23 @@ class TestCpuAlloc(unittest.TestCase):
2: [8, 9, 10, 11, 12, 13] 2: [8, 9, 10, 11, 12, 13]
}) })
@patch('vllm_ascend.cpu_binding.get_ascend_device_type')
def test_binding_mode_table(self, mock_get_device_type):
mock_get_device_type.return_value = AscendDeviceType.A2
self.assertEqual(self.cpu_alloc._binding_mode(), "affinity")
mock_get_device_type.return_value = AscendDeviceType.A3
self.assertEqual(self.cpu_alloc._binding_mode(), "numa_balanced")
@patch('vllm_ascend.cpu_binding.get_ascend_device_type')
def test_build_cpu_pools_fallback_to_numa_balanced(self, mock_get_device_type):
mock_get_device_type.return_value = AscendDeviceType.A2
self.cpu_alloc.device_info.npu_affinity = {}
with patch.object(self.cpu_alloc, "build_cpu_node_map") as mock_build_cpu_node_map, \
patch.object(self.cpu_alloc, "handle_no_affinity") as mock_handle_no_affinity:
self.cpu_alloc.build_cpu_pools()
mock_build_cpu_node_map.assert_called_once()
mock_handle_no_affinity.assert_called_once()
def test_extend_numa(self): def test_extend_numa(self):
result = self.cpu_alloc.extend_numa([]) result = self.cpu_alloc.extend_numa([])
self.assertEqual(result, []) self.assertEqual(result, [])
@@ -128,8 +146,10 @@ class TestCpuAlloc(unittest.TestCase):
self.assertEqual(self.cpu_alloc.numa_to_cpu_map, self.assertEqual(self.cpu_alloc.numa_to_cpu_map,
expected_numa_to_cpu_map) expected_numa_to_cpu_map)
@patch('vllm_ascend.cpu_binding.get_ascend_device_type')
@patch('vllm_ascend.cpu_binding.execute_command') @patch('vllm_ascend.cpu_binding.execute_command')
def test_handle_no_affinity(self, mock_execute_command): def test_handle_no_affinity(self, mock_execute_command, mock_get_device_type):
mock_get_device_type.return_value = AscendDeviceType.A3
mock_execute_command.side_effect = [("0 0\n1 1", 0), ("0 0\n1 1", 0)] mock_execute_command.side_effect = [("0 0\n1 1", 0), ("0 0\n1 1", 0)]
self.cpu_alloc.device_info.running_npu_list = [0, 1] self.cpu_alloc.device_info.running_npu_list = [0, 1]
self.cpu_alloc.device_info.allowed_cpus = [0, 1, 2, 3] self.cpu_alloc.device_info.allowed_cpus = [0, 1, 2, 3]
@@ -163,5 +183,26 @@ class TestCpuAlloc(unittest.TestCase):
mock_execute_command.assert_called() mock_execute_command.assert_called()
class TestBindingSwitch(unittest.TestCase):
@patch('vllm_ascend.cpu_binding.platform.machine')
def test_is_arm_cpu(self, mock_machine):
mock_machine.return_value = "x86_64"
self.assertFalse(is_arm_cpu())
mock_machine.return_value = "aarch64"
self.assertTrue(is_arm_cpu())
mock_machine.return_value = "armv8"
self.assertTrue(is_arm_cpu())
mock_machine.return_value = "mips64"
self.assertFalse(is_arm_cpu())
@patch('vllm_ascend.cpu_binding.CpuAlloc')
@patch('vllm_ascend.cpu_binding.is_arm_cpu')
def test_bind_cpus_skip_non_arm(self, mock_is_arm_cpu, mock_cpu_alloc):
mock_is_arm_cpu.return_value = False
bind_cpus(0)
mock_cpu_alloc.assert_not_called()
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@@ -70,7 +70,7 @@ class TestNPUWorker(TestBase):
# Setup mock behavior # Setup mock behavior
mock_ops.register_dummy_fusion_op.return_value = None mock_ops.register_dummy_fusion_op.return_value = None
mock_ascend_config = MagicMock() mock_ascend_config = MagicMock()
mock_ascend_config.enable_cpu_binding = False mock_ascend_config.enable_cpu_binding = True
mock_get_ascend_config.return_value = mock_ascend_config mock_get_ascend_config.return_value = mock_ascend_config
# Import and create NPUWorker instance # Import and create NPUWorker instance
@@ -125,7 +125,7 @@ class TestNPUWorker(TestBase):
self.model_config_mock.trust_remote_code = True self.model_config_mock.trust_remote_code = True
mock_ops.register_dummy_fusion_op.return_value = None mock_ops.register_dummy_fusion_op.return_value = None
mock_ascend_config = MagicMock() mock_ascend_config = MagicMock()
mock_ascend_config.enable_cpu_binding = False mock_ascend_config.enable_cpu_binding = True
mock_get_ascend_config.return_value = mock_ascend_config mock_get_ascend_config.return_value = mock_ascend_config
# Create NPUWorker instance # Create NPUWorker instance
@@ -168,7 +168,7 @@ class TestNPUWorker(TestBase):
self.cache_config_mock.cache_dtype = "float32" self.cache_config_mock.cache_dtype = "float32"
mock_ops.register_dummy_fusion_op.return_value = None mock_ops.register_dummy_fusion_op.return_value = None
mock_ascend_config = MagicMock() mock_ascend_config = MagicMock()
mock_ascend_config.enable_cpu_binding = False mock_ascend_config.enable_cpu_binding = True
mock_get_ascend_config.return_value = mock_ascend_config mock_get_ascend_config.return_value = mock_ascend_config
# Create NPUWorker instance # Create NPUWorker instance

View File

@@ -84,7 +84,7 @@ class AscendConfig:
self.multistream_overlap_shared_expert = additional_config.get("multistream_overlap_shared_expert", False) self.multistream_overlap_shared_expert = additional_config.get("multistream_overlap_shared_expert", False)
self.multistream_overlap_gate = additional_config.get("multistream_overlap_gate", False) self.multistream_overlap_gate = additional_config.get("multistream_overlap_gate", False)
self.recompute_scheduler_enable = additional_config.get("recompute_scheduler_enable", False) self.recompute_scheduler_enable = additional_config.get("recompute_scheduler_enable", False)
self.enable_cpu_binding = additional_config.get("enable_cpu_binding", False) self.enable_cpu_binding = additional_config.get("enable_cpu_binding", True)
self.pd_tp_ratio = 1 self.pd_tp_ratio = 1
self.pd_head_ratio = 1 self.pd_head_ratio = 1

View File

@@ -1,16 +1,29 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
import platform
import subprocess import subprocess
from collections import defaultdict from collections import defaultdict
import psutil import psutil
from vllm.logger import logger from vllm.logger import logger
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
ALLOWED_CPUS_PATH = "/proc/self/status" ALLOWED_CPUS_PATH = "/proc/self/status"
ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES") ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")
def is_arm_cpu() -> bool:
arch = platform.machine().lower()
if arch in {"x86_64", "amd64", "i386", "i686"}:
return False
if arch in {"aarch64", "arm64"} or arch.startswith("arm"):
return True
logger.warning(f"Unknown CPU architecture '{arch}', CPU binding will be disabled.")
return False
def execute_command(cmd: list[str]) -> tuple[str, int]: def execute_command(cmd: list[str]) -> tuple[str, int]:
with subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p: with subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
out, _ = p.communicate(timeout=1000) out, _ = p.communicate(timeout=1000)
@@ -77,7 +90,7 @@ class DeviceInfo:
devices_list = [int(x) for x in devices_str.split(",")] devices_list = [int(x) for x in devices_str.split(",")]
running_npu_set = set(devices_list) & running_npu_set running_npu_set = set(devices_list) & running_npu_set
if not running_npu_set: if not running_npu_set:
raise RuntimeError("Can not get running npu info, you can use BIND_CPU=0 to skip.") raise RuntimeError("Can not get running npu info.")
return sorted(running_npu_set) return sorted(running_npu_set)
def parse_allowed_cpus(self) -> list[int]: def parse_allowed_cpus(self) -> list[int]:
@@ -202,7 +215,7 @@ class CpuAlloc:
npu_num_this_node = min(npu_num_per_node, num_running_npu - index) npu_num_this_node = min(npu_num_per_node, num_running_npu - index)
if npu_num_this_node <= 0: if npu_num_this_node <= 0:
break break
# Evenly distribute the CPUs of this NUMA node among npu_num_this_node NPUs. # NUMA-balanced distribute the CPUs of this NUMA node among npu_num_this_node NPUs.
total_cpu_num = len(cpus) total_cpu_num = len(cpus)
base_cpu_num = total_cpu_num // npu_num_this_node base_cpu_num = total_cpu_num // npu_num_this_node
extra_cpu_num = total_cpu_num % npu_num_this_node extra_cpu_num = total_cpu_num % npu_num_this_node
@@ -217,9 +230,22 @@ class CpuAlloc:
index += 1 index += 1
start_index = end_index start_index = end_index
DEVICE_BINDING_MODE = {
AscendDeviceType.A3: "numa_balanced",
}
@classmethod
def _binding_mode(cls) -> str:
device_type = get_ascend_device_type()
return cls.DEVICE_BINDING_MODE.get(device_type, "affinity")
def build_cpu_pools(self) -> None: def build_cpu_pools(self) -> None:
self.build_cpu_node_map() self.build_cpu_node_map()
if self._binding_mode() == "numa_balanced":
self.handle_no_affinity()
return
if not self.device_info.npu_affinity: if not self.device_info.npu_affinity:
logger.warning("NPU affinity info not found, fallback to NUMA-balanced CPU binding.")
self.handle_no_affinity() self.handle_no_affinity()
return return
for npu in self.device_info.running_npu_list: for npu in self.device_info.running_npu_list:
@@ -282,5 +308,8 @@ class CpuAlloc:
def bind_cpus(rank_id: int) -> None: def bind_cpus(rank_id: int) -> None:
if not is_arm_cpu():
logger.info("CPU binding skipped: non-ARM CPU detected.")
return
binder = CpuAlloc(rank_id) binder = CpuAlloc(rank_id)
binder.run_all() binder.run_all()