From 3da2ba22ebeef10ed31782488edb8120e3935bf7 Mon Sep 17 00:00:00 2001 From: Frank Chen Date: Wed, 25 Feb 2026 11:15:14 +0800 Subject: [PATCH] [Platform] Enable ARM-only CPU binding with NUMA-balanced A3 policy and update docs/tests (#6686) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? - Keeps enable_cpu_binding default on, but skips binding on non‑ARM CPUs inside bind_cpus, with a clear log. - Uses a table-driven binding policy: A3 uses NUMA‑balanced binding; other device types use NUMA‑affinity binding. - Updates docs to reflect the exact behavior and adds/updates unit tests for the new logic. ### Does this PR introduce _any_ user-facing change? - Yes. CPU binding is now enabled by default via additional_config, and documented in the user guide. - CPU binding behavior differs by device type (A3 vs. others). ### How was this patch tested? Added/updated unit tests: test_cpu_binding.py 1. test_binding_mode_table covers A2 vs A3 binding mode mapping. 2. test_build_cpu_pools_fallback_to_numa_balanced covers fallback when affinity info is missing. 3. TestBindingSwitch.test_is_arm_cpu covers ARM/x86/unknown arch detection. 4. test_bind_cpus_skip_non_arm covers non‑ARM skip path in bind_cpus. test_worker_v1.py 1. Updated mocks for enable_cpu_binding default True to align with new config default. - vLLM version: v0.14.1 - vLLM main: d7de043 --------- Signed-off-by: chenchuw886 Co-authored-by: chenchuw886 --- .../configuration/additional_config.md | 2 +- requirements.txt | 1 + tests/ut/device_allocator/test_cpu_binding.py | 45 ++++++++++++++++++- tests/ut/worker/test_worker_v1.py | 6 +-- vllm_ascend/ascend_config.py | 2 +- vllm_ascend/cpu_binding.py | 33 +++++++++++++- 6 files changed, 80 insertions(+), 9 deletions(-) diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md index b08c2015..4a7a2380 100644 --- a/docs/source/user_guide/configuration/additional_config.md +++ b/docs/source/user_guide/configuration/additional_config.md @@ -39,7 +39,7 @@ The following table lists additional configuration options available in vLLM Asc | `multistream_overlap_shared_expert` | bool | `False` | Whether to enable multi-stream shared expert. This option only takes effect on MoE models with shared experts. | | `multistream_overlap_gate` | bool | `False` | Whether to enable multi-stream overlap gate. This option only takes effect on MoE models with shared experts. | | `recompute_scheduler_enable` | bool | `False` | Whether to enable recompute scheduler. | -| `enable_cpu_binding` | bool | `False` | Whether to enable CPU Binding. | +| `enable_cpu_binding` | bool | `True` | Whether to enable CPU binding. Only takes effect on ARM CPUs; when enabled, A3 uses NUMA-balanced binding strategy and other device types use NUMA-affinity's. | | `SLO_limits_for_dynamic_batch` | int | `-1` | SLO limits for dynamic batch. This is new scheduler to support dynamic batch feature | | `enable_npugraph_ex` | bool | `False` | Whether to enable npugraph_ex graph mode. | | `pa_shape_list` | list | `[]` | The custom shape list of page attention ops. | diff --git a/requirements.txt b/requirements.txt index ef617fd4..b307c62e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ pybind11 pyyaml scipy pandas +psutil setuptools>=64 setuptools-scm>=8 torch==2.9.0 diff --git a/tests/ut/device_allocator/test_cpu_binding.py b/tests/ut/device_allocator/test_cpu_binding.py index 4f5cb114..2fd16f0f 100644 --- a/tests/ut/device_allocator/test_cpu_binding.py +++ b/tests/ut/device_allocator/test_cpu_binding.py @@ -1,7 +1,8 @@ import unittest from unittest.mock import patch -from vllm_ascend.cpu_binding import CpuAlloc, DeviceInfo +from vllm_ascend.cpu_binding import CpuAlloc, DeviceInfo, bind_cpus, is_arm_cpu +from vllm_ascend.utils import AscendDeviceType class TestDeviceInfo(unittest.TestCase): @@ -103,6 +104,23 @@ class TestCpuAlloc(unittest.TestCase): 2: [8, 9, 10, 11, 12, 13] }) + @patch('vllm_ascend.cpu_binding.get_ascend_device_type') + def test_binding_mode_table(self, mock_get_device_type): + mock_get_device_type.return_value = AscendDeviceType.A2 + self.assertEqual(self.cpu_alloc._binding_mode(), "affinity") + mock_get_device_type.return_value = AscendDeviceType.A3 + self.assertEqual(self.cpu_alloc._binding_mode(), "numa_balanced") + + @patch('vllm_ascend.cpu_binding.get_ascend_device_type') + def test_build_cpu_pools_fallback_to_numa_balanced(self, mock_get_device_type): + mock_get_device_type.return_value = AscendDeviceType.A2 + self.cpu_alloc.device_info.npu_affinity = {} + with patch.object(self.cpu_alloc, "build_cpu_node_map") as mock_build_cpu_node_map, \ + patch.object(self.cpu_alloc, "handle_no_affinity") as mock_handle_no_affinity: + self.cpu_alloc.build_cpu_pools() + mock_build_cpu_node_map.assert_called_once() + mock_handle_no_affinity.assert_called_once() + def test_extend_numa(self): result = self.cpu_alloc.extend_numa([]) self.assertEqual(result, []) @@ -128,8 +146,10 @@ class TestCpuAlloc(unittest.TestCase): self.assertEqual(self.cpu_alloc.numa_to_cpu_map, expected_numa_to_cpu_map) + @patch('vllm_ascend.cpu_binding.get_ascend_device_type') @patch('vllm_ascend.cpu_binding.execute_command') - def test_handle_no_affinity(self, mock_execute_command): + def test_handle_no_affinity(self, mock_execute_command, mock_get_device_type): + mock_get_device_type.return_value = AscendDeviceType.A3 mock_execute_command.side_effect = [("0 0\n1 1", 0), ("0 0\n1 1", 0)] self.cpu_alloc.device_info.running_npu_list = [0, 1] self.cpu_alloc.device_info.allowed_cpus = [0, 1, 2, 3] @@ -163,5 +183,26 @@ class TestCpuAlloc(unittest.TestCase): mock_execute_command.assert_called() +class TestBindingSwitch(unittest.TestCase): + + @patch('vllm_ascend.cpu_binding.platform.machine') + def test_is_arm_cpu(self, mock_machine): + mock_machine.return_value = "x86_64" + self.assertFalse(is_arm_cpu()) + mock_machine.return_value = "aarch64" + self.assertTrue(is_arm_cpu()) + mock_machine.return_value = "armv8" + self.assertTrue(is_arm_cpu()) + mock_machine.return_value = "mips64" + self.assertFalse(is_arm_cpu()) + + @patch('vllm_ascend.cpu_binding.CpuAlloc') + @patch('vllm_ascend.cpu_binding.is_arm_cpu') + def test_bind_cpus_skip_non_arm(self, mock_is_arm_cpu, mock_cpu_alloc): + mock_is_arm_cpu.return_value = False + bind_cpus(0) + mock_cpu_alloc.assert_not_called() + + if __name__ == '__main__': unittest.main() diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py index afb9d42a..fb6d66c5 100644 --- a/tests/ut/worker/test_worker_v1.py +++ b/tests/ut/worker/test_worker_v1.py @@ -70,7 +70,7 @@ class TestNPUWorker(TestBase): # Setup mock behavior mock_ops.register_dummy_fusion_op.return_value = None mock_ascend_config = MagicMock() - mock_ascend_config.enable_cpu_binding = False + mock_ascend_config.enable_cpu_binding = True mock_get_ascend_config.return_value = mock_ascend_config # Import and create NPUWorker instance @@ -125,7 +125,7 @@ class TestNPUWorker(TestBase): self.model_config_mock.trust_remote_code = True mock_ops.register_dummy_fusion_op.return_value = None mock_ascend_config = MagicMock() - mock_ascend_config.enable_cpu_binding = False + mock_ascend_config.enable_cpu_binding = True mock_get_ascend_config.return_value = mock_ascend_config # Create NPUWorker instance @@ -168,7 +168,7 @@ class TestNPUWorker(TestBase): self.cache_config_mock.cache_dtype = "float32" mock_ops.register_dummy_fusion_op.return_value = None mock_ascend_config = MagicMock() - mock_ascend_config.enable_cpu_binding = False + mock_ascend_config.enable_cpu_binding = True mock_get_ascend_config.return_value = mock_ascend_config # Create NPUWorker instance diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 843c7da6..3bc55be3 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -84,7 +84,7 @@ class AscendConfig: self.multistream_overlap_shared_expert = additional_config.get("multistream_overlap_shared_expert", False) self.multistream_overlap_gate = additional_config.get("multistream_overlap_gate", False) self.recompute_scheduler_enable = additional_config.get("recompute_scheduler_enable", False) - self.enable_cpu_binding = additional_config.get("enable_cpu_binding", False) + self.enable_cpu_binding = additional_config.get("enable_cpu_binding", True) self.pd_tp_ratio = 1 self.pd_head_ratio = 1 diff --git a/vllm_ascend/cpu_binding.py b/vllm_ascend/cpu_binding.py index 062ff4f9..fd8717de 100644 --- a/vllm_ascend/cpu_binding.py +++ b/vllm_ascend/cpu_binding.py @@ -1,16 +1,29 @@ #!/usr/bin/env python3 import os +import platform import subprocess from collections import defaultdict import psutil from vllm.logger import logger +from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type + ALLOWED_CPUS_PATH = "/proc/self/status" ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES") +def is_arm_cpu() -> bool: + arch = platform.machine().lower() + if arch in {"x86_64", "amd64", "i386", "i686"}: + return False + if arch in {"aarch64", "arm64"} or arch.startswith("arm"): + return True + logger.warning(f"Unknown CPU architecture '{arch}', CPU binding will be disabled.") + return False + + def execute_command(cmd: list[str]) -> tuple[str, int]: with subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p: out, _ = p.communicate(timeout=1000) @@ -77,7 +90,7 @@ class DeviceInfo: devices_list = [int(x) for x in devices_str.split(",")] running_npu_set = set(devices_list) & running_npu_set if not running_npu_set: - raise RuntimeError("Can not get running npu info, you can use BIND_CPU=0 to skip.") + raise RuntimeError("Can not get running npu info.") return sorted(running_npu_set) def parse_allowed_cpus(self) -> list[int]: @@ -202,7 +215,7 @@ class CpuAlloc: npu_num_this_node = min(npu_num_per_node, num_running_npu - index) if npu_num_this_node <= 0: break - # Evenly distribute the CPUs of this NUMA node among npu_num_this_node NPUs. + # NUMA-balanced distribute the CPUs of this NUMA node among npu_num_this_node NPUs. total_cpu_num = len(cpus) base_cpu_num = total_cpu_num // npu_num_this_node extra_cpu_num = total_cpu_num % npu_num_this_node @@ -217,9 +230,22 @@ class CpuAlloc: index += 1 start_index = end_index + DEVICE_BINDING_MODE = { + AscendDeviceType.A3: "numa_balanced", + } + + @classmethod + def _binding_mode(cls) -> str: + device_type = get_ascend_device_type() + return cls.DEVICE_BINDING_MODE.get(device_type, "affinity") + def build_cpu_pools(self) -> None: self.build_cpu_node_map() + if self._binding_mode() == "numa_balanced": + self.handle_no_affinity() + return if not self.device_info.npu_affinity: + logger.warning("NPU affinity info not found, fallback to NUMA-balanced CPU binding.") self.handle_no_affinity() return for npu in self.device_info.running_npu_list: @@ -282,5 +308,8 @@ class CpuAlloc: def bind_cpus(rank_id: int) -> None: + if not is_arm_cpu(): + logger.info("CPU binding skipped: non-ARM CPU detected.") + return binder = CpuAlloc(rank_id) binder.run_all()