[Platform] Enable ARM-only CPU binding with NUMA-balanced A3 policy and update docs/tests (#6686)
### What this PR does / why we need it? - Keeps enable_cpu_binding default on, but skips binding on non‑ARM CPUs inside bind_cpus, with a clear log. - Uses a table-driven binding policy: A3 uses NUMA‑balanced binding; other device types use NUMA‑affinity binding. - Updates docs to reflect the exact behavior and adds/updates unit tests for the new logic. ### Does this PR introduce _any_ user-facing change? - Yes. CPU binding is now enabled by default via additional_config, and documented in the user guide. - CPU binding behavior differs by device type (A3 vs. others). ### How was this patch tested? Added/updated unit tests: test_cpu_binding.py 1. test_binding_mode_table covers A2 vs A3 binding mode mapping. 2. test_build_cpu_pools_fallback_to_numa_balanced covers fallback when affinity info is missing. 3. TestBindingSwitch.test_is_arm_cpu covers ARM/x86/unknown arch detection. 4. test_bind_cpus_skip_non_arm covers non‑ARM skip path in bind_cpus. test_worker_v1.py 1. Updated mocks for enable_cpu_binding default True to align with new config default. - vLLM version: v0.14.1 - vLLM main: d7de043 --------- Signed-off-by: chenchuw886 <chenchuw@huawei.com> Co-authored-by: chenchuw886 <chenchuw@huawei.com>
This commit is contained in:
@@ -39,7 +39,7 @@ The following table lists additional configuration options available in vLLM Asc
|
|||||||
| `multistream_overlap_shared_expert` | bool | `False` | Whether to enable multi-stream shared expert. This option only takes effect on MoE models with shared experts. |
|
| `multistream_overlap_shared_expert` | bool | `False` | Whether to enable multi-stream shared expert. This option only takes effect on MoE models with shared experts. |
|
||||||
| `multistream_overlap_gate` | bool | `False` | Whether to enable multi-stream overlap gate. This option only takes effect on MoE models with shared experts. |
|
| `multistream_overlap_gate` | bool | `False` | Whether to enable multi-stream overlap gate. This option only takes effect on MoE models with shared experts. |
|
||||||
| `recompute_scheduler_enable` | bool | `False` | Whether to enable recompute scheduler. |
|
| `recompute_scheduler_enable` | bool | `False` | Whether to enable recompute scheduler. |
|
||||||
| `enable_cpu_binding` | bool | `False` | Whether to enable CPU Binding. |
|
| `enable_cpu_binding` | bool | `True` | Whether to enable CPU binding. Only takes effect on ARM CPUs; when enabled, A3 uses NUMA-balanced binding strategy and other device types use NUMA-affinity's. |
|
||||||
| `SLO_limits_for_dynamic_batch` | int | `-1` | SLO limits for dynamic batch. This is new scheduler to support dynamic batch feature |
|
| `SLO_limits_for_dynamic_batch` | int | `-1` | SLO limits for dynamic batch. This is new scheduler to support dynamic batch feature |
|
||||||
| `enable_npugraph_ex` | bool | `False` | Whether to enable npugraph_ex graph mode. |
|
| `enable_npugraph_ex` | bool | `False` | Whether to enable npugraph_ex graph mode. |
|
||||||
| `pa_shape_list` | list | `[]` | The custom shape list of page attention ops. |
|
| `pa_shape_list` | list | `[]` | The custom shape list of page attention ops. |
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ pybind11
|
|||||||
pyyaml
|
pyyaml
|
||||||
scipy
|
scipy
|
||||||
pandas
|
pandas
|
||||||
|
psutil
|
||||||
setuptools>=64
|
setuptools>=64
|
||||||
setuptools-scm>=8
|
setuptools-scm>=8
|
||||||
torch==2.9.0
|
torch==2.9.0
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
import unittest
|
import unittest
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
from vllm_ascend.cpu_binding import CpuAlloc, DeviceInfo
|
from vllm_ascend.cpu_binding import CpuAlloc, DeviceInfo, bind_cpus, is_arm_cpu
|
||||||
|
from vllm_ascend.utils import AscendDeviceType
|
||||||
|
|
||||||
|
|
||||||
class TestDeviceInfo(unittest.TestCase):
|
class TestDeviceInfo(unittest.TestCase):
|
||||||
@@ -103,6 +104,23 @@ class TestCpuAlloc(unittest.TestCase):
|
|||||||
2: [8, 9, 10, 11, 12, 13]
|
2: [8, 9, 10, 11, 12, 13]
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.get_ascend_device_type')
|
||||||
|
def test_binding_mode_table(self, mock_get_device_type):
|
||||||
|
mock_get_device_type.return_value = AscendDeviceType.A2
|
||||||
|
self.assertEqual(self.cpu_alloc._binding_mode(), "affinity")
|
||||||
|
mock_get_device_type.return_value = AscendDeviceType.A3
|
||||||
|
self.assertEqual(self.cpu_alloc._binding_mode(), "numa_balanced")
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.get_ascend_device_type')
|
||||||
|
def test_build_cpu_pools_fallback_to_numa_balanced(self, mock_get_device_type):
|
||||||
|
mock_get_device_type.return_value = AscendDeviceType.A2
|
||||||
|
self.cpu_alloc.device_info.npu_affinity = {}
|
||||||
|
with patch.object(self.cpu_alloc, "build_cpu_node_map") as mock_build_cpu_node_map, \
|
||||||
|
patch.object(self.cpu_alloc, "handle_no_affinity") as mock_handle_no_affinity:
|
||||||
|
self.cpu_alloc.build_cpu_pools()
|
||||||
|
mock_build_cpu_node_map.assert_called_once()
|
||||||
|
mock_handle_no_affinity.assert_called_once()
|
||||||
|
|
||||||
def test_extend_numa(self):
|
def test_extend_numa(self):
|
||||||
result = self.cpu_alloc.extend_numa([])
|
result = self.cpu_alloc.extend_numa([])
|
||||||
self.assertEqual(result, [])
|
self.assertEqual(result, [])
|
||||||
@@ -128,8 +146,10 @@ class TestCpuAlloc(unittest.TestCase):
|
|||||||
self.assertEqual(self.cpu_alloc.numa_to_cpu_map,
|
self.assertEqual(self.cpu_alloc.numa_to_cpu_map,
|
||||||
expected_numa_to_cpu_map)
|
expected_numa_to_cpu_map)
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.get_ascend_device_type')
|
||||||
@patch('vllm_ascend.cpu_binding.execute_command')
|
@patch('vllm_ascend.cpu_binding.execute_command')
|
||||||
def test_handle_no_affinity(self, mock_execute_command):
|
def test_handle_no_affinity(self, mock_execute_command, mock_get_device_type):
|
||||||
|
mock_get_device_type.return_value = AscendDeviceType.A3
|
||||||
mock_execute_command.side_effect = [("0 0\n1 1", 0), ("0 0\n1 1", 0)]
|
mock_execute_command.side_effect = [("0 0\n1 1", 0), ("0 0\n1 1", 0)]
|
||||||
self.cpu_alloc.device_info.running_npu_list = [0, 1]
|
self.cpu_alloc.device_info.running_npu_list = [0, 1]
|
||||||
self.cpu_alloc.device_info.allowed_cpus = [0, 1, 2, 3]
|
self.cpu_alloc.device_info.allowed_cpus = [0, 1, 2, 3]
|
||||||
@@ -163,5 +183,26 @@ class TestCpuAlloc(unittest.TestCase):
|
|||||||
mock_execute_command.assert_called()
|
mock_execute_command.assert_called()
|
||||||
|
|
||||||
|
|
||||||
|
class TestBindingSwitch(unittest.TestCase):
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.platform.machine')
|
||||||
|
def test_is_arm_cpu(self, mock_machine):
|
||||||
|
mock_machine.return_value = "x86_64"
|
||||||
|
self.assertFalse(is_arm_cpu())
|
||||||
|
mock_machine.return_value = "aarch64"
|
||||||
|
self.assertTrue(is_arm_cpu())
|
||||||
|
mock_machine.return_value = "armv8"
|
||||||
|
self.assertTrue(is_arm_cpu())
|
||||||
|
mock_machine.return_value = "mips64"
|
||||||
|
self.assertFalse(is_arm_cpu())
|
||||||
|
|
||||||
|
@patch('vllm_ascend.cpu_binding.CpuAlloc')
|
||||||
|
@patch('vllm_ascend.cpu_binding.is_arm_cpu')
|
||||||
|
def test_bind_cpus_skip_non_arm(self, mock_is_arm_cpu, mock_cpu_alloc):
|
||||||
|
mock_is_arm_cpu.return_value = False
|
||||||
|
bind_cpus(0)
|
||||||
|
mock_cpu_alloc.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -70,7 +70,7 @@ class TestNPUWorker(TestBase):
|
|||||||
# Setup mock behavior
|
# Setup mock behavior
|
||||||
mock_ops.register_dummy_fusion_op.return_value = None
|
mock_ops.register_dummy_fusion_op.return_value = None
|
||||||
mock_ascend_config = MagicMock()
|
mock_ascend_config = MagicMock()
|
||||||
mock_ascend_config.enable_cpu_binding = False
|
mock_ascend_config.enable_cpu_binding = True
|
||||||
mock_get_ascend_config.return_value = mock_ascend_config
|
mock_get_ascend_config.return_value = mock_ascend_config
|
||||||
|
|
||||||
# Import and create NPUWorker instance
|
# Import and create NPUWorker instance
|
||||||
@@ -125,7 +125,7 @@ class TestNPUWorker(TestBase):
|
|||||||
self.model_config_mock.trust_remote_code = True
|
self.model_config_mock.trust_remote_code = True
|
||||||
mock_ops.register_dummy_fusion_op.return_value = None
|
mock_ops.register_dummy_fusion_op.return_value = None
|
||||||
mock_ascend_config = MagicMock()
|
mock_ascend_config = MagicMock()
|
||||||
mock_ascend_config.enable_cpu_binding = False
|
mock_ascend_config.enable_cpu_binding = True
|
||||||
mock_get_ascend_config.return_value = mock_ascend_config
|
mock_get_ascend_config.return_value = mock_ascend_config
|
||||||
|
|
||||||
# Create NPUWorker instance
|
# Create NPUWorker instance
|
||||||
@@ -168,7 +168,7 @@ class TestNPUWorker(TestBase):
|
|||||||
self.cache_config_mock.cache_dtype = "float32"
|
self.cache_config_mock.cache_dtype = "float32"
|
||||||
mock_ops.register_dummy_fusion_op.return_value = None
|
mock_ops.register_dummy_fusion_op.return_value = None
|
||||||
mock_ascend_config = MagicMock()
|
mock_ascend_config = MagicMock()
|
||||||
mock_ascend_config.enable_cpu_binding = False
|
mock_ascend_config.enable_cpu_binding = True
|
||||||
mock_get_ascend_config.return_value = mock_ascend_config
|
mock_get_ascend_config.return_value = mock_ascend_config
|
||||||
|
|
||||||
# Create NPUWorker instance
|
# Create NPUWorker instance
|
||||||
|
|||||||
@@ -84,7 +84,7 @@ class AscendConfig:
|
|||||||
self.multistream_overlap_shared_expert = additional_config.get("multistream_overlap_shared_expert", False)
|
self.multistream_overlap_shared_expert = additional_config.get("multistream_overlap_shared_expert", False)
|
||||||
self.multistream_overlap_gate = additional_config.get("multistream_overlap_gate", False)
|
self.multistream_overlap_gate = additional_config.get("multistream_overlap_gate", False)
|
||||||
self.recompute_scheduler_enable = additional_config.get("recompute_scheduler_enable", False)
|
self.recompute_scheduler_enable = additional_config.get("recompute_scheduler_enable", False)
|
||||||
self.enable_cpu_binding = additional_config.get("enable_cpu_binding", False)
|
self.enable_cpu_binding = additional_config.get("enable_cpu_binding", True)
|
||||||
|
|
||||||
self.pd_tp_ratio = 1
|
self.pd_tp_ratio = 1
|
||||||
self.pd_head_ratio = 1
|
self.pd_head_ratio = 1
|
||||||
|
|||||||
@@ -1,16 +1,29 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import platform
|
||||||
import subprocess
|
import subprocess
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
|
|
||||||
|
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
||||||
|
|
||||||
ALLOWED_CPUS_PATH = "/proc/self/status"
|
ALLOWED_CPUS_PATH = "/proc/self/status"
|
||||||
ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")
|
ASCEND_RT_VISIBLE_DEVICES = os.getenv("ASCEND_RT_VISIBLE_DEVICES")
|
||||||
|
|
||||||
|
|
||||||
|
def is_arm_cpu() -> bool:
|
||||||
|
arch = platform.machine().lower()
|
||||||
|
if arch in {"x86_64", "amd64", "i386", "i686"}:
|
||||||
|
return False
|
||||||
|
if arch in {"aarch64", "arm64"} or arch.startswith("arm"):
|
||||||
|
return True
|
||||||
|
logger.warning(f"Unknown CPU architecture '{arch}', CPU binding will be disabled.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def execute_command(cmd: list[str]) -> tuple[str, int]:
|
def execute_command(cmd: list[str]) -> tuple[str, int]:
|
||||||
with subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
|
with subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
|
||||||
out, _ = p.communicate(timeout=1000)
|
out, _ = p.communicate(timeout=1000)
|
||||||
@@ -77,7 +90,7 @@ class DeviceInfo:
|
|||||||
devices_list = [int(x) for x in devices_str.split(",")]
|
devices_list = [int(x) for x in devices_str.split(",")]
|
||||||
running_npu_set = set(devices_list) & running_npu_set
|
running_npu_set = set(devices_list) & running_npu_set
|
||||||
if not running_npu_set:
|
if not running_npu_set:
|
||||||
raise RuntimeError("Can not get running npu info, you can use BIND_CPU=0 to skip.")
|
raise RuntimeError("Can not get running npu info.")
|
||||||
return sorted(running_npu_set)
|
return sorted(running_npu_set)
|
||||||
|
|
||||||
def parse_allowed_cpus(self) -> list[int]:
|
def parse_allowed_cpus(self) -> list[int]:
|
||||||
@@ -202,7 +215,7 @@ class CpuAlloc:
|
|||||||
npu_num_this_node = min(npu_num_per_node, num_running_npu - index)
|
npu_num_this_node = min(npu_num_per_node, num_running_npu - index)
|
||||||
if npu_num_this_node <= 0:
|
if npu_num_this_node <= 0:
|
||||||
break
|
break
|
||||||
# Evenly distribute the CPUs of this NUMA node among npu_num_this_node NPUs.
|
# NUMA-balanced distribute the CPUs of this NUMA node among npu_num_this_node NPUs.
|
||||||
total_cpu_num = len(cpus)
|
total_cpu_num = len(cpus)
|
||||||
base_cpu_num = total_cpu_num // npu_num_this_node
|
base_cpu_num = total_cpu_num // npu_num_this_node
|
||||||
extra_cpu_num = total_cpu_num % npu_num_this_node
|
extra_cpu_num = total_cpu_num % npu_num_this_node
|
||||||
@@ -217,9 +230,22 @@ class CpuAlloc:
|
|||||||
index += 1
|
index += 1
|
||||||
start_index = end_index
|
start_index = end_index
|
||||||
|
|
||||||
|
DEVICE_BINDING_MODE = {
|
||||||
|
AscendDeviceType.A3: "numa_balanced",
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _binding_mode(cls) -> str:
|
||||||
|
device_type = get_ascend_device_type()
|
||||||
|
return cls.DEVICE_BINDING_MODE.get(device_type, "affinity")
|
||||||
|
|
||||||
def build_cpu_pools(self) -> None:
|
def build_cpu_pools(self) -> None:
|
||||||
self.build_cpu_node_map()
|
self.build_cpu_node_map()
|
||||||
|
if self._binding_mode() == "numa_balanced":
|
||||||
|
self.handle_no_affinity()
|
||||||
|
return
|
||||||
if not self.device_info.npu_affinity:
|
if not self.device_info.npu_affinity:
|
||||||
|
logger.warning("NPU affinity info not found, fallback to NUMA-balanced CPU binding.")
|
||||||
self.handle_no_affinity()
|
self.handle_no_affinity()
|
||||||
return
|
return
|
||||||
for npu in self.device_info.running_npu_list:
|
for npu in self.device_info.running_npu_list:
|
||||||
@@ -282,5 +308,8 @@ class CpuAlloc:
|
|||||||
|
|
||||||
|
|
||||||
def bind_cpus(rank_id: int) -> None:
|
def bind_cpus(rank_id: int) -> None:
|
||||||
|
if not is_arm_cpu():
|
||||||
|
logger.info("CPU binding skipped: non-ARM CPU detected.")
|
||||||
|
return
|
||||||
binder = CpuAlloc(rank_id)
|
binder = CpuAlloc(rank_id)
|
||||||
binder.run_all()
|
binder.run_all()
|
||||||
|
|||||||
Reference in New Issue
Block a user