[main] support cpu binding (#3546)

### What this PR does / why we need it? Currently, in the piecewise of aclgraph, the model will be in eagle mode in attention, which will cause abnormal allreduce latency of O matrix. The reason is that cpu resources will be preempted in eagle mode. So I hope to temporarily add cpu binding to vllm-ascend. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed with new existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: GDzhu1 <809721801@qq.com>
2025-10-21 09:17:03 +08:00
parent 274b708e0c
commit 4a849df6fa
3 changed files with 345 additions and 1 deletions
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -43,7 +43,8 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
 from vllm.v1.worker.worker_base import WorkerBase

 import vllm_ascend.envs as envs_ascend
-from vllm_ascend.ascend_config import init_ascend_config
+from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
+from vllm_ascend.cpu_binding import bind_cpus
 from vllm_ascend.device_allocator.camem import CaMemAllocator
 from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
 from vllm_ascend.platform import NPUPlatform
@@ -110,6 +111,17 @@ class NPUWorker(WorkerBase):
                         distributed_init_method=distributed_init_method,
                         is_driver_worker=is_driver_worker)

+        # binding cpu
+        if get_ascend_config().enable_cpu_binding:
+            try:
+                bind_cpus(self.local_rank, ratio=1.0)
+            except RuntimeError as e:
+                logger.error(f"{e} in {self.local_rank}")
+            except ValueError as e:
+                logger.error(f"{e} in {self.local_rank}")
+            except Exception:
+                logger.info("Skip binding cpu.")
+
        # Try to import mindie_turbo to accelerate vLLM inference.
        try_register_lib(
            "mindie_turbo",