[main] support cpu binding (#3546)
### What this PR does / why we need it? Currently, in the piecewise of aclgraph, the model will be in eagle mode in attention, which will cause abnormal allreduce latency of O matrix. The reason is that cpu resources will be preempted in eagle mode. So I hope to temporarily add cpu binding to vllm-ascend. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed with new existing test. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: GDzhu1 <809721801@qq.com>
This commit is contained in:
@@ -43,7 +43,8 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
|
||||
from vllm.v1.worker.worker_base import WorkerBase
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.ascend_config import init_ascend_config
|
||||
from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
|
||||
from vllm_ascend.cpu_binding import bind_cpus
|
||||
from vllm_ascend.device_allocator.camem import CaMemAllocator
|
||||
from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
|
||||
from vllm_ascend.platform import NPUPlatform
|
||||
@@ -110,6 +111,17 @@ class NPUWorker(WorkerBase):
|
||||
distributed_init_method=distributed_init_method,
|
||||
is_driver_worker=is_driver_worker)
|
||||
|
||||
# binding cpu
|
||||
if get_ascend_config().enable_cpu_binding:
|
||||
try:
|
||||
bind_cpus(self.local_rank, ratio=1.0)
|
||||
except RuntimeError as e:
|
||||
logger.error(f"{e} in {self.local_rank}")
|
||||
except ValueError as e:
|
||||
logger.error(f"{e} in {self.local_rank}")
|
||||
except Exception:
|
||||
logger.info("Skip binding cpu.")
|
||||
|
||||
# Try to import mindie_turbo to accelerate vLLM inference.
|
||||
try_register_lib(
|
||||
"mindie_turbo",
|
||||
|
||||
Reference in New Issue
Block a user