From f244f3c4a97c96ba5db20e0a5db9d1ddc67d4af3 Mon Sep 17 00:00:00 2001 From: Hexiang Wang <56632993+whx-sjtu@users.noreply.github.com> Date: Thu, 12 Mar 2026 15:59:03 +0800 Subject: [PATCH] [BugFix] Fix problem of extra processes on rank0 device (#7107) ### What this PR does / why we need it? Currently when tp>1, we have extra processes on tp rank0 device which consumes extra HBM memory. This is caused by `import torch_npu._inductor` before set_device which introduces extra initialization of device. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? All ci passed. - vLLM version: v0.16.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d --------- Signed-off-by: whx-sjtu <2952154980@qq.com> --- vllm_ascend/worker/worker.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 39f4f646..ebf2cdf7 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -95,12 +95,7 @@ class NPUWorker(WorkerBase): from vllm_ascend.utils import adapt_patch adapt_patch() - # Import _inductor for graph mode execution with triton - # This lazy import avoids torch_npu re-initialization in patch - from vllm.triton_utils import HAS_TRITON - if HAS_TRITON: - import torch_npu._inductor # noqa: F401 # Register ops when worker init. from vllm_ascend import ops @@ -253,6 +248,15 @@ class NPUWorker(WorkerBase): device = torch.device(f"npu:{self.local_rank}") torch.npu.set_device(device) + # Import _inductor for graph mode execution with triton + # This lazy import avoids torch_npu re-initialization in patch + # Note that this should be imported after torch.npu.set_device + # to avoid repeated set_device in extra processes + from vllm.triton_utils import HAS_TRITON + + if HAS_TRITON: + import torch_npu._inductor # noqa: F401 + gc.collect() torch.npu.empty_cache()