From f244f3c4a97c96ba5db20e0a5db9d1ddc67d4af3 Mon Sep 17 00:00:00 2001
From: Hexiang Wang <56632993+whx-sjtu@users.noreply.github.com>
Date: Thu, 12 Mar 2026 15:59:03 +0800
Subject: [PATCH] [BugFix] Fix problem of extra processes on rank0 device
 (#7107)

### What this PR does / why we need it?
Currently when tp>1, we have extra processes on tp rank0 device which
consumes extra HBM memory. This is caused by `import
torch_npu._inductor` before set_device which introduces extra
initialization of device.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
All ci passed.

- vLLM version: v0.16.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/4034c3d32e30d01639459edd3ab486f56993876d

---------

Signed-off-by: whx-sjtu <2952154980@qq.com>
---
 vllm_ascend/worker/worker.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
index 39f4f646..ebf2cdf7 100644
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -95,12 +95,7 @@ class NPUWorker(WorkerBase):
         from vllm_ascend.utils import adapt_patch
 
         adapt_patch()
-        # Import _inductor for graph mode execution with triton
-        # This lazy import avoids torch_npu re-initialization in patch
-        from vllm.triton_utils import HAS_TRITON
 
-        if HAS_TRITON:
-            import torch_npu._inductor  # noqa: F401
         # Register ops when worker init.
         from vllm_ascend import ops
 
@@ -253,6 +248,15 @@ class NPUWorker(WorkerBase):
         device = torch.device(f"npu:{self.local_rank}")
         torch.npu.set_device(device)
 
+        # Import _inductor for graph mode execution with triton
+        # This lazy import avoids torch_npu re-initialization in patch
+        # Note that this should be imported after torch.npu.set_device
+        # to avoid repeated set_device in extra processes
+        from vllm.triton_utils import HAS_TRITON
+
+        if HAS_TRITON:
+            import torch_npu._inductor  # noqa: F401
+
         gc.collect()
         torch.npu.empty_cache()