From b64ee7d346511b6ea7a64b09db58c17aa1c915ef Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Mon, 3 Mar 2025 09:23:13 +0800
Subject: [PATCH] [Dist] Set device as rank (#202)

### What this PR does / why we need it?
The rank returned by `torch.distributed.get_rank(device_group)` is the
local rank, but rank (or rank in process group (PG)) is expected.
Thus we change to use `torch.npu.current_device()` to set device

```python
    # difference between `local_rank` and `rank_in_group`:
    # if we have a group of size 4 across two nodes:
    # Process | Node | Rank | Local Rank | Rank in Group
    #   0     |   0  |  0   |     0      |       0
    #   1     |   0  |  1   |     1      |       1
    #   2     |   1  |  2   |     0      |       2
    #   3     |   1  |  3   |     1      |       3
```

Tested by @wwfu109 with
`vllm/tests/distributed/test_customops::test_multi_process_tensor_parallel_pipeline_parallel`

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 vllm_ascend/communicator.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm_ascend/communicator.py b/vllm_ascend/communicator.py
index 543b639..0c43f1f 100644
--- a/vllm_ascend/communicator.py
+++ b/vllm_ascend/communicator.py
@@ -17,7 +17,6 @@
 from typing import Optional
 
 import torch
-import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from vllm.distributed.device_communicators.base_device_communicator import \
     DeviceCommunicatorBase
@@ -31,6 +30,5 @@ class NPUCommunicator(DeviceCommunicatorBase):
                  device_group: Optional[ProcessGroup] = None,
                  unique_name: str = ""):
         super().__init__(cpu_group, device, device_group, unique_name)
-        # init device according to local rank
-        local_rank = dist.get_rank(device_group)
-        self.device = torch.device(f"npu:{local_rank}")
+        # init device according to rank
+        self.device = torch.npu.current_device()