[Dist] Set device as rank (#202)

### What this PR does / why we need it? The rank returned by `torch.distributed.get_rank(device_group)` is the local rank, but rank (or rank in process group (PG)) is expected. Thus we change to use `torch.npu.current_device()` to set device ```python # difference between `local_rank` and `rank_in_group`: # if we have a group of size 4 across two nodes: # Process | Node | Rank | Local Rank | Rank in Group # 0 | 0 | 0 | 0 | 0 # 1 | 0 | 1 | 1 | 1 # 2 | 1 | 2 | 0 | 2 # 3 | 1 | 3 | 1 | 3 ``` Tested by @wwfu109 with `vllm/tests/distributed/test_customops::test_multi_process_tensor_parallel_pipeline_parallel` Signed-off-by: MengqingCao <cmq0113@163.com>
2025-03-03 09:23:13 +08:00
parent ebe14f20cf
commit b64ee7d346
1 changed files with 2 additions and 4 deletions
--- a/vllm_ascend/communicator.py
+++ b/vllm_ascend/communicator.py
@@ -17,7 +17,6 @@
 from typing import Optional
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from vllm.distributed.device_communicators.base_device_communicator import \
    DeviceCommunicatorBase
@@ -31,6 +30,5 @@ class NPUCommunicator(DeviceCommunicatorBase):
                 device_group: Optional[ProcessGroup] = None,
                 unique_name: str = ""):
        super().__init__(cpu_group, device, device_group, unique_name)
-        # init device according to local rank
+        # init device according to rank
-        local_rank = dist.get_rank(device_group)
+        self.device = torch.npu.current_device()
        self.device = torch.device(f"npu:{local_rank}")