From b64ee7d346511b6ea7a64b09db58c17aa1c915ef Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Mon, 3 Mar 2025 09:23:13 +0800 Subject: [PATCH] [Dist] Set device as rank (#202) ### What this PR does / why we need it? The rank returned by `torch.distributed.get_rank(device_group)` is the local rank, but rank (or rank in process group (PG)) is expected. Thus we change to use `torch.npu.current_device()` to set device ```python # difference between `local_rank` and `rank_in_group`: # if we have a group of size 4 across two nodes: # Process | Node | Rank | Local Rank | Rank in Group # 0 | 0 | 0 | 0 | 0 # 1 | 0 | 1 | 1 | 1 # 2 | 1 | 2 | 0 | 2 # 3 | 1 | 3 | 1 | 3 ``` Tested by @wwfu109 with `vllm/tests/distributed/test_customops::test_multi_process_tensor_parallel_pipeline_parallel` Signed-off-by: MengqingCao --- vllm_ascend/communicator.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/communicator.py b/vllm_ascend/communicator.py index 543b639..0c43f1f 100644 --- a/vllm_ascend/communicator.py +++ b/vllm_ascend/communicator.py @@ -17,7 +17,6 @@ from typing import Optional import torch -import torch.distributed as dist from torch.distributed import ProcessGroup from vllm.distributed.device_communicators.base_device_communicator import \ DeviceCommunicatorBase @@ -31,6 +30,5 @@ class NPUCommunicator(DeviceCommunicatorBase): device_group: Optional[ProcessGroup] = None, unique_name: str = ""): super().__init__(cpu_group, device, device_group, unique_name) - # init device according to local rank - local_rank = dist.get_rank(device_group) - self.device = torch.device(f"npu:{local_rank}") + # init device according to rank + self.device = torch.npu.current_device()