[Dist] Set device as rank (#202)
### What this PR does / why we need it?
The rank returned by `torch.distributed.get_rank(device_group)` is the
local rank, but rank (or rank in process group (PG)) is expected.
Thus we change to use `torch.npu.current_device()` to set device
```python
# difference between `local_rank` and `rank_in_group`:
# if we have a group of size 4 across two nodes:
# Process | Node | Rank | Local Rank | Rank in Group
# 0 | 0 | 0 | 0 | 0
# 1 | 0 | 1 | 1 | 1
# 2 | 1 | 2 | 0 | 2
# 3 | 1 | 3 | 1 | 3
```
Tested by @wwfu109 with
`vllm/tests/distributed/test_customops::test_multi_process_tensor_parallel_pipeline_parallel`
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
@@ -17,7 +17,6 @@
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
|
||||||
from torch.distributed import ProcessGroup
|
from torch.distributed import ProcessGroup
|
||||||
from vllm.distributed.device_communicators.base_device_communicator import \
|
from vllm.distributed.device_communicators.base_device_communicator import \
|
||||||
DeviceCommunicatorBase
|
DeviceCommunicatorBase
|
||||||
@@ -31,6 +30,5 @@ class NPUCommunicator(DeviceCommunicatorBase):
|
|||||||
device_group: Optional[ProcessGroup] = None,
|
device_group: Optional[ProcessGroup] = None,
|
||||||
unique_name: str = ""):
|
unique_name: str = ""):
|
||||||
super().__init__(cpu_group, device, device_group, unique_name)
|
super().__init__(cpu_group, device, device_group, unique_name)
|
||||||
# init device according to local rank
|
# init device according to rank
|
||||||
local_rank = dist.get_rank(device_group)
|
self.device = torch.npu.current_device()
|
||||||
self.device = torch.device(f"npu:{local_rank}")
|
|
||||||
|
|||||||
Reference in New Issue
Block a user