Use device_id in dist init to reduce NCCL communicator warmup & creation overhead (#5728)

This commit is contained in:
Wenxuan Tan
2025-04-26 20:11:09 -05:00
committed by GitHub
parent 63c13a2c73
commit dfb322642f

View File

@@ -1055,6 +1055,11 @@ def init_distributed_environment(
world_size=world_size,
rank=rank,
timeout=timeout,
device_id=torch.device(
f"cuda:{torch.cuda.current_device()}"
if hasattr(torch, "cuda") and torch.cuda.is_available()
else None
), # Allow NCCL to eagerly init communicator
)
# set the local rank