From dfb322642fe6346e286fae7be20e75d3a8899e76 Mon Sep 17 00:00:00 2001 From: Wenxuan Tan Date: Sat, 26 Apr 2025 20:11:09 -0500 Subject: [PATCH] Use device_id in dist init to reduce NCCL communicator warmup & creation overhead (#5728) --- python/sglang/srt/distributed/parallel_state.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py index e43bc0000..fdde7dde8 100644 --- a/python/sglang/srt/distributed/parallel_state.py +++ b/python/sglang/srt/distributed/parallel_state.py @@ -1055,6 +1055,11 @@ def init_distributed_environment( world_size=world_size, rank=rank, timeout=timeout, + device_id=torch.device( + f"cuda:{torch.cuda.current_device()}" + if hasattr(torch, "cuda") and torch.cuda.is_available() + else None + ), # Allow NCCL to eagerly init communicator ) # set the local rank