From dfb322642fe6346e286fae7be20e75d3a8899e76 Mon Sep 17 00:00:00 2001
From: Wenxuan Tan <wtan45@wisc.edu>
Date: Sat, 26 Apr 2025 20:11:09 -0500
Subject: [PATCH] Use device_id in dist init to reduce NCCL communicator warmup
 & creation overhead (#5728)

---
 python/sglang/srt/distributed/parallel_state.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py
index e43bc0000..fdde7dde8 100644
--- a/python/sglang/srt/distributed/parallel_state.py
+++ b/python/sglang/srt/distributed/parallel_state.py
@@ -1055,6 +1055,11 @@ def init_distributed_environment(
             world_size=world_size,
             rank=rank,
             timeout=timeout,
+            device_id=torch.device(
+                f"cuda:{torch.cuda.current_device()}"
+                if hasattr(torch, "cuda") and torch.cuda.is_available()
+                else None
+            ),  # Allow NCCL to eagerly init communicator
         )
 
     # set the local rank