From 01d66ae2e8aa08a3fa2b93f8023063f8798477f0 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 23 Jul 2024 21:53:36 -0700 Subject: [PATCH] Fix multi-node deadlock (#709) --- python/sglang/srt/server.py | 1 + python/sglang/srt/utils.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index e93727cc6..b0fca27f6 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -187,6 +187,7 @@ def launch_server( os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" os.environ["NCCL_CUMEM_ENABLE"] = "0" os.environ["NCCL_NVLS_ENABLE"] = "0" + os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" set_ulimit() if server_args.show_time_cost: enable_show_time_cost() diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 8aaf5c3fb..6ada031d2 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -312,6 +312,9 @@ def suppress_other_loggers(): logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel( logging.WARN ) + logging.getLogger("vllm.distributed.device_communicators.shm_broadcast").setLevel( + logging.WARN + ) logging.getLogger("vllm.selector").setLevel(logging.WARN) logging.getLogger("vllm.utils").setLevel(logging.WARN)