Fix multi-node deadlock (#709)

This commit is contained in:
Lianmin Zheng
2024-07-23 21:53:36 -07:00
committed by GitHub
parent a523a3c13a
commit 01d66ae2e8
2 changed files with 4 additions and 0 deletions

View File

@@ -187,6 +187,7 @@ def launch_server(
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["NCCL_CUMEM_ENABLE"] = "0"
os.environ["NCCL_NVLS_ENABLE"] = "0"
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
set_ulimit()
if server_args.show_time_cost:
enable_show_time_cost()