Fix multi-node deadlock (#709)

This commit is contained in:
Lianmin Zheng
2024-07-23 21:53:36 -07:00
committed by GitHub
parent a523a3c13a
commit 01d66ae2e8
2 changed files with 4 additions and 0 deletions

View File

@@ -187,6 +187,7 @@ def launch_server(
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["NCCL_CUMEM_ENABLE"] = "0"
os.environ["NCCL_NVLS_ENABLE"] = "0"
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
set_ulimit()
if server_args.show_time_cost:
enable_show_time_cost()

View File

@@ -312,6 +312,9 @@ def suppress_other_loggers():
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
logging.WARN
)
logging.getLogger("vllm.distributed.device_communicators.shm_broadcast").setLevel(
logging.WARN
)
logging.getLogger("vllm.selector").setLevel(logging.WARN)
logging.getLogger("vllm.utils").setLevel(logging.WARN)