Fix multi-node deadlock (#709)
This commit is contained in:
@@ -187,6 +187,7 @@ def launch_server(
|
|||||||
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
||||||
os.environ["NCCL_CUMEM_ENABLE"] = "0"
|
os.environ["NCCL_CUMEM_ENABLE"] = "0"
|
||||||
os.environ["NCCL_NVLS_ENABLE"] = "0"
|
os.environ["NCCL_NVLS_ENABLE"] = "0"
|
||||||
|
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
||||||
set_ulimit()
|
set_ulimit()
|
||||||
if server_args.show_time_cost:
|
if server_args.show_time_cost:
|
||||||
enable_show_time_cost()
|
enable_show_time_cost()
|
||||||
|
|||||||
@@ -312,6 +312,9 @@ def suppress_other_loggers():
|
|||||||
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
|
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
|
||||||
logging.WARN
|
logging.WARN
|
||||||
)
|
)
|
||||||
|
logging.getLogger("vllm.distributed.device_communicators.shm_broadcast").setLevel(
|
||||||
|
logging.WARN
|
||||||
|
)
|
||||||
logging.getLogger("vllm.selector").setLevel(logging.WARN)
|
logging.getLogger("vllm.selector").setLevel(logging.WARN)
|
||||||
logging.getLogger("vllm.utils").setLevel(logging.WARN)
|
logging.getLogger("vllm.utils").setLevel(logging.WARN)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user