Launch a thread to overlap CPU and GPU (#1687)

This commit is contained in:
Lianmin Zheng
2024-10-16 11:20:17 -07:00
committed by GitHub
parent e4b367baa8
commit dbec2f1847
3 changed files with 142 additions and 20 deletions

View File

@@ -447,7 +447,7 @@ def _set_envs_and_config(server_args: ServerArgs):
os.environ["NCCL_CUMEM_ENABLE"] = "0"
os.environ["NCCL_NVLS_ENABLE"] = "0"
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
# Set ulimit
set_ulimit()
@@ -528,7 +528,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer, pid):
kill_child_process(pid, including_parent=False)
return
# print(f"{res.json()=}")
print(f"{res.json()=}")
logger.info("The server is fired up and ready to roll!")
if pipe_finish_writer is not None: