[CI] Fix nightly test and raise better error message (#2626)

Co-authored-by: Sangbin <rkooo567@gmail.com>
This commit is contained in:
Lianmin Zheng
2024-12-27 22:16:39 -08:00
committed by GitHub
parent 9254a33ad4
commit 855d0ba381
3 changed files with 16 additions and 2 deletions

View File

@@ -484,7 +484,16 @@ def launch_engine(
# Wait for model to finish loading
scheduler_infos = []
for i in range(len(scheduler_pipe_readers)):
data = scheduler_pipe_readers[i].recv()
try:
data = scheduler_pipe_readers[i].recv()
except EOFError as e:
logger.exception(e)
logger.error(
f"Rank {i} scheduler is dead. Please check if there are relevant logs."
)
scheduler_procs[i].join()
logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
raise
if data["status"] != "ready":
raise RuntimeError(