From 855d0ba381f6bfa69f906797e33efcd0708797b9 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Fri, 27 Dec 2024 22:16:39 -0800 Subject: [PATCH] [CI] Fix nightly test and raise better error message (#2626) Co-authored-by: Sangbin --- .github/workflows/nightly-test.yml | 2 +- python/sglang/srt/server.py | 11 ++++++++++- test/srt/run_suite.py | 5 +++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly-test.yml b/.github/workflows/nightly-test.yml index 04a109f23..17b4e402d 100644 --- a/.github/workflows/nightly-test.yml +++ b/.github/workflows/nightly-test.yml @@ -30,5 +30,5 @@ jobs: - name: Run test timeout-minutes: 10 run: | - cd test/lang + cd test/srt python3 run_suite.py --suite nightly --timeout-per-file 2400 diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 4814c8c6f..0b51a0636 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -484,7 +484,16 @@ def launch_engine( # Wait for model to finish loading scheduler_infos = [] for i in range(len(scheduler_pipe_readers)): - data = scheduler_pipe_readers[i].recv() + try: + data = scheduler_pipe_readers[i].recv() + except EOFError as e: + logger.exception(e) + logger.error( + f"Rank {i} scheduler is dead. Please check if there are relevant logs." + ) + scheduler_procs[i].join() + logger.error(f"Exit code: {scheduler_procs[i].exitcode}") + raise if data["status"] != "ready": raise RuntimeError( diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index a0ca5fabb..df0d41476 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -44,11 +44,16 @@ suites = { "test_vision_openai_server.py", "test_session_control.py", ], + "nightly": [ + "test_nightly_gsm8k_eval.py", + "test_nightly_human_eval.py", + ], "sampling/penaltylib": glob.glob( "sampling/penaltylib/**/test_*.py", recursive=True ), } +# Expand suite for target_suite_name, target_tests in suites.items(): for suite_name, tests in suites.items(): if suite_name == target_suite_name: