diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index cd21c896a..4121deb17 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -36,7 +36,7 @@ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" -DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 83d2e90a4..2c1750d36 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -49,8 +49,7 @@ suites = { ], "nightly": [ "test_nightly_gsm8k_eval.py", - "test_nightly_human_eval.py", - # Disable temporarly + # Disable temporarily # "test_nightly_math_eval.py", ], "sampling/penaltylib": glob.glob( diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py index 7e23b721e..7820f6825 100644 --- a/test/srt/test_nightly_gsm8k_eval.py +++ b/test/srt/test_nightly_gsm8k_eval.py @@ -1,6 +1,5 @@ import json import os -import subprocess import unittest import warnings from datetime import datetime @@ -16,24 +15,26 @@ from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, + is_in_ci, popen_launch_server, + write_github_step_summary, ) MODEL_SCORE_THRESHOLDS = { - "meta-llama/Llama-3.1-8B-Instruct": 0.83, + "meta-llama/Llama-3.1-8B-Instruct": 0.82, "mistralai/Mistral-7B-Instruct-v0.3": 0.58, - "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84, + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85, "google/gemma-2-27b-it": 0.92, - "meta-llama/Llama-3.1-70B-Instruct": 0.96, - "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.63, - "Qwen/Qwen2-57B-A14B-Instruct": 0.87, - "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84, + "meta-llama/Llama-3.1-70B-Instruct": 0.95, + "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64, + "Qwen/Qwen2-57B-A14B-Instruct": 0.88, + "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83, "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54, - "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83, + "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84, "neuralmagic/gemma-2-2b-it-FP8": 0.60, - "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95, - "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61, - "neuralmagic/Qwen2-72B-Instruct-FP8": 0.95, + "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.94, + "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.62, + "neuralmagic/Qwen2-72B-Instruct-FP8": 0.94, "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82, "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84, "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.83, @@ -67,7 +68,6 @@ def launch_server(base_url, model, is_fp8, is_tp2): base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=other_args, - return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL), ) return process @@ -99,6 +99,9 @@ def write_results_to_json(model, metrics, mode="a"): def check_model_scores(results): failed_models = [] + summary = " | model | score | threshold |\n" + summary += "| ----- | ----- | --------- |\n" + for model, score in results: threshold = MODEL_SCORE_THRESHOLDS.get(model) if threshold is None: @@ -111,11 +114,19 @@ def check_model_scores(results): f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})" ) + line = f"| {model} | {score} | {threshold} |\n" + summary += line + + print(summary) + + if is_in_ci(): + write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}") + if failed_models: raise AssertionError("\n".join(failed_models)) -class TestEvalAccuracyLarge(unittest.TestCase): +class TestNightlyGsm8KEval(unittest.TestCase): @classmethod def setUpClass(cls): cls.model_groups = [ @@ -127,13 +138,6 @@ class TestEvalAccuracyLarge(unittest.TestCase): ] cls.base_url = DEFAULT_URL_FOR_TEST - def setUp(self): - self.process = None - - def tearDown(self): - if self.process: - kill_process_tree(self.process.pid) - def test_mgsm_en_all_models(self): warnings.filterwarnings( "ignore", category=ResourceWarning, message="unclosed.*socket" @@ -144,7 +148,7 @@ class TestEvalAccuracyLarge(unittest.TestCase): for model_group, is_fp8, is_tp2 in self.model_groups: for model in model_group: with self.subTest(model=model): - self.process = launch_server(self.base_url, model, is_fp8, is_tp2) + process = launch_server(self.base_url, model, is_fp8, is_tp2) args = SimpleNamespace( base_url=self.base_url, @@ -163,8 +167,7 @@ class TestEvalAccuracyLarge(unittest.TestCase): is_first = False all_results.append((model, metrics["score"])) - - self.tearDown() + kill_process_tree(process.pid) try: with open("results.json", "r") as f: diff --git a/test/srt/test_nightly_human_eval.py b/test/srt/test_nightly_human_eval.py index bffe214b5..0b682937a 100644 --- a/test/srt/test_nightly_human_eval.py +++ b/test/srt/test_nightly_human_eval.py @@ -18,7 +18,7 @@ from sglang.test.test_utils import ( ) -class TestEvalAccuracyLarge(unittest.TestCase): +class TestNightlyHumanEval(unittest.TestCase): @classmethod def setUpClass(cls): if is_in_ci(): diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py index bc99b23ad..eef033ea9 100644 --- a/test/srt/test_skip_tokenizer_init.py +++ b/test/srt/test_skip_tokenizer_init.py @@ -55,8 +55,10 @@ class TestSkipTokenizerInit(unittest.TestCase): print(json.dumps(ret)) def assert_one_item(item): - assert len(item["token_ids"]) == item["meta_info"]["completion_tokens"] - assert len(item["token_ids"]) == max_new_tokens + self.assertEqual( + len(item["token_ids"]), item["meta_info"]["completion_tokens"] + ) + self.assertEqual(len(item["token_ids"]), max_new_tokens) assert item["meta_info"]["prompt_tokens"] == len(input_ids) if return_logprob: