From a4d6d6f1ddc9f15bfa904e7e286e3f5ba4ba5a50 Mon Sep 17 00:00:00 2001 From: Xiaotong Jiang Date: Wed, 1 Jan 2025 15:29:35 -0800 Subject: [PATCH] [feat]: Add math eval to CI nightly run (#2663) Co-authored-by: Chayenne --- test/srt/run_suite.py | 1 + test/srt/test_nightly_gsm8k_eval.py | 4 +-- test/srt/test_nightly_human_eval.py | 21 +++++++++---- test/srt/test_nightly_math_eval.py | 46 +++++++++++++++++++++++++++++ 4 files changed, 64 insertions(+), 8 deletions(-) create mode 100644 test/srt/test_nightly_math_eval.py diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 02fe8032e..d670a2d35 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -49,6 +49,7 @@ suites = { "nightly": [ "test_nightly_gsm8k_eval.py", "test_nightly_human_eval.py", + "test_nightly_math_eval.py", ], "sampling/penaltylib": glob.glob( "sampling/penaltylib/**/test_*.py", recursive=True diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py index 8466c2c64..7e23b721e 100644 --- a/test/srt/test_nightly_gsm8k_eval.py +++ b/test/srt/test_nightly_gsm8k_eval.py @@ -25,7 +25,7 @@ MODEL_SCORE_THRESHOLDS = { "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84, "google/gemma-2-27b-it": 0.92, "meta-llama/Llama-3.1-70B-Instruct": 0.96, - "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64, + "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.63, "Qwen/Qwen2-57B-A14B-Instruct": 0.87, "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84, "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54, @@ -36,7 +36,7 @@ MODEL_SCORE_THRESHOLDS = { "neuralmagic/Qwen2-72B-Instruct-FP8": 0.95, "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82, "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84, - "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.84, + "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.83, } diff --git a/test/srt/test_nightly_human_eval.py b/test/srt/test_nightly_human_eval.py index 626e6fb15..bffe214b5 100644 --- a/test/srt/test_nightly_human_eval.py +++ b/test/srt/test_nightly_human_eval.py @@ -12,19 +12,28 @@ from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2, DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1, DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, + DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_URL_FOR_TEST, + is_in_ci, ) class TestEvalAccuracyLarge(unittest.TestCase): @classmethod def setUpClass(cls): - cls.model_groups = [ - (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), - (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), - (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False), - (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), - ] + if is_in_ci(): + cls.model_groups = [([DEFAULT_MODEL_NAME_FOR_TEST], False, False)] + else: + cls.model_groups = [ + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), + ( + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), + True, + False, + ), + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), + ] cls.base_url = DEFAULT_URL_FOR_TEST cls.process = None cls.eval_process = None diff --git a/test/srt/test_nightly_math_eval.py b/test/srt/test_nightly_math_eval.py new file mode 100644 index 000000000..3a4eb0adf --- /dev/null +++ b/test/srt/test_nightly_math_eval.py @@ -0,0 +1,46 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestEvalAccuracyLarge(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--log-level-http", "warning"], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_math(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="math", + num_examples=5000, + num_threads=1024, + ) + + metrics = run_eval(args) + self.assertGreaterEqual( + metrics["score"], 0.519 - 0.02 + ) # -2% to account for sampling variance + + +if __name__ == "__main__": + unittest.main()