diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py index 49ef46169..ede25b1d4 100644 --- a/test/srt/test_nightly_gsm8k_eval.py +++ b/test/srt/test_nightly_gsm8k_eval.py @@ -1,4 +1,7 @@ +import json +import os import unittest +from datetime import datetime from types import SimpleNamespace from sglang.srt.utils import kill_child_process @@ -14,6 +17,26 @@ from sglang.test.test_utils import ( popen_launch_server, ) +MODEL_SCORE_THRESHOLDS = { + "meta-llama/Llama-3.1-8B-Instruct": 0.8316, + "mistralai/Mistral-7B-Instruct-v0.3": 0.5861, + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.8672, + "google/gemma-2-27b-it": 0.9227, + "meta-llama/Llama-3.1-70B-Instruct": 0.9623, + "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.6415, + "Qwen/Qwen2-57B-A14B-Instruct": 0.8791, + "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.8672, + "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.5544, + "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.8356, + "neuralmagic/gemma-2-2b-it-FP8": 0.6059, + "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.9504, + "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.6138, + "neuralmagic/Qwen2-72B-Instruct-FP8": 0.9504, + "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.8197, + "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.8395, + "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.8435, +} + def parse_models(model_string): return [model.strip() for model in model_string.split(",") if model.strip()] @@ -23,10 +46,8 @@ def launch_server(base_url, model, is_fp8, is_tp2): other_args = ["--log-level-http", "warning", "--trust-remote-code"] if is_fp8: if "Llama-3" in model or "gemma-2" in model: - # compressed-tensors other_args.extend(["--kv-cache-dtype", "fp8_e5m2"]) elif "Qwen2-72B-Instruct-FP8" in model: - # bug other_args.extend(["--quantization", "fp8"]) else: other_args.extend(["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"]) @@ -48,6 +69,49 @@ def launch_server(base_url, model, is_fp8, is_tp2): return process +def write_results_to_json(model, metrics, mode="a"): + result = { + "timestamp": datetime.now().isoformat(), + "model": model, + "metrics": metrics, + "score": metrics["score"], + } + + existing_results = [] + if mode == "a" and os.path.exists("results.json"): + try: + with open("results.json", "r") as f: + existing_results = json.load(f) + except json.JSONDecodeError: + existing_results = [] + + if isinstance(existing_results, list): + existing_results.append(result) + else: + existing_results = [result] + + with open("results.json", "w") as f: + json.dump(existing_results, f, indent=2) + + +def check_model_scores(results): + failed_models = [] + for model, score in results: + threshold = MODEL_SCORE_THRESHOLDS.get(model) + if threshold is None: + print(f"Warning: No threshold defined for model {model}") + continue + + if score < threshold: + failed_models.append( + f"\nScore Check Failed: {model}\n" + f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})" + ) + + if failed_models: + raise AssertionError("\n".join(failed_models)) + + class TestEvalAccuracyLarge(unittest.TestCase): @classmethod def setUpClass(cls): @@ -68,6 +132,9 @@ class TestEvalAccuracyLarge(unittest.TestCase): kill_child_process(self.process.pid, include_self=True) def test_mgsm_en_all_models(self): + is_first = True + all_results = [] + for model_group, is_fp8, is_tp2 in self.model_groups: for model in model_group: with self.subTest(model=model): @@ -85,11 +152,24 @@ class TestEvalAccuracyLarge(unittest.TestCase): print( f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n" ) - # loosely threshold - assert metrics["score"] > 0.5, f"score={metrics['score']} <= 0.5" + + write_results_to_json(model, metrics, "w" if is_first else "a") + is_first = False + + all_results.append((model, metrics["score"])) self.tearDown() + try: + with open("results.json", "r") as f: + print("\nFinal Results from results.json:") + print(json.dumps(json.load(f), indent=2)) + except Exception as e: + print(f"Error reading results.json: {e}") + + # Check all scores after collecting all results + check_model_scores(all_results) + if __name__ == "__main__": unittest.main()