From d86a2d6562840455281bfb7bd4a9a0bcc9461992 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Fri, 1 Nov 2024 14:29:20 +0800 Subject: [PATCH] minor: add human eval (#1754) --- .github/workflows/nightly-eval.yml | 7 +- test/srt/test_nightly_human_eval.py | 125 ++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+), 3 deletions(-) create mode 100644 test/srt/test_nightly_human_eval.py diff --git a/.github/workflows/nightly-eval.yml b/.github/workflows/nightly-eval.yml index afc29beee..13911f989 100644 --- a/.github/workflows/nightly-eval.yml +++ b/.github/workflows/nightly-eval.yml @@ -25,9 +25,10 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh + pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus" - - name: Nightly gsm8k Accuracy - timeout-minutes: 60 + - name: Nightly gsm8k and human eval Accuracy + timeout-minutes: 120 run: | cd test/srt - python3 test_nightly_gsm8k_eval.py + python3 test_nightly_human_eval.py diff --git a/test/srt/test_nightly_human_eval.py b/test/srt/test_nightly_human_eval.py new file mode 100644 index 000000000..6d2ecee50 --- /dev/null +++ b/test/srt/test_nightly_human_eval.py @@ -0,0 +1,125 @@ +import os +import shutil +import signal +import subprocess +import unittest +from types import SimpleNamespace + +from test_nightly_gsm8k_eval import parse_models + +from sglang.srt.utils import kill_child_process +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1, + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2, + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1, + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestEvalAccuracyLarge(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model_groups = [ + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False), + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), + ] + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = None + cls.eval_process = None + + @classmethod + def tearDownClass(cls): + if cls.process: + kill_child_process(cls.process.pid) + if cls.eval_process: + kill_child_process(cls.eval_process.pid) + + def launch_server(self, model, is_fp8, is_tp2): + other_args = ["--log-level-http", "warning", "--trust-remote-code"] + if is_fp8: + if "Llama-3" in model or "gemma-2" in model: + # compressed-tensors + other_args.extend(["--kv-cache-dtype", "fp8_e5m2"]) + elif "Qwen2-72B-Instruct-FP8" in model: + # bug + other_args.extend(["--quantization", "fp8"]) + else: + other_args.extend( + ["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"] + ) + if is_tp2: + other_args.extend(["--tp", "2"]) + if "DeepSeek" in model: + other_args.extend(["--mem-frac", "0.85"]) + if "AWQ" in model: + other_args.extend(["--quantization", "awq"]) + elif "GPTQ" in model: + other_args.extend(["--quantization", "gptq"]) + + self.process = popen_launch_server( + model, + self.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) + + def run_evalplus(self, model): + print("Delete evalplus results") + shutil.rmtree("evalplus_results", ignore_errors=True) + cmd = [ + "evalplus.evaluate", + "--model", + model, + "--dataset", + "humaneval", + "--backend", + "openai", + "--base-url", + "http://localhost:6157/v1", + "--greedy", + ] + + try: + self.eval_process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + preexec_fn=os.setsid, + ) + + stdout, stderr = self.eval_process.communicate(timeout=600) + + if self.eval_process.returncode != 0: + print(f"Fail to human eval model={model} err={stderr}") + + print("=" * 42) + print(stdout) + print("=" * 42) + except subprocess.TimeoutExpired: + if self.eval_process: + os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM) + print(f"Timeout during evaluation for model={model}") + except Exception as e: + print(f"Error running evalplus for model={model} {str(e)}") + if self.eval_process: + os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM) + + def test_human_eval_all_models(self): + for model_group, is_fp8, is_tp2 in self.model_groups: + for model in model_group: + # NOTE: only Llama for now + if "Llama" in model: + with self.subTest(model=model): + self.launch_server(model, is_fp8, is_tp2) + self.run_evalplus(model) + self.tearDownClass() + + +if __name__ == "__main__": + unittest.main()