minor: add human eval (#1754)
This commit is contained in:
7
.github/workflows/nightly-eval.yml
vendored
7
.github/workflows/nightly-eval.yml
vendored
@@ -25,9 +25,10 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
|
||||
|
||||
- name: Nightly gsm8k Accuracy
|
||||
timeout-minutes: 60
|
||||
- name: Nightly gsm8k and human eval Accuracy
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
cd test/srt
|
||||
python3 test_nightly_gsm8k_eval.py
|
||||
python3 test_nightly_human_eval.py
|
||||
|
||||
125
test/srt/test_nightly_human_eval.py
Normal file
125
test/srt/test_nightly_human_eval.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import os
|
||||
import shutil
|
||||
import signal
|
||||
import subprocess
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
from test_nightly_gsm8k_eval import parse_models
|
||||
|
||||
from sglang.srt.utils import kill_child_process
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1,
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
popen_launch_server,
|
||||
)
|
||||
|
||||
|
||||
class TestEvalAccuracyLarge(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model_groups = [
|
||||
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
|
||||
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
|
||||
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
|
||||
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
|
||||
]
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = None
|
||||
cls.eval_process = None
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
if cls.process:
|
||||
kill_child_process(cls.process.pid)
|
||||
if cls.eval_process:
|
||||
kill_child_process(cls.eval_process.pid)
|
||||
|
||||
def launch_server(self, model, is_fp8, is_tp2):
|
||||
other_args = ["--log-level-http", "warning", "--trust-remote-code"]
|
||||
if is_fp8:
|
||||
if "Llama-3" in model or "gemma-2" in model:
|
||||
# compressed-tensors
|
||||
other_args.extend(["--kv-cache-dtype", "fp8_e5m2"])
|
||||
elif "Qwen2-72B-Instruct-FP8" in model:
|
||||
# bug
|
||||
other_args.extend(["--quantization", "fp8"])
|
||||
else:
|
||||
other_args.extend(
|
||||
["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"]
|
||||
)
|
||||
if is_tp2:
|
||||
other_args.extend(["--tp", "2"])
|
||||
if "DeepSeek" in model:
|
||||
other_args.extend(["--mem-frac", "0.85"])
|
||||
if "AWQ" in model:
|
||||
other_args.extend(["--quantization", "awq"])
|
||||
elif "GPTQ" in model:
|
||||
other_args.extend(["--quantization", "gptq"])
|
||||
|
||||
self.process = popen_launch_server(
|
||||
model,
|
||||
self.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=other_args,
|
||||
)
|
||||
|
||||
def run_evalplus(self, model):
|
||||
print("Delete evalplus results")
|
||||
shutil.rmtree("evalplus_results", ignore_errors=True)
|
||||
cmd = [
|
||||
"evalplus.evaluate",
|
||||
"--model",
|
||||
model,
|
||||
"--dataset",
|
||||
"humaneval",
|
||||
"--backend",
|
||||
"openai",
|
||||
"--base-url",
|
||||
"http://localhost:6157/v1",
|
||||
"--greedy",
|
||||
]
|
||||
|
||||
try:
|
||||
self.eval_process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
preexec_fn=os.setsid,
|
||||
)
|
||||
|
||||
stdout, stderr = self.eval_process.communicate(timeout=600)
|
||||
|
||||
if self.eval_process.returncode != 0:
|
||||
print(f"Fail to human eval model={model} err={stderr}")
|
||||
|
||||
print("=" * 42)
|
||||
print(stdout)
|
||||
print("=" * 42)
|
||||
except subprocess.TimeoutExpired:
|
||||
if self.eval_process:
|
||||
os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)
|
||||
print(f"Timeout during evaluation for model={model}")
|
||||
except Exception as e:
|
||||
print(f"Error running evalplus for model={model} {str(e)}")
|
||||
if self.eval_process:
|
||||
os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)
|
||||
|
||||
def test_human_eval_all_models(self):
|
||||
for model_group, is_fp8, is_tp2 in self.model_groups:
|
||||
for model in model_group:
|
||||
# NOTE: only Llama for now
|
||||
if "Llama" in model:
|
||||
with self.subTest(model=model):
|
||||
self.launch_server(model, is_fp8, is_tp2)
|
||||
self.run_evalplus(model)
|
||||
self.tearDownClass()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user