From 0c1c72a0b409f255a1fcea666705af8140da5f1e Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 12 Aug 2024 02:48:40 -0700 Subject: [PATCH] Fix accuracy test (#1051) --- python/sglang/test/run_eval.py | 3 ++- python/sglang/test/simple_eval_humaneval.py | 10 ++-------- test/srt/test_eval_accuracy_large.py | 14 +++++++------- test/srt/test_serving_throughput.py | 8 ++++---- 4 files changed, 15 insertions(+), 20 deletions(-) diff --git a/python/sglang/test/run_eval.py b/python/sglang/test/run_eval.py index 3d13d475b..51b32ca01 100644 --- a/python/sglang/test/run_eval.py +++ b/python/sglang/test/run_eval.py @@ -16,6 +16,8 @@ from sglang.test.simple_eval_common import ( def run_eval(args): + set_ulimit() + if "OPENAI_API_KEY" not in os.environ: os.environ["OPENAI_API_KEY"] = "EMPTY" @@ -117,7 +119,6 @@ if __name__ == "__main__": parser.add_argument("--eval-name", type=str, default="mmlu") parser.add_argument("--num-examples", type=int) parser.add_argument("--num-threads", type=int, default=512) - set_ulimit() args = parser.parse_args() run_eval(args) diff --git a/python/sglang/test/simple_eval_humaneval.py b/python/sglang/test/simple_eval_humaneval.py index 7a0f90c46..efb0d0bd6 100644 --- a/python/sglang/test/simple_eval_humaneval.py +++ b/python/sglang/test/simple_eval_humaneval.py @@ -6,21 +6,15 @@ Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/ """ -import json -import logging -import multiprocessing import random import re -from collections import Counter, defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed -from io import BytesIO -from typing import Any, Dict, List, Tuple +from typing import Dict, List -import blobfile as bf import tqdm try: - from human_eval.data import HUMAN_EVAL, read_problems + from human_eval.data import read_problems from human_eval.evaluation import estimate_pass_at_k from human_eval.execution import check_correctness # , unsafe_execute except (ImportError, ModuleNotFoundError): diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py index 84a60dbe9..556954331 100644 --- a/test/srt/test_eval_accuracy_large.py +++ b/test/srt/test_eval_accuracy_large.py @@ -32,12 +32,12 @@ class TestEvalAccuracyLarge(unittest.TestCase): base_url=self.base_url, model=self.model, eval_name="mmlu", - num_examples=None, - num_threads=2048, + num_examples=3000, + num_threads=1024, ) metrics = run_eval(args) - assert metrics["score"] >= 0.70 + assert metrics["score"] >= 0.71, f"{metrics}" def test_human_eval(self): args = SimpleNamespace( @@ -45,11 +45,11 @@ class TestEvalAccuracyLarge(unittest.TestCase): model=self.model, eval_name="humaneval", num_examples=None, - num_threads=2048, + num_threads=1024, ) metrics = run_eval(args) - assert metrics["score"] >= 0.65 + assert metrics["score"] >= 0.65, f"{metrics}" def test_mgsm_en(self): args = SimpleNamespace( @@ -57,11 +57,11 @@ class TestEvalAccuracyLarge(unittest.TestCase): model=self.model, eval_name="mgsm_en", num_examples=None, - num_threads=2048, + num_threads=1024, ) metrics = run_eval(args) - assert metrics["score"] >= 0.85 + assert metrics["score"] >= 0.85, f"{metrics}" if __name__ == "__main__": diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py index 25b07d881..0066d01cb 100644 --- a/test/srt/test_serving_throughput.py +++ b/test/srt/test_serving_throughput.py @@ -66,8 +66,8 @@ class TestServingThroughput(unittest.TestCase): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 performance - assert res["output_throughput"] >= 1300 + # A100 (PCIE) performance + assert res["output_throughput"] >= 1400 def test_default_without_radix_cache(self): res = self.run_test( @@ -77,8 +77,8 @@ class TestServingThroughput(unittest.TestCase): ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - # A100 performance - assert res["output_throughput"] >= 1400 + # A100 (PCIE) performance + assert res["output_throughput"] >= 1450 def test_default_without_flashinfer(self): self.run_test(