Fix accuracy test (#1051)
This commit is contained in:
@@ -16,6 +16,8 @@ from sglang.test.simple_eval_common import (
|
|||||||
|
|
||||||
|
|
||||||
def run_eval(args):
|
def run_eval(args):
|
||||||
|
set_ulimit()
|
||||||
|
|
||||||
if "OPENAI_API_KEY" not in os.environ:
|
if "OPENAI_API_KEY" not in os.environ:
|
||||||
os.environ["OPENAI_API_KEY"] = "EMPTY"
|
os.environ["OPENAI_API_KEY"] = "EMPTY"
|
||||||
|
|
||||||
@@ -117,7 +119,6 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--eval-name", type=str, default="mmlu")
|
parser.add_argument("--eval-name", type=str, default="mmlu")
|
||||||
parser.add_argument("--num-examples", type=int)
|
parser.add_argument("--num-examples", type=int)
|
||||||
parser.add_argument("--num-threads", type=int, default=512)
|
parser.add_argument("--num-threads", type=int, default=512)
|
||||||
set_ulimit()
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
run_eval(args)
|
run_eval(args)
|
||||||
|
|||||||
@@ -6,21 +6,15 @@ Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de
|
|||||||
https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
|
https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import multiprocessing
|
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
from collections import Counter, defaultdict
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from io import BytesIO
|
from typing import Dict, List
|
||||||
from typing import Any, Dict, List, Tuple
|
|
||||||
|
|
||||||
import blobfile as bf
|
|
||||||
import tqdm
|
import tqdm
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from human_eval.data import HUMAN_EVAL, read_problems
|
from human_eval.data import read_problems
|
||||||
from human_eval.evaluation import estimate_pass_at_k
|
from human_eval.evaluation import estimate_pass_at_k
|
||||||
from human_eval.execution import check_correctness # , unsafe_execute
|
from human_eval.execution import check_correctness # , unsafe_execute
|
||||||
except (ImportError, ModuleNotFoundError):
|
except (ImportError, ModuleNotFoundError):
|
||||||
|
|||||||
@@ -32,12 +32,12 @@ class TestEvalAccuracyLarge(unittest.TestCase):
|
|||||||
base_url=self.base_url,
|
base_url=self.base_url,
|
||||||
model=self.model,
|
model=self.model,
|
||||||
eval_name="mmlu",
|
eval_name="mmlu",
|
||||||
num_examples=None,
|
num_examples=3000,
|
||||||
num_threads=2048,
|
num_threads=1024,
|
||||||
)
|
)
|
||||||
|
|
||||||
metrics = run_eval(args)
|
metrics = run_eval(args)
|
||||||
assert metrics["score"] >= 0.70
|
assert metrics["score"] >= 0.71, f"{metrics}"
|
||||||
|
|
||||||
def test_human_eval(self):
|
def test_human_eval(self):
|
||||||
args = SimpleNamespace(
|
args = SimpleNamespace(
|
||||||
@@ -45,11 +45,11 @@ class TestEvalAccuracyLarge(unittest.TestCase):
|
|||||||
model=self.model,
|
model=self.model,
|
||||||
eval_name="humaneval",
|
eval_name="humaneval",
|
||||||
num_examples=None,
|
num_examples=None,
|
||||||
num_threads=2048,
|
num_threads=1024,
|
||||||
)
|
)
|
||||||
|
|
||||||
metrics = run_eval(args)
|
metrics = run_eval(args)
|
||||||
assert metrics["score"] >= 0.65
|
assert metrics["score"] >= 0.65, f"{metrics}"
|
||||||
|
|
||||||
def test_mgsm_en(self):
|
def test_mgsm_en(self):
|
||||||
args = SimpleNamespace(
|
args = SimpleNamespace(
|
||||||
@@ -57,11 +57,11 @@ class TestEvalAccuracyLarge(unittest.TestCase):
|
|||||||
model=self.model,
|
model=self.model,
|
||||||
eval_name="mgsm_en",
|
eval_name="mgsm_en",
|
||||||
num_examples=None,
|
num_examples=None,
|
||||||
num_threads=2048,
|
num_threads=1024,
|
||||||
)
|
)
|
||||||
|
|
||||||
metrics = run_eval(args)
|
metrics = run_eval(args)
|
||||||
assert metrics["score"] >= 0.85
|
assert metrics["score"] >= 0.85, f"{metrics}"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -66,8 +66,8 @@ class TestServingThroughput(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
||||||
# A100 performance
|
# A100 (PCIE) performance
|
||||||
assert res["output_throughput"] >= 1300
|
assert res["output_throughput"] >= 1400
|
||||||
|
|
||||||
def test_default_without_radix_cache(self):
|
def test_default_without_radix_cache(self):
|
||||||
res = self.run_test(
|
res = self.run_test(
|
||||||
@@ -77,8 +77,8 @@ class TestServingThroughput(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
||||||
# A100 performance
|
# A100 (PCIE) performance
|
||||||
assert res["output_throughput"] >= 1400
|
assert res["output_throughput"] >= 1450
|
||||||
|
|
||||||
def test_default_without_flashinfer(self):
|
def test_default_without_flashinfer(self):
|
||||||
self.run_test(
|
self.run_test(
|
||||||
|
|||||||
Reference in New Issue
Block a user