From 0c1c72a0b409f255a1fcea666705af8140da5f1e Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 12 Aug 2024 02:48:40 -0700
Subject: [PATCH] Fix accuracy test (#1051)

---
 python/sglang/test/run_eval.py              |  3 ++-
 python/sglang/test/simple_eval_humaneval.py | 10 ++--------
 test/srt/test_eval_accuracy_large.py        | 14 +++++++-------
 test/srt/test_serving_throughput.py         |  8 ++++----
 4 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/python/sglang/test/run_eval.py b/python/sglang/test/run_eval.py
index 3d13d475b..51b32ca01 100644
--- a/python/sglang/test/run_eval.py
+++ b/python/sglang/test/run_eval.py
@@ -16,6 +16,8 @@ from sglang.test.simple_eval_common import (
 
 
 def run_eval(args):
+    set_ulimit()
+
     if "OPENAI_API_KEY" not in os.environ:
         os.environ["OPENAI_API_KEY"] = "EMPTY"
 
@@ -117,7 +119,6 @@ if __name__ == "__main__":
     parser.add_argument("--eval-name", type=str, default="mmlu")
     parser.add_argument("--num-examples", type=int)
     parser.add_argument("--num-threads", type=int, default=512)
-    set_ulimit()
     args = parser.parse_args()
 
     run_eval(args)
diff --git a/python/sglang/test/simple_eval_humaneval.py b/python/sglang/test/simple_eval_humaneval.py
index 7a0f90c46..efb0d0bd6 100644
--- a/python/sglang/test/simple_eval_humaneval.py
+++ b/python/sglang/test/simple_eval_humaneval.py
@@ -6,21 +6,15 @@ Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de
 https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/ 
 """
 
-import json
-import logging
-import multiprocessing
 import random
 import re
-from collections import Counter, defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from io import BytesIO
-from typing import Any, Dict, List, Tuple
+from typing import Dict, List
 
-import blobfile as bf
 import tqdm
 
 try:
-    from human_eval.data import HUMAN_EVAL, read_problems
+    from human_eval.data import read_problems
     from human_eval.evaluation import estimate_pass_at_k
     from human_eval.execution import check_correctness  # , unsafe_execute
 except (ImportError, ModuleNotFoundError):
diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py
index 84a60dbe9..556954331 100644
--- a/test/srt/test_eval_accuracy_large.py
+++ b/test/srt/test_eval_accuracy_large.py
@@ -32,12 +32,12 @@ class TestEvalAccuracyLarge(unittest.TestCase):
             base_url=self.base_url,
             model=self.model,
             eval_name="mmlu",
-            num_examples=None,
-            num_threads=2048,
+            num_examples=3000,
+            num_threads=1024,
         )
 
         metrics = run_eval(args)
-        assert metrics["score"] >= 0.70
+        assert metrics["score"] >= 0.71, f"{metrics}"
 
     def test_human_eval(self):
         args = SimpleNamespace(
@@ -45,11 +45,11 @@ class TestEvalAccuracyLarge(unittest.TestCase):
             model=self.model,
             eval_name="humaneval",
             num_examples=None,
-            num_threads=2048,
+            num_threads=1024,
         )
 
         metrics = run_eval(args)
-        assert metrics["score"] >= 0.65
+        assert metrics["score"] >= 0.65, f"{metrics}"
 
     def test_mgsm_en(self):
         args = SimpleNamespace(
@@ -57,11 +57,11 @@ class TestEvalAccuracyLarge(unittest.TestCase):
             model=self.model,
             eval_name="mgsm_en",
             num_examples=None,
-            num_threads=2048,
+            num_threads=1024,
         )
 
         metrics = run_eval(args)
-        assert metrics["score"] >= 0.85
+        assert metrics["score"] >= 0.85, f"{metrics}"
 
 
 if __name__ == "__main__":
diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py
index 25b07d881..0066d01cb 100644
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -66,8 +66,8 @@ class TestServingThroughput(unittest.TestCase):
         )
 
         if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 performance
-            assert res["output_throughput"] >= 1300
+            # A100 (PCIE) performance
+            assert res["output_throughput"] >= 1400
 
     def test_default_without_radix_cache(self):
         res = self.run_test(
@@ -77,8 +77,8 @@ class TestServingThroughput(unittest.TestCase):
         )
 
         if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 performance
-            assert res["output_throughput"] >= 1400
+            # A100 (PCIE) performance
+            assert res["output_throughput"] >= 1450
 
     def test_default_without_flashinfer(self):
         self.run_test(