Add longer accuracy test on CI (#1049)

2024-08-12 02:21:38 -07:00
parent 89f23a5178
commit 41598e0d8e
13 changed files with 385 additions and 44 deletions
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -7,7 +7,7 @@ suites = {
    "minimal": [
        "test_chunked_prefill.py",
        "test_embedding_openai_server.py",
-        "test_eval_accuracy.py",
+        "test_eval_accuracy_mini.py",
        "test_large_max_new_tokens.py",
        "test_openai_server.py",
        "test_skip_tokenizer_init.py",
--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -10,34 +10,41 @@ from sglang.test.test_utils import (
 )


-class TestAccuracy(unittest.TestCase):
+class TestChunkedPrefill(unittest.TestCase):

-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
+    def run_mmlu(self, disable_radix_cache):
+        other_args = ["--chunked-prefill-size", "32"]
+        if disable_radix_cache:
+            other_args += ["--disable-radix-cache"]
+
+        model = DEFAULT_MODEL_NAME_FOR_TEST
+        base_url = DEFAULT_URL_FOR_TEST
+        process = popen_launch_server(
+            model,
+            base_url,
            timeout=300,
-            other_args=["--chunked-prefill-size", "32"],
+            other_args=other_args,
        )

-    @classmethod
-    def tearDownClass(cls):
-        kill_child_process(cls.process.pid)
-
-    def test_mmlu(self):
        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
+            base_url=base_url,
+            model=model,
            eval_name="mmlu",
-            num_examples=20,
-            num_threads=20,
+            num_examples=32,
+            num_threads=32,
        )

-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.5
+        try:
+            metrics = run_eval(args)
+            assert metrics["score"] >= 0.6
+        finally:
+            kill_child_process(process.pid)
+
+    def test_chunked_prefill(self):
+        self.run_mmlu(disable_radix_cache=False)
+
+    def test_chunked_prefill_without_radix_cache(self):
+        self.run_mmlu(disable_radix_cache=True)


 if __name__ == "__main__":
--- a/test/srt/test_eval_accuracy_large.py
+++ b/test/srt/test_eval_accuracy_large.py
@@ -0,0 +1,68 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestEvalAccuracyLarge(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = "http://127.0.0.1:7157"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=300,
+            other_args=["--log-level-http", "warning"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=None,
+            num_threads=2048,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.70
+
+    def test_human_eval(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="humaneval",
+            num_examples=None,
+            num_threads=2048,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.65
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=2048,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.85
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_eval_accuracy_mini.py
+++ b/test/srt/test_eval_accuracy_mini.py
@@ -10,7 +10,7 @@ from sglang.test.test_utils import (
 )


-class TestAccuracy(unittest.TestCase):
+class TestEvalAccuracyMini(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
@@ -27,12 +27,12 @@ class TestAccuracy(unittest.TestCase):
            base_url=self.base_url,
            model=self.model,
            eval_name="mmlu",
-            num_examples=20,
-            num_threads=20,
+            num_examples=32,
+            num_threads=32,
        )

        metrics = run_eval(args)
-        assert metrics["score"] >= 0.5
+        assert metrics["score"] >= 0.6


 if __name__ == "__main__":
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -1,3 +1,4 @@
+import os
 import unittest
 from types import SimpleNamespace

@@ -55,21 +56,30 @@ class TestServingThroughput(unittest.TestCase):
            kill_child_process(process.pid)

        assert res["completed"] == num_prompts
+        return res

    def test_default(self):
-        self.run_test(
+        res = self.run_test(
            disable_radix_cache=False,
            disable_flashinfer=False,
            chunked_prefill_size=-1,
        )

+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            # A100 performance
+            assert res["output_throughput"] >= 1300
+
    def test_default_without_radix_cache(self):
-        self.run_test(
+        res = self.run_test(
            disable_radix_cache=True,
            disable_flashinfer=False,
            chunked_prefill_size=-1,
        )

+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            # A100 performance
+            assert res["output_throughput"] >= 1400
+
    def test_default_without_flashinfer(self):
        self.run_test(
            disable_radix_cache=False,
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -10,7 +10,7 @@ from sglang.test.test_utils import (
 )


-class TestAccuracy(unittest.TestCase):
+class TestTorchCompile(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
@@ -29,12 +29,12 @@ class TestAccuracy(unittest.TestCase):
            base_url=self.base_url,
            model=self.model,
            eval_name="mmlu",
-            num_examples=20,
-            num_threads=20,
+            num_examples=32,
+            num_threads=32,
        )

        metrics = run_eval(args)
-        assert metrics["score"] >= 0.5
+        assert metrics["score"] >= 0.6


 if __name__ == "__main__":