[CI] Add more multi-gpu tests (#1280)

2024-09-01 00:27:25 -07:00
parent d134c139a1
commit 1b5d56f7f8
11 changed files with 271 additions and 128 deletions
--- a/test/srt/test_moe_eval_accuracy_large.py
+++ b/test/srt/test_moe_eval_accuracy_large.py
@@ -0,0 +1,73 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestEvalAccuracyLarge(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--log-level-http",
+                "warning",
+                "--tp",
+                "2",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=3000,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.63, f"{metrics}"
+
+    def test_human_eval(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="humaneval",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.43, f"{metrics}"
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.64, f"{metrics}"
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_moe_serving_latency.py
+++ b/test/srt/test_moe_serving_latency.py
@@ -0,0 +1,45 @@
+import os
+import subprocess
+import unittest
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
+
+
+class TestServingLatency(unittest.TestCase):
+    def test_default(self):
+        command = [
+            "python3",
+            "-m",
+            "sglang.bench_latency",
+            "--model",
+            DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            "--batch-size",
+            "1",
+            "--input",
+            "128",
+            "--output",
+            "8",
+            "--tp",
+            "2",
+        ]
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        stdout, stderr = process.communicate()
+        output = stdout.decode()
+        error = stderr.decode()
+        print(f"Output: {output}")
+        print(f"Error: {error}")
+
+        lastline = output.split("\n")[-3]
+        value = float(lastline.split(" ")[-2])
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert value > 125
+
+        kill_child_process(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_moe_serving_throughput.py
+++ b/test/srt/test_moe_serving_throughput.py
@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase):
            other_args.append("--disable-flashinfer")
        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
        other_args.extend(["--tensor-parallel-size", "2"])
-        other_args.append("--enable-p2p-check")

        model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
        base_url = DEFAULT_URL_FOR_TEST
@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase):
        )

        # Run benchmark
-        num_prompts = 200
+        num_prompts = 300
        args = SimpleNamespace(
            backend="sglang",
            base_url=base_url,
@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase):
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 950, H100 (SMX): 1800
-            assert res["output_throughput"] > 1750
+            assert res["output_throughput"] > 1850

    def test_default_without_radix_cache(self):
        res = self.run_test(
@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase):
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 950, H100 (SMX): 1900
-            assert res["output_throughput"] > 1850
-
-    def test_all_cases(self):
-        for disable_radix_cache in [False, True]:
-            for disable_flashinfer in [False, True]:
-                for chunked_prefill_size in [-1, 2048]:
-                    self.run_test(
-                        disable_radix_cache=False,
-                        disable_flashinfer=False,
-                        chunked_prefill_size=-1,
-                    )
+            assert res["output_throughput"] > 1950


 if __name__ == "__main__":
--- a/test/srt/test_serving_latency.py
+++ b/test/srt/test_serving_latency.py
@@ -0,0 +1,43 @@
+import os
+import subprocess
+import unittest
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
+
+
+class TestServingLatency(unittest.TestCase):
+    def test_default(self):
+        command = [
+            "python3",
+            "-m",
+            "sglang.bench_latency",
+            "--model",
+            DEFAULT_MODEL_NAME_FOR_TEST,
+            "--batch-size",
+            "1",
+            "--input",
+            "128",
+            "--output",
+            "8",
+        ]
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        stdout, stderr = process.communicate()
+        output = stdout.decode()
+        error = stderr.decode()
+        print(f"Output: {output}")
+        print(f"Error: {error}")
+
+        lastline = output.split("\n")[-3]
+        value = float(lastline.split(" ")[-2])
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert value > 130
+
+        kill_child_process(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase):
        )

        # Run benchmark
-        num_prompts = 400
+        num_prompts = 500
        args = SimpleNamespace(
            backend="sglang",
            base_url=base_url,
@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase):
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1450, H100 (SMX): 2550
-            assert res["output_throughput"] > 2500
+            assert res["output_throughput"] > 2400

    def test_default_without_radix_cache(self):
        res = self.run_test(
@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase):
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1500, H100 (SMX): 2850
            assert res["output_throughput"] > 2800

    def test_default_without_chunked_prefill(self):
@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase):
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1450, H100 (SMX): 2550
-            assert res["output_throughput"] > 2500
-
-    def test_all_cases(self):
-        for disable_radix_cache in [False, True]:
-            for disable_flashinfer in [False, True]:
-                for chunked_prefill_size in [-1, 2048]:
-                    self.run_test(
-                        disable_radix_cache=False,
-                        disable_flashinfer=False,
-                        chunked_prefill_size=-1,
-                    )
+            assert res["output_throughput"] > 2400


 if __name__ == "__main__":