[CI] Include triton backend and online serving benchmark into CI (#1408)

2024-09-12 21:36:41 -07:00
parent b912de11b0
commit 68be2f6d3b
8 changed files with 270 additions and 307 deletions
--- a/test/srt/test_bench_latency.py
+++ b/test/srt/test_bench_latency.py
@@ -0,0 +1,83 @@
+import os
+import subprocess
+import unittest
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+)
+
+
+class TestBenchLatency(unittest.TestCase):
+    def test_default(self):
+        command = [
+            "python3",
+            "-m",
+            "sglang.bench_latency",
+            "--model-path",
+            DEFAULT_MODEL_NAME_FOR_TEST,
+            "--batch-size",
+            "1",
+            "--input",
+            "128",
+            "--output",
+            "8",
+        ]
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+
+        try:
+            stdout, stderr = process.communicate()
+            output = stdout.decode()
+            error = stderr.decode()
+            print(f"Output: {output}")
+            print(f"Error: {error}")
+
+            lastline = output.split("\n")[-3]
+            value = float(lastline.split(" ")[-2])
+
+            if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+                assert value > 130
+        finally:
+            kill_child_process(process.pid)
+
+    def test_moe_default(self):
+        command = [
+            "python3",
+            "-m",
+            "sglang.bench_latency",
+            "--model",
+            DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            "--batch-size",
+            "1",
+            "--input",
+            "128",
+            "--output",
+            "8",
+            "--tp",
+            "2",
+        ]
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+
+        try:
+            stdout, stderr = process.communicate()
+            output = stdout.decode()
+            error = stderr.decode()
+            print(f"Output: {output}")
+            print(f"Error: {error}")
+
+            lastline = output.split("\n")[-3]
+            value = float(lastline.split(" ")[-2])
+
+            if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+                assert value > 125
+        finally:
+            kill_child_process(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -0,0 +1,99 @@
+import os
+import unittest
+
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    run_bench_serving,
+)
+
+
+class TestBenchServing(unittest.TestCase):
+
+    def test_offline_throughput_default(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 2600
+
+    def test_offline_throughput_without_radix_cache(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=["--disable-radix-cache"],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 2800
+
+    def test_offline_throughput_without_chunked_prefill(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=["--chunked-prefill-size", "-1"],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 2600
+
+    def test_offline_throughput_with_triton_attention_backend(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[
+                "--attention-backend",
+                "triton",
+                "--context-length",
+                "8192",
+            ],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 2600
+
+    def test_online_latency_default(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=100,
+            request_rate=1,
+            other_server_args=[],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["median_e2e_latency_ms"] < 12000
+            assert res["median_ttft_ms"] < 78
+            assert res["median_itl_ms"] < 12
+
+    def test_moe_offline_throughput_default(self):
+        res = run_bench_serving(
+            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            num_prompts=300,
+            request_rate=float("inf"),
+            other_server_args=["--tp", "2"],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 1850
+
+    def test_moe_offline_throughput_without_radix_cache(self):
+        res = run_bench_serving(
+            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            num_prompts=300,
+            request_rate=float("inf"),
+            other_server_args=["--tp", "2", "--disable-radix-cache"],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 1950
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_moe_serving_latency.py
+++ b/test/srt/test_moe_serving_latency.py
@@ -1,45 +0,0 @@
-import os
-import subprocess
-import unittest
-
-from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
-
-
-class TestServingLatency(unittest.TestCase):
-    def test_default(self):
-        command = [
-            "python3",
-            "-m",
-            "sglang.bench_latency",
-            "--model",
-            DEFAULT_MOE_MODEL_NAME_FOR_TEST,
-            "--batch-size",
-            "1",
-            "--input",
-            "128",
-            "--output",
-            "8",
-            "--tp",
-            "2",
-        ]
-        process = subprocess.Popen(
-            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        stdout, stderr = process.communicate()
-        output = stdout.decode()
-        error = stderr.decode()
-        print(f"Output: {output}")
-        print(f"Error: {error}")
-
-        lastline = output.split("\n")[-3]
-        value = float(lastline.split(" ")[-2])
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert value > 125
-
-        kill_child_process(process.pid)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_moe_serving_throughput.py
+++ b/test/srt/test_moe_serving_throughput.py
@@ -1,92 +0,0 @@
-import os
-import unittest
-from types import SimpleNamespace
-
-from sglang.bench_serving import run_benchmark
-from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import (
-    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-
-
-class TestServingThroughput(unittest.TestCase):
-    def run_test(self, disable_radix_cache, attention_backend, chunked_prefill_size):
-        # Launch the server
-        other_args = []
-        if disable_radix_cache:
-            other_args.append("--disable-radix-cache")
-        if attention_backend:
-            other_args.extend(["--attention-backend", attention_backend])
-        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
-        other_args.extend(["--tensor-parallel-size", "2"])
-
-        model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
-        base_url = DEFAULT_URL_FOR_TEST
-        process = popen_launch_server(
-            model,
-            base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=other_args,
-        )
-
-        # Run benchmark
-        num_prompts = 300
-        args = SimpleNamespace(
-            backend="sglang",
-            base_url=base_url,
-            host=None,
-            port=None,
-            dataset_name="random",
-            dataset_path="",
-            model=None,
-            tokenizer=None,
-            num_prompts=num_prompts,
-            sharegpt_output_len=None,
-            random_input_len=4096,
-            random_output_len=2048,
-            random_range_ratio=0.0,
-            request_rate=float("inf"),
-            multi=None,
-            seed=0,
-            output_file=None,
-            disable_tqdm=False,
-            disable_stream=False,
-            disable_ignore_eos=False,
-            extra_request_body=None,
-        )
-
-        try:
-            res = run_benchmark(args)
-        finally:
-            kill_child_process(process.pid)
-
-        assert res["completed"] == num_prompts
-        return res
-
-    def test_default(self):
-        res = self.run_test(
-            disable_radix_cache=ServerArgs.disable_radix_cache,
-            attention_backend=ServerArgs.attention_backend,
-            chunked_prefill_size=ServerArgs.chunked_prefill_size,
-        )
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 1800
-
-    def test_default_without_radix_cache(self):
-        res = self.run_test(
-            disable_radix_cache=True,
-            attention_backend=ServerArgs.attention_backend,
-            chunked_prefill_size=ServerArgs.chunked_prefill_size,
-        )
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 1950
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_serving_latency.py
+++ b/test/srt/test_serving_latency.py
@@ -1,43 +0,0 @@
-import os
-import subprocess
-import unittest
-
-from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
-
-
-class TestServingLatency(unittest.TestCase):
-    def test_default(self):
-        command = [
-            "python3",
-            "-m",
-            "sglang.bench_latency",
-            "--model-path",
-            DEFAULT_MODEL_NAME_FOR_TEST,
-            "--batch-size",
-            "1",
-            "--input",
-            "128",
-            "--output",
-            "8",
-        ]
-        process = subprocess.Popen(
-            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        stdout, stderr = process.communicate()
-        output = stdout.decode()
-        error = stderr.decode()
-        print(f"Output: {output}")
-        print(f"Error: {error}")
-
-        lastline = output.split("\n")[-3]
-        value = float(lastline.split(" ")[-2])
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert value > 130
-
-        kill_child_process(process.pid)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -1,111 +0,0 @@
-import os
-import unittest
-from types import SimpleNamespace
-
-from sglang.bench_serving import run_benchmark
-from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-
-
-class TestServingThroughput(unittest.TestCase):
-    def run_test(self, disable_radix_cache, attention_backend, chunked_prefill_size):
-        # Launch the server
-        other_args = []
-        if disable_radix_cache:
-            other_args.append("--disable-radix-cache")
-        if attention_backend:
-            other_args.extend(["--attention-backend", attention_backend])
-        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
-
-        model = DEFAULT_MODEL_NAME_FOR_TEST
-        base_url = DEFAULT_URL_FOR_TEST
-        process = popen_launch_server(
-            model,
-            base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=other_args,
-        )
-
-        # Run benchmark
-        num_prompts = 500
-        args = SimpleNamespace(
-            backend="sglang",
-            base_url=base_url,
-            host=None,
-            port=None,
-            dataset_name="random",
-            dataset_path="",
-            model=None,
-            tokenizer=None,
-            num_prompts=num_prompts,
-            sharegpt_output_len=None,
-            random_input_len=4096,
-            random_output_len=2048,
-            random_range_ratio=0.0,
-            request_rate=float("inf"),
-            multi=None,
-            seed=0,
-            output_file=None,
-            disable_tqdm=False,
-            disable_stream=False,
-            disable_ignore_eos=False,
-            extra_request_body=None,
-        )
-
-        try:
-            res = run_benchmark(args)
-        finally:
-            kill_child_process(process.pid)
-
-        assert res["completed"] == num_prompts
-        return res
-
-    def test_default(self):
-        res = self.run_test(
-            disable_radix_cache=ServerArgs.disable_radix_cache,
-            attention_backend=ServerArgs.attention_backend,
-            chunked_prefill_size=ServerArgs.chunked_prefill_size,
-        )
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 2400
-
-    def test_default_without_radix_cache(self):
-        res = self.run_test(
-            disable_radix_cache=True,
-            attention_backend=ServerArgs.attention_backend,
-            chunked_prefill_size=ServerArgs.chunked_prefill_size,
-        )
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 2800
-
-    def test_default_without_chunked_prefill(self):
-        res = self.run_test(
-            disable_radix_cache=ServerArgs.disable_radix_cache,
-            attention_backend=ServerArgs.attention_backend,
-            chunked_prefill_size=-1,
-        )
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 2400
-
-    def test_default_with_triton_attention_backend(self):
-        res = self.run_test(
-            disable_radix_cache=ServerArgs.disable_radix_cache,
-            attention_backend="triton",
-            chunked_prefill_size=-1,
-        )
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 2400
-
-
-if __name__ == "__main__":
-    unittest.main()