[CI] Include triton backend and online serving benchmark into CI (#1408)

2024-09-12 21:36:41 -07:00
parent b912de11b0
commit 68be2f6d3b
8 changed files with 270 additions and 307 deletions
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -75,7 +75,7 @@ jobs:
          cd test/srt
          python3 run_suite.py --suite minimal --range-begin 8

-  performance-test-1-gpu:
+  performance-test-1-gpu-part-1:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
    steps:
@@ -88,29 +88,54 @@ jobs:
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall

-      - name: Benchmark Serving Throughput
+      - name: Benchmark Offline Throughput
        timeout-minutes: 10
        run: |
          cd test/srt
-          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default

-      - name: Benchmark Serving Latency
+      - name: Benchmark Offline Throughput (w/o RadixAttention)
        timeout-minutes: 10
        run: |
          cd test/srt
-          python3 -m unittest test_serving_latency.TestServingLatency.test_default
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache

-      - name: Benchmark Serving Throughput (w/o RadixAttention)
+      - name: Benchmark Offline Throughput (w/o ChunkedPrefill)
        timeout-minutes: 10
        run: |
          cd test/srt
-          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_chunked_prefill

-      - name: Benchmark Serving Throughput (w/o ChunkedPrefill)
+      - name: Benchmark Offline Throughput (w/ Triton)
        timeout-minutes: 10
        run: |
          cd test/srt
-          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
+
+  performance-test-1-gpu-part-2:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[all]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+      - name: Benchmark Single Latency
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_latency.TestBenchLatency.test_default
+
+      - name: Benchmark Online Latency
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default

  performance-test-2-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -125,23 +150,24 @@ jobs:
          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall

-      - name: Benchmark Serving Throughput (TP=2)
+      - name: Benchmark Offline Throughput (TP=2)
        timeout-minutes: 10
        run: |
          cd test/srt
-          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
+          python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default

-      - name: Benchmark Serving Latency (TP=2)
+      - name: Benchmark Offline Throughput (w/o RadixAttention) (TP=2)
        timeout-minutes: 10
        run: |
          cd test/srt
-          python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
+          python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache

-      - name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
+      - name: Benchmark Single Latency (TP=2)
        timeout-minutes: 10
        run: |
          cd test/srt
-          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
+          python3 -m unittest test_bench_latency.TestBenchLatency.test_moe_default
+

  accuracy-test-1-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -192,7 +218,7 @@ jobs:
  finish:
    needs: [
      unit-test-frontend, unit-test-backend-part-0, unit-test-backend-part-1,
-      performance-test-1-gpu, performance-test-2-gpu,
+      performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
      accuracy-test-1-gpu, accuracy-test-2-gpu
    ]
    runs-on: ubuntu-latest
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -7,6 +7,7 @@ import subprocess
 import threading
 import time
 from functools import partial
+from types import SimpleNamespace
 from typing import Callable, List, Optional

 import numpy as np
@@ -14,6 +15,7 @@ import requests
 import torch
 import torch.nn.functional as F

+from sglang.bench_serving import run_benchmark
 from sglang.global_config import global_config
 from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
@@ -501,3 +503,47 @@ def run_unittest_files(files: List[str], timeout_per_file: float):

 def get_similarities(vec1, vec2):
    return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
+
+
+def run_bench_serving(model, num_prompts, request_rate, other_server_args):
+    # Launch the server
+    base_url = DEFAULT_URL_FOR_TEST
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_server_args,
+    )
+
+    # Run benchmark
+    args = SimpleNamespace(
+        backend="sglang",
+        base_url=base_url,
+        host=None,
+        port=None,
+        dataset_name="random",
+        dataset_path="",
+        model=None,
+        tokenizer=None,
+        num_prompts=num_prompts,
+        sharegpt_output_len=None,
+        random_input_len=4096,
+        random_output_len=2048,
+        random_range_ratio=0.0,
+        request_rate=request_rate,
+        multi=None,
+        seed=0,
+        output_file=None,
+        disable_tqdm=False,
+        disable_stream=False,
+        disable_ignore_eos=False,
+        extra_request_body=None,
+    )
+
+    try:
+        res = run_benchmark(args)
+    finally:
+        kill_child_process(process.pid)
+
+    assert res["completed"] == num_prompts
+    return res
--- a/test/srt/test_bench_latency.py
+++ b/test/srt/test_bench_latency.py
@@ -0,0 +1,83 @@
+import os
+import subprocess
+import unittest
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+)
+
+
+class TestBenchLatency(unittest.TestCase):
+    def test_default(self):
+        command = [
+            "python3",
+            "-m",
+            "sglang.bench_latency",
+            "--model-path",
+            DEFAULT_MODEL_NAME_FOR_TEST,
+            "--batch-size",
+            "1",
+            "--input",
+            "128",
+            "--output",
+            "8",
+        ]
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+
+        try:
+            stdout, stderr = process.communicate()
+            output = stdout.decode()
+            error = stderr.decode()
+            print(f"Output: {output}")
+            print(f"Error: {error}")
+
+            lastline = output.split("\n")[-3]
+            value = float(lastline.split(" ")[-2])
+
+            if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+                assert value > 130
+        finally:
+            kill_child_process(process.pid)
+
+    def test_moe_default(self):
+        command = [
+            "python3",
+            "-m",
+            "sglang.bench_latency",
+            "--model",
+            DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            "--batch-size",
+            "1",
+            "--input",
+            "128",
+            "--output",
+            "8",
+            "--tp",
+            "2",
+        ]
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+
+        try:
+            stdout, stderr = process.communicate()
+            output = stdout.decode()
+            error = stderr.decode()
+            print(f"Output: {output}")
+            print(f"Error: {error}")
+
+            lastline = output.split("\n")[-3]
+            value = float(lastline.split(" ")[-2])
+
+            if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+                assert value > 125
+        finally:
+            kill_child_process(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -0,0 +1,99 @@
+import os
+import unittest
+
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    run_bench_serving,
+)
+
+
+class TestBenchServing(unittest.TestCase):
+
+    def test_offline_throughput_default(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 2600
+
+    def test_offline_throughput_without_radix_cache(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=["--disable-radix-cache"],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 2800
+
+    def test_offline_throughput_without_chunked_prefill(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=["--chunked-prefill-size", "-1"],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 2600
+
+    def test_offline_throughput_with_triton_attention_backend(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[
+                "--attention-backend",
+                "triton",
+                "--context-length",
+                "8192",
+            ],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 2600
+
+    def test_online_latency_default(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=100,
+            request_rate=1,
+            other_server_args=[],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["median_e2e_latency_ms"] < 12000
+            assert res["median_ttft_ms"] < 78
+            assert res["median_itl_ms"] < 12
+
+    def test_moe_offline_throughput_default(self):
+        res = run_bench_serving(
+            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            num_prompts=300,
+            request_rate=float("inf"),
+            other_server_args=["--tp", "2"],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 1850
+
+    def test_moe_offline_throughput_without_radix_cache(self):
+        res = run_bench_serving(
+            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            num_prompts=300,
+            request_rate=float("inf"),
+            other_server_args=["--tp", "2", "--disable-radix-cache"],
+        )
+
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert res["output_throughput"] > 1950
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_moe_serving_latency.py
+++ b/test/srt/test_moe_serving_latency.py
@@ -1,45 +0,0 @@
-import os
-import subprocess
-import unittest
-
-from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
-
-
-class TestServingLatency(unittest.TestCase):
-    def test_default(self):
-        command = [
-            "python3",
-            "-m",
-            "sglang.bench_latency",
-            "--model",
-            DEFAULT_MOE_MODEL_NAME_FOR_TEST,
-            "--batch-size",
-            "1",
-            "--input",
-            "128",
-            "--output",
-            "8",
-            "--tp",
-            "2",
-        ]
-        process = subprocess.Popen(
-            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        stdout, stderr = process.communicate()
-        output = stdout.decode()
-        error = stderr.decode()
-        print(f"Output: {output}")
-        print(f"Error: {error}")
-
-        lastline = output.split("\n")[-3]
-        value = float(lastline.split(" ")[-2])
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert value > 125
-
-        kill_child_process(process.pid)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_moe_serving_throughput.py
+++ b/test/srt/test_moe_serving_throughput.py
@@ -1,92 +0,0 @@
-import os
-import unittest
-from types import SimpleNamespace
-
-from sglang.bench_serving import run_benchmark
-from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import (
-    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-
-
-class TestServingThroughput(unittest.TestCase):
-    def run_test(self, disable_radix_cache, attention_backend, chunked_prefill_size):
-        # Launch the server
-        other_args = []
-        if disable_radix_cache:
-            other_args.append("--disable-radix-cache")
-        if attention_backend:
-            other_args.extend(["--attention-backend", attention_backend])
-        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
-        other_args.extend(["--tensor-parallel-size", "2"])
-
-        model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
-        base_url = DEFAULT_URL_FOR_TEST
-        process = popen_launch_server(
-            model,
-            base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=other_args,
-        )
-
-        # Run benchmark
-        num_prompts = 300
-        args = SimpleNamespace(
-            backend="sglang",
-            base_url=base_url,
-            host=None,
-            port=None,
-            dataset_name="random",
-            dataset_path="",
-            model=None,
-            tokenizer=None,
-            num_prompts=num_prompts,
-            sharegpt_output_len=None,
-            random_input_len=4096,
-            random_output_len=2048,
-            random_range_ratio=0.0,
-            request_rate=float("inf"),
-            multi=None,
-            seed=0,
-            output_file=None,
-            disable_tqdm=False,
-            disable_stream=False,
-            disable_ignore_eos=False,
-            extra_request_body=None,
-        )
-
-        try:
-            res = run_benchmark(args)
-        finally:
-            kill_child_process(process.pid)
-
-        assert res["completed"] == num_prompts
-        return res
-
-    def test_default(self):
-        res = self.run_test(
-            disable_radix_cache=ServerArgs.disable_radix_cache,
-            attention_backend=ServerArgs.attention_backend,
-            chunked_prefill_size=ServerArgs.chunked_prefill_size,
-        )
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 1800
-
-    def test_default_without_radix_cache(self):
-        res = self.run_test(
-            disable_radix_cache=True,
-            attention_backend=ServerArgs.attention_backend,
-            chunked_prefill_size=ServerArgs.chunked_prefill_size,
-        )
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 1950
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_serving_latency.py
+++ b/test/srt/test_serving_latency.py
@@ -1,43 +0,0 @@
-import os
-import subprocess
-import unittest
-
-from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
-
-
-class TestServingLatency(unittest.TestCase):
-    def test_default(self):
-        command = [
-            "python3",
-            "-m",
-            "sglang.bench_latency",
-            "--model-path",
-            DEFAULT_MODEL_NAME_FOR_TEST,
-            "--batch-size",
-            "1",
-            "--input",
-            "128",
-            "--output",
-            "8",
-        ]
-        process = subprocess.Popen(
-            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        stdout, stderr = process.communicate()
-        output = stdout.decode()
-        error = stderr.decode()
-        print(f"Output: {output}")
-        print(f"Error: {error}")
-
-        lastline = output.split("\n")[-3]
-        value = float(lastline.split(" ")[-2])
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert value > 130
-
-        kill_child_process(process.pid)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -1,111 +0,0 @@
-import os
-import unittest
-from types import SimpleNamespace
-
-from sglang.bench_serving import run_benchmark
-from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_child_process
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-
-
-class TestServingThroughput(unittest.TestCase):
-    def run_test(self, disable_radix_cache, attention_backend, chunked_prefill_size):
-        # Launch the server
-        other_args = []
-        if disable_radix_cache:
-            other_args.append("--disable-radix-cache")
-        if attention_backend:
-            other_args.extend(["--attention-backend", attention_backend])
-        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
-
-        model = DEFAULT_MODEL_NAME_FOR_TEST
-        base_url = DEFAULT_URL_FOR_TEST
-        process = popen_launch_server(
-            model,
-            base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=other_args,
-        )
-
-        # Run benchmark
-        num_prompts = 500
-        args = SimpleNamespace(
-            backend="sglang",
-            base_url=base_url,
-            host=None,
-            port=None,
-            dataset_name="random",
-            dataset_path="",
-            model=None,
-            tokenizer=None,
-            num_prompts=num_prompts,
-            sharegpt_output_len=None,
-            random_input_len=4096,
-            random_output_len=2048,
-            random_range_ratio=0.0,
-            request_rate=float("inf"),
-            multi=None,
-            seed=0,
-            output_file=None,
-            disable_tqdm=False,
-            disable_stream=False,
-            disable_ignore_eos=False,
-            extra_request_body=None,
-        )
-
-        try:
-            res = run_benchmark(args)
-        finally:
-            kill_child_process(process.pid)
-
-        assert res["completed"] == num_prompts
-        return res
-
-    def test_default(self):
-        res = self.run_test(
-            disable_radix_cache=ServerArgs.disable_radix_cache,
-            attention_backend=ServerArgs.attention_backend,
-            chunked_prefill_size=ServerArgs.chunked_prefill_size,
-        )
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 2400
-
-    def test_default_without_radix_cache(self):
-        res = self.run_test(
-            disable_radix_cache=True,
-            attention_backend=ServerArgs.attention_backend,
-            chunked_prefill_size=ServerArgs.chunked_prefill_size,
-        )
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 2800
-
-    def test_default_without_chunked_prefill(self):
-        res = self.run_test(
-            disable_radix_cache=ServerArgs.disable_radix_cache,
-            attention_backend=ServerArgs.attention_backend,
-            chunked_prefill_size=-1,
-        )
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 2400
-
-    def test_default_with_triton_attention_backend(self):
-        res = self.run_test(
-            disable_radix_cache=ServerArgs.disable_radix_cache,
-            attention_backend="triton",
-            chunked_prefill_size=-1,
-        )
-
-        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            assert res["output_throughput"] > 2400
-
-
-if __name__ == "__main__":
-    unittest.main()