From 68be2f6d3b8df28ee0e3553c528c8842987c18f2 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 12 Sep 2024 21:36:41 -0700 Subject: [PATCH] [CI] Include triton backend and online serving benchmark into CI (#1408) --- .github/workflows/pr-test.yml | 58 +++++++++---- python/sglang/test/test_utils.py | 46 ++++++++++ test/srt/test_bench_latency.py | 83 ++++++++++++++++++ test/srt/test_bench_serving.py | 99 +++++++++++++++++++++ test/srt/test_moe_serving_latency.py | 45 ---------- test/srt/test_moe_serving_throughput.py | 92 -------------------- test/srt/test_serving_latency.py | 43 --------- test/srt/test_serving_throughput.py | 111 ------------------------ 8 files changed, 270 insertions(+), 307 deletions(-) create mode 100644 test/srt/test_bench_latency.py create mode 100644 test/srt/test_bench_serving.py delete mode 100644 test/srt/test_moe_serving_latency.py delete mode 100644 test/srt/test_moe_serving_throughput.py delete mode 100644 test/srt/test_serving_latency.py delete mode 100644 test/srt/test_serving_throughput.py diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 5784a0975..6c536afc8 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -75,7 +75,7 @@ jobs: cd test/srt python3 run_suite.py --suite minimal --range-begin 8 - performance-test-1-gpu: + performance-test-1-gpu-part-1: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' runs-on: 1-gpu-runner steps: @@ -88,29 +88,54 @@ jobs: pip install -e "python[all]" pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - - name: Benchmark Serving Throughput + - name: Benchmark Offline Throughput timeout-minutes: 10 run: | cd test/srt - python3 -m unittest test_serving_throughput.TestServingThroughput.test_default + python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default - - name: Benchmark Serving Latency + - name: Benchmark Offline Throughput (w/o RadixAttention) timeout-minutes: 10 run: | cd test/srt - python3 -m unittest test_serving_latency.TestServingLatency.test_default + python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache - - name: Benchmark Serving Throughput (w/o RadixAttention) + - name: Benchmark Offline Throughput (w/o ChunkedPrefill) timeout-minutes: 10 run: | cd test/srt - python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache + python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_chunked_prefill - - name: Benchmark Serving Throughput (w/o ChunkedPrefill) + - name: Benchmark Offline Throughput (w/ Triton) timeout-minutes: 10 run: | cd test/srt - python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill + python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend + + performance-test-1-gpu-part-2: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: 1-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + + - name: Benchmark Single Latency + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_latency.TestBenchLatency.test_default + + - name: Benchmark Online Latency + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default performance-test-2-gpu: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' @@ -125,23 +150,24 @@ jobs: pip install -e "python[all]" pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - - name: Benchmark Serving Throughput (TP=2) + - name: Benchmark Offline Throughput (TP=2) timeout-minutes: 10 run: | cd test/srt - python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default + python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default - - name: Benchmark Serving Latency (TP=2) + - name: Benchmark Offline Throughput (w/o RadixAttention) (TP=2) timeout-minutes: 10 run: | cd test/srt - python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default + python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache - - name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2) + - name: Benchmark Single Latency (TP=2) timeout-minutes: 10 run: | cd test/srt - python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache + python3 -m unittest test_bench_latency.TestBenchLatency.test_moe_default + accuracy-test-1-gpu: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' @@ -192,7 +218,7 @@ jobs: finish: needs: [ unit-test-frontend, unit-test-backend-part-0, unit-test-backend-part-1, - performance-test-1-gpu, performance-test-2-gpu, + performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, accuracy-test-1-gpu, accuracy-test-2-gpu ] runs-on: ubuntu-latest diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 1b9b63e88..a816bb7fa 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -7,6 +7,7 @@ import subprocess import threading import time from functools import partial +from types import SimpleNamespace from typing import Callable, List, Optional import numpy as np @@ -14,6 +15,7 @@ import requests import torch import torch.nn.functional as F +from sglang.bench_serving import run_benchmark from sglang.global_config import global_config from sglang.lang.backend.openai import OpenAI from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint @@ -501,3 +503,47 @@ def run_unittest_files(files: List[str], timeout_per_file: float): def get_similarities(vec1, vec2): return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0) + + +def run_bench_serving(model, num_prompts, request_rate, other_server_args): + # Launch the server + base_url = DEFAULT_URL_FOR_TEST + process = popen_launch_server( + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_server_args, + ) + + # Run benchmark + args = SimpleNamespace( + backend="sglang", + base_url=base_url, + host=None, + port=None, + dataset_name="random", + dataset_path="", + model=None, + tokenizer=None, + num_prompts=num_prompts, + sharegpt_output_len=None, + random_input_len=4096, + random_output_len=2048, + random_range_ratio=0.0, + request_rate=request_rate, + multi=None, + seed=0, + output_file=None, + disable_tqdm=False, + disable_stream=False, + disable_ignore_eos=False, + extra_request_body=None, + ) + + try: + res = run_benchmark(args) + finally: + kill_child_process(process.pid) + + assert res["completed"] == num_prompts + return res diff --git a/test/srt/test_bench_latency.py b/test/srt/test_bench_latency.py new file mode 100644 index 000000000..2c893ee66 --- /dev/null +++ b/test/srt/test_bench_latency.py @@ -0,0 +1,83 @@ +import os +import subprocess +import unittest + +from sglang.srt.utils import kill_child_process +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_MOE_MODEL_NAME_FOR_TEST, +) + + +class TestBenchLatency(unittest.TestCase): + def test_default(self): + command = [ + "python3", + "-m", + "sglang.bench_latency", + "--model-path", + DEFAULT_MODEL_NAME_FOR_TEST, + "--batch-size", + "1", + "--input", + "128", + "--output", + "8", + ] + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + try: + stdout, stderr = process.communicate() + output = stdout.decode() + error = stderr.decode() + print(f"Output: {output}") + print(f"Error: {error}") + + lastline = output.split("\n")[-3] + value = float(lastline.split(" ")[-2]) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + assert value > 130 + finally: + kill_child_process(process.pid) + + def test_moe_default(self): + command = [ + "python3", + "-m", + "sglang.bench_latency", + "--model", + DEFAULT_MOE_MODEL_NAME_FOR_TEST, + "--batch-size", + "1", + "--input", + "128", + "--output", + "8", + "--tp", + "2", + ] + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + try: + stdout, stderr = process.communicate() + output = stdout.decode() + error = stderr.decode() + print(f"Output: {output}") + print(f"Error: {error}") + + lastline = output.split("\n")[-3] + value = float(lastline.split(" ")[-2]) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + assert value > 125 + finally: + kill_child_process(process.pid) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py new file mode 100644 index 000000000..d2275e5a2 --- /dev/null +++ b/test/srt/test_bench_serving.py @@ -0,0 +1,99 @@ +import os +import unittest + +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_MOE_MODEL_NAME_FOR_TEST, + run_bench_serving, +) + + +class TestBenchServing(unittest.TestCase): + + def test_offline_throughput_default(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=500, + request_rate=float("inf"), + other_server_args=[], + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + assert res["output_throughput"] > 2600 + + def test_offline_throughput_without_radix_cache(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=500, + request_rate=float("inf"), + other_server_args=["--disable-radix-cache"], + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + assert res["output_throughput"] > 2800 + + def test_offline_throughput_without_chunked_prefill(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=500, + request_rate=float("inf"), + other_server_args=["--chunked-prefill-size", "-1"], + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + assert res["output_throughput"] > 2600 + + def test_offline_throughput_with_triton_attention_backend(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=500, + request_rate=float("inf"), + other_server_args=[ + "--attention-backend", + "triton", + "--context-length", + "8192", + ], + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + assert res["output_throughput"] > 2600 + + def test_online_latency_default(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=100, + request_rate=1, + other_server_args=[], + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + assert res["median_e2e_latency_ms"] < 12000 + assert res["median_ttft_ms"] < 78 + assert res["median_itl_ms"] < 12 + + def test_moe_offline_throughput_default(self): + res = run_bench_serving( + model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, + num_prompts=300, + request_rate=float("inf"), + other_server_args=["--tp", "2"], + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + assert res["output_throughput"] > 1850 + + def test_moe_offline_throughput_without_radix_cache(self): + res = run_bench_serving( + model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, + num_prompts=300, + request_rate=float("inf"), + other_server_args=["--tp", "2", "--disable-radix-cache"], + ) + + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + assert res["output_throughput"] > 1950 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_moe_serving_latency.py b/test/srt/test_moe_serving_latency.py deleted file mode 100644 index 9d5215323..000000000 --- a/test/srt/test_moe_serving_latency.py +++ /dev/null @@ -1,45 +0,0 @@ -import os -import subprocess -import unittest - -from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST - - -class TestServingLatency(unittest.TestCase): - def test_default(self): - command = [ - "python3", - "-m", - "sglang.bench_latency", - "--model", - DEFAULT_MOE_MODEL_NAME_FOR_TEST, - "--batch-size", - "1", - "--input", - "128", - "--output", - "8", - "--tp", - "2", - ] - process = subprocess.Popen( - command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - stdout, stderr = process.communicate() - output = stdout.decode() - error = stderr.decode() - print(f"Output: {output}") - print(f"Error: {error}") - - lastline = output.split("\n")[-3] - value = float(lastline.split(" ")[-2]) - - if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - assert value > 125 - - kill_child_process(process.pid) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py deleted file mode 100644 index 65b0b55b9..000000000 --- a/test/srt/test_moe_serving_throughput.py +++ /dev/null @@ -1,92 +0,0 @@ -import os -import unittest -from types import SimpleNamespace - -from sglang.bench_serving import run_benchmark -from sglang.srt.server_args import ServerArgs -from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import ( - DEFAULT_MOE_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - popen_launch_server, -) - - -class TestServingThroughput(unittest.TestCase): - def run_test(self, disable_radix_cache, attention_backend, chunked_prefill_size): - # Launch the server - other_args = [] - if disable_radix_cache: - other_args.append("--disable-radix-cache") - if attention_backend: - other_args.extend(["--attention-backend", attention_backend]) - other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)]) - other_args.extend(["--tensor-parallel-size", "2"]) - - model = DEFAULT_MOE_MODEL_NAME_FOR_TEST - base_url = DEFAULT_URL_FOR_TEST - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - ) - - # Run benchmark - num_prompts = 300 - args = SimpleNamespace( - backend="sglang", - base_url=base_url, - host=None, - port=None, - dataset_name="random", - dataset_path="", - model=None, - tokenizer=None, - num_prompts=num_prompts, - sharegpt_output_len=None, - random_input_len=4096, - random_output_len=2048, - random_range_ratio=0.0, - request_rate=float("inf"), - multi=None, - seed=0, - output_file=None, - disable_tqdm=False, - disable_stream=False, - disable_ignore_eos=False, - extra_request_body=None, - ) - - try: - res = run_benchmark(args) - finally: - kill_child_process(process.pid) - - assert res["completed"] == num_prompts - return res - - def test_default(self): - res = self.run_test( - disable_radix_cache=ServerArgs.disable_radix_cache, - attention_backend=ServerArgs.attention_backend, - chunked_prefill_size=ServerArgs.chunked_prefill_size, - ) - - if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - assert res["output_throughput"] > 1800 - - def test_default_without_radix_cache(self): - res = self.run_test( - disable_radix_cache=True, - attention_backend=ServerArgs.attention_backend, - chunked_prefill_size=ServerArgs.chunked_prefill_size, - ) - - if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - assert res["output_throughput"] > 1950 - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/test_serving_latency.py b/test/srt/test_serving_latency.py deleted file mode 100644 index 3dae4541a..000000000 --- a/test/srt/test_serving_latency.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -import subprocess -import unittest - -from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST - - -class TestServingLatency(unittest.TestCase): - def test_default(self): - command = [ - "python3", - "-m", - "sglang.bench_latency", - "--model-path", - DEFAULT_MODEL_NAME_FOR_TEST, - "--batch-size", - "1", - "--input", - "128", - "--output", - "8", - ] - process = subprocess.Popen( - command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - stdout, stderr = process.communicate() - output = stdout.decode() - error = stderr.decode() - print(f"Output: {output}") - print(f"Error: {error}") - - lastline = output.split("\n")[-3] - value = float(lastline.split(" ")[-2]) - - if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - assert value > 130 - - kill_child_process(process.pid) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py deleted file mode 100644 index 16da1d963..000000000 --- a/test/srt/test_serving_throughput.py +++ /dev/null @@ -1,111 +0,0 @@ -import os -import unittest -from types import SimpleNamespace - -from sglang.bench_serving import run_benchmark -from sglang.srt.server_args import ServerArgs -from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import ( - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - popen_launch_server, -) - - -class TestServingThroughput(unittest.TestCase): - def run_test(self, disable_radix_cache, attention_backend, chunked_prefill_size): - # Launch the server - other_args = [] - if disable_radix_cache: - other_args.append("--disable-radix-cache") - if attention_backend: - other_args.extend(["--attention-backend", attention_backend]) - other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)]) - - model = DEFAULT_MODEL_NAME_FOR_TEST - base_url = DEFAULT_URL_FOR_TEST - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - ) - - # Run benchmark - num_prompts = 500 - args = SimpleNamespace( - backend="sglang", - base_url=base_url, - host=None, - port=None, - dataset_name="random", - dataset_path="", - model=None, - tokenizer=None, - num_prompts=num_prompts, - sharegpt_output_len=None, - random_input_len=4096, - random_output_len=2048, - random_range_ratio=0.0, - request_rate=float("inf"), - multi=None, - seed=0, - output_file=None, - disable_tqdm=False, - disable_stream=False, - disable_ignore_eos=False, - extra_request_body=None, - ) - - try: - res = run_benchmark(args) - finally: - kill_child_process(process.pid) - - assert res["completed"] == num_prompts - return res - - def test_default(self): - res = self.run_test( - disable_radix_cache=ServerArgs.disable_radix_cache, - attention_backend=ServerArgs.attention_backend, - chunked_prefill_size=ServerArgs.chunked_prefill_size, - ) - - if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - assert res["output_throughput"] > 2400 - - def test_default_without_radix_cache(self): - res = self.run_test( - disable_radix_cache=True, - attention_backend=ServerArgs.attention_backend, - chunked_prefill_size=ServerArgs.chunked_prefill_size, - ) - - if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - assert res["output_throughput"] > 2800 - - def test_default_without_chunked_prefill(self): - res = self.run_test( - disable_radix_cache=ServerArgs.disable_radix_cache, - attention_backend=ServerArgs.attention_backend, - chunked_prefill_size=-1, - ) - - if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - assert res["output_throughput"] > 2400 - - def test_default_with_triton_attention_backend(self): - res = self.run_test( - disable_radix_cache=ServerArgs.disable_radix_cache, - attention_backend="triton", - chunked_prefill_size=-1, - ) - - if os.getenv("SGLANG_IS_IN_CI", "false") == "true": - assert res["output_throughput"] > 2400 - - -if __name__ == "__main__": - unittest.main()