From 8207637029082563cab74951fe8d5f86b574b85e Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 11 Aug 2024 18:27:33 -0700 Subject: [PATCH] Improve end-to-end throughput test and its coverage (#1039) --- .github/workflows/e2e-test.yml | 27 +++---- python/sglang/bench_serving.py | 25 ++++--- python/sglang/test/test_utils.py | 1 + test/srt/test_chunked_prefill.py | 8 ++- test/srt/test_embedding_openai_server.py | 4 +- test/srt/test_eval_accuracy.py | 8 ++- test/srt/test_large_max_new_tokens.py | 8 ++- test/srt/test_openai_server.py | 8 ++- test/srt/test_serving_throughput.py | 92 ++++++++++++++++++++++++ test/srt/test_skip_tokenizer_init.py | 8 ++- test/srt/test_srt_endpoint.py | 8 ++- test/srt/test_throughput.py | 61 ++++++++++++++++ test/srt/test_torch_compile.py | 8 ++- test/srt/test_vision_openai_server.py | 4 +- 14 files changed, 224 insertions(+), 46 deletions(-) create mode 100644 test/srt/test_serving_throughput.py create mode 100644 test/srt/test_throughput.py diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 6e8984763..78ac4d9ec 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -37,23 +37,16 @@ jobs: - name: Benchmark Serving Throughput run: | - python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --port 8413 --disable-radix-cache & - SERVER_PID=$! + cd test/srt + python3 -m unittest test_serving_throughput.TestServingThroughput.test_default - echo "Waiting for server to start..." - for i in {1..120}; do - if curl -s http://127.0.0.1:8413/health; then - echo "Server is up!" - break - fi - if [ $i -eq 120 ]; then - echo "Server failed to start within 120 seconds" - exit 1 - fi - sleep 1 - done + - name: Benchmark Serving Throughput (w/o RadixAttention) + run: | + cd test/srt + python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache - cd $HOME && python3 -m sglang.bench_serving --backend sglang --port 8413 --dataset-name random --num-prompts 500 --random-input 4096 --random-output 2048 + - name: Benchmark Serving Throughput (w/o FlashInfer) + run: | + cd test/srt + python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_flashinfer - echo "Stopping server..." - kill -9 $SERVER_PID diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 4d733e959..e3a2ad0a2 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -39,6 +39,8 @@ from transformers import ( AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) +global args + @dataclass class RequestFuncInput: @@ -749,7 +751,11 @@ def check_chat_template(model_path): return False -def fire(args: argparse.Namespace): +def run_benchmark(args_: argparse.Namespace): + global args + args = args_ + + set_ulimit() random.seed(args.seed) np.random.seed(args.seed) @@ -853,7 +859,7 @@ def fire(args: argparse.Namespace): ) ) else: - asyncio.run( + return asyncio.run( benchmark( backend=backend, api_url=api_url, @@ -962,11 +968,6 @@ if __name__ == "__main__": "Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.", ) parser.add_argument("--seed", type=int, default=0, help="Default is 0.") - parser.add_argument( - "--disable-tqdm", - action="store_true", - help="Specify to disable tqdm progress bar.", - ) parser.add_argument( "--multi", action="store_true", @@ -979,6 +980,11 @@ if __name__ == "__main__": help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.", ) parser.add_argument("--output-file", type=str, help="Output JSONL file name.") + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Specify to disable tqdm progress bar.", + ) parser.add_argument( "--disable-stream", action="store_true", @@ -996,8 +1002,5 @@ if __name__ == "__main__": help="Append given JSON object to the request payload. You can use this to specify" "additional generate params like sampling params.", ) - - set_ulimit() - args = parser.parse_args() - fire(args) + run_benchmark(args) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 22aa597f5..7243ff2ec 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -21,6 +21,7 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.utils import get_exception_traceback DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" +DEFAULT_URL_FOR_TEST = "http://127.0.0.1:8157" def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None): diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py index 271b73fab..3a9423bc5 100644 --- a/test/srt/test_chunked_prefill.py +++ b/test/srt/test_chunked_prefill.py @@ -3,7 +3,11 @@ from types import SimpleNamespace from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval -from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) class TestAccuracy(unittest.TestCase): @@ -11,7 +15,7 @@ class TestAccuracy(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = "http://127.0.0.1:8157" + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, diff --git a/test/srt/test_embedding_openai_server.py b/test/srt/test_embedding_openai_server.py index ed7db6643..45580feda 100644 --- a/test/srt/test_embedding_openai_server.py +++ b/test/srt/test_embedding_openai_server.py @@ -4,7 +4,7 @@ import openai from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import popen_launch_server +from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server class TestOpenAIServer(unittest.TestCase): @@ -12,7 +12,7 @@ class TestOpenAIServer(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = "intfloat/e5-mistral-7b-instruct" - cls.base_url = "http://127.0.0.1:8157" + cls.base_url = DEFAULT_URL_FOR_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, api_key=cls.api_key diff --git a/test/srt/test_eval_accuracy.py b/test/srt/test_eval_accuracy.py index da9a4f9c6..a3f16f857 100644 --- a/test/srt/test_eval_accuracy.py +++ b/test/srt/test_eval_accuracy.py @@ -3,7 +3,11 @@ from types import SimpleNamespace from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval -from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) class TestAccuracy(unittest.TestCase): @@ -11,7 +15,7 @@ class TestAccuracy(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = "http://127.0.0.1:8157" + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) @classmethod diff --git a/test/srt/test_large_max_new_tokens.py b/test/srt/test_large_max_new_tokens.py index 3b3212209..58f82b351 100644 --- a/test/srt/test_large_max_new_tokens.py +++ b/test/srt/test_large_max_new_tokens.py @@ -8,7 +8,11 @@ import openai from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) class TestOpenAIServer(unittest.TestCase): @@ -16,7 +20,7 @@ class TestOpenAIServer(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = "http://127.0.0.1:8157" + cls.base_url = DEFAULT_URL_FOR_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index 95486d70e..b66c35f01 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -6,7 +6,11 @@ import openai from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) class TestOpenAIServer(unittest.TestCase): @@ -14,7 +18,7 @@ class TestOpenAIServer(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = "http://127.0.0.1:8157" + cls.base_url = DEFAULT_URL_FOR_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, api_key=cls.api_key diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py new file mode 100644 index 000000000..808bc833e --- /dev/null +++ b/test/srt/test_serving_throughput.py @@ -0,0 +1,92 @@ +import unittest +from types import SimpleNamespace + +from sglang.bench_serving import run_benchmark +from sglang.srt.utils import kill_child_process +from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server + + +class TestServingThroughput(unittest.TestCase): + + def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size): + # Launch the server + other_args = [] + if disable_radix_cache: + other_args.append("--disable-radix-cache") + if disable_flashinfer: + other_args.append("--disable-flashinfer") + other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)]) + + model = DEFAULT_MODEL_NAME_FOR_TEST + base_url = "http://127.0.0.1:9157" + process = popen_launch_server( + model, base_url, timeout=300, other_args=other_args + ) + + # Run benchmark + num_prompts = 400 + args = SimpleNamespace( + backend="sglang", + base_url=base_url, + host=None, + port=None, + dataset_name="random", + dataset_path="", + model=None, + tokenizer=None, + num_prompts=num_prompts, + sharegpt_output_len=None, + random_input_len=4096, + random_output_len=2048, + random_range_ratio=0.0, + request_rate=float("inf"), + multi=None, + seed=0, + output_file=None, + disable_tqdm=False, + disable_stream=False, + disable_ignore_eos=False, + extra_request_body=None, + ) + + try: + res = run_benchmark(args) + finally: + kill_child_process(process.pid) + + assert res["completed"] == num_prompts + + def test_default(self): + self.run_test( + disable_radix_cache=False, + disable_flashinfer=False, + chunked_prefill_size=-1, + ) + + def test_default_without_radix_cache(self): + self.run_test( + disable_radix_cache=True, + disable_flashinfer=False, + chunked_prefill_size=-1, + ) + + def test_default_without_flashinfer(self): + self.run_test( + disable_radix_cache=False, + disable_flashinfer=True, + chunked_prefill_size=-1, + ) + + def test_all_cases(self): + for disable_radix_cache in [False, True]: + for disable_flashinfer in [False, True]: + for chunked_prefill_size in [-1, 2048]: + self.run_test( + disable_radix_cache=False, + disable_flashinfer=False, + chunked_prefill_size=-1, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py index 7417783f6..01bfdb96a 100644 --- a/test/srt/test_skip_tokenizer_init.py +++ b/test/srt/test_skip_tokenizer_init.py @@ -4,7 +4,11 @@ import unittest import requests from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) class TestSkipTokenizerInit(unittest.TestCase): @@ -12,7 +16,7 @@ class TestSkipTokenizerInit(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = "http://127.0.0.1:8157" + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, other_args=["--skip-tokenizer-init"] ) diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py index 8948e22d7..2c40f5360 100644 --- a/test/srt/test_srt_endpoint.py +++ b/test/srt/test_srt_endpoint.py @@ -4,7 +4,11 @@ import unittest import requests from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) class TestSRTEndpoint(unittest.TestCase): @@ -12,7 +16,7 @@ class TestSRTEndpoint(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = "http://127.0.0.1:8157" + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) @classmethod diff --git a/test/srt/test_throughput.py b/test/srt/test_throughput.py new file mode 100644 index 000000000..8948e22d7 --- /dev/null +++ b/test/srt/test_throughput.py @@ -0,0 +1,61 @@ +import json +import unittest + +import requests + +from sglang.srt.utils import kill_child_process +from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server + + +class TestSRTEndpoint(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = "http://127.0.0.1:8157" + cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def run_decode( + self, return_logprob=False, top_logprobs_num=0, return_text=False, n=1 + ): + response = requests.post( + self.base_url + "/generate", + json={ + "text": "The capital of France is", + "sampling_params": { + "temperature": 0 if n == 1 else 0.5, + "max_new_tokens": 32, + "n": n, + }, + "stream": False, + "return_logprob": return_logprob, + "top_logprobs_num": top_logprobs_num, + "return_text_in_logprobs": return_text, + "logprob_start_len": 0, + }, + ) + print(json.dumps(response.json())) + print("=" * 100) + + def test_simple_decode(self): + self.run_decode() + + def test_parallel_sample(self): + self.run_decode(n=3) + + def test_logprob(self): + for top_logprobs_num in [0, 3]: + for return_text in [True, False]: + self.run_decode( + return_logprob=True, + top_logprobs_num=top_logprobs_num, + return_text=return_text, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index 7b4664563..c8869a9cc 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -3,7 +3,11 @@ from types import SimpleNamespace from sglang.srt.utils import kill_child_process from sglang.test.run_eval import run_eval -from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) class TestAccuracy(unittest.TestCase): @@ -11,7 +15,7 @@ class TestAccuracy(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = "http://127.0.0.1:8157" + cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"] ) diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index 52764b6b4..0449e33f1 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -5,7 +5,7 @@ import openai from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import popen_launch_server +from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server class TestOpenAIVisionServer(unittest.TestCase): @@ -13,7 +13,7 @@ class TestOpenAIVisionServer(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = "liuhaotian/llava-v1.6-vicuna-7b" - cls.base_url = "http://127.0.0.1:8157" + cls.base_url = DEFAULT_URL_FOR_TEST cls.api_key = "sk-123456" cls.process = popen_launch_server( cls.model,