From c877292cc12a61011694d7d0ea53c05f247003f6 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 12 Aug 2024 03:39:01 -0700 Subject: [PATCH] Re-organize CI tests (#1052) --- .github/workflows/e2e-test.yml | 5 +- .../sglang/srt/constrained/base_tool_cache.py | 2 +- python/sglang/srt/managers/tp_worker.py | 17 +++-- python/sglang/srt/mem_cache/chunk_cache.py | 2 +- python/sglang/srt/server.py | 9 +++ test/srt/run_suite.py | 1 + ...est_eval_accuracy_large_chunked_prefill.py | 68 +++++++++++++++++++ test/srt/test_serving_throughput.py | 25 ++++--- test/srt/test_triton_attn_backend.py | 41 +++++++++++ 9 files changed, 148 insertions(+), 22 deletions(-) create mode 100644 test/srt/test_eval_accuracy_large_chunked_prefill.py create mode 100644 test/srt/test_triton_attn_backend.py diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 336f6a14f..455594bd7 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -45,8 +45,7 @@ jobs: cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache - - name: Benchmark Serving Throughput (w/o FlashInfer) + - name: Benchmark Serving Throughput (w/ ChunkedPrefill) run: | cd test/srt - python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_flashinfer - + python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_with_chunked_prefill diff --git a/python/sglang/srt/constrained/base_tool_cache.py b/python/sglang/srt/constrained/base_tool_cache.py index 4cbb6bd22..fa1aff5ea 100644 --- a/python/sglang/srt/constrained/base_tool_cache.py +++ b/python/sglang/srt/constrained/base_tool_cache.py @@ -54,7 +54,7 @@ class BaseToolCache: return val def init_value(self, key): - raise NotImplementedError + raise NotImplementedError() def get_cache_hit_rate(self): if self.metrics["total"] == 0: diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index f14885263..a8b952361 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -410,13 +410,16 @@ class ModelTpServer: # Print stats if self.tp_rank == 0: - self.tree_cache_metrics["total"] += ( - adder.log_input_tokens + adder.log_hit_tokens - ) / 10**9 - self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9 - tree_cache_hit_rate = ( - self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] - ) + if isinstance(self.tree_cache, RadixCache): + self.tree_cache_metrics["total"] += ( + adder.log_input_tokens + adder.log_hit_tokens + ) / 10**9 + self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9 + tree_cache_hit_rate = ( + self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] + ) + else: + tree_cache_hit_rate = 0.0 logger.info( f"[gpu={self.gpu_id}] Prefill batch. " f"#new-seq: {len(can_run_list)}, " diff --git a/python/sglang/srt/mem_cache/chunk_cache.py b/python/sglang/srt/mem_cache/chunk_cache.py index 35b9171e5..e7e48ecee 100644 --- a/python/sglang/srt/mem_cache/chunk_cache.py +++ b/python/sglang/srt/mem_cache/chunk_cache.py @@ -68,7 +68,7 @@ class ChunkCache(BasePrefixCache): req.last_node = entry def insert(self): - raise NotImplementedError + raise NotImplementedError() def evict(self, num_tokens: int, evict_callback: Callable): pass diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 8b6766335..7331425fa 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -447,6 +447,15 @@ def _wait_and_warmup(server_args, pipe_finish_writer): print(f"Initialization failed. warmup error: {last_traceback}", flush=True) sys.exit(1) + # Print warnings here + if server_args.disable_radix_cache and server_args.chunked_prefill_size is not None: + logger.warning( + "You set both `--disable-radix-cache` and `--chunked-prefill-size`. " + "This combination is an experimental feature and we noticed it can lead to " + "wrong generation results. If you want to use chunked prefill, it is recommended " + "not using `--disable-radix-cache`." + ) + logger.info("The server is fired up and ready to roll!") if pipe_finish_writer is not None: pipe_finish_writer.send("init ok") diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index c99b6a60b..4d3f7de30 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -12,6 +12,7 @@ suites = { "test_openai_server.py", "test_skip_tokenizer_init.py", "test_torch_compile.py", + "test_triton_attn_backend.py", "test_vision_openai_server.py", "test_large_max_new_tokens.py", "models/test_generation_models.py", diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py new file mode 100644 index 000000000..297fc22e1 --- /dev/null +++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py @@ -0,0 +1,68 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_child_process +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = "http://127.0.0.1:7157" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=300, + other_args=["--log-level-http", "warning", "--chunked-prefill-size", "256"], + ) + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=3000, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.71, f"{metrics}" + + def test_human_eval(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="humaneval", + num_examples=None, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.65, f"{metrics}" + + def test_mgsm_en(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=None, + num_threads=1024, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.85, f"{metrics}" + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py index 0066d01cb..c733163f5 100644 --- a/test/srt/test_serving_throughput.py +++ b/test/srt/test_serving_throughput.py @@ -3,6 +3,7 @@ import unittest from types import SimpleNamespace from sglang.bench_serving import run_benchmark +from sglang.srt.server_args import ServerArgs from sglang.srt.utils import kill_child_process from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server @@ -60,9 +61,9 @@ class TestServingThroughput(unittest.TestCase): def test_default(self): res = self.run_test( - disable_radix_cache=False, - disable_flashinfer=False, - chunked_prefill_size=-1, + disable_radix_cache=ServerArgs.disable_radix_cache, + disable_flashinfer=ServerArgs.disable_flashinfer, + chunked_prefill_size=ServerArgs.chunked_prefill_size, ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": @@ -72,21 +73,25 @@ class TestServingThroughput(unittest.TestCase): def test_default_without_radix_cache(self): res = self.run_test( disable_radix_cache=True, - disable_flashinfer=False, - chunked_prefill_size=-1, + disable_flashinfer=ServerArgs.disable_flashinfer, + chunked_prefill_size=ServerArgs.chunked_prefill_size, ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance assert res["output_throughput"] >= 1450 - def test_default_without_flashinfer(self): - self.run_test( - disable_radix_cache=False, - disable_flashinfer=True, - chunked_prefill_size=-1, + def test_default_with_chunked_prefill(self): + res = self.run_test( + disable_radix_cache=ServerArgs.disable_radix_cache, + disable_flashinfer=ServerArgs.disable_flashinfer, + chunked_prefill_size=8192, ) + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + # A100 (PCIE) performance + assert res["output_throughput"] >= 1400 + def test_all_cases(self): for disable_radix_cache in [False, True]: for disable_flashinfer in [False, True]: diff --git a/test/srt/test_triton_attn_backend.py b/test/srt/test_triton_attn_backend.py new file mode 100644 index 000000000..67cbc623c --- /dev/null +++ b/test/srt/test_triton_attn_backend.py @@ -0,0 +1,41 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_child_process +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestTritonAttnBackend(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, cls.base_url, timeout=300, other_args=["--disable-flashinfer"] + ) + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=32, + num_threads=32, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.6 + + +if __name__ == "__main__": + unittest.main()