diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index cb11e0db5..ad271c37e 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -47,8 +47,8 @@ jobs: python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache timeout-minutes: 10 - - name: Benchmark Serving Throughput (w/ ChunkedPrefill) + - name: Benchmark Serving Throughput (w/o ChunkedPrefill) run: | cd test/srt - python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_with_chunked_prefill + python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill timeout-minutes: 10 diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 8ed66960b..6512e1b6e 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -49,7 +49,7 @@ class ServerArgs: max_running_requests: Optional[int] = None max_num_reqs: Optional[int] = None max_total_tokens: Optional[int] = None - chunked_prefill_size: int = -1 + chunked_prefill_size: int = 8192 max_prefill_tokens: int = 16384 schedule_policy: str = "lpm" schedule_conservativeness: float = 1.0 diff --git a/test/srt/test_moe_serving_throughput.py b/test/srt/test_moe_serving_throughput.py index 80b445f49..bbcd51227 100644 --- a/test/srt/test_moe_serving_throughput.py +++ b/test/srt/test_moe_serving_throughput.py @@ -86,11 +86,11 @@ class TestServingThroughput(unittest.TestCase): # A100 (PCIE) performance assert res["output_throughput"] > 930 - def test_default_with_chunked_prefill(self): + def test_default_without_chunked_prefill(self): res = self.run_test( disable_radix_cache=ServerArgs.disable_radix_cache, disable_flashinfer=ServerArgs.disable_flashinfer, - chunked_prefill_size=8192, + chunked_prefill_size=-1, ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py index c99d2e07e..261ac6ec5 100644 --- a/test/srt/test_serving_throughput.py +++ b/test/srt/test_serving_throughput.py @@ -71,7 +71,7 @@ class TestServingThroughput(unittest.TestCase): if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance - assert res["output_throughput"] >= 1400 + assert res["output_throughput"] > 1400 def test_default_without_radix_cache(self): res = self.run_test( @@ -82,18 +82,18 @@ class TestServingThroughput(unittest.TestCase): if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance - assert res["output_throughput"] >= 1450 + assert res["output_throughput"] > 1450 - def test_default_with_chunked_prefill(self): + def test_default_without_chunked_prefill(self): res = self.run_test( disable_radix_cache=ServerArgs.disable_radix_cache, disable_flashinfer=ServerArgs.disable_flashinfer, - chunked_prefill_size=8192, + chunked_prefill_size=-1, ) if os.getenv("SGLANG_IS_IN_CI", "false") == "true": # A100 (PCIE) performance - assert res["output_throughput"] >= 1400 + assert res["output_throughput"] > 1400 def test_all_cases(self): for disable_radix_cache in [False, True]: