diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py index f8ec2a778..1068250c1 100644 --- a/python/sglang/srt/managers/schedule_policy.py +++ b/python/sglang/srt/managers/schedule_policy.py @@ -249,7 +249,10 @@ class PrefillAdder: return AddReqResult.NO_TOKEN tokens_freed += tokens_occupied - if req.extend_input_len <= self.rem_chunk_tokens: + if ( + self.rem_chunk_tokens is None + or req.extend_input_len <= self.rem_chunk_tokens + ): self.can_run_list.append(req) self._prefill_one_req( 0, diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py index 6f6824416..8e9010395 100644 --- a/test/srt/test_chunked_prefill.py +++ b/test/srt/test_chunked_prefill.py @@ -8,6 +8,7 @@ from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, popen_launch_server, + run_bench_serving, ) @@ -62,6 +63,16 @@ class TestChunkedPrefill(unittest.TestCase): disable_radix_cache=False, enable_mixed_chunk=False, chunked_prefill_size=-1 ) + def test_no_chunked_prefill_without_radix_cache(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=10, + request_rate=float("inf"), + other_server_args=["--disable-radix-cache", "--chunked-prefill-size", "-1"], + ) + + assert res["completed"] == 10 + if __name__ == "__main__": unittest.main()