diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 3e1f4e836..48f62d28b 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -168,7 +168,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner): capture_bs += [model_runner.req_to_token_pool.size] if server_args.enable_two_batch_overlap: - capture_bs = [bs for bs in capture_bs if bs >= 2] + capture_bs = [bs for bs in capture_bs if bs % 2 == 0] if server_args.cuda_graph_max_bs: capture_bs = [bs for bs in capture_bs if bs <= server_args.cuda_graph_max_bs]