From 71e2a27753fa6908eeaa0151ad27df0b05fd407a Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 5 Dec 2024 13:42:47 -0800 Subject: [PATCH] Fix the cuda graph capture range for small #max-running-requests (#2359) --- .../sglang/srt/model_executor/cuda_graph_runner.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 3aac4965a..27043cc9a 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -130,6 +130,20 @@ class CudaGraphRunner: self.capture_bs = list(range(1, 32)) + [64, 128] else: self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)] + + if max(self.capture_bs) > model_runner.req_to_token_pool.size: + # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests + # is very samll. We add more values here to make sure we capture the maximum bs. + self.capture_bs = list( + sorted( + set( + self.capture_bs + + [model_runner.req_to_token_pool.size - 1] + + [model_runner.req_to_token_pool.size] + ) + ) + ) + self.capture_bs = [ bs for bs in self.capture_bs