From 4606e2a3fe9f4199451f83aa7bf43b79875228f8 Mon Sep 17 00:00:00 2001
From: who who who <fsx950223@outlook.com>
Date: Wed, 26 Feb 2025 00:40:35 +0800
Subject: [PATCH] Bug: fix capture_bs (#3857)

---
 python/sglang/srt/model_executor/cuda_graph_runner.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
index db103162f..d3f2e5146 100644
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -114,6 +114,10 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
             capture_bs = list(range(1, 33)) + [64, 128]
         else:
             capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
+
+    if is_hip_:
+        capture_bs += [i * 8 for i in range(21, 33)]
+
     if max(capture_bs) > model_runner.req_to_token_pool.size:
         # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
         # is very samll. We add more values here to make sure we capture the maximum bs.
@@ -132,8 +136,6 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
         if bs <= model_runner.req_to_token_pool.size
         and bs <= server_args.cuda_graph_max_bs
     ]
-    if is_hip_:
-        capture_bs += [i * 8 for i in range(21, 33)]
     compile_bs = (
         [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
         if server_args.enable_torch_compile