diff --git a/python/sglang/srt/managers/controller/tp_worker.py b/python/sglang/srt/managers/controller/tp_worker.py index f5401bc62..b07528141 100644 --- a/python/sglang/srt/managers/controller/tp_worker.py +++ b/python/sglang/srt/managers/controller/tp_worker.py @@ -103,6 +103,10 @@ class ModelTpServer: if server_args.max_running_requests is None else server_args.max_running_requests ) + self.max_running_requests = min( + self.max_running_requests, + self.model_runner.req_to_token_pool.size - 1 + ) self.int_token_logit_bias = torch.tensor( get_int_token_logit_bias(self.tokenizer, self.model_config.vocab_size) ) diff --git a/python/sglang/srt/memory_pool.py b/python/sglang/srt/memory_pool.py index a6335797c..f7cc7bec4 100644 --- a/python/sglang/srt/memory_pool.py +++ b/python/sglang/srt/memory_pool.py @@ -11,6 +11,7 @@ class ReqToTokenPool: """A memory pool that maps a request to its token locations.""" def __init__(self, size: int, max_context_len: int): + self.size = size self.mem_state = torch.ones((size,), dtype=torch.bool, device="cuda") self.req_to_token = torch.empty( (size, max_context_len), dtype=torch.int32, device="cuda"