Fix prefill size (#711)
This commit is contained in:
@@ -103,6 +103,10 @@ class ModelTpServer:
|
|||||||
if server_args.max_running_requests is None
|
if server_args.max_running_requests is None
|
||||||
else server_args.max_running_requests
|
else server_args.max_running_requests
|
||||||
)
|
)
|
||||||
|
self.max_running_requests = min(
|
||||||
|
self.max_running_requests,
|
||||||
|
self.model_runner.req_to_token_pool.size - 1
|
||||||
|
)
|
||||||
self.int_token_logit_bias = torch.tensor(
|
self.int_token_logit_bias = torch.tensor(
|
||||||
get_int_token_logit_bias(self.tokenizer, self.model_config.vocab_size)
|
get_int_token_logit_bias(self.tokenizer, self.model_config.vocab_size)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ class ReqToTokenPool:
|
|||||||
"""A memory pool that maps a request to its token locations."""
|
"""A memory pool that maps a request to its token locations."""
|
||||||
|
|
||||||
def __init__(self, size: int, max_context_len: int):
|
def __init__(self, size: int, max_context_len: int):
|
||||||
|
self.size = size
|
||||||
self.mem_state = torch.ones((size,), dtype=torch.bool, device="cuda")
|
self.mem_state = torch.ones((size,), dtype=torch.bool, device="cuda")
|
||||||
self.req_to_token = torch.empty(
|
self.req_to_token = torch.empty(
|
||||||
(size, max_context_len), dtype=torch.int32, device="cuda"
|
(size, max_context_len), dtype=torch.int32, device="cuda"
|
||||||
|
|||||||
Reference in New Issue
Block a user