refactor model loader: initial refactor (#664)
This commit is contained in:
@@ -304,6 +304,12 @@ class ModelTpServer:
|
||||
self.model_config.context_len - 1 - len(req.origin_input_ids),
|
||||
self.max_total_num_tokens - 128 - len(req.origin_input_ids),
|
||||
)
|
||||
if req.sampling_params.max_new_tokens < 0:
|
||||
req.origin_input_ids = req.origin_input_ids[
|
||||
: self.max_total_num_tokens - 128
|
||||
]
|
||||
logger.error("Request longer than memory pool size, truncated!!!")
|
||||
|
||||
self.forward_queue.append(req)
|
||||
|
||||
def get_new_prefill_batch(self) -> Optional[Batch]:
|
||||
|
||||
Reference in New Issue
Block a user