Fix max_new_tokens for limited memory

2024-01-24 10:44:32 +00:00
parent bef0b35902
commit fa7a696d04
1 changed files with 1 additions and 0 deletions
--- a/python/sglang/srt/managers/router/model_rpc.py
+++ b/python/sglang/srt/managers/router/model_rpc.py
@@ -229,6 +229,7 @@ class ModelRpcServer(rpyc.Service):
        req.sampling_params.max_new_tokens = min(
            req.sampling_params.max_new_tokens,
            self.model_config.context_len - 1 - len(req.input_ids),
+            self.max_total_num_token - 128 - len(req.input_ids),
        )
        self.forward_queue.append(req)