Auto adjust new ratio (#708)

This commit is contained in:
Liangsheng Yin
2024-07-23 22:06:02 -07:00
committed by GitHub
parent 01d66ae2e8
commit 4cd64b8ee6
3 changed files with 31 additions and 7 deletions

View File

@@ -228,6 +228,7 @@ class ModelTpServer:
break
else:
self.check_memory()
self.new_token_ratio = global_config.init_new_token_ratio
def print_stats(self):
num_used = self.max_total_num_tokens - (
@@ -536,9 +537,10 @@ class ModelTpServer:
# Check if decode out of memory
if not batch.check_decode_mem():
old_ratio = self.new_token_ratio
self.new_token_ratio = min(old_ratio + self.new_token_ratio_recovery, 1.0)
retracted_reqs = batch.retract_decode()
retracted_reqs, new_token_ratio = batch.retract_decode()
self.new_token_ratio = new_token_ratio
logger.info(
"decode out of memory happened, "
f"#retracted_reqs: {len(retracted_reqs)}, "