Expose max total num tokens from Runtime & Engine API (#2092)

This commit is contained in:
Henry Hyeonmok Ko
2024-11-22 15:10:10 -08:00
committed by GitHub
parent 72f87b723b
commit c35cd1f8c7
4 changed files with 81 additions and 7 deletions

View File

@@ -1400,7 +1400,9 @@ def run_scheduler_process(
try:
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
pipe_writer.send("ready")
pipe_writer.send(
{"status": "ready", "max_total_num_tokens": scheduler.max_total_num_tokens}
)
if scheduler.enable_overlap:
scheduler.event_loop_overlap()
else: