Expose max total num tokens from Runtime & Engine API (#2092)
This commit is contained in:
committed by
GitHub
parent
72f87b723b
commit
c35cd1f8c7
@@ -1400,7 +1400,9 @@ def run_scheduler_process(
|
||||
|
||||
try:
|
||||
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
|
||||
pipe_writer.send("ready")
|
||||
pipe_writer.send(
|
||||
{"status": "ready", "max_total_num_tokens": scheduler.max_total_num_tokens}
|
||||
)
|
||||
if scheduler.enable_overlap:
|
||||
scheduler.event_loop_overlap()
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user