Multi-Stage Awake: Support Resume and Pause KV Cache and Weights separately (#7099)
This commit is contained in:
@@ -479,17 +479,15 @@ class Engine(EngineBase):
|
||||
self.tokenizer_manager.get_weights_by_name(obj, None)
|
||||
)
|
||||
|
||||
def release_memory_occupation(self):
|
||||
"""Release GPU occupation temporarily."""
|
||||
obj = ReleaseMemoryOccupationReqInput()
|
||||
def release_memory_occupation(self, tags: Optional[List[str]] = None):
|
||||
obj = ReleaseMemoryOccupationReqInput(tags=tags)
|
||||
loop = asyncio.get_event_loop()
|
||||
return loop.run_until_complete(
|
||||
self.tokenizer_manager.release_memory_occupation(obj, None)
|
||||
)
|
||||
|
||||
def resume_memory_occupation(self):
|
||||
"""Resume GPU occupation."""
|
||||
obj = ResumeMemoryOccupationReqInput()
|
||||
def resume_memory_occupation(self, tags: Optional[List[str]] = None):
|
||||
obj = ResumeMemoryOccupationReqInput(tags=tags)
|
||||
loop = asyncio.get_event_loop()
|
||||
return loop.run_until_complete(
|
||||
self.tokenizer_manager.resume_memory_occupation(obj, None)
|
||||
@@ -670,11 +668,9 @@ def _launch_subprocesses(
|
||||
|
||||
scheduler_procs = []
|
||||
if server_args.dp_size == 1:
|
||||
# Launch tensor parallel scheduler processes
|
||||
memory_saver_adapter = TorchMemorySaverAdapter.create(
|
||||
enable=server_args.enable_memory_saver
|
||||
)
|
||||
|
||||
scheduler_pipe_readers = []
|
||||
|
||||
nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
|
||||
@@ -710,6 +706,7 @@ def _launch_subprocesses(
|
||||
writer,
|
||||
),
|
||||
)
|
||||
|
||||
with memory_saver_adapter.configure_subprocess():
|
||||
proc.start()
|
||||
scheduler_procs.append(proc)
|
||||
|
||||
Reference in New Issue
Block a user