support multi npu partially
This commit is contained in:
@@ -26,12 +26,21 @@ def reload_vram(self) -> bool:
|
||||
logger.warning("Executor is not offloaded.")
|
||||
return True
|
||||
|
||||
time_before_reload = time.perf_counter()
|
||||
prev_is_self = self.collective_rpc("reload_vram")
|
||||
time_after_reload = time.perf_counter()
|
||||
self.is_offloaded = False
|
||||
logger.info(f"Reloading VRAM costs {time_after_reload - time_before_reload:.6f} seconds.")
|
||||
return all(prev_is_self)
|
||||
while True:
|
||||
time_before_reload = time.perf_counter()
|
||||
res = self.collective_rpc("try_reload_vram")
|
||||
time_after_reload = time.perf_counter()
|
||||
|
||||
succ = all(x[0] for x in res)
|
||||
if succ:
|
||||
self.is_offloaded = False
|
||||
logger.info(f"Reloading VRAM costs {time_after_reload - time_before_reload:.6f} seconds.")
|
||||
prev_is_self = all(x[1] for x in res)
|
||||
return prev_is_self
|
||||
else:
|
||||
# some workers not get lock
|
||||
self.collective_rpc("vnpu_unlock_gpu")
|
||||
time.sleep(0.001)
|
||||
|
||||
|
||||
def determine_available_memory_idle_offload_mode(self) -> int:
|
||||
|
||||
Reference in New Issue
Block a user