support multi npu partially

This commit is contained in:
starkwj
2026-01-08 06:54:33 +00:00
parent fa0fb46853
commit 2a571d8bc8
12 changed files with 331 additions and 160 deletions

View File

@@ -26,12 +26,21 @@ def reload_vram(self) -> bool:
logger.warning("Executor is not offloaded.")
return True
time_before_reload = time.perf_counter()
prev_is_self = self.collective_rpc("reload_vram")
time_after_reload = time.perf_counter()
self.is_offloaded = False
logger.info(f"Reloading VRAM costs {time_after_reload - time_before_reload:.6f} seconds.")
return all(prev_is_self)
while True:
time_before_reload = time.perf_counter()
res = self.collective_rpc("try_reload_vram")
time_after_reload = time.perf_counter()
succ = all(x[0] for x in res)
if succ:
self.is_offloaded = False
logger.info(f"Reloading VRAM costs {time_after_reload - time_before_reload:.6f} seconds.")
prev_is_self = all(x[1] for x in res)
return prev_is_self
else:
# some workers not get lock
self.collective_rpc("vnpu_unlock_gpu")
time.sleep(0.001)
def determine_available_memory_idle_offload_mode(self) -> int: