import time from vllm.v1.executor.abstract import logger, Executor def is_offloaded(self) -> bool: if not hasattr(self, "_is_offloaded"): self._is_offloaded = False return self._is_offloaded def offload_vram(self): if self.is_offloaded(): logger.warning("Executor is already offloaded.") return time_before_offload = time.perf_counter() self.collective_rpc("offload_vram") time_after_offload = time.perf_counter() self._is_offloaded = True logger.info(f"Offloading VRAM costs {time_after_offload - time_before_offload:.6f} seconds.") def reload_vram(self) -> bool: if not self.is_offloaded(): logger.warning("Executor is not offloaded.") return True while True: time_before_reload = time.perf_counter() res = self.collective_rpc("try_reload_vram") time_after_reload = time.perf_counter() succ = all(x[0] for x in res) if succ: self._is_offloaded = False logger.info(f"Reloading VRAM costs {time_after_reload - time_before_reload:.6f} seconds.") prev_is_self = all(x[1] for x in res) return prev_is_self else: # some workers not get lock self.collective_rpc("vnpu_unlock_gpu") time.sleep(0.001) Executor.is_offloaded = is_offloaded Executor.offload_vram = offload_vram Executor.reload_vram = reload_vram