Files
xc-llm-ascend/vllm_ascend/patch/platform/patch_executor.py

48 lines
1.4 KiB
Python

import time
from vllm.v1.executor.abstract import logger, Executor
def is_offloaded(self) -> bool:
if not hasattr(self, "_is_offloaded"):
self._is_offloaded = False
return self._is_offloaded
def offload_vram(self):
if self.is_offloaded():
logger.warning("Executor is already offloaded.")
return
time_before_offload = time.perf_counter()
self.collective_rpc("offload_vram")
time_after_offload = time.perf_counter()
self._is_offloaded = True
logger.info(f"Offloading VRAM costs {time_after_offload - time_before_offload:.6f} seconds.")
def reload_vram(self) -> bool:
if not self.is_offloaded():
logger.warning("Executor is not offloaded.")
return True
while True:
time_before_reload = time.perf_counter()
res = self.collective_rpc("try_reload_vram")
time_after_reload = time.perf_counter()
succ = all(x[0] for x in res)
if succ:
self._is_offloaded = False
logger.info(f"Reloading VRAM costs {time_after_reload - time_before_reload:.6f} seconds.")
prev_is_self = all(x[1] for x in res)
return prev_is_self
else:
# some workers not get lock
self.collective_rpc("vnpu_unlock_gpu")
time.sleep(0.001)
Executor.is_offloaded = is_offloaded
Executor.offload_vram = offload_vram
Executor.reload_vram = reload_vram