support multi npu partially
This commit is contained in:
@@ -339,12 +339,13 @@ class NPUWorker(WorkerBase):
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
|
||||
# save memory to host with lock
|
||||
self.offload_vram()
|
||||
self.reload_vram()
|
||||
succ, _ = self.try_reload_vram()
|
||||
assert succ, "Failed to reload model weights after offloading."
|
||||
|
||||
def offload_vram(self) -> None:
|
||||
# free_bytes_before_offload = NPUPlatform.mem_get_info()[0]
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
allocator.offload_vram(offload_tags=("weights", ))
|
||||
allocator.offload_vram(offload_tags=("weights",))
|
||||
# free_bytes_after_offload, total = NPUPlatform.mem_get_info()
|
||||
# freed_bytes = free_bytes_after_offload - free_bytes_before_offload
|
||||
# used_bytes = total - free_bytes_after_offload
|
||||
@@ -354,9 +355,13 @@ class NPUWorker(WorkerBase):
|
||||
# "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
|
||||
# used_bytes / GiB_bytes)
|
||||
|
||||
def reload_vram(self) -> bool:
|
||||
def try_reload_vram(self) -> tuple[bool, bool]:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
return allocator.reload_vram(tags=None)
|
||||
return allocator.try_reload_vram(tags=None)
|
||||
|
||||
def vnpu_unlock_gpu(self) -> None:
|
||||
allocator = CaMemAllocator.get_instance()
|
||||
allocator.vnpu_unlock_gpu()
|
||||
|
||||
def compile_or_warm_up_model(self) -> None:
|
||||
# Note: need to adapt for graph mode.
|
||||
|
||||
Reference in New Issue
Block a user