support multi npu partially

This commit is contained in:
starkwj
2026-01-08 06:54:33 +00:00
parent fa0fb46853
commit 2a571d8bc8
12 changed files with 331 additions and 160 deletions

View File

@@ -339,12 +339,13 @@ class NPUWorker(WorkerBase):
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
# save memory to host with lock
self.offload_vram()
self.reload_vram()
succ, _ = self.try_reload_vram()
assert succ, "Failed to reload model weights after offloading."
def offload_vram(self) -> None:
# free_bytes_before_offload = NPUPlatform.mem_get_info()[0]
allocator = CaMemAllocator.get_instance()
allocator.offload_vram(offload_tags=("weights", ))
allocator.offload_vram(offload_tags=("weights",))
# free_bytes_after_offload, total = NPUPlatform.mem_get_info()
# freed_bytes = free_bytes_after_offload - free_bytes_before_offload
# used_bytes = total - free_bytes_after_offload
@@ -354,9 +355,13 @@ class NPUWorker(WorkerBase):
# "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
# used_bytes / GiB_bytes)
def reload_vram(self) -> bool:
def try_reload_vram(self) -> tuple[bool, bool]:
allocator = CaMemAllocator.get_instance()
return allocator.reload_vram(tags=None)
return allocator.try_reload_vram(tags=None)
def vnpu_unlock_gpu(self) -> None:
allocator = CaMemAllocator.get_instance()
allocator.vnpu_unlock_gpu()
def compile_or_warm_up_model(self) -> None:
# Note: need to adapt for graph mode.