support multi npu partially

This commit is contained in:
starkwj
2026-01-08 06:54:33 +00:00
parent fa0fb46853
commit 2a571d8bc8
12 changed files with 331 additions and 160 deletions

View File

@@ -63,14 +63,14 @@ try:
init_module_offload as init_module,
python_create_and_map_offload as python_create_and_map,python_unmap_and_release_offload as python_unmap_and_release,
python_get_mem_info_offload as python_get_mem_info,
python_lock_gpu_offload as python_lock_gpu,
python_try_lock_gpu_offload as python_try_lock_gpu,
python_unlock_gpu_offload as python_unlock_gpu
)
else:
from vllm_ascend.vllm_ascend_C import ( # type: ignore # noqa: F401
init_module, python_create_and_map, python_unmap_and_release)
python_get_mem_info = None
python_lock_gpu = None
python_try_lock_gpu = None
python_unlock_gpu = None
lib_name = find_loaded_library("vllm_ascend_C")
camem_available = True
@@ -81,7 +81,7 @@ except ImportError as e:
python_create_and_map = None
python_unmap_and_release = None
python_get_mem_info = None
python_lock_gpu = None
python_try_lock_gpu = None
python_unlock_gpu = None
lib_name = None
libcudart = None
@@ -109,12 +109,14 @@ def get_pluggable_allocator(
python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
python_free_func: Callable[[int], tuple[int, int, int, int]]
) -> torch.npu.memory.NPUPluggableAllocator:
init_module(python_malloc_fn, python_free_func)
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
current_device = torch.npu.current_device()
init_module(python_malloc_fn, python_free_func, current_device)
new_alloc = torch.npu.memory.NPUPluggableAllocator(
lib_name, 'my_malloc_offload', 'my_free_offload'
)
else:
init_module(python_malloc_fn, python_free_func)
new_alloc = torch.npu.memory.NPUPluggableAllocator(
lib_name, 'my_malloc', 'my_free'
)
@@ -280,7 +282,7 @@ class CaMemAllocator:
self.allocator_and_pools[tag] = data
# lock gpu
if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
self.vnpu_lock_gpu()
self._vnpu_lock_gpu()
yield
# PyTorch's bug, calling torch.cuda.empty_cache() will error
# when using pluggable allocator, see
@@ -306,12 +308,18 @@ class CaMemAllocator:
sum_bytes += handle[1]
return sum_bytes
def vnpu_lock_gpu(self) -> bool:
if python_lock_gpu:
return python_lock_gpu()
def vnpu_try_lock_gpu(self) -> tuple[bool, bool]:
if python_try_lock_gpu:
return python_try_lock_gpu()
else:
return False
return False, False
def _vnpu_lock_gpu(self) -> bool:
while True:
success, _ = self.vnpu_try_lock_gpu()
if success:
return True
time.sleep(0.001)
def vnpu_unlock_gpu(self):
@@ -373,15 +381,15 @@ class CaMemAllocator:
self.vnpu_unlock_gpu()
# logger.info(f"offload: tags {offload_tags}: {sz_weights/(1024**3):.2f} GB, discard kv cache: {sz_kvcache/(1024**3):.2f} GB")
def reload_vram(self, tags: Optional[list[str]] = None) -> bool:
"""
Wake up the allocator from sleep mode.
All data that is previously offloaded will be loaded back to GPU
memory, and the rest of the data will have empty memory."""
prev_is_self = self.vnpu_lock_gpu()
def try_reload_vram(self, tags: Optional[list[str]] = None) -> tuple[bool, bool]:
succ, prev_is_self = self.vnpu_try_lock_gpu()
if not succ:
# not get the lock
return False, prev_is_self
if prev_is_self:
# nothing to do
return True
return succ, prev_is_self
for ptr, data in self.pointer_to_data.items():
handle = data.handle
@@ -401,4 +409,4 @@ class CaMemAllocator:
# else:
# size_in_bytes = handle[1]
# memset(ptr, size_in_bytes, 0, size_in_bytes)
return False
return succ, prev_is_self