support multi npu partially

2026-01-08 06:54:33 +00:00
parent fa0fb46853
commit 2a571d8bc8
12 changed files with 331 additions and 160 deletions
--- a/vllm_ascend/device_allocator/camem.py
+++ b/vllm_ascend/device_allocator/camem.py
@@ -63,14 +63,14 @@ try:
            init_module_offload as init_module,
            python_create_and_map_offload as python_create_and_map,python_unmap_and_release_offload as python_unmap_and_release,
            python_get_mem_info_offload as python_get_mem_info,
-            python_lock_gpu_offload as python_lock_gpu,
+            python_try_lock_gpu_offload as python_try_lock_gpu,
            python_unlock_gpu_offload as python_unlock_gpu
            )
    else:
        from vllm_ascend.vllm_ascend_C import (  # type: ignore # noqa: F401
            init_module, python_create_and_map, python_unmap_and_release)
        python_get_mem_info = None
-        python_lock_gpu = None
+        python_try_lock_gpu = None
        python_unlock_gpu = None
    lib_name = find_loaded_library("vllm_ascend_C")
    camem_available = True
@@ -81,7 +81,7 @@ except ImportError as e:
    python_create_and_map = None
    python_unmap_and_release = None
    python_get_mem_info = None
-    python_lock_gpu = None
+    python_try_lock_gpu = None
    python_unlock_gpu = None
    lib_name = None
    libcudart = None
@@ -109,12 +109,14 @@ def get_pluggable_allocator(
    python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
    python_free_func: Callable[[int], tuple[int, int, int, int]]
 ) -> torch.npu.memory.NPUPluggableAllocator:
-    init_module(python_malloc_fn, python_free_func)
    if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
+        current_device = torch.npu.current_device()
+        init_module(python_malloc_fn, python_free_func, current_device)
        new_alloc = torch.npu.memory.NPUPluggableAllocator(
            lib_name, 'my_malloc_offload', 'my_free_offload'
        )
    else:
+        init_module(python_malloc_fn, python_free_func)
        new_alloc = torch.npu.memory.NPUPluggableAllocator(
            lib_name, 'my_malloc', 'my_free'
        )
@@ -280,7 +282,7 @@ class CaMemAllocator:
            self.allocator_and_pools[tag] = data
            # lock gpu
            if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
-                self.vnpu_lock_gpu()
+                self._vnpu_lock_gpu()
            yield
            # PyTorch's bug, calling torch.cuda.empty_cache() will error
            # when using pluggable allocator, see
@@ -306,12 +308,18 @@ class CaMemAllocator:
            sum_bytes += handle[1]
        return sum_bytes

-    
-    def vnpu_lock_gpu(self) -> bool:
-        if python_lock_gpu:
-            return python_lock_gpu()
+    def vnpu_try_lock_gpu(self) -> tuple[bool, bool]:
+        if python_try_lock_gpu:
+            return python_try_lock_gpu()
        else:
-            return False
+            return False, False
+
+    def _vnpu_lock_gpu(self) -> bool:
+        while True:
+            success, _ = self.vnpu_try_lock_gpu()
+            if success:
+                return True
+            time.sleep(0.001)


    def vnpu_unlock_gpu(self):
@@ -373,15 +381,15 @@ class CaMemAllocator:
        self.vnpu_unlock_gpu()
        # logger.info(f"offload: tags {offload_tags}: {sz_weights/(1024**3):.2f} GB, discard kv cache: {sz_kvcache/(1024**3):.2f} GB")

-    def reload_vram(self, tags: Optional[list[str]] = None) -> bool:
-        """
-        Wake up the allocator from sleep mode.
-        All data that is previously offloaded will be loaded back to GPU 
-        memory, and the rest of the data will have empty memory."""
-        prev_is_self = self.vnpu_lock_gpu()
+    def try_reload_vram(self, tags: Optional[list[str]] = None) -> tuple[bool, bool]:
+        succ, prev_is_self = self.vnpu_try_lock_gpu()
+        if not succ:
+            # not get the lock
+            return False, prev_is_self
+
        if prev_is_self:
            # nothing to do
-            return True
+            return succ, prev_is_self

        for ptr, data in self.pointer_to_data.items():
            handle = data.handle
@@ -401,4 +409,4 @@ class CaMemAllocator:
                # else:
                #     size_in_bytes = handle[1]
                #     memset(ptr, size_in_bytes, 0, size_in_bytes)
-        return False
+        return succ, prev_is_self