adapt to vllm-ascend v0.18.0rc1

2026-04-21 03:05:32 +00:00
parent 99e1ea0fe6
commit e4d898b245
132 changed files with 28743 additions and 100 deletions
--- a/vllm_ascend/device_allocator/camem.py
+++ b/vllm_ascend/device_allocator/camem.py
@@ -21,10 +21,12 @@ import os
 from collections.abc import Callable
 from contextlib import contextmanager
 from typing import Any
+import time

 import torch
 from acl.rt import memcpy  # type: ignore # noqa: F401
 from vllm.logger import logger
+import vllm_ascend.envs as envs_ascend


 def find_loaded_library(lib_name) -> str | None:
@@ -54,11 +56,23 @@ def find_loaded_library(lib_name) -> str | None:

 camem_available = False
 try:
-    from vllm_ascend.vllm_ascend_C import (  # type: ignore # noqa: F401
-        init_module,
-        python_create_and_map,
-        python_unmap_and_release,
-    )
+    if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
+        from vllm_ascend.vllm_ascend_C import (  # type: ignore # noqa: F401
+            init_module_offload as init_module,
+            python_create_and_map_offload as python_create_and_map,python_unmap_and_release_offload as python_unmap_and_release,
+            python_get_mem_info_offload as python_get_mem_info,
+            python_try_lock_gpu_offload as python_try_lock_gpu,
+            python_unlock_gpu_offload as python_unlock_gpu
+        )
+    else:
+        from vllm_ascend.vllm_ascend_C import (  # type: ignore # noqa: F401
+            init_module,
+            python_create_and_map,
+            python_unmap_and_release,
+        )
+        python_get_mem_info = None
+        python_try_lock_gpu = None
+        python_unlock_gpu = None

    lib_name = find_loaded_library("vllm_ascend_C")
    camem_available = True
@@ -67,6 +81,9 @@ except ImportError as e:
    init_module = None
    python_create_and_map = None
    python_unmap_and_release = None
+    python_get_mem_info = None
+    python_try_lock_gpu = None
+    python_unlock_gpu = None
    lib_name = None
    libcudart = None

@@ -93,8 +110,17 @@ def get_pluggable_allocator(
    python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
    python_free_func: Callable[[int], tuple[int, int, int, int]],
 ) -> torch.npu.memory.NPUPluggableAllocator:
-    init_module(python_malloc_fn, python_free_func)
-    new_alloc = torch.npu.memory.NPUPluggableAllocator(lib_name, "my_malloc", "my_free")
+    if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
+        current_device = torch.npu.current_device()
+        init_module(python_malloc_fn, python_free_func, current_device)
+        new_alloc = torch.npu.memory.NPUPluggableAllocator(
+            lib_name, 'my_malloc_offload', 'my_free_offload'
+        )
+    else:
+        init_module(python_malloc_fn, python_free_func)
+        new_alloc = torch.npu.memory.NPUPluggableAllocator(
+            lib_name, 'my_malloc', 'my_free'
+        )
    return new_alloc


@@ -245,6 +271,9 @@ class CaMemAllocator:
            # to avoid the issue, we keep a reference of the data.
            # see https://github.com/pytorch/pytorch/issues/146431 .
            self.allocator_and_pools[tag] = data
+            # lock gpu
+            if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
+                self._vnpu_lock_gpu()
            yield
            # PyTorch's bug, calling torch.cuda.empty_cache() will error
            # when using pluggable allocator, see
@@ -256,6 +285,8 @@ class CaMemAllocator:
            # allocate memory.
            # TODO: we need to find a way to release the memory,
            # i.e. calling torch.cuda.empty_cache()
+            if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
+                self.vnpu_unlock_gpu()
            self.current_tag = old_tag

    def get_current_usage(self) -> int:
@@ -267,3 +298,104 @@ class CaMemAllocator:
            handle = data.handle
            sum_bytes += handle[1]
        return sum_bytes
+
+    def vnpu_try_lock_gpu(self) -> tuple[bool, bool]:
+        if python_try_lock_gpu:
+            return python_try_lock_gpu()
+        else:
+            return False, False
+
+    def _vnpu_lock_gpu(self) -> bool:
+        while True:
+            success, _ = self.vnpu_try_lock_gpu()
+            if success:
+                return True
+            time.sleep(0.001)
+
+    def vnpu_unlock_gpu(self):
+        if python_unlock_gpu:
+            python_unlock_gpu()
+
+    def get_pool_mem_info(self) -> tuple[int, int]:
+        """
+        get available memory in reserved pool."""
+        return python_get_mem_info()
+
+    def offload_vram(
+            self,
+            offload_tags: tuple[str, ...] | str | None = None) -> None:
+        """
+        Put the allocator in sleep mode.
+        All data in the memory allocation with the specified tag will be 
+        offloaded to CPU memory, and others will be discarded.
+        :param offload_tags: The tags of the memory allocation that will be
+            offloaded. The rest of the memory allocation will be discarded.
+        """
+        if offload_tags is None:
+            # by default, allocated tensors are offloaded
+            # when the allocator sleeps
+            offload_tags = (CaMemAllocator.default_tag, )
+        elif isinstance(offload_tags, str):
+            offload_tags = (offload_tags, )
+
+        assert isinstance(offload_tags, tuple)
+
+        sz_weights = 0
+        sz_kvcache = 0
+
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            if data.tag in offload_tags:
+                size_in_bytes = handle[1]
+                if data.cpu_backup_tensor is None:
+                    cpu_backup_tensor = torch.empty(
+                        size_in_bytes,
+                        dtype=torch.uint8,
+                        device='cpu',
+                        pin_memory=True)
+                    cpu_ptr = cpu_backup_tensor.data_ptr()
+                    ACL_MEMCPY_DEVICE_TO_HOST = 2
+                    dest_max = cpu_ptr + size_in_bytes * 2
+                    memcpy(cpu_ptr, dest_max, ptr, size_in_bytes,
+                        ACL_MEMCPY_DEVICE_TO_HOST)
+                    data.cpu_backup_tensor = cpu_backup_tensor
+                unmap_and_release(handle)
+                sz_weights += size_in_bytes
+            else:
+                size_in_bytes = handle[1]
+                unmap_and_release(handle)
+                sz_kvcache += size_in_bytes
+        # self.requested_vram_size = sz_weights + sz_kvcache
+
+        self.vnpu_unlock_gpu()
+        # logger.info(f"offload: tags {offload_tags}: {sz_weights/(1024**3):.2f} GB, discard kv cache: {sz_kvcache/(1024**3):.2f} GB")
+
+    def try_reload_vram(self, tags: list[str] | None = None) -> tuple[bool, bool]:
+        succ, prev_is_self = self.vnpu_try_lock_gpu()
+        if not succ:
+            # not get the lock
+            return False, prev_is_self
+
+        if prev_is_self:
+            # nothing to do
+            return succ, prev_is_self
+
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            if tags is None or data.tag in tags:
+                create_and_map(handle)
+                if data.cpu_backup_tensor is not None:
+                    cpu_backup_tensor = data.cpu_backup_tensor
+                    size_in_bytes = cpu_backup_tensor.numel(
+                    ) * cpu_backup_tensor.element_size()
+                    cpu_ptr = cpu_backup_tensor.data_ptr()
+                    ACL_MEMCPY_HOST_TO_DEVICE = 1
+                    dest_max = ptr + size_in_bytes * 2
+                    memcpy(ptr, dest_max, cpu_ptr, size_in_bytes,
+                        ACL_MEMCPY_HOST_TO_DEVICE)
+                    # data.cpu_backup_tensor = None
+                # TO check: no need to re-memset if we reset_prefix_cache
+                # else:
+                #     size_in_bytes = handle[1]
+                #     memset(ptr, size_in_bytes, 0, size_in_bytes)
+        return succ, prev_is_self