add vxpu

2026-02-05 19:36:06 +08:00
parent 070bfa4a73
commit e273ef01b8
131 changed files with 28539 additions and 2 deletions
--- a/vllm_kunlun/device_allocator/xpumem.py
+++ b/vllm_kunlun/device_allocator/xpumem.py
@@ -0,0 +1,317 @@
+import dataclasses
+import os
+from contextlib import contextmanager
+from typing import Any, Callable, Optional, Union
+import time
+
+import torch
+from vllm.logger import logger
+import vllm_kunlun.platforms.envs as xenvs
+
+
+def find_loaded_library(lib_name) -> Optional[str]:
+    """
+    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
+    the file `/proc/self/maps` contains the memory maps of the process, which includes the
+    shared libraries loaded by the process. We can use this file to find the path of the
+    a loaded library.
+    """ # noqa
+    found_line = None
+    with open("/proc/self/maps") as f:
+        for line in f:
+            if lib_name in line:
+                found_line = line
+                break
+    if found_line is None:
+        # the library is not loaded in the current process
+        return None
+    # if lib_name is libcudart, we need to match a line with:
+    # address /path/to/libcudart-hash.so.11.0
+    start = found_line.index("/")
+    path = found_line[start:].strip()
+    filename = path.split("/")[-1]
+    assert filename.rpartition(".so")[0].startswith(lib_name), \
+        f"Unexpected filename: {filename} for library {lib_name}"
+    return path
+
+
+xpumem_available = False
+try:
+    if xenvs.VLLM_KUNLUN_ENABLE_VXPU:
+        from vllm_kunlun._kunlun_vxpu import (
+            init_module,
+            create_and_map as py_create_and_map,
+            unmap_and_release as py_unmap_and_release,
+            my_xpu_memcpy as xpu_memcpy,
+            get_mem_info,
+            try_lock_gpu,
+            unlock_gpu,
+        )
+
+        lib_name = find_loaded_library("_kunlun_vxpu")
+        xpumem_available = True
+    else:
+        init_module = None
+        py_create_and_map = None
+        py_unmap_and_release = None
+        xpu_memcpy = None
+        get_mem_info = None
+        try_lock_gpu = None
+        unlock_gpu = None
+        lib_name = None
+except ImportError as e:
+    logger.warning("Failed to import vllm_kunlun._kunlun_vxpu:%s.", e)
+    init_module = None
+    py_create_and_map = None
+    py_unmap_and_release = None
+    xpu_memcpy = None
+    get_mem_info = None
+    try_lock_gpu = None
+    unlock_gpu = None
+    lib_name = None
+
+# py_device, py_alignedSize, py_d_mem, py_p_memHandle
+HandleType = tuple[int, int, int, int]
+
+
+@dataclasses.dataclass
+class AllocationData:
+    handle: HandleType
+    tag: str
+    cpu_backup_tensor: Optional[torch.Tensor] = None
+
+
+def create_and_map(allocation_handle: HandleType) -> None:
+    py_create_and_map(*allocation_handle)
+
+
+def unmap_and_release(allocation_handle: HandleType) -> None:
+    py_unmap_and_release(*allocation_handle)
+
+
+def get_pluggable_allocator(
+    python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
+    python_free_func: Callable[[int], tuple[int, int, int, int]],
+) -> torch.cuda.memory.CUDAPluggableAllocator:
+    current_device = torch.cuda.current_device()
+    init_module(python_malloc_fn, python_free_func, current_device)
+    new_alloc = torch.cuda.memory.CUDAPluggableAllocator(
+        lib_name, 'my_malloc', 'my_free'
+    )
+    return new_alloc
+
+
+@contextmanager
+def use_memory_pool_with_allocator(
+        python_malloc_fn: Callable[[tuple[int, int, int, int]], None],
+        python_free_func: Callable[[int], tuple[int, int, int, int]]):
+    new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func)
+    mem_pool = torch.cuda.memory.MemPool(new_alloc._allocator)
+    with torch.cuda.memory.use_mem_pool(mem_pool):
+        yield mem_pool, new_alloc
+
+
+class XpuMemAllocator:
+    """
+    A singleton class that manages a memory pool for Kunlun XPU tensors.
+    The memory in this pool can be offloaded or discarded when the
+    allocator sleeps.
+    Inside the `use_memory_pool(tag)` context, all tensors created will
+    be allocated in the memory pool, and has the same tag as the
+    tag passed to the context.
+    When we call `sleep`, all tensors with the specified tag will be
+    offloaded to CPU memory, and the rest of the tensors will be discarded.
+    When we call `wake_up`, all tensors that are previously offloaded
+    will be loaded back to GPU memory, and the rest of the tensors will
+    have empty memory.
+    Why it needs to be a singleton?
+    When allocated tensors are garbage collected, PyTorch will call
+    the free callback, which will call the `python_free_callback` method.
+    The C-extension uses a global variable to store the function of an
+    instance of this class. If we create multiple instances of this class,
+    the global variable will be overwritten and the free callback will
+    not work as expected.
+    """
+    nstance = None
+    default_tag: str = "default"
+
+    @staticmethod
+    def get_instance() -> "XpuMemAllocator":
+        """
+        XpuMemAllocator is a singleton class.
+        We cannot call the constructor directly.
+        Call this method to get the instance.
+        """
+        assert xpumem_available, "xpumem allocator is not available"
+        if XpuMemAllocator.nstance is None:
+            XpuMemAllocator.nstance = XpuMemAllocator()
+        return XpuMemAllocator.nstance
+
+    def __init__(self):
+        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+        assert "expandable_segments:True" not in conf, \
+            ("Expandable segments are not compatible with memory pool. "
+            "Please track https://github.com/pytorch/pytorch/issues/147851 "
+            "for the latest updates.")
+
+        self.pointer_to_data: dict[int, AllocationData] = {}
+        self.current_tag: str = XpuMemAllocator.default_tag
+        self.allocator_and_pools: dict[str, Any] = {}
+
+    def python_malloc_callback(self, allocation_handle: HandleType) -> None:
+        """
+        Internal method to store the allocation data
+        when memory is allocated in the memory pool."""
+        py_d_mem = allocation_handle[2]
+        self.pointer_to_data[py_d_mem] = AllocationData(
+            allocation_handle, self.current_tag)
+        return
+
+    def python_free_callback(self, ptr: int) -> HandleType:
+        """
+        Internal method to look up the allocation data
+        when memory is freed in the memory pool."""
+        data = self.pointer_to_data.pop(ptr)
+        if data.cpu_backup_tensor is not None:
+            data.cpu_backup_tensor = None
+        return data.handle
+
+    @contextmanager
+    def use_memory_pool(self, tag: Optional[str] = None):
+        """
+        A context manager to use the memory pool.
+        All memory allocation created inside the context will be allocated 
+        in the memory pool, and has the specified tag.
+        :param tag: The tag of the memory allocation. If None, the default tag
+            will be used.
+        """
+        if tag is None:
+            tag = XpuMemAllocator.default_tag
+
+        assert isinstance(tag, str)
+
+        old_tag = self.current_tag
+        self.current_tag = tag
+        with use_memory_pool_with_allocator(self.python_malloc_callback,
+                                            self.python_free_callback) as data:
+            # start to hit another PyTorch bug in PyTorch 2.6,
+            # possibly because of gc-related issue w.r.t. the allocator and
+            # the memory pool.
+            # to avoid the issue, we keep a reference of the data.
+            # see https://github.com/pytorch/pytorch/issues/146431 .
+            self.allocator_and_pools[tag] = data
+            yield
+            # PyTorch's bug, calling torch.cuda.empty_cache() will error
+            # when using pluggable allocator, see
+            # https://github.com/pytorch/pytorch/issues/145168 .
+            # if we have some memory allocated and then freed,
+            # the memory will not be released, e.g. in online quantization,
+            # where the model is created in higher precision, and then
+            # quantized in lower precision.
+            # Find all unused allocations and manually release them.
+            # TODO: we should expose `empty_cache` method in the memory pool.
+            # TODO: ask for help from PyTorch team to expose this method.
+            # allocations = data[0].snapshot()
+            # for allocation in allocations:
+            #     if allocation["allocated_size"] == 0:
+            #         handle = self._python_free_callback(allocation["address"])
+            #         unmap_and_release(handle)
+            self.current_tag = old_tag
+
+    def get_current_usage(self) -> int:
+        """
+        Get the total number of bytes allocated in the memory pool.
+        """
+        sum_bytes: int = 0
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            sum_bytes += handle[1]
+        return sum_bytes
+
+    def vxpu_try_lock_gpu(self) -> tuple[bool, bool]:
+        if try_lock_gpu:
+            return try_lock_gpu()
+        else:
+            return False, False
+
+    def _vxpu_lock_gpu(self) -> bool:
+        while True:
+            success, _ = self.vxpu_try_lock_gpu()
+            if success:
+                return True
+            time.sleep(0.001)
+
+    def vxpu_unlock_gpu(self):
+        if unlock_gpu:
+            unlock_gpu()
+
+    def get_pool_mem_info(self) -> tuple[int, int]:
+        """
+        get memory info (available, total) in reserved pool.
+        """
+        return get_mem_info()
+
+    def offload_vram(
+            self,
+            offload_tags: Optional[Union[tuple[str, ...],
+                                         str]] = None) -> None:
+        """
+        Put the allocator in sleep mode.
+        All data in the memory allocation with the specified tag will be 
+        offloaded to CPU memory, and others will be discarded.
+        :param offload_tags: The tags of the memory allocation that will be
+            offloaded. The rest of the memory allocation will be discarded.
+        """
+        if offload_tags is None:
+            # by default, allocated tensors are offloaded
+            # when the allocator sleeps
+            offload_tags = (XpuMemAllocator.default_tag,)
+        elif isinstance(offload_tags, str):
+            offload_tags = (offload_tags,)
+
+        assert isinstance(offload_tags, tuple)
+
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            if data.tag in offload_tags:
+                size_in_bytes = handle[1]
+                if data.cpu_backup_tensor is None:
+                    cpu_backup_tensor = torch.empty(
+                        size_in_bytes,
+                        dtype=torch.uint8,
+                        device='cpu',
+                        pin_memory=True)
+                    cpu_ptr = cpu_backup_tensor.data_ptr()
+                    XPU_DEVICE_TO_HOST = 0
+                    xpu_memcpy(cpu_ptr, ptr, size_in_bytes, XPU_DEVICE_TO_HOST)
+                    data.cpu_backup_tensor = cpu_backup_tensor
+                unmap_and_release(handle)
+            else:
+                unmap_and_release(handle)
+
+        self.vxpu_unlock_gpu()
+
+    def try_reload_vram(self, tags: Optional[list[str]] = None) -> tuple[bool, bool]:
+        succ, prev_is_self = self.vxpu_try_lock_gpu()
+        if not succ:
+            # not get the lock
+            return False, prev_is_self
+
+        if prev_is_self:
+            # nothing to do
+            return succ, prev_is_self
+
+        for ptr, data in self.pointer_to_data.items():
+            handle = data.handle
+            if tags is None or data.tag in tags:
+                create_and_map(handle)
+                if data.cpu_backup_tensor is not None:
+                    cpu_backup_tensor = data.cpu_backup_tensor
+                    size_in_bytes = (
+                        cpu_backup_tensor.numel() * cpu_backup_tensor.element_size()
+                    )
+                    cpu_ptr = cpu_backup_tensor.data_ptr()
+                    XPU_HOST_TO_DEVICE = 1
+                    xpu_memcpy(ptr, cpu_ptr, size_in_bytes, XPU_HOST_TO_DEVICE)
+                    # data.cpu_backup_tensor = None
+        return succ, prev_is_self