Files
xc-llm-kunlun/vllm_kunlun/csrc/xpumem_allocator.cpp
2026-03-02 18:38:10 +08:00

154 lines
4.8 KiB
C++

#include <pybind11/pybind11.h>
#include <iostream>
#include <stdexcept>
#include <stdint.h>
#include <atomic>
#include <tuple>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include "vxpu_offload/xpu_helper.h"
#include "vxpu_offload/shm_worker.h"
namespace py = pybind11;
namespace {
// vxpu
static std::atomic<bool> g_initialized(false);
static void *g_vmem = nullptr;
static size_t g_size = 0;
static std::atomic_uint_fast64_t g_allocated_offset(0);
ShmWorker *shm_worker = nullptr;
static const size_t granularity = 2 * 1024 * 1024; // 2MB
// Global references to Python callables
// NOTE: this is borrowed reference, so we don't need to DECREF them.
// This brings the limitation that the allocator needs to be singleton.
// static PyObject* g_python_malloc_callback = nullptr;
// static PyObject* g_python_free_callback = nullptr;
static py::function g_python_malloc_callback;
static py::function g_python_free_callback;
extern "C" {
void* my_malloc(ssize_t size, int device, cudaStream_t stream) {
size_t aligned_size = ((size + granularity - 1) / granularity) * granularity;
size_t alloc_offset = g_allocated_offset.fetch_add(aligned_size);
if (alloc_offset + aligned_size > g_size) {
throw std::runtime_error(
"my_malloc ERROR: Out of memory in the reserved pool." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
}
void *d_ptr = (void *)((uintptr_t)g_vmem + alloc_offset);
if (!g_python_malloc_callback) {
throw std::runtime_error(
"my_malloc ERROR: g_python_malloc_callback is not callable." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
} else {
py::gil_scoped_acquire gil;
unsigned long long fake_handle = 0;
auto handle_tuple = std::make_tuple(
(unsigned long long)device, (unsigned long long)aligned_size,
(unsigned long long)d_ptr, (unsigned long long)fake_handle);
g_python_malloc_callback(handle_tuple);
}
return d_ptr;
}
void my_free(void *ptr, ssize_t size, int device, cudaStream_t stream) {
if (!g_python_free_callback) {
throw std::runtime_error(
"my_free ERROR: g_python_free_callback is not callable." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
} else {
py::gil_scoped_acquire gil;
py::object result = g_python_free_callback((unsigned long long)ptr);
// nothing to do
}
}
} // extern "C"
void init_module(py::function malloc_cb, py::function free_cb, int device_id) {
g_python_malloc_callback = malloc_cb;
g_python_free_callback = free_cb;
// init vxpu
if (g_initialized.load()) {
return;
}
g_initialized.store(true);
shm_worker = new ShmWorker();
XPUIpcMemHandle mem_handle;
bool res = shm_worker->register_worker(device_id, &mem_handle, &g_size);
if (!res) {
throw std::runtime_error(
"init_module ERROR: Failed to register shm worker." + std::string(" ") +
__FILE__ + ":" + std::to_string(__LINE__));
}
// open mem handle
int ret = xpu_ipc_open_memhandle(&g_vmem, mem_handle, 1);
if (ret != XPU_SUCCESS) {
throw std::runtime_error(
"init_module ERROR: xpu_ipc_open_memhandle failed." + std::string(" ") +
__FILE__ + ":" + std::to_string(__LINE__));
}
}
void create_and_map(unsigned long long device, size_t size, uintptr_t p_mem,
uint64_t handle) {
return;
}
void unmap_and_release(unsigned long long device, uintptr_t p_mem, size_t size,
uint64_t handle) {
return;
}
void my_xpu_memcpy(uintptr_t dst, uintptr_t src, uint64_t sz, int kind) {
XPUMemcpyKind memcpy_kind = static_cast<XPUMemcpyKind>(kind);
int ret = xpu_memcpy((void *)dst, (const void *)src, sz, memcpy_kind);
if (ret != XPU_SUCCESS) {
throw std::runtime_error("my_xpu_memcpy ERROR: xpu_memcpy failed." +
std::string(" ") + __FILE__ + ":" +
std::to_string(__LINE__));
}
}
std::tuple<size_t, size_t> get_mem_info() {
size_t allocated_bytes = g_allocated_offset.load();
size_t free_mem = 0;
if (allocated_bytes >= g_size) {
free_mem = 0;
} else {
free_mem = g_size - allocated_bytes;
}
return std::make_tuple(free_mem, g_size);
}
std::tuple<bool, bool> try_lock_gpu() {
bool prev_is_self = false;
bool success = shm_worker->try_lock_gpu(prev_is_self);
return std::make_tuple(success, prev_is_self);
}
void unlock_gpu() { shm_worker->unlock_gpu(); }
} // namespace
PYBIND11_MODULE(_kunlun_vxpu, m) {
m.def("init_module", &init_module, py::arg("malloc_cb"), py::arg("free_cb"), py::arg("device_id"));
m.def("create_and_map", &create_and_map);
m.def("unmap_and_release", &unmap_and_release);
m.def("my_xpu_memcpy", &my_xpu_memcpy);
m.def("get_mem_info", &get_mem_info);
m.def("try_lock_gpu", &try_lock_gpu);
m.def("unlock_gpu", &unlock_gpu);
}