154 lines
4.8 KiB
C++
154 lines
4.8 KiB
C++
#include <pybind11/pybind11.h>
|
|
#include <iostream>
|
|
#include <stdexcept>
|
|
#include <stdint.h>
|
|
#include <atomic>
|
|
#include <tuple>
|
|
#include <cuda_runtime_api.h>
|
|
#include <cuda.h>
|
|
|
|
#include "vxpu_offload/xpu_helper.h"
|
|
#include "vxpu_offload/shm_worker.h"
|
|
|
|
namespace py = pybind11;
|
|
|
|
namespace {
|
|
|
|
// vxpu
|
|
static std::atomic<bool> g_initialized(false);
|
|
static void *g_vmem = nullptr;
|
|
static size_t g_size = 0;
|
|
static std::atomic_uint_fast64_t g_allocated_offset(0);
|
|
ShmWorker *shm_worker = nullptr;
|
|
static const size_t granularity = 2 * 1024 * 1024; // 2MB
|
|
|
|
// Global references to Python callables
|
|
// NOTE: this is borrowed reference, so we don't need to DECREF them.
|
|
// This brings the limitation that the allocator needs to be singleton.
|
|
// static PyObject* g_python_malloc_callback = nullptr;
|
|
// static PyObject* g_python_free_callback = nullptr;
|
|
static py::function g_python_malloc_callback;
|
|
static py::function g_python_free_callback;
|
|
|
|
|
|
extern "C" {
|
|
void* my_malloc(ssize_t size, int device, cudaStream_t stream) {
|
|
size_t aligned_size = ((size + granularity - 1) / granularity) * granularity;
|
|
size_t alloc_offset = g_allocated_offset.fetch_add(aligned_size);
|
|
if (alloc_offset + aligned_size > g_size) {
|
|
throw std::runtime_error(
|
|
"my_malloc ERROR: Out of memory in the reserved pool." +
|
|
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
void *d_ptr = (void *)((uintptr_t)g_vmem + alloc_offset);
|
|
|
|
if (!g_python_malloc_callback) {
|
|
throw std::runtime_error(
|
|
"my_malloc ERROR: g_python_malloc_callback is not callable." +
|
|
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
|
} else {
|
|
py::gil_scoped_acquire gil;
|
|
unsigned long long fake_handle = 0;
|
|
auto handle_tuple = std::make_tuple(
|
|
(unsigned long long)device, (unsigned long long)aligned_size,
|
|
(unsigned long long)d_ptr, (unsigned long long)fake_handle);
|
|
g_python_malloc_callback(handle_tuple);
|
|
}
|
|
|
|
return d_ptr;
|
|
}
|
|
|
|
void my_free(void *ptr, ssize_t size, int device, cudaStream_t stream) {
|
|
if (!g_python_free_callback) {
|
|
throw std::runtime_error(
|
|
"my_free ERROR: g_python_free_callback is not callable." +
|
|
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
|
} else {
|
|
py::gil_scoped_acquire gil;
|
|
py::object result = g_python_free_callback((unsigned long long)ptr);
|
|
// nothing to do
|
|
}
|
|
}
|
|
|
|
} // extern "C"
|
|
|
|
|
|
void init_module(py::function malloc_cb, py::function free_cb, int device_id) {
|
|
g_python_malloc_callback = malloc_cb;
|
|
g_python_free_callback = free_cb;
|
|
|
|
// init vxpu
|
|
if (g_initialized.load()) {
|
|
return;
|
|
}
|
|
g_initialized.store(true);
|
|
|
|
shm_worker = new ShmWorker();
|
|
XPUIpcMemHandle mem_handle;
|
|
bool res = shm_worker->register_worker(device_id, &mem_handle, &g_size);
|
|
if (!res) {
|
|
throw std::runtime_error(
|
|
"init_module ERROR: Failed to register shm worker." + std::string(" ") +
|
|
__FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
// open mem handle
|
|
int ret = xpu_ipc_open_memhandle(&g_vmem, mem_handle, 1);
|
|
if (ret != XPU_SUCCESS) {
|
|
throw std::runtime_error(
|
|
"init_module ERROR: xpu_ipc_open_memhandle failed." + std::string(" ") +
|
|
__FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
}
|
|
|
|
void create_and_map(unsigned long long device, size_t size, uintptr_t p_mem,
|
|
uint64_t handle) {
|
|
return;
|
|
}
|
|
|
|
void unmap_and_release(unsigned long long device, uintptr_t p_mem, size_t size,
|
|
uint64_t handle) {
|
|
return;
|
|
}
|
|
|
|
void my_xpu_memcpy(uintptr_t dst, uintptr_t src, uint64_t sz, int kind) {
|
|
XPUMemcpyKind memcpy_kind = static_cast<XPUMemcpyKind>(kind);
|
|
int ret = xpu_memcpy((void *)dst, (const void *)src, sz, memcpy_kind);
|
|
if (ret != XPU_SUCCESS) {
|
|
throw std::runtime_error("my_xpu_memcpy ERROR: xpu_memcpy failed." +
|
|
std::string(" ") + __FILE__ + ":" +
|
|
std::to_string(__LINE__));
|
|
}
|
|
}
|
|
|
|
std::tuple<size_t, size_t> get_mem_info() {
|
|
size_t allocated_bytes = g_allocated_offset.load();
|
|
size_t free_mem = 0;
|
|
if (allocated_bytes >= g_size) {
|
|
free_mem = 0;
|
|
} else {
|
|
free_mem = g_size - allocated_bytes;
|
|
}
|
|
return std::make_tuple(free_mem, g_size);
|
|
}
|
|
|
|
std::tuple<bool, bool> try_lock_gpu() {
|
|
bool prev_is_self = false;
|
|
bool success = shm_worker->try_lock_gpu(prev_is_self);
|
|
return std::make_tuple(success, prev_is_self);
|
|
}
|
|
|
|
void unlock_gpu() { shm_worker->unlock_gpu(); }
|
|
|
|
} // namespace
|
|
|
|
|
|
PYBIND11_MODULE(_kunlun_vxpu, m) {
|
|
m.def("init_module", &init_module, py::arg("malloc_cb"), py::arg("free_cb"), py::arg("device_id"));
|
|
m.def("create_and_map", &create_and_map);
|
|
m.def("unmap_and_release", &unmap_and_release);
|
|
m.def("my_xpu_memcpy", &my_xpu_memcpy);
|
|
m.def("get_mem_info", &get_mem_info);
|
|
m.def("try_lock_gpu", &try_lock_gpu);
|
|
m.def("unlock_gpu", &unlock_gpu);
|
|
}
|