vllm-ascend vnpu v1
This commit is contained in:
@@ -17,6 +17,9 @@
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <atomic>
|
||||
|
||||
#include "idle_offload/shm_worker.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
@@ -26,6 +29,13 @@ extern "C" {
|
||||
#include <sys/types.h>
|
||||
#include "acl/acl.h"
|
||||
|
||||
// idle offload
|
||||
static std::atomic<bool> g_initialized(false);
|
||||
static void *g_d_mem = nullptr;
|
||||
static size_t g_size = 0;
|
||||
static std::atomic_uint_fast64_t g_allocated_offset(0);
|
||||
ShmWorker *shm_worker = nullptr;
|
||||
|
||||
// Global references to Python callables
|
||||
// NOTE: this is borrowed reference, so we don't need to DECREF them.
|
||||
// This brings the limitation that the allocator needs to be singleton.
|
||||
@@ -248,6 +258,144 @@ __attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, in
|
||||
free(p_memHandle);
|
||||
}
|
||||
|
||||
__attribute__((visibility("default"))) void *
|
||||
my_malloc_offload(ssize_t size, int device, aclrtStream stream) {
|
||||
ensure_context(device);
|
||||
|
||||
// first allocation, align the size, and reserve an address, and also allocate
|
||||
// a aclrtDrvMemHandle
|
||||
|
||||
// Define memory allocation properties
|
||||
aclrtPhysicalMemProp prop = {};
|
||||
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
|
||||
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.memAttr = ACL_HBM_MEM_HUGE;
|
||||
prop.location.id = device;
|
||||
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.reserve = 0;
|
||||
|
||||
// Check if the allocation is supported
|
||||
size_t granularity;
|
||||
aclError error_code = aclrtMemGetAllocationGranularity(&prop,
|
||||
ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
|
||||
&granularity);
|
||||
if (error_code != 0) {
|
||||
throw std::runtime_error("aclrtMemGetAllocationGranularity failed with acl error code: " +
|
||||
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
||||
}
|
||||
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
|
||||
void *d_mem;
|
||||
// error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
|
||||
// if (error_code != 0) {
|
||||
// if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
|
||||
// throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
|
||||
// std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " +
|
||||
// __FILE__ + ":" + std::to_string(__LINE__));
|
||||
// } else {
|
||||
// throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
|
||||
// std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
||||
// }
|
||||
// }
|
||||
|
||||
// allocate from the reserved pool
|
||||
size_t alloc_offset = g_allocated_offset.fetch_add(alignedSize);
|
||||
if (alloc_offset + alignedSize > g_size) {
|
||||
throw std::runtime_error(
|
||||
"my_malloc ERROR: Out of memory in the reserved pool." +
|
||||
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
||||
}
|
||||
d_mem = (void *)((char *)g_d_mem + alloc_offset);
|
||||
|
||||
// allocate the aclrtDrvMemHandle
|
||||
aclrtDrvMemHandle* p_memHandle =
|
||||
(aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
|
||||
|
||||
if (!g_python_malloc_callback) {
|
||||
throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
|
||||
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
||||
}
|
||||
|
||||
// Acquire GIL (not in stable ABI officially, but often works)
|
||||
PyGILState_STATE gstate = PyGILState_Ensure();
|
||||
|
||||
PyObject* arg_tuple = create_tuple_from_c_integers(
|
||||
(unsigned long long)device, (unsigned long long)alignedSize,
|
||||
(unsigned long long)d_mem, (unsigned long long)p_memHandle);
|
||||
|
||||
// Call g_python_malloc_callback
|
||||
PyObject* py_result =
|
||||
PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
|
||||
Py_DECREF(arg_tuple);
|
||||
|
||||
if (!py_result) {
|
||||
PyErr_Print();
|
||||
PyGILState_Release(gstate);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
PyGILState_Release(gstate);
|
||||
|
||||
// // do the final mapping
|
||||
// create_and_map(device, alignedSize, d_mem, p_memHandle);
|
||||
|
||||
return (void*)d_mem;
|
||||
}
|
||||
|
||||
__attribute__((visibility("default"))) void
|
||||
my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) {
|
||||
// get memory handle from the pointer
|
||||
if (!g_python_free_callback) {
|
||||
throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
|
||||
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
||||
}
|
||||
|
||||
// Acquire GIL (not in stable ABI officially, but often works)
|
||||
PyGILState_STATE gstate = PyGILState_Ensure();
|
||||
|
||||
PyObject* py_ptr =
|
||||
PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
|
||||
|
||||
PyObject* py_result =
|
||||
PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
|
||||
|
||||
if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
|
||||
PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned long long recv_device, recv_size;
|
||||
unsigned long long recv_d_mem, recv_p_memHandle;
|
||||
// Unpack the tuple into four C integers
|
||||
if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
|
||||
&recv_d_mem, &recv_p_memHandle)) {
|
||||
// PyArg_ParseTuple sets an error if it fails
|
||||
return;
|
||||
}
|
||||
|
||||
PyGILState_Release(gstate);
|
||||
|
||||
// recv_size == size
|
||||
// recv_device == device
|
||||
|
||||
// Free memory
|
||||
|
||||
// nothing to do
|
||||
|
||||
// void *d_mem = (void*)recv_d_mem;
|
||||
// // allocate the aclrtDrvMemHandle
|
||||
// aclrtDrvMemHandle* p_memHandle =
|
||||
// (aclrtDrvMemHandle*)recv_p_memHandle;
|
||||
// unmap_and_release(device, size, d_mem, p_memHandle);
|
||||
|
||||
// // free address and the handle
|
||||
// aclError error_code = aclrtReleaseMemAddress(d_mem);
|
||||
// if (error_code != 0) {
|
||||
// throw std::runtime_error("aclrtReleaseMemAddress failed with acl error code: " +
|
||||
// std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
||||
// }
|
||||
// free(p_memHandle);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Python extension boilerplate:
|
||||
|
||||
@@ -322,6 +470,116 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
|
||||
static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
|
||||
PyObject* malloc_callback = nullptr;
|
||||
PyObject* free_callback = nullptr;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
|
||||
PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Save the Python callables
|
||||
// This module does not handle GC of these objects, so they must be kept alive
|
||||
// outside of this module.
|
||||
g_python_malloc_callback = malloc_callback;
|
||||
g_python_free_callback = free_callback;
|
||||
|
||||
// init idle
|
||||
if (g_initialized.load()) {
|
||||
printf("Module already initialized.\n");
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
g_initialized.store(true);
|
||||
|
||||
shm_worker = new ShmWorker();
|
||||
// get pid
|
||||
aclError error_code;
|
||||
int32_t pid;
|
||||
error_code = aclrtDeviceGetBareTgid(&pid);
|
||||
if (error_code != 0) {
|
||||
throw std::runtime_error(
|
||||
"aclrtDeviceGetBareTgid failed with acl error code: " +
|
||||
std::to_string(error_code) + " " + __FILE__ + ":" +
|
||||
std::to_string(__LINE__));
|
||||
}
|
||||
uint64_t shareable_handle;
|
||||
shm_worker->register_worker(pid, &shareable_handle, &g_size);
|
||||
|
||||
// import shareable handle
|
||||
uint32_t device = 0;
|
||||
aclrtDrvMemHandle memHandle;
|
||||
error_code =
|
||||
aclrtMemImportFromShareableHandle(shareable_handle, device, &memHandle);
|
||||
if (error_code != 0) {
|
||||
throw std::runtime_error(
|
||||
"aclrtMemImportFromShareableHandle failed with acl error code: " +
|
||||
std::to_string(error_code) + " " + __FILE__ + ":" +
|
||||
std::to_string(__LINE__));
|
||||
}
|
||||
|
||||
// reserve virtual address
|
||||
error_code = aclrtReserveMemAddress(&g_d_mem, g_size, 0, nullptr, 0);
|
||||
if (error_code != 0) {
|
||||
throw std::runtime_error(
|
||||
"aclrtReserveMemAddress failed with acl error code: " +
|
||||
std::to_string(error_code) + " " + __FILE__ + ":" +
|
||||
std::to_string(__LINE__));
|
||||
}
|
||||
// map
|
||||
error_code = aclrtMapMem(g_d_mem, g_size, 0, memHandle, 0);
|
||||
if (error_code != 0) {
|
||||
throw std::runtime_error("aclrtMapMem failed with acl error code: " +
|
||||
std::to_string(error_code) + " " + __FILE__ + ":" +
|
||||
std::to_string(__LINE__));
|
||||
}
|
||||
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
static PyObject *python_unmap_and_release_offload(PyObject *self,
|
||||
PyObject *args) {
|
||||
// nothing to do
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
static PyObject *python_create_and_map_offload(PyObject *self, PyObject *args) {
|
||||
// nothing to do
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
static PyObject* python_get_mem_info_offload(PyObject* self, PyObject* args) {
|
||||
size_t allocated_bytes = g_allocated_offset.load();
|
||||
size_t free_mem = 0;
|
||||
if (allocated_bytes >= g_size) {
|
||||
free_mem = 0;
|
||||
} else {
|
||||
free_mem = g_size - allocated_bytes;
|
||||
}
|
||||
PyObject* tuple = PyTuple_New(2);
|
||||
if (!tuple) {
|
||||
return nullptr;
|
||||
}
|
||||
PyTuple_SetItem(tuple, 0, PyLong_FromSize_t(free_mem));
|
||||
PyTuple_SetItem(tuple, 1, PyLong_FromSize_t(g_size));
|
||||
return tuple;
|
||||
}
|
||||
|
||||
static PyObject* python_lock_gpu_offload(PyObject* self, PyObject* args) {
|
||||
bool prev_is_self = shm_worker->lock_gpu();
|
||||
return PyBool_FromLong(prev_is_self);
|
||||
}
|
||||
|
||||
static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) {
|
||||
shm_worker->unlock_gpu();
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
static PyMethodDef module_methods[] = {
|
||||
{"init_module", (PyCFunction)py_init_module, METH_VARARGS,
|
||||
"Initialize module with python_malloc and python_free callables."},
|
||||
@@ -329,7 +587,21 @@ static PyMethodDef module_methods[] = {
|
||||
"Create and map memory on the device."},
|
||||
{"python_unmap_and_release", (PyCFunction)python_unmap_and_release,
|
||||
METH_VARARGS, "Unmap and release memory on the device."},
|
||||
{NULL, NULL, 0, NULL} // sentinel
|
||||
{"init_module_offload", (PyCFunction)py_init_module_offload, METH_VARARGS,
|
||||
"Initialize module with python_malloc and python_free callables."},
|
||||
{"python_create_and_map_offload",
|
||||
(PyCFunction)python_create_and_map_offload, METH_VARARGS,
|
||||
"Create and map memory on the device."},
|
||||
{"python_unmap_and_release_offload",
|
||||
(PyCFunction)python_unmap_and_release_offload, METH_VARARGS,
|
||||
"Unmap and release memory on the device."},
|
||||
{"python_get_mem_info_offload", (PyCFunction)python_get_mem_info_offload,
|
||||
METH_NOARGS, "Get mem info in the reserved pool."},
|
||||
{"python_lock_gpu_offload", (PyCFunction)python_lock_gpu_offload,
|
||||
METH_NOARGS, "Lock GPU."},
|
||||
{"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload,
|
||||
METH_NOARGS, "Unlock GPU."},
|
||||
{NULL, NULL, 0, NULL} // sentinel
|
||||
};
|
||||
|
||||
static struct PyModuleDef camem_allocator_module = {
|
||||
|
||||
Reference in New Issue
Block a user