vllm-ascend vnpu v1

This commit is contained in:
starkwj
2025-12-26 07:37:35 +00:00
parent 2f1aed98cc
commit 135cc0a505
168 changed files with 28337 additions and 9 deletions

View File

@@ -17,6 +17,9 @@
#include <iostream>
#include <stdexcept>
#include <string>
#include <atomic>
#include "idle_offload/shm_worker.h"
extern "C" {
@@ -26,6 +29,13 @@ extern "C" {
#include <sys/types.h>
#include "acl/acl.h"
// idle offload
static std::atomic<bool> g_initialized(false);
static void *g_d_mem = nullptr;
static size_t g_size = 0;
static std::atomic_uint_fast64_t g_allocated_offset(0);
ShmWorker *shm_worker = nullptr;
// Global references to Python callables
// NOTE: this is borrowed reference, so we don't need to DECREF them.
// This brings the limitation that the allocator needs to be singleton.
@@ -248,6 +258,144 @@ __attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, in
free(p_memHandle);
}
__attribute__((visibility("default"))) void *
my_malloc_offload(ssize_t size, int device, aclrtStream stream) {
ensure_context(device);
// first allocation, align the size, and reserve an address, and also allocate
// a aclrtDrvMemHandle
// Define memory allocation properties
aclrtPhysicalMemProp prop = {};
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
prop.memAttr = ACL_HBM_MEM_HUGE;
prop.location.id = device;
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
prop.reserve = 0;
// Check if the allocation is supported
size_t granularity;
aclError error_code = aclrtMemGetAllocationGranularity(&prop,
ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
&granularity);
if (error_code != 0) {
throw std::runtime_error("aclrtMemGetAllocationGranularity failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
}
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
void *d_mem;
// error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
// if (error_code != 0) {
// if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
// throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
// std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " +
// __FILE__ + ":" + std::to_string(__LINE__));
// } else {
// throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
// std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
// }
// }
// allocate from the reserved pool
size_t alloc_offset = g_allocated_offset.fetch_add(alignedSize);
if (alloc_offset + alignedSize > g_size) {
throw std::runtime_error(
"my_malloc ERROR: Out of memory in the reserved pool." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
}
d_mem = (void *)((char *)g_d_mem + alloc_offset);
// allocate the aclrtDrvMemHandle
aclrtDrvMemHandle* p_memHandle =
(aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
if (!g_python_malloc_callback) {
throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
}
// Acquire GIL (not in stable ABI officially, but often works)
PyGILState_STATE gstate = PyGILState_Ensure();
PyObject* arg_tuple = create_tuple_from_c_integers(
(unsigned long long)device, (unsigned long long)alignedSize,
(unsigned long long)d_mem, (unsigned long long)p_memHandle);
// Call g_python_malloc_callback
PyObject* py_result =
PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
Py_DECREF(arg_tuple);
if (!py_result) {
PyErr_Print();
PyGILState_Release(gstate);
return nullptr;
}
PyGILState_Release(gstate);
// // do the final mapping
// create_and_map(device, alignedSize, d_mem, p_memHandle);
return (void*)d_mem;
}
__attribute__((visibility("default"))) void
my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) {
// get memory handle from the pointer
if (!g_python_free_callback) {
throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
}
// Acquire GIL (not in stable ABI officially, but often works)
PyGILState_STATE gstate = PyGILState_Ensure();
PyObject* py_ptr =
PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
PyObject* py_result =
PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
return;
}
unsigned long long recv_device, recv_size;
unsigned long long recv_d_mem, recv_p_memHandle;
// Unpack the tuple into four C integers
if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
&recv_d_mem, &recv_p_memHandle)) {
// PyArg_ParseTuple sets an error if it fails
return;
}
PyGILState_Release(gstate);
// recv_size == size
// recv_device == device
// Free memory
// nothing to do
// void *d_mem = (void*)recv_d_mem;
// // allocate the aclrtDrvMemHandle
// aclrtDrvMemHandle* p_memHandle =
// (aclrtDrvMemHandle*)recv_p_memHandle;
// unmap_and_release(device, size, d_mem, p_memHandle);
// // free address and the handle
// aclError error_code = aclrtReleaseMemAddress(d_mem);
// if (error_code != 0) {
// throw std::runtime_error("aclrtReleaseMemAddress failed with acl error code: " +
// std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
// }
// free(p_memHandle);
}
// ---------------------------------------------------------------------------
// Python extension boilerplate:
@@ -322,6 +470,116 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
Py_RETURN_NONE;
}
static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
PyObject* malloc_callback = nullptr;
PyObject* free_callback = nullptr;
if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
return nullptr;
}
if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
return nullptr;
}
// Save the Python callables
// This module does not handle GC of these objects, so they must be kept alive
// outside of this module.
g_python_malloc_callback = malloc_callback;
g_python_free_callback = free_callback;
// init idle
if (g_initialized.load()) {
printf("Module already initialized.\n");
Py_RETURN_NONE;
}
g_initialized.store(true);
shm_worker = new ShmWorker();
// get pid
aclError error_code;
int32_t pid;
error_code = aclrtDeviceGetBareTgid(&pid);
if (error_code != 0) {
throw std::runtime_error(
"aclrtDeviceGetBareTgid failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" +
std::to_string(__LINE__));
}
uint64_t shareable_handle;
shm_worker->register_worker(pid, &shareable_handle, &g_size);
// import shareable handle
uint32_t device = 0;
aclrtDrvMemHandle memHandle;
error_code =
aclrtMemImportFromShareableHandle(shareable_handle, device, &memHandle);
if (error_code != 0) {
throw std::runtime_error(
"aclrtMemImportFromShareableHandle failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" +
std::to_string(__LINE__));
}
// reserve virtual address
error_code = aclrtReserveMemAddress(&g_d_mem, g_size, 0, nullptr, 0);
if (error_code != 0) {
throw std::runtime_error(
"aclrtReserveMemAddress failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" +
std::to_string(__LINE__));
}
// map
error_code = aclrtMapMem(g_d_mem, g_size, 0, memHandle, 0);
if (error_code != 0) {
throw std::runtime_error("aclrtMapMem failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ + ":" +
std::to_string(__LINE__));
}
Py_RETURN_NONE;
}
static PyObject *python_unmap_and_release_offload(PyObject *self,
PyObject *args) {
// nothing to do
Py_RETURN_NONE;
}
static PyObject *python_create_and_map_offload(PyObject *self, PyObject *args) {
// nothing to do
Py_RETURN_NONE;
}
static PyObject* python_get_mem_info_offload(PyObject* self, PyObject* args) {
size_t allocated_bytes = g_allocated_offset.load();
size_t free_mem = 0;
if (allocated_bytes >= g_size) {
free_mem = 0;
} else {
free_mem = g_size - allocated_bytes;
}
PyObject* tuple = PyTuple_New(2);
if (!tuple) {
return nullptr;
}
PyTuple_SetItem(tuple, 0, PyLong_FromSize_t(free_mem));
PyTuple_SetItem(tuple, 1, PyLong_FromSize_t(g_size));
return tuple;
}
static PyObject* python_lock_gpu_offload(PyObject* self, PyObject* args) {
bool prev_is_self = shm_worker->lock_gpu();
return PyBool_FromLong(prev_is_self);
}
static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) {
shm_worker->unlock_gpu();
Py_RETURN_NONE;
}
static PyMethodDef module_methods[] = {
{"init_module", (PyCFunction)py_init_module, METH_VARARGS,
"Initialize module with python_malloc and python_free callables."},
@@ -329,7 +587,21 @@ static PyMethodDef module_methods[] = {
"Create and map memory on the device."},
{"python_unmap_and_release", (PyCFunction)python_unmap_and_release,
METH_VARARGS, "Unmap and release memory on the device."},
{NULL, NULL, 0, NULL} // sentinel
{"init_module_offload", (PyCFunction)py_init_module_offload, METH_VARARGS,
"Initialize module with python_malloc and python_free callables."},
{"python_create_and_map_offload",
(PyCFunction)python_create_and_map_offload, METH_VARARGS,
"Create and map memory on the device."},
{"python_unmap_and_release_offload",
(PyCFunction)python_unmap_and_release_offload, METH_VARARGS,
"Unmap and release memory on the device."},
{"python_get_mem_info_offload", (PyCFunction)python_get_mem_info_offload,
METH_NOARGS, "Get mem info in the reserved pool."},
{"python_lock_gpu_offload", (PyCFunction)python_lock_gpu_offload,
METH_NOARGS, "Lock GPU."},
{"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload,
METH_NOARGS, "Unlock GPU."},
{NULL, NULL, 0, NULL} // sentinel
};
static struct PyModuleDef camem_allocator_module = {