639 lines
22 KiB
C++
639 lines
22 KiB
C++
/*
|
|
* Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <iostream>
|
|
#include <stdexcept>
|
|
#include <string>
|
|
#include <atomic>
|
|
|
|
#include "vnpu_offload/shm_worker.h"
|
|
#include "vnpu_offload/npu_helper.h"
|
|
|
|
extern "C" {
|
|
|
|
#define PY_SSIZE_T_CLEAN
|
|
#include <Python.h>
|
|
|
|
#include <sys/types.h>
|
|
#include "acl/acl.h"
|
|
|
|
// idle offload
|
|
static std::atomic<bool> g_initialized(false);
|
|
static void *g_d_mem = nullptr;
|
|
static size_t g_size = 0;
|
|
static std::atomic_uint_fast64_t g_allocated_offset(0);
|
|
ShmWorker *shm_worker = nullptr;
|
|
|
|
// Global references to Python callables
|
|
// NOTE: this is borrowed reference, so we don't need to DECREF them.
|
|
// This brings the limitation that the allocator needs to be singleton.
|
|
static PyObject* g_python_malloc_callback = nullptr;
|
|
static PyObject* g_python_free_callback = nullptr;
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Helper functions:
|
|
|
|
void ensure_context(unsigned long long device) {
|
|
aclrtContext pctx;
|
|
aclrtGetCurrentContext(&pctx);
|
|
if (!pctx) {
|
|
// Ensure device context.
|
|
aclrtCreateContext(&pctx, device);
|
|
aclrtSetCurrentContext(pctx);
|
|
}
|
|
}
|
|
|
|
void create_and_map(unsigned long long device, ssize_t size, void* d_mem,
|
|
aclrtDrvMemHandle* p_memHandle) {
|
|
ensure_context(device);
|
|
// Define memory allocation properties
|
|
aclrtPhysicalMemProp prop = {};
|
|
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
|
|
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
|
|
prop.memAttr = ACL_HBM_MEM_HUGE;
|
|
prop.location.id = device;
|
|
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
|
|
prop.reserve = 0;
|
|
|
|
// Allocate memory using aclrtMallocPhysical
|
|
aclError error_code = aclrtMallocPhysical(p_memHandle, size, &prop, 0);
|
|
if (error_code != 0) {
|
|
if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
|
|
throw std::runtime_error("aclrtMallocPhysical failed with acl error code: " +
|
|
std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " +
|
|
__FILE__ + ":" + std::to_string(__LINE__));
|
|
} else {
|
|
throw std::runtime_error("aclrtMallocPhysical failed with acl error code: " +
|
|
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
}
|
|
|
|
// Map memory
|
|
error_code = aclrtMapMem(d_mem, size, 0, *p_memHandle, 0);
|
|
if (error_code != 0) {
|
|
throw std::runtime_error("aclrtMapMem failed with acl error code: " +
|
|
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
}
|
|
|
|
void unmap_and_release(unsigned long long device, ssize_t size,
|
|
void* d_mem,
|
|
aclrtDrvMemHandle* p_memHandle) {
|
|
// std::cout << "unmap_and_release: device=" << device << ", size=" << size <<
|
|
// ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
|
|
ensure_context(device);
|
|
aclError error_code = aclrtUnmapMem(d_mem);
|
|
if (error_code != 0) {
|
|
throw std::runtime_error("aclrtUnmapMem failed with acl error code: " +
|
|
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
error_code = aclrtFreePhysical(*p_memHandle);
|
|
if (error_code != 0) {
|
|
throw std::runtime_error("aclrtFreePhysical failed with acl error code: " +
|
|
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
}
|
|
|
|
PyObject* create_tuple_from_c_integers(unsigned long long a,
|
|
unsigned long long b,
|
|
unsigned long long c,
|
|
unsigned long long d) {
|
|
// Create a new tuple of size 4
|
|
PyObject* tuple = PyTuple_New(4);
|
|
if (!tuple) {
|
|
return NULL; // Return NULL on failure
|
|
}
|
|
|
|
// Convert integers to Python objects and set them in the tuple
|
|
PyTuple_SetItem(
|
|
tuple, 0,
|
|
PyLong_FromUnsignedLongLong(a)); // Steals reference to the PyLong
|
|
PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b));
|
|
PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c));
|
|
PyTuple_SetItem(tuple, 3, PyLong_FromUnsignedLongLong(d));
|
|
|
|
// Note: PyTuple_SetItem "steals" a reference to each object,
|
|
// so we do not need to Py_DECREF the PyLong objects explicitly.
|
|
|
|
return tuple; // Return the created tuple
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Our exported C functions that call Python:
|
|
|
|
__attribute__ ((visibility("default"))) void* my_malloc(ssize_t size, int device, aclrtStream stream) {
|
|
ensure_context(device);
|
|
|
|
// first allocation, align the size, and reserve an address, and also allocate
|
|
// a aclrtDrvMemHandle
|
|
|
|
// Define memory allocation properties
|
|
aclrtPhysicalMemProp prop = {};
|
|
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
|
|
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
|
|
prop.memAttr = ACL_HBM_MEM_HUGE;
|
|
prop.location.id = device;
|
|
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
|
|
prop.reserve = 0;
|
|
|
|
// Check if the allocation is supported
|
|
size_t granularity;
|
|
aclError error_code = aclrtMemGetAllocationGranularity(&prop,
|
|
ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
|
|
&granularity);
|
|
if (error_code != 0) {
|
|
throw std::runtime_error("aclrtMemGetAllocationGranularity failed with acl error code: " +
|
|
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
|
|
void *d_mem;
|
|
error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
|
|
if (error_code != 0) {
|
|
if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
|
|
throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
|
|
std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " +
|
|
__FILE__ + ":" + std::to_string(__LINE__));
|
|
} else {
|
|
throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
|
|
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
}
|
|
// allocate the aclrtDrvMemHandle
|
|
aclrtDrvMemHandle* p_memHandle =
|
|
(aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
|
|
|
|
if (!g_python_malloc_callback) {
|
|
throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
|
|
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
|
|
// Acquire GIL (not in stable ABI officially, but often works)
|
|
PyGILState_STATE gstate = PyGILState_Ensure();
|
|
|
|
PyObject* arg_tuple = create_tuple_from_c_integers(
|
|
(unsigned long long)device, (unsigned long long)alignedSize,
|
|
(unsigned long long)d_mem, (unsigned long long)p_memHandle);
|
|
|
|
// Call g_python_malloc_callback
|
|
PyObject* py_result =
|
|
PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
|
|
Py_DECREF(arg_tuple);
|
|
|
|
if (!py_result) {
|
|
PyErr_Print();
|
|
PyGILState_Release(gstate);
|
|
return nullptr;
|
|
}
|
|
|
|
PyGILState_Release(gstate);
|
|
|
|
// do the final mapping
|
|
create_and_map(device, alignedSize, d_mem, p_memHandle);
|
|
|
|
return (void*)d_mem;
|
|
}
|
|
|
|
__attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, int device, aclrtStream stream) {
|
|
// get memory handle from the pointer
|
|
if (!g_python_free_callback) {
|
|
throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
|
|
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
|
|
// Acquire GIL (not in stable ABI officially, but often works)
|
|
PyGILState_STATE gstate = PyGILState_Ensure();
|
|
|
|
PyObject* py_ptr =
|
|
PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
|
|
|
|
PyObject* py_result =
|
|
PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
|
|
|
|
if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
|
|
PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
|
|
return;
|
|
}
|
|
|
|
unsigned long long recv_device, recv_size;
|
|
unsigned long long recv_d_mem, recv_p_memHandle;
|
|
// Unpack the tuple into four C integers
|
|
if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
|
|
&recv_d_mem, &recv_p_memHandle)) {
|
|
// PyArg_ParseTuple sets an error if it fails
|
|
return;
|
|
}
|
|
|
|
PyGILState_Release(gstate);
|
|
|
|
// recv_size == size
|
|
// recv_device == device
|
|
|
|
// Free memory
|
|
|
|
void *d_mem = (void*)recv_d_mem;
|
|
// allocate the aclrtDrvMemHandle
|
|
aclrtDrvMemHandle* p_memHandle =
|
|
(aclrtDrvMemHandle*)recv_p_memHandle;
|
|
unmap_and_release(device, size, d_mem, p_memHandle);
|
|
|
|
// free address and the handle
|
|
aclError error_code = aclrtReleaseMemAddress(d_mem);
|
|
if (error_code != 0) {
|
|
throw std::runtime_error("aclrtReleaseMemAddress failed with acl error code: " +
|
|
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
free(p_memHandle);
|
|
}
|
|
|
|
__attribute__((visibility("default"))) void *
|
|
my_malloc_offload(ssize_t size, int device, aclrtStream stream) {
|
|
ensure_context(device);
|
|
|
|
// first allocation, align the size, and reserve an address, and also allocate
|
|
// a aclrtDrvMemHandle
|
|
|
|
// Define memory allocation properties
|
|
aclrtPhysicalMemProp prop = {};
|
|
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
|
|
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
|
|
prop.memAttr = ACL_HBM_MEM_HUGE;
|
|
prop.location.id = device;
|
|
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
|
|
prop.reserve = 0;
|
|
|
|
// Check if the allocation is supported
|
|
size_t granularity;
|
|
aclError error_code = aclrtMemGetAllocationGranularity(&prop,
|
|
ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
|
|
&granularity);
|
|
if (error_code != 0) {
|
|
throw std::runtime_error("aclrtMemGetAllocationGranularity failed with acl error code: " +
|
|
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
|
|
void *d_mem;
|
|
// error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
|
|
// if (error_code != 0) {
|
|
// if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
|
|
// throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
|
|
// std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " +
|
|
// __FILE__ + ":" + std::to_string(__LINE__));
|
|
// } else {
|
|
// throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
|
|
// std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
|
// }
|
|
// }
|
|
|
|
// allocate from the reserved pool
|
|
size_t alloc_offset = g_allocated_offset.fetch_add(alignedSize);
|
|
if (alloc_offset + alignedSize > g_size) {
|
|
throw std::runtime_error(
|
|
"my_malloc ERROR: Out of memory in the reserved pool." +
|
|
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
d_mem = (void *)((char *)g_d_mem + alloc_offset);
|
|
|
|
// allocate the aclrtDrvMemHandle
|
|
aclrtDrvMemHandle* p_memHandle =
|
|
(aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
|
|
|
|
if (!g_python_malloc_callback) {
|
|
throw std::runtime_error(
|
|
"my_malloc ERROR: g_python_malloc_callback not set." +
|
|
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
|
|
// Acquire GIL (not in stable ABI officially, but often works)
|
|
PyGILState_STATE gstate = PyGILState_Ensure();
|
|
|
|
PyObject* arg_tuple = create_tuple_from_c_integers(
|
|
(unsigned long long)device, (unsigned long long)alignedSize,
|
|
(unsigned long long)d_mem, (unsigned long long)p_memHandle);
|
|
|
|
// Call g_python_malloc_callback
|
|
PyObject* py_result =
|
|
PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
|
|
Py_DECREF(arg_tuple);
|
|
|
|
if (!py_result) {
|
|
PyErr_Print();
|
|
PyGILState_Release(gstate);
|
|
return nullptr;
|
|
}
|
|
|
|
PyGILState_Release(gstate);
|
|
|
|
// // do the final mapping
|
|
// create_and_map(device, alignedSize, d_mem, p_memHandle);
|
|
|
|
return (void*)d_mem;
|
|
}
|
|
|
|
__attribute__((visibility("default"))) void
|
|
my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) {
|
|
// get memory handle from the pointer
|
|
if (!g_python_free_callback) {
|
|
throw std::runtime_error(
|
|
"my_free ERROR: g_python_malloc_callback not set." + std::string(" ") +
|
|
__FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
|
|
// Acquire GIL (not in stable ABI officially, but often works)
|
|
PyGILState_STATE gstate = PyGILState_Ensure();
|
|
|
|
PyObject* py_ptr =
|
|
PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
|
|
|
|
PyObject* py_result =
|
|
PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
|
|
|
|
if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
|
|
PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
|
|
return;
|
|
}
|
|
|
|
unsigned long long recv_device, recv_size;
|
|
unsigned long long recv_d_mem, recv_p_memHandle;
|
|
// Unpack the tuple into four C integers
|
|
if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
|
|
&recv_d_mem, &recv_p_memHandle)) {
|
|
// PyArg_ParseTuple sets an error if it fails
|
|
return;
|
|
}
|
|
|
|
PyGILState_Release(gstate);
|
|
|
|
// recv_size == size
|
|
// recv_device == device
|
|
|
|
// Free memory
|
|
|
|
// nothing to do
|
|
|
|
// void *d_mem = (void*)recv_d_mem;
|
|
// // allocate the aclrtDrvMemHandle
|
|
// aclrtDrvMemHandle* p_memHandle =
|
|
// (aclrtDrvMemHandle*)recv_p_memHandle;
|
|
// unmap_and_release(device, size, d_mem, p_memHandle);
|
|
|
|
// // free address and the handle
|
|
// aclError error_code = aclrtReleaseMemAddress(d_mem);
|
|
// if (error_code != 0) {
|
|
// throw std::runtime_error("aclrtReleaseMemAddress failed with acl error code: " +
|
|
// std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
|
// }
|
|
// free(p_memHandle);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Python extension boilerplate:
|
|
|
|
// Python-exposed function: init_module(python_malloc, python_free)
|
|
static PyObject* py_init_module(PyObject* self, PyObject* args) {
|
|
PyObject* malloc_callback = nullptr;
|
|
PyObject* free_callback = nullptr;
|
|
|
|
if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
|
|
return nullptr;
|
|
}
|
|
|
|
if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
|
|
PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
|
|
return nullptr;
|
|
}
|
|
|
|
// Save the Python callables
|
|
// This module does not handle GC of these objects, so they must be kept alive
|
|
// outside of this module.
|
|
g_python_malloc_callback = malloc_callback;
|
|
g_python_free_callback = free_callback;
|
|
|
|
Py_RETURN_NONE;
|
|
}
|
|
|
|
static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
|
|
if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
|
|
PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
|
|
return nullptr;
|
|
}
|
|
|
|
unsigned long long recv_device, recv_size;
|
|
unsigned long long recv_d_mem, recv_p_memHandle;
|
|
// Unpack the tuple into four C integers
|
|
if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
|
|
&recv_p_memHandle)) {
|
|
// PyArg_ParseTuple sets an error if it fails
|
|
return nullptr;
|
|
}
|
|
|
|
void *d_mem_ptr = (void*)recv_d_mem;
|
|
aclrtDrvMemHandle* p_memHandle =
|
|
(aclrtDrvMemHandle*)recv_p_memHandle;
|
|
|
|
unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
|
|
|
|
Py_RETURN_NONE;
|
|
}
|
|
|
|
static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
|
|
if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
|
|
PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
|
|
return nullptr;
|
|
}
|
|
|
|
unsigned long long recv_device, recv_size;
|
|
unsigned long long recv_d_mem, recv_p_memHandle;
|
|
// Unpack the tuple into four C integers
|
|
if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
|
|
&recv_p_memHandle)) {
|
|
// PyArg_ParseTuple sets an error if it fails
|
|
return nullptr;
|
|
}
|
|
|
|
void *d_mem_ptr = (void*)recv_d_mem;
|
|
aclrtDrvMemHandle* p_memHandle =
|
|
(aclrtDrvMemHandle*)recv_p_memHandle;
|
|
|
|
create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
|
|
|
|
Py_RETURN_NONE;
|
|
}
|
|
|
|
|
|
static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
|
|
PyObject* malloc_callback = nullptr;
|
|
PyObject* free_callback = nullptr;
|
|
unsigned long long device = 0;
|
|
|
|
if (!PyArg_ParseTuple(args, "OOK", &malloc_callback, &free_callback,
|
|
&device)) {
|
|
return nullptr;
|
|
}
|
|
|
|
if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
|
|
PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
|
|
return nullptr;
|
|
}
|
|
|
|
// Save the Python callables
|
|
// This module does not handle GC of these objects, so they must be kept alive
|
|
// outside of this module.
|
|
g_python_malloc_callback = malloc_callback;
|
|
g_python_free_callback = free_callback;
|
|
|
|
// init idle
|
|
if (g_initialized.load()) {
|
|
printf("Module already initialized.\n");
|
|
Py_RETURN_NONE;
|
|
}
|
|
g_initialized.store(true);
|
|
|
|
std::vector<int> gpu_ids = get_npu_ids();
|
|
if (device >= gpu_ids.size()) {
|
|
throw std::runtime_error("Invalid device id: " + std::to_string(device) +
|
|
" " + __FILE__ + ":" + std::to_string(__LINE__));
|
|
}
|
|
int gpu_id = gpu_ids[device];
|
|
|
|
// get pid
|
|
aclError error_code;
|
|
int32_t pid;
|
|
error_code = aclrtDeviceGetBareTgid(&pid);
|
|
if (error_code != 0) {
|
|
throw std::runtime_error(
|
|
"aclrtDeviceGetBareTgid failed with acl error code: " +
|
|
std::to_string(error_code) + " " + __FILE__ + ":" +
|
|
std::to_string(__LINE__));
|
|
}
|
|
|
|
shm_worker = new ShmWorker();
|
|
uint64_t shareable_handle;
|
|
shm_worker->register_worker(pid, gpu_id, &shareable_handle, &g_size);
|
|
|
|
// import shareable handle
|
|
aclrtDrvMemHandle memHandle;
|
|
error_code =
|
|
aclrtMemImportFromShareableHandle(shareable_handle, device, &memHandle);
|
|
if (error_code != 0) {
|
|
throw std::runtime_error(
|
|
"aclrtMemImportFromShareableHandle failed with acl error code: " +
|
|
std::to_string(error_code) + " " + __FILE__ + ":" +
|
|
std::to_string(__LINE__));
|
|
}
|
|
|
|
// reserve virtual address
|
|
error_code = aclrtReserveMemAddress(&g_d_mem, g_size, 0, nullptr, 0);
|
|
if (error_code != 0) {
|
|
throw std::runtime_error(
|
|
"aclrtReserveMemAddress failed with acl error code: " +
|
|
std::to_string(error_code) + " " + __FILE__ + ":" +
|
|
std::to_string(__LINE__));
|
|
}
|
|
// map
|
|
error_code = aclrtMapMem(g_d_mem, g_size, 0, memHandle, 0);
|
|
if (error_code != 0) {
|
|
throw std::runtime_error("aclrtMapMem failed with acl error code: " +
|
|
std::to_string(error_code) + " " + __FILE__ + ":" +
|
|
std::to_string(__LINE__));
|
|
}
|
|
|
|
Py_RETURN_NONE;
|
|
}
|
|
|
|
static PyObject *python_unmap_and_release_offload(PyObject *self,
|
|
PyObject *args) {
|
|
// nothing to do
|
|
Py_RETURN_NONE;
|
|
}
|
|
|
|
static PyObject *python_create_and_map_offload(PyObject *self, PyObject *args) {
|
|
// nothing to do
|
|
Py_RETURN_NONE;
|
|
}
|
|
|
|
static PyObject* python_get_mem_info_offload(PyObject* self, PyObject* args) {
|
|
size_t allocated_bytes = g_allocated_offset.load();
|
|
size_t free_mem = 0;
|
|
if (allocated_bytes >= g_size) {
|
|
free_mem = 0;
|
|
} else {
|
|
free_mem = g_size - allocated_bytes;
|
|
}
|
|
PyObject* tuple = PyTuple_New(2);
|
|
if (!tuple) {
|
|
return nullptr;
|
|
}
|
|
PyTuple_SetItem(tuple, 0, PyLong_FromSize_t(free_mem));
|
|
PyTuple_SetItem(tuple, 1, PyLong_FromSize_t(g_size));
|
|
return tuple;
|
|
}
|
|
|
|
static PyObject* python_try_lock_gpu_offload(PyObject* self, PyObject* args) {
|
|
bool prev_is_self = false;
|
|
bool success = shm_worker->try_lock_gpu(prev_is_self);
|
|
PyObject* tuple = PyTuple_New(2);
|
|
if (!tuple) {
|
|
return nullptr;
|
|
}
|
|
PyTuple_SetItem(tuple, 0, PyBool_FromLong(success));
|
|
PyTuple_SetItem(tuple, 1, PyBool_FromLong(prev_is_self));
|
|
return tuple;
|
|
}
|
|
|
|
static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) {
|
|
shm_worker->unlock_gpu();
|
|
Py_RETURN_NONE;
|
|
}
|
|
|
|
static PyMethodDef module_methods[] = {
|
|
{"init_module", (PyCFunction)py_init_module, METH_VARARGS,
|
|
"Initialize module with python_malloc and python_free callables."},
|
|
{"python_create_and_map", (PyCFunction)python_create_and_map, METH_VARARGS,
|
|
"Create and map memory on the device."},
|
|
{"python_unmap_and_release", (PyCFunction)python_unmap_and_release,
|
|
METH_VARARGS, "Unmap and release memory on the device."},
|
|
{"init_module_offload", (PyCFunction)py_init_module_offload, METH_VARARGS,
|
|
"Initialize module with python_malloc and python_free callables."},
|
|
{"python_create_and_map_offload",
|
|
(PyCFunction)python_create_and_map_offload, METH_VARARGS,
|
|
"Create and map memory on the device."},
|
|
{"python_unmap_and_release_offload",
|
|
(PyCFunction)python_unmap_and_release_offload, METH_VARARGS,
|
|
"Unmap and release memory on the device."},
|
|
{"python_get_mem_info_offload", (PyCFunction)python_get_mem_info_offload,
|
|
METH_NOARGS, "Get mem info in the reserved pool."},
|
|
{"python_try_lock_gpu_offload", (PyCFunction)python_try_lock_gpu_offload,
|
|
METH_NOARGS, "Lock GPU."},
|
|
{"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload,
|
|
METH_NOARGS, "Unlock GPU."},
|
|
{NULL, NULL, 0, NULL} // sentinel
|
|
};
|
|
|
|
static struct PyModuleDef camem_allocator_module = {
|
|
PyModuleDef_HEAD_INIT, "camem_allocator",
|
|
"CANN-mem-based allocator for NPUPluggableAllocator", -1, module_methods};
|
|
|
|
PyMODINIT_FUNC PyInit_vllm_ascend_C(void) {
|
|
// Initialize the module
|
|
PyObject* module = PyModule_Create(&camem_allocator_module);
|
|
if (!module) {
|
|
return NULL;
|
|
}
|
|
return module;
|
|
}
|
|
} // extern "C"
|