vllm-ascend vnpu v1

2025-12-26 07:37:35 +00:00
parent 2f1aed98cc
commit 135cc0a505
168 changed files with 28337 additions and 9 deletions
--- a/csrc/camem_allocator.cpp
+++ b/csrc/camem_allocator.cpp
@@ -17,6 +17,9 @@
 #include <iostream>
 #include <stdexcept>
 #include <string>
+#include <atomic>
+
+#include "idle_offload/shm_worker.h"

 extern "C" {

@@ -26,6 +29,13 @@ extern "C" {
 #include <sys/types.h>
 #include "acl/acl.h"

+// idle offload
+static std::atomic<bool> g_initialized(false);
+static void *g_d_mem = nullptr;
+static size_t g_size = 0;
+static std::atomic_uint_fast64_t g_allocated_offset(0);
+ShmWorker *shm_worker = nullptr;
+
 // Global references to Python callables
 // NOTE: this is borrowed reference, so we don't need to DECREF them.
 // This brings the limitation that the allocator needs to be singleton.
@@ -248,6 +258,144 @@ __attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, in
  free(p_memHandle);
 }

+__attribute__((visibility("default"))) void *
+my_malloc_offload(ssize_t size, int device, aclrtStream stream) {
+  ensure_context(device);
+
+  // first allocation, align the size, and reserve an address, and also allocate
+  // a aclrtDrvMemHandle
+
+  // Define memory allocation properties
+  aclrtPhysicalMemProp prop = {};
+  prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
+  prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+  prop.memAttr = ACL_HBM_MEM_HUGE;
+  prop.location.id = device;
+  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+  prop.reserve = 0;
+
+  // Check if the allocation is supported
+  size_t granularity;
+  aclError error_code = aclrtMemGetAllocationGranularity(&prop,
+                                   ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
+                                   &granularity);
+  if (error_code != 0) {
+    throw std::runtime_error("aclrtMemGetAllocationGranularity failed with acl error code: " +
+                            std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
+  }
+  size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
+  void *d_mem;
+  // error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
+  // if (error_code != 0) {
+  //   if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
+  //     throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " + 
+  //                             std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " + 
+  //                             __FILE__ + ":" + std::to_string(__LINE__));
+  //   } else {
+  //     throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
+  //                             std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
+  //   }
+  // }
+
+  // allocate from the reserved pool
+  size_t alloc_offset = g_allocated_offset.fetch_add(alignedSize);
+  if (alloc_offset + alignedSize > g_size) {
+    throw std::runtime_error(
+        "my_malloc ERROR: Out of memory in the reserved pool." +
+        std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
+  }
+  d_mem = (void *)((char *)g_d_mem + alloc_offset);
+
+  // allocate the aclrtDrvMemHandle
+  aclrtDrvMemHandle* p_memHandle =
+      (aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
+
+  if (!g_python_malloc_callback) {
+    throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
+                            std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
+  }
+
+  // Acquire GIL (not in stable ABI officially, but often works)
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject* arg_tuple = create_tuple_from_c_integers(
+      (unsigned long long)device, (unsigned long long)alignedSize,
+      (unsigned long long)d_mem, (unsigned long long)p_memHandle);
+
+  // Call g_python_malloc_callback
+  PyObject* py_result =
+      PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
+  Py_DECREF(arg_tuple);
+
+  if (!py_result) {
+    PyErr_Print();
+    PyGILState_Release(gstate);
+    return nullptr;
+  }
+
+  PyGILState_Release(gstate);
+
+  // // do the final mapping
+  // create_and_map(device, alignedSize, d_mem, p_memHandle);
+
+  return (void*)d_mem;
+}
+
+__attribute__((visibility("default"))) void
+my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) {
+  // get memory handle from the pointer
+  if (!g_python_free_callback) {
+    throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
+                            std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
+  }
+
+  // Acquire GIL (not in stable ABI officially, but often works)
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject* py_ptr =
+      PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
+
+  PyObject* py_result =
+      PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
+
+  if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return;
+  }
+
+  PyGILState_Release(gstate);
+
+  // recv_size == size
+  // recv_device == device
+
+  // Free memory
+
+  // nothing to do
+  
+  // void *d_mem = (void*)recv_d_mem;
+  //   // allocate the aclrtDrvMemHandle
+  // aclrtDrvMemHandle* p_memHandle =
+  //     (aclrtDrvMemHandle*)recv_p_memHandle;
+  // unmap_and_release(device, size, d_mem, p_memHandle);
+
+  // // free address and the handle
+  // aclError error_code = aclrtReleaseMemAddress(d_mem);
+  // if (error_code != 0) {
+  //   throw std::runtime_error("aclrtReleaseMemAddress failed with acl error code: " +
+  //                           std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
+  // }
+  // free(p_memHandle);
+}
+
 // ---------------------------------------------------------------------------
 // Python extension boilerplate:

@@ -322,6 +470,116 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
  Py_RETURN_NONE;
 }

+
+static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
+  PyObject* malloc_callback = nullptr;
+  PyObject* free_callback = nullptr;
+
+  if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
+    return nullptr;
+  }
+
+  if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
+    PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
+    return nullptr;
+  }
+
+  // Save the Python callables
+  // This module does not handle GC of these objects, so they must be kept alive
+  // outside of this module.
+  g_python_malloc_callback = malloc_callback;
+  g_python_free_callback = free_callback;
+
+  // init idle
+  if (g_initialized.load()) {
+    printf("Module already initialized.\n");
+    Py_RETURN_NONE;
+  }
+  g_initialized.store(true);
+
+  shm_worker = new ShmWorker();
+  // get pid
+  aclError error_code;
+  int32_t pid;
+  error_code = aclrtDeviceGetBareTgid(&pid);
+  if (error_code != 0) {
+    throw std::runtime_error(
+        "aclrtDeviceGetBareTgid failed with acl error code: " +
+        std::to_string(error_code) + " " + __FILE__ + ":" +
+        std::to_string(__LINE__));
+  }
+  uint64_t shareable_handle;
+  shm_worker->register_worker(pid, &shareable_handle, &g_size);
+
+  // import shareable handle
+  uint32_t device = 0;
+  aclrtDrvMemHandle memHandle;
+  error_code =
+      aclrtMemImportFromShareableHandle(shareable_handle, device, &memHandle);
+  if (error_code != 0) {
+    throw std::runtime_error(
+        "aclrtMemImportFromShareableHandle failed with acl error code: " +
+        std::to_string(error_code) + " " + __FILE__ + ":" +
+        std::to_string(__LINE__));
+  }
+
+  // reserve virtual address
+  error_code = aclrtReserveMemAddress(&g_d_mem, g_size, 0, nullptr, 0);
+  if (error_code != 0) {
+    throw std::runtime_error(
+        "aclrtReserveMemAddress failed with acl error code: " +
+        std::to_string(error_code) + " " + __FILE__ + ":" +
+        std::to_string(__LINE__));
+  }
+  // map
+  error_code = aclrtMapMem(g_d_mem, g_size, 0, memHandle, 0);
+  if (error_code != 0) {
+    throw std::runtime_error("aclrtMapMem failed with acl error code: " +
+                             std::to_string(error_code) + " " + __FILE__ + ":" +
+                             std::to_string(__LINE__));
+  }
+
+  Py_RETURN_NONE;
+}
+
+static PyObject *python_unmap_and_release_offload(PyObject *self,
+                                                  PyObject *args) {
+  // nothing to do
+  Py_RETURN_NONE;
+}
+
+static PyObject *python_create_and_map_offload(PyObject *self, PyObject *args) {
+  // nothing to do
+  Py_RETURN_NONE;
+}
+
+static PyObject* python_get_mem_info_offload(PyObject* self, PyObject* args) {
+  size_t allocated_bytes = g_allocated_offset.load();
+  size_t free_mem = 0;
+  if (allocated_bytes >= g_size) {
+    free_mem = 0;
+  } else {
+    free_mem = g_size - allocated_bytes;
+  }
+  PyObject* tuple = PyTuple_New(2);
+  if (!tuple) {  
+    return nullptr;
+  }
+  PyTuple_SetItem(tuple, 0, PyLong_FromSize_t(free_mem));
+  PyTuple_SetItem(tuple, 1, PyLong_FromSize_t(g_size));
+  return tuple;
+}
+
+static PyObject* python_lock_gpu_offload(PyObject* self, PyObject* args) {
+  bool prev_is_self = shm_worker->lock_gpu();
+  return PyBool_FromLong(prev_is_self);
+}
+
+static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) {
+  shm_worker->unlock_gpu();
+  Py_RETURN_NONE;
+}
+
 static PyMethodDef module_methods[] = {
    {"init_module", (PyCFunction)py_init_module, METH_VARARGS,
     "Initialize module with python_malloc and python_free callables."},
@@ -329,7 +587,21 @@ static PyMethodDef module_methods[] = {
     "Create and map memory on the device."},
    {"python_unmap_and_release", (PyCFunction)python_unmap_and_release,
     METH_VARARGS, "Unmap and release memory on the device."},
-    {NULL, NULL, 0, NULL}  // sentinel
+    {"init_module_offload", (PyCFunction)py_init_module_offload, METH_VARARGS,
+     "Initialize module with python_malloc and python_free callables."},
+    {"python_create_and_map_offload",
+     (PyCFunction)python_create_and_map_offload, METH_VARARGS,
+     "Create and map memory on the device."},
+    {"python_unmap_and_release_offload",
+     (PyCFunction)python_unmap_and_release_offload, METH_VARARGS,
+     "Unmap and release memory on the device."},
+    {"python_get_mem_info_offload", (PyCFunction)python_get_mem_info_offload,
+     METH_NOARGS, "Get mem info in the reserved pool."},
+    {"python_lock_gpu_offload", (PyCFunction)python_lock_gpu_offload,
+     METH_NOARGS, "Lock GPU."},
+    {"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload,
+     METH_NOARGS, "Unlock GPU."},
+    {NULL, NULL, 0, NULL} // sentinel
 };

 static struct PyModuleDef camem_allocator_module = {