Add sleep mode feature for Ascend NPU (#513)

### What this PR does / why we need it? This PR adds sleep mode feature for vllm-ascend, when sleeps, we do mainly two things: - offload model weights - discard kv cache RLHF tools(such as https://github.com/volcengine/verl and https://github.com/OpenRLHF/OpenRLHF) have a strong need of sleep mode to accelerate the training process. This PR may solve #375 and #320 . ### Does this PR introduce _any_ user-facing change? No existing user interfaces changed. Users will have two new methods(`sleep()` and `wake_up()`) to use. ### How was this patch tested? This PR is tested with Qwen/Qwen2.5-0.5B-Instruct. At first, we have free NPU memory M1. After `llm = LLM("Qwen/Qwen2.5-0.5B-Instruct", enable_sleep_mode=True)` executed, we have free NPU memory M2. M2 < M1. Then we call `llm.sleep(level=1)`, we have free NPU memory M3. We have M3 > M2, M3 is very close to M1. Plus, we have the same output tokens before sleep and after wake up, with the config of `SamplingParams(temperature=0, max_tokens=10)` and with the same input tokens of course. This PR is utilizing the CMake procedure of #371 , thanks a lot. Signed-off-by: Shuqiao Li <celestialli@outlook.com>
2025-04-18 13:11:39 +08:00
parent 42c7fbb10e
commit 84563fc65d
13 changed files with 1020 additions and 9 deletions
--- a/csrc/camem_allocator.cpp
+++ b/csrc/camem_allocator.cpp
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+extern "C" {
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#include <sys/types.h>
+#include "acl/acl.h"
+
+// Global references to Python callables
+// NOTE: this is borrowed reference, so we don't need to DECREF them.
+// This brings the limitation that the allocator needs to be singleton.
+static PyObject* g_python_malloc_callback = nullptr;
+static PyObject* g_python_free_callback = nullptr;
+
+
+// ---------------------------------------------------------------------------
+// Helper functions:
+
+void ensure_context(unsigned long long device) {
+  aclrtContext pctx;
+  aclrtGetCurrentContext(&pctx);
+  if (!pctx) {
+    // Ensure device context.
+    aclrtCreateContext(&pctx, device);
+    aclrtSetCurrentContext(pctx);
+  }
+}
+
+void create_and_map(unsigned long long device, ssize_t size, void* d_mem,
+                    aclrtDrvMemHandle* p_memHandle) {
+  ensure_context(device);
+  // Define memory allocation properties
+  aclrtPhysicalMemProp prop = {};
+  prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
+  prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+  prop.memAttr = ACL_HBM_MEM_HUGE;
+  prop.location.id = device;
+  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+  prop.reserve = 0;
+
+  // Allocate memory using aclrtMallocPhysical
+  aclError error_code = aclrtMallocPhysical(p_memHandle, size, &prop, 0);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+            << __LINE__ << std::endl;  
+    return;
+  }
+  error_code = aclrtMapMem(d_mem, size, 0, *p_memHandle, 0);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+            << __LINE__ << std::endl;  
+    return;
+  }
+}
+
+void unmap_and_release(unsigned long long device, ssize_t size,
+                       void* d_mem,
+                       aclrtDrvMemHandle* p_memHandle) {
+  // std::cout << "unmap_and_release: device=" << device << ", size=" << size <<
+  // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
+  ensure_context(device);
+  aclError error_code = aclrtUnmapMem(d_mem);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+            << __LINE__ << std::endl;  
+    return;
+  }
+  error_code = aclrtFreePhysical(*p_memHandle);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+            << __LINE__ << std::endl;  
+    return;
+  }
+}
+
+PyObject* create_tuple_from_c_integers(unsigned long long a,
+                                       unsigned long long b,
+                                       unsigned long long c,
+                                       unsigned long long d) {
+  // Create a new tuple of size 4
+  PyObject* tuple = PyTuple_New(4);
+  if (!tuple) {
+    return NULL;  // Return NULL on failure
+  }
+
+  // Convert integers to Python objects and set them in the tuple
+  PyTuple_SetItem(
+      tuple, 0,
+      PyLong_FromUnsignedLongLong(a));  // Steals reference to the PyLong
+  PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b));
+  PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c));
+  PyTuple_SetItem(tuple, 3, PyLong_FromUnsignedLongLong(d));
+
+  // Note: PyTuple_SetItem "steals" a reference to each object,
+  // so we do not need to Py_DECREF the PyLong objects explicitly.
+
+  return tuple;  // Return the created tuple
+}
+
+// ---------------------------------------------------------------------------
+// Our exported C functions that call Python:
+
+__attribute__ ((visibility("default"))) void* my_malloc(ssize_t size, int device, aclrtStream stream) {
+  ensure_context(device);
+
+  // first allocation, align the size, and reserve an address, and also allocate
+  // a aclrtDrvMemHandle
+
+  // Define memory allocation properties
+  aclrtPhysicalMemProp prop = {};
+  prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
+  prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+  prop.memAttr = ACL_HBM_MEM_HUGE;
+  prop.location.id = device;
+  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+  prop.reserve = 0;
+
+  // Check if the allocation is supported
+  size_t granularity;
+  aclError error_code = aclrtMemGetAllocationGranularity(&prop,
+                                   ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
+                                   &granularity);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+            << __LINE__ << std::endl;  
+    return nullptr;
+  }
+  size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
+  void *d_mem;
+  error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+                << __LINE__ << std::endl;  
+    return nullptr;
+  }
+  // allocate the aclrtDrvMemHandle
+  aclrtDrvMemHandle* p_memHandle =
+      (aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
+
+  if (!g_python_malloc_callback) {
+    std::cerr << "ERROR: g_python_malloc_callback not set.\n";
+    return nullptr;
+  }
+
+  // Acquire GIL (not in stable ABI officially, but often works)
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject* arg_tuple = create_tuple_from_c_integers(
+      (unsigned long long)device, (unsigned long long)alignedSize,
+      (unsigned long long)d_mem, (unsigned long long)p_memHandle);
+
+  // Call g_python_malloc_callback
+  PyObject* py_result =
+      PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
+  Py_DECREF(arg_tuple);
+
+  if (!py_result) {
+    PyErr_Print();
+    PyGILState_Release(gstate);
+    return nullptr;
+  }
+
+  PyGILState_Release(gstate);
+
+  // do the final mapping
+  create_and_map(device, alignedSize, d_mem, p_memHandle);
+
+  return (void*)d_mem;
+}
+
+__attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, int device, aclrtStream stream) {
+  // get memory handle from the pointer
+  if (!g_python_free_callback) {
+    std::cerr << "ERROR: g_python_free_callback not set.\n";
+    return;
+  }
+
+  // Acquire GIL (not in stable ABI officially, but often works)
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject* py_ptr =
+      PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
+
+  PyObject* py_result =
+      PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
+
+  if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return;
+  }
+
+  PyGILState_Release(gstate);
+
+  // recv_size == size
+  // recv_device == device
+
+  // Free memory
+
+  void *d_mem = (void*)recv_d_mem;
+    // allocate the aclrtDrvMemHandle
+  aclrtDrvMemHandle* p_memHandle =
+      (aclrtDrvMemHandle*)recv_p_memHandle;
+  unmap_and_release(device, size, d_mem, p_memHandle);
+
+  // free address and the handle
+  aclError error_code = aclrtReleaseMemAddress(d_mem);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+        << __LINE__ << std::endl;  
+    return;
+  }
+  free(p_memHandle);
+}
+
+// ---------------------------------------------------------------------------
+// Python extension boilerplate:
+
+// Python-exposed function: init_module(python_malloc, python_free)
+static PyObject* py_init_module(PyObject* self, PyObject* args) {
+  PyObject* malloc_callback = nullptr;
+  PyObject* free_callback = nullptr;
+
+  if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
+    return nullptr;
+  }
+
+  if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
+    PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
+    return nullptr;
+  }
+
+  // Save the Python callables
+  // This module does not handle GC of these objects, so they must be kept alive
+  // outside of this module.
+  g_python_malloc_callback = malloc_callback;
+  g_python_free_callback = free_callback;
+
+  Py_RETURN_NONE;
+}
+
+static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
+  if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return nullptr;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
+                        &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return nullptr;
+  }
+
+  void *d_mem_ptr = (void*)recv_d_mem;
+  aclrtDrvMemHandle* p_memHandle =
+      (aclrtDrvMemHandle*)recv_p_memHandle;
+
+  unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
+  if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return nullptr;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
+                        &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return nullptr;
+  }
+
+  void *d_mem_ptr = (void*)recv_d_mem;
+  aclrtDrvMemHandle* p_memHandle =
+      (aclrtDrvMemHandle*)recv_p_memHandle;
+
+  create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
+
+  Py_RETURN_NONE;
+}
+
+static PyMethodDef module_methods[] = {
+    {"init_module", (PyCFunction)py_init_module, METH_VARARGS,
+     "Initialize module with python_malloc and python_free callables."},
+    {"python_create_and_map", (PyCFunction)python_create_and_map, METH_VARARGS,
+     "Create and map memory on the device."},
+    {"python_unmap_and_release", (PyCFunction)python_unmap_and_release,
+     METH_VARARGS, "Unmap and release memory on the device."},
+    {NULL, NULL, 0, NULL}  // sentinel
+};
+
+static struct PyModuleDef camem_allocator_module = {
+    PyModuleDef_HEAD_INIT, "camem_allocator",
+    "CANN-mem-based allocator for NPUPluggableAllocator", -1, module_methods};
+
+PyMODINIT_FUNC PyInit_vllm_ascend_C(void) {
+  // Initialize the module
+  PyObject* module = PyModule_Create(&camem_allocator_module);
+  if (!module) {
+    return NULL;
+  }
+  return module;
+}
+}  // extern "C"