support multi npu partially

2026-01-08 06:54:33 +00:00
parent fa0fb46853
commit 2a571d8bc8
12 changed files with 331 additions and 160 deletions
--- a/csrc/camem_allocator.cpp
+++ b/csrc/camem_allocator.cpp
@@ -20,6 +20,7 @@
 #include <atomic>

 #include "idle_offload/shm_worker.h"
+#include "idle_offload/npu_helper.h"

 extern "C" {

@@ -474,8 +475,10 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
 static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
  PyObject* malloc_callback = nullptr;
  PyObject* free_callback = nullptr;
+  unsigned long long device = 0;

-  if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
+  if (!PyArg_ParseTuple(args, "OOK", &malloc_callback, &free_callback,
+                        &device)) {
    return nullptr;
  }

@@ -497,7 +500,13 @@ static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
  }
  g_initialized.store(true);

-  shm_worker = new ShmWorker();
+  std::vector<int> gpu_ids = get_npu_ids();
+  if (device >= gpu_ids.size()) {
+    throw std::runtime_error("Invalid device id: " + std::to_string(device) +
+                             " " + __FILE__ + ":" + std::to_string(__LINE__));
+  }
+  int gpu_id = gpu_ids[device];
+
  // get pid
  aclError error_code;
  int32_t pid;
@@ -508,11 +517,12 @@ static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
        std::to_string(error_code) + " " + __FILE__ + ":" +
        std::to_string(__LINE__));
  }
+
+  shm_worker = new ShmWorker();
  uint64_t shareable_handle;
-  shm_worker->register_worker(pid, &shareable_handle, &g_size);
+  shm_worker->register_worker(pid, gpu_id, &shareable_handle, &g_size);

  // import shareable handle
-  uint32_t device = 0;
  aclrtDrvMemHandle memHandle;
  error_code =
      aclrtMemImportFromShareableHandle(shareable_handle, device, &memHandle);
@@ -570,9 +580,16 @@ static PyObject* python_get_mem_info_offload(PyObject* self, PyObject* args) {
  return tuple;
 }

-static PyObject* python_lock_gpu_offload(PyObject* self, PyObject* args) {
-  bool prev_is_self = shm_worker->lock_gpu();
-  return PyBool_FromLong(prev_is_self);
+static PyObject* python_try_lock_gpu_offload(PyObject* self, PyObject* args) {
+  bool prev_is_self = false;
+  bool success = shm_worker->try_lock_gpu(prev_is_self);
+  PyObject* tuple = PyTuple_New(2);
+  if (!tuple) {  
+    return nullptr;
+  }
+  PyTuple_SetItem(tuple, 0, PyBool_FromLong(success));
+  PyTuple_SetItem(tuple, 1, PyBool_FromLong(prev_is_self));
+  return tuple;
 }

 static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) {
@@ -597,7 +614,7 @@ static PyMethodDef module_methods[] = {
     "Unmap and release memory on the device."},
    {"python_get_mem_info_offload", (PyCFunction)python_get_mem_info_offload,
     METH_NOARGS, "Get mem info in the reserved pool."},
-    {"python_lock_gpu_offload", (PyCFunction)python_lock_gpu_offload,
+    {"python_try_lock_gpu_offload", (PyCFunction)python_try_lock_gpu_offload,
     METH_NOARGS, "Lock GPU."},
    {"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload,
     METH_NOARGS, "Unlock GPU."},