[Model] Support DeepSeek-V4
This commit is contained in:
310
csrc/cnmem_allocator.cpp
Normal file
310
csrc/cnmem_allocator.cpp
Normal file
@@ -0,0 +1,310 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
// A MLU PluggableAllocator based on cn_api APIs.
|
||||
|
||||
#include <iostream>
|
||||
|
||||
extern "C" {
|
||||
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#include <Python.h>
|
||||
#include <cn_api.h>
|
||||
|
||||
|
||||
#define DRV_CHECK_GET_RETURN(...) \
|
||||
DRV_CHECK_GET_RETURN_IMPL(__VA_ARGS__, return, )
|
||||
#define DRV_CHECK_GET_RETURN_IMPL(_1, _2, ...) _2
|
||||
|
||||
#define CN_CHECK(return_code, ...) \
|
||||
do { \
|
||||
CNresult rc = (return_code); \
|
||||
if (rc) { \
|
||||
const char *error_str; \
|
||||
cnGetErrorString(rc, &error_str); \
|
||||
std::cout << "Error: " << error_str \
|
||||
<< " at " << __FILE__ \
|
||||
<< ":" << __LINE__ \
|
||||
<< std::endl; \
|
||||
DRV_CHECK_GET_RETURN(__VA_ARGS__) \
|
||||
__VA_ARGS__; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Global references to Python callables
|
||||
static PyObject* g_python_malloc_callback = nullptr;
|
||||
static PyObject* g_python_free_callback = nullptr;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helper functions:
|
||||
void ensure_context(CNdev device) {
|
||||
CNcontext pctx;
|
||||
CN_CHECK(cnCtxGetCurrent(&pctx));
|
||||
if (!pctx) {
|
||||
// Ensure device context;
|
||||
CN_CHECK(cnCtxCreate(&pctx, 0, device));
|
||||
CN_CHECK(cnCtxSetCurrent(pctx));
|
||||
}
|
||||
}
|
||||
|
||||
void create_and_map(CNdev device, ssize_t size, CNaddr d_mem, CNmemGenericAllocationHandle* p_memHandle) {
|
||||
ensure_context(device);
|
||||
// Define memory allocation properties
|
||||
CNmemAllocationProp prop = {};
|
||||
// The memory allocation type requested, which must be CN_MEM_ALLOCATION_TYPE_DEFAULT currently according to cndrv developer guide.
|
||||
prop.type = CN_MEM_ALLOCATION_TYPE_DEFAULT; //CU_MEM_ALLOCATION_TYPE_PINNED
|
||||
prop.location.type = CN_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.location.id = device;
|
||||
prop.requestedHandleTypes = CN_MEM_HANDLE_TYPE_NONE;
|
||||
prop.allocFlags.compressionType = CN_MEM_ALLOCATION_COMP_NONE;
|
||||
|
||||
// Allocate memory using cnMemCreate
|
||||
CN_CHECK(cnMemCreate(p_memHandle, size, &prop, 0));
|
||||
CN_CHECK(cnMemMap(d_mem, size, 0, *p_memHandle, 0));
|
||||
|
||||
CNmemAccessDesc accessDesc = {};
|
||||
accessDesc.location.type = CN_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = device;
|
||||
accessDesc.accessFlags = CN_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CN_CHECK(cnMemSetAccess(d_mem, size, &accessDesc, 1));
|
||||
}
|
||||
|
||||
void unmap_and_release(CNdev device, ssize_t size, CNaddr d_mem, CNmemGenericAllocationHandle* p_memHandle) {
|
||||
ensure_context(device);
|
||||
CN_CHECK(cnMemUnmap(d_mem, size));
|
||||
CN_CHECK(cnMemRelease(*p_memHandle));
|
||||
}
|
||||
|
||||
PyObject* create_tuple_from_c_integers(unsigned long long a,
|
||||
unsigned long long b,
|
||||
unsigned long long c,
|
||||
unsigned long long d) {
|
||||
// Create a new tuple of size 4
|
||||
PyObject* tuple = PyTuple_New(4);
|
||||
if (!tuple) {
|
||||
return NULL;
|
||||
}
|
||||
// Convert integers to Python objects and set them in the tuple
|
||||
// Steals reference to the PyLong
|
||||
PyTuple_SetItem(tuple, 0, PyLong_FromUnsignedLongLong(a));
|
||||
PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b));
|
||||
PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c));
|
||||
PyTuple_SetItem(tuple, 3, PyLong_FromUnsignedLongLong(d));
|
||||
|
||||
// Note: PyTuple_SetItem "steals" a reference to each object,
|
||||
// so we do not need to Py_DECREF the PyLong objects explicitly.
|
||||
|
||||
return tuple;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Our exported C functions that call Python:
|
||||
__attribute__ ((visibility("default"))) void* my_malloc(ssize_t size, int device, CNqueue stream) {
|
||||
ensure_context(device);
|
||||
// first allocation, align the size, and reserve an address, and also allocate
|
||||
// a CNmemGenericAllocationHandle
|
||||
|
||||
// Define memory allocation properties
|
||||
CNmemAllocationProp prop = {};
|
||||
// The memory allocation type requested, which must be CN_MEM_ALLOCATION_TYPE_DEFAULT currently according to cndrv developer guide.
|
||||
prop.type = CN_MEM_ALLOCATION_TYPE_DEFAULT; //CU_MEM_ALLOCATION_TYPE_PINNED
|
||||
prop.location.type = CN_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.location.id = device;
|
||||
prop.requestedHandleTypes = CN_MEM_HANDLE_TYPE_NONE;
|
||||
prop.allocFlags.compressionType = CN_MEM_ALLOCATION_COMP_NONE;
|
||||
|
||||
//Check if the allocation is supported
|
||||
size_t granularity;
|
||||
CN_CHECK(cnMemGetAllocationGranularity(&granularity, &prop, CN_MEM_ALLOC_GRANULARITY_MINIMUM), nullptr);
|
||||
|
||||
size_t alignedSize = ((size+granularity-1)/granularity)*granularity;
|
||||
CNaddr d_mem;
|
||||
CN_CHECK(cnMemAddressReserve(&d_mem, alignedSize, 0, 0, 0), nullptr);
|
||||
|
||||
// allocate the CNmemGenericAllocationHandle
|
||||
CNmemGenericAllocationHandle* p_memHandle = (CNmemGenericAllocationHandle*)malloc(sizeof(CNmemGenericAllocationHandle));
|
||||
|
||||
if (!g_python_malloc_callback) {
|
||||
std::cerr << "ERROR: g_python_malloc_callback not set.\n";
|
||||
return nullptr;
|
||||
}
|
||||
// Acquire GIL (not in stable ABI officially, but often works)
|
||||
PyGILState_STATE gstate = PyGILState_Ensure();
|
||||
PyObject* arg_tuple = create_tuple_from_c_integers(
|
||||
(unsigned long long)device, (unsigned long long)alignedSize,
|
||||
(unsigned long long)d_mem, (unsigned long long)p_memHandle);
|
||||
|
||||
// Call g_python_malloc_callback
|
||||
PyObject* py_result = PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
|
||||
Py_DECREF(arg_tuple);
|
||||
|
||||
if (!py_result) {
|
||||
PyErr_Print();
|
||||
PyGILState_Release(gstate);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
PyGILState_Release(gstate);
|
||||
|
||||
// do the final mapping
|
||||
create_and_map(device, alignedSize, d_mem, p_memHandle);
|
||||
|
||||
return (void*)d_mem;
|
||||
}
|
||||
|
||||
__attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, int device, CNqueue stream) {
|
||||
// get memory handle from the pointer
|
||||
if (!g_python_free_callback) {
|
||||
std::cerr << "ERROR: g_python_free_callback not set.\n";
|
||||
return;
|
||||
}
|
||||
|
||||
// Acquire GIL (not in stable ABI officially, but often works)
|
||||
PyGILState_STATE gstate = PyGILState_Ensure();
|
||||
|
||||
PyObject* py_ptr = PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
|
||||
PyObject* py_result = PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
|
||||
if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
|
||||
PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned long long recv_device, recv_size;
|
||||
unsigned long long recv_d_mem, recv_p_memHandle;
|
||||
if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size, &recv_d_mem, &recv_p_memHandle)) {
|
||||
// PyArg_ParseTuple sets an error if it fails
|
||||
return;
|
||||
}
|
||||
|
||||
PyGILState_Release(gstate);
|
||||
|
||||
// Free memory
|
||||
CNaddr d_mem = (CNaddr)recv_d_mem;
|
||||
CNmemGenericAllocationHandle* p_memHandle = (CNmemGenericAllocationHandle*)recv_p_memHandle;
|
||||
unmap_and_release(device, size, d_mem, p_memHandle);
|
||||
|
||||
//free address and the handle
|
||||
CN_CHECK(cnMemAddressFree(d_mem, size));
|
||||
free(p_memHandle);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Python extension boilerplate:
|
||||
// Python-exposed function: init_module(python_malloc, python_free)
|
||||
static PyObject* py_init_module(PyObject* self, PyObject* args) {
|
||||
PyObject* malloc_callback = nullptr;
|
||||
PyObject* free_callback = nullptr;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
|
||||
PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Save the Python callables
|
||||
// This module does not handle GC of these objects, so they must be kept alive
|
||||
// outside of this module.
|
||||
// This module keeps a strong reference to prevent premature GC
|
||||
Py_XINCREF(malloc_callback);
|
||||
Py_XINCREF(free_callback);
|
||||
|
||||
Py_XDECREF(g_python_malloc_callback);
|
||||
Py_XDECREF(g_python_free_callback);
|
||||
|
||||
g_python_malloc_callback = malloc_callback;
|
||||
g_python_free_callback = free_callback;
|
||||
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
|
||||
if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
|
||||
PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
unsigned long long recv_device, recv_size;
|
||||
unsigned long long recv_d_mem, recv_p_memHandle;
|
||||
// Unpack the tuple into four C integers
|
||||
if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
|
||||
&recv_p_memHandle)) {
|
||||
// PyArg_ParseTuple sets an error if it fails
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
CNaddr d_mem_ptr = (CNaddr)recv_d_mem;
|
||||
CNmemGenericAllocationHandle* p_memHandle = (CNmemGenericAllocationHandle*)recv_p_memHandle;
|
||||
|
||||
unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
|
||||
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
|
||||
if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
|
||||
PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
unsigned long long recv_device, recv_size;
|
||||
unsigned long long recv_d_mem, recv_p_memHandle;
|
||||
// Unpack the tuple into four C integers
|
||||
if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
|
||||
&recv_p_memHandle)) {
|
||||
// PyArg_ParseTuple sets an error if it fails
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
CNaddr d_mem_ptr = (CNaddr)recv_d_mem;
|
||||
CNmemGenericAllocationHandle* p_memHandle = (CNmemGenericAllocationHandle*)recv_p_memHandle;
|
||||
|
||||
create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
|
||||
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
static PyObject* python_cn_memcpy(PyObject* self, PyObject* args){
|
||||
if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 3) {
|
||||
PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 3");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
CNaddr dst, src;
|
||||
cn_uint64_t bytes;
|
||||
if (!PyArg_ParseTuple(args, "KKK", &dst, &src, &bytes)) {
|
||||
// PyArg_ParseTuple sets an error if it fails
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
CN_CHECK(cnMemcpy(dst, src, bytes), nullptr);
|
||||
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
static PyMethodDef module_methods[] = {
|
||||
{"init_module", (PyCFunction)py_init_module, METH_VARARGS,
|
||||
"Initialize module with python_malloc and python_free callables."},
|
||||
{"python_create_and_map", (PyCFunction)python_create_and_map, METH_VARARGS,
|
||||
"Create and map memory on the device."},
|
||||
{"python_unmap_and_release", (PyCFunction)python_unmap_and_release,
|
||||
METH_VARARGS, "Unmap and release memory on the device."},
|
||||
{"python_cn_memcpy", (PyCFunction)python_cn_memcpy, METH_VARARGS, "Copies data from source address to destination address."},
|
||||
{NULL, NULL, 0, NULL} // sentinel
|
||||
};
|
||||
|
||||
static struct PyModuleDef cnmem_allocator_module = {
|
||||
PyModuleDef_HEAD_INIT, "cnmem_allocator",
|
||||
"cnapi-mem-based allocator for MLUPluggableAllocator", -1, module_methods};
|
||||
|
||||
PyMODINIT_FUNC PyInit_vllm_mlu_C(void) {
|
||||
// Initialize the module
|
||||
PyObject* module = PyModule_Create(&cnmem_allocator_module);
|
||||
if (!module) {
|
||||
return NULL;
|
||||
}
|
||||
return module;
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
34
csrc/ops.h
Normal file
34
csrc/ops.h
Normal file
@@ -0,0 +1,34 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
#pragma once
|
||||
|
||||
#include <optional>
|
||||
#include <torch/library.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace vllm_mlu {
|
||||
|
||||
torch::Tensor weak_ref_tensor(torch::Tensor& tensor) {
|
||||
// Ensure tensor is on MLU
|
||||
if (!tensor.is_privateuseone()) {
|
||||
throw std::runtime_error("Tensor must be on MLU device");
|
||||
}
|
||||
|
||||
// Get the raw data pointer
|
||||
void* data_ptr = tensor.data_ptr();
|
||||
|
||||
// Get tensor sizes and strides
|
||||
std::vector<int64_t> sizes = tensor.sizes().vec();
|
||||
std::vector<int64_t> strides = tensor.strides().vec();
|
||||
|
||||
// Get tensor options (dtype, device)
|
||||
auto options = tensor.options();
|
||||
|
||||
// Create a new tensor from the raw data pointer
|
||||
auto new_tensor = torch::from_blob(data_ptr, sizes, strides, options);
|
||||
|
||||
return new_tensor;
|
||||
}
|
||||
|
||||
}
|
||||
18
csrc/torch_bindings.cpp
Normal file
18
csrc/torch_bindings.cpp
Normal file
@@ -0,0 +1,18 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
#include <torch/extension.h>
|
||||
#include <torch/library.h>
|
||||
#include <torch/version.h>
|
||||
#include <pybind11/pybind11.h>
|
||||
#include "ops.h"
|
||||
#include "utils.h"
|
||||
|
||||
|
||||
TORCH_LIBRARY_EXPAND(_C, ops)
|
||||
{
|
||||
// vLLM-MLU custom ops
|
||||
ops.def("weak_ref_tensor(Tensor input) -> Tensor");
|
||||
ops.impl("weak_ref_tensor", torch::kPrivateUse1, &vllm_mlu::weak_ref_tensor);
|
||||
}
|
||||
|
||||
REGISTER_EXTENSION(_C)
|
||||
29
csrc/utils.h
Normal file
29
csrc/utils.h
Normal file
@@ -0,0 +1,29 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright contributors to the vLLM-MLU project
|
||||
#pragma once
|
||||
|
||||
#include <Python.h>
|
||||
|
||||
#define _CONCAT(A, B) A##B
|
||||
#define CONCAT(A, B) _CONCAT(A, B)
|
||||
|
||||
#define _STRINGIFY(A) #A
|
||||
#define STRINGIFY(A) _STRINGIFY(A)
|
||||
|
||||
// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
|
||||
// could be a macro instead of a literal token.
|
||||
#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
|
||||
|
||||
// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
|
||||
// could be a macro instead of a literal token.
|
||||
#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
|
||||
TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
|
||||
|
||||
// REGISTER_EXTENSION allows the shared library to be loaded and initialized
|
||||
// via python's import statement.
|
||||
#define REGISTER_EXTENSION(NAME) \
|
||||
PyMODINIT_FUNC CONCAT(PyInit_, NAME)() { \
|
||||
static struct PyModuleDef module = {PyModuleDef_HEAD_INIT, \
|
||||
STRINGIFY(NAME), nullptr, 0, nullptr}; \
|
||||
return PyModule_Create(&module); \
|
||||
}
|
||||
Reference in New Issue
Block a user