Refactor sgl-kernel build (#2642)
This commit is contained in:
@@ -1,5 +1,15 @@
|
||||
from .ops import moe_align_block_size
|
||||
from sgl_kernel.ops import (
|
||||
custom_dispose,
|
||||
custom_reduce,
|
||||
init_custom_reduce,
|
||||
moe_align_block_size,
|
||||
warp_reduce,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"moe_align_block_size",
|
||||
"warp_reduce",
|
||||
"init_custom_reduce",
|
||||
"custom_dispose",
|
||||
"custom_reduce",
|
||||
]
|
||||
|
||||
@@ -3,11 +3,11 @@
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <torch/all.h>
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <THC/THCAtomics.cuh>
|
||||
|
||||
#include "utils.hpp"
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_runtime.h>
|
||||
#endif
|
||||
@@ -133,7 +133,3 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t b
|
||||
token_cnts_buffer.data_ptr<int32_t>(), cumsum_buffer.data_ptr<int32_t>());
|
||||
});
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("moe_align_block_size", &moe_align_block_size, "MOE Align Block Size (CUDA)");
|
||||
}
|
||||
|
||||
32
sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
Normal file
32
sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
Normal file
@@ -0,0 +1,32 @@
|
||||
#include "utils.hpp"
|
||||
|
||||
// warp_reduce
|
||||
torch::Tensor warp_reduce_cuda(torch::Tensor input);
|
||||
|
||||
torch::Tensor warp_reduce(torch::Tensor input) {
|
||||
CHECK_CUDA_INPUT(input);
|
||||
return warp_reduce_cuda(input);
|
||||
}
|
||||
|
||||
// trt_reduce
|
||||
using fptr_t = int64_t;
|
||||
fptr_t init_custom_ar(int64_t rank_id, int64_t world_size, const std::vector<fptr_t>& buffers,
|
||||
const std::vector<fptr_t>& barrier_in, const std::vector<fptr_t>& barrier_out);
|
||||
void dispose(fptr_t _fa);
|
||||
void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
|
||||
|
||||
// moe_align_block_size
|
||||
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t block_size,
|
||||
torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad,
|
||||
torch::Tensor token_cnts_buffer, torch::Tensor cumsum_buffer);
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
// warp_reduce
|
||||
m.def("reduce", &warp_reduce, "Warp Reduce (CUDA)");
|
||||
// trt_reduce
|
||||
m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
|
||||
m.def("dispose", &dispose, "dispose custom allreduce meta");
|
||||
m.def("all_reduce", &all_reduce, "custom all reduce (CUDA)");
|
||||
// moe_align_block_size
|
||||
m.def("moe_align_block_size", &moe_align_block_size, "MOE Align Block Size (CUDA)");
|
||||
}
|
||||
@@ -1,13 +0,0 @@
|
||||
#include <torch/extension.h>
|
||||
|
||||
using fptr_t = int64_t;
|
||||
fptr_t init_custom_ar(int64_t rank_id, int64_t world_size, const std::vector<fptr_t>& buffers,
|
||||
const std::vector<fptr_t>& barrier_in, const std::vector<fptr_t>& barrier_out);
|
||||
void dispose(fptr_t _fa);
|
||||
void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)");
|
||||
m.def("dispose", &dispose, "dispose custom allreduce meta");
|
||||
m.def("all_reduce", &all_reduce, "custom all reduce (CUDA)");
|
||||
}
|
||||
@@ -1,14 +0,0 @@
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include "utils.hpp"
|
||||
|
||||
torch::Tensor warp_reduce_cuda(torch::Tensor input);
|
||||
|
||||
torch::Tensor warp_reduce(torch::Tensor input) {
|
||||
CHECK_CUDA_INPUT(input);
|
||||
return warp_reduce_cuda(input);
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("reduce", &warp_reduce, "Warp Reduce (CUDA)");
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include "utils.hpp"
|
||||
|
||||
#define FINAL_MASK 0xffffffff
|
||||
#define BLOCK_SIZE 256
|
||||
|
||||
@@ -1,4 +1,24 @@
|
||||
from .moe_align_block_size import moe_align_block_size as _moe_align_block_size
|
||||
from sgl_kernel.ops._kernels import all_reduce as _all_reduce
|
||||
from sgl_kernel.ops._kernels import dispose as _dispose
|
||||
from sgl_kernel.ops._kernels import init_custom_ar as _init_custom_ar
|
||||
from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size
|
||||
from sgl_kernel.ops._kernels import reduce as _reduce
|
||||
|
||||
|
||||
def warp_reduce(input_tensor):
|
||||
return _reduce(input_tensor)
|
||||
|
||||
|
||||
def init_custom_reduce(rank_id, num_devices, buffers, barrier_in, barrier_out):
|
||||
return _init_custom_ar(rank_id, num_devices, buffers, barrier_in, barrier_out)
|
||||
|
||||
|
||||
def custom_dispose(fa):
|
||||
_dispose(fa)
|
||||
|
||||
|
||||
def custom_reduce(fa, inp, out):
|
||||
_all_reduce(fa, inp, out)
|
||||
|
||||
|
||||
def moe_align_block_size(
|
||||
|
||||
Reference in New Issue
Block a user