From b6b57fc2007524a1c16b71ff88e5c04f8271caca Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 31 Dec 2024 14:52:00 +0800 Subject: [PATCH] minor: cleanup sgl-kernel (#2679) --- sgl-kernel/CMakeLists.txt | 1 - sgl-kernel/setup.py | 37 -------- sgl-kernel/src/sgl-kernel/__init__.py | 2 - .../src/sgl-kernel/csrc/sgl_kernel_ops.cu | 10 --- .../src/sgl-kernel/csrc/warp_reduce_kernel.cu | 90 ------------------- sgl-kernel/src/sgl-kernel/ops/__init__.py | 5 -- 6 files changed, 145 deletions(-) delete mode 100644 sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt index 26081a8e7..9a556c860 100644 --- a/sgl-kernel/CMakeLists.txt +++ b/sgl-kernel/CMakeLists.txt @@ -28,7 +28,6 @@ find_package(Torch REQUIRED) # Warp Reduce library add_library(_kernels SHARED - src/sgl-kernel/csrc/warp_reduce_kernel.cu src/sgl-kernel/csrc/trt_reduce_internal.cu src/sgl-kernel/csrc/trt_reduce_kernel.cu src/sgl-kernel/csrc/moe_align_kernel.cu diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py index 4a40aeb2d..eb81991e3 100644 --- a/sgl-kernel/setup.py +++ b/sgl-kernel/setup.py @@ -1,6 +1,3 @@ -import os -import shutil -import zipfile from pathlib import Path from setuptools import setup @@ -16,39 +13,6 @@ def get_version(): return line.split("=")[1].strip().strip('"') -def rename_wheel(): - if not os.environ.get("CUDA_VERSION"): - return - cuda_version = os.environ["CUDA_VERSION"].replace(".", "") - base_version = get_version() - - wheel_dir = Path("dist") - old_wheel = next(wheel_dir.glob("*.whl")) - tmp_dir = wheel_dir / "tmp" - tmp_dir.mkdir(exist_ok=True) - - with zipfile.ZipFile(old_wheel, "r") as zip_ref: - zip_ref.extractall(tmp_dir) - - old_info = tmp_dir / f"sgl_kernel-{base_version}.dist-info" - new_info = tmp_dir / f"sgl_kernel-{base_version}.post0+cu{cuda_version}.dist-info" - old_info.rename(new_info) - - platform = "manylinux2014_x86_64" - new_wheel = wheel_dir / old_wheel.name.replace("linux_x86_64", platform) - new_wheel = wheel_dir / new_wheel.name.replace( - base_version, f"{base_version}.post0+cu{cuda_version}" - ) - - with zipfile.ZipFile(new_wheel, "w", zipfile.ZIP_DEFLATED) as new_zip: - for file_path in tmp_dir.rglob("*"): - if file_path.is_file(): - new_zip.write(file_path, file_path.relative_to(tmp_dir)) - - old_wheel.unlink() - shutil.rmtree(tmp_dir) - - def update_wheel_platform_tag(): wheel_dir = Path("dist") old_wheel = next(wheel_dir.glob("*.whl")) @@ -81,7 +45,6 @@ ext_modules = [ CUDAExtension( name="sgl_kernel.ops._kernels", sources=[ - "src/sgl-kernel/csrc/warp_reduce_kernel.cu", "src/sgl-kernel/csrc/trt_reduce_internal.cu", "src/sgl-kernel/csrc/trt_reduce_kernel.cu", "src/sgl-kernel/csrc/moe_align_kernel.cu", diff --git a/sgl-kernel/src/sgl-kernel/__init__.py b/sgl-kernel/src/sgl-kernel/__init__.py index c0a5caa10..0705b6efa 100644 --- a/sgl-kernel/src/sgl-kernel/__init__.py +++ b/sgl-kernel/src/sgl-kernel/__init__.py @@ -3,12 +3,10 @@ from sgl_kernel.ops import ( custom_reduce, init_custom_reduce, moe_align_block_size, - warp_reduce, ) __all__ = [ "moe_align_block_size", - "warp_reduce", "init_custom_reduce", "custom_dispose", "custom_reduce", diff --git a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu index 9518eb4ff..69b8f8eeb 100644 --- a/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu +++ b/sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu @@ -1,13 +1,5 @@ #include "utils.hpp" -// warp_reduce -torch::Tensor warp_reduce_cuda(torch::Tensor input); - -torch::Tensor warp_reduce(torch::Tensor input) { - CHECK_CUDA_INPUT(input); - return warp_reduce_cuda(input); -} - // trt_reduce using fptr_t = int64_t; fptr_t init_custom_ar(int64_t rank_id, int64_t world_size, const std::vector& buffers, @@ -21,8 +13,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t b torch::Tensor token_cnts_buffer, torch::Tensor cumsum_buffer); PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - // warp_reduce - m.def("reduce", &warp_reduce, "Warp Reduce (CUDA)"); // trt_reduce m.def("init_custom_ar", &init_custom_ar, "init custom allreduce meta (CUDA)"); m.def("dispose", &dispose, "dispose custom allreduce meta"); diff --git a/sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu deleted file mode 100644 index 7a3f2f5fc..000000000 --- a/sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu +++ /dev/null @@ -1,90 +0,0 @@ -#include -#include - -#include "utils.hpp" - -#define FINAL_MASK 0xffffffff -#define BLOCK_SIZE 256 - -template -__device__ __forceinline__ scalar_t add(scalar_t a, scalar_t b) { - return a + b; -} - -template -__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) { -#pragma unroll - for (int offset = 16; offset > 0; offset /= 2) { - val += __shfl_down_sync(FINAL_MASK, val, offset); - } - return val; -} - -template -__device__ __forceinline__ scalar_t blockReduceSum(scalar_t val) { - __shared__ scalar_t shared[32]; - int lane = threadIdx.x % 32; - int wid = threadIdx.x / 32; - - val = warpReduceSum(val); // First reduce within warp - - if (lane == 0) shared[wid] = val; // Write reduced value to shared memory - - __syncthreads(); // Wait for all partial reductions - - // Read from shared memory only if that warp existed - val = (threadIdx.x < (blockDim.x / 32)) ? shared[lane] : 0; - - if (wid == 0) val = warpReduceSum(val); // Final reduce within first warp - - return val; -} - -template -__global__ void warp_reduce_cuda_kernel( - const torch::PackedTensorAccessor32 input, - torch::PackedTensorAccessor32 output, int N) { - scalar_t sum = 0; - - // Grid-stride loop - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { - sum += input[i]; - } - - // Perform block-wide reduction - sum = blockReduceSum(sum); - - // Write result for this block to global memory - if (threadIdx.x == 0) { - output[blockIdx.x] = sum; - } -} - -torch::Tensor warp_reduce_cuda(torch::Tensor input) { - // Input validation - TORCH_CHECK(input.dim() == 1, "1D tensor expected"); - TORCH_CHECK(input.is_cuda(), "CUDA tensor expected"); - - const auto N = input.size(0); - - // Handle empty tensor - if (N == 0) { - return torch::zeros({1}, input.options()); - } - - // Calculate grid dimensions - const int threads = BLOCK_SIZE; - const int blocks = (N + threads - 1) / threads; - - // Allocate output tensor for partial sums - auto output = torch::empty({blocks}, input.options()); - - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "warp_reduce_cuda", ([&] { - warp_reduce_cuda_kernel<<>>( - input.packed_accessor32(), - output.packed_accessor32(), N); - })); - - // Sum the partial results - return output.sum(); -} diff --git a/sgl-kernel/src/sgl-kernel/ops/__init__.py b/sgl-kernel/src/sgl-kernel/ops/__init__.py index 3b4cbfb17..aee968ae6 100644 --- a/sgl-kernel/src/sgl-kernel/ops/__init__.py +++ b/sgl-kernel/src/sgl-kernel/ops/__init__.py @@ -2,11 +2,6 @@ from sgl_kernel.ops._kernels import all_reduce as _all_reduce from sgl_kernel.ops._kernels import dispose as _dispose from sgl_kernel.ops._kernels import init_custom_ar as _init_custom_ar from sgl_kernel.ops._kernels import moe_align_block_size as _moe_align_block_size -from sgl_kernel.ops._kernels import reduce as _reduce - - -def warp_reduce(input_tensor): - return _reduce(input_tensor) def init_custom_reduce(rank_id, num_devices, buffers, barrier_in, barrier_out):