diff --git a/.gitignore b/.gitignore index 6470c7df5..6d0987f27 100644 --- a/.gitignore +++ b/.gitignore @@ -218,3 +218,5 @@ work_dirs/ *.exe *.out *.app + +compile_commands.json diff --git a/sgl-kernel/Makefile b/sgl-kernel/Makefile new file mode 100644 index 000000000..3186031ac --- /dev/null +++ b/sgl-kernel/Makefile @@ -0,0 +1,19 @@ +.PHONY: tree ln install build clean test + +tree: + @tree --prune -I "__pycache__|*.egg-info|*.so|build" + +ln: + @rm -rf build && cmake . -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -DCMAKE_CUDA_COMPILER=nvcc -B build && rm -rf compile_commands.json && ln -s build/compile_commands.json compile_commands.json + +install: + @pip install -e . + +build: + @python3 setup.py bdist_wheel + +clean: + @rm -rf build dist *.egg-info + +test: + @pytest tests/ diff --git a/sgl-kernel/build.sh b/sgl-kernel/build.sh new file mode 100755 index 000000000..b276f0141 --- /dev/null +++ b/sgl-kernel/build.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -ex + +docker run --rm -it \ + -v "$(pwd)":/sgl-kernel \ + pytorch/manylinux-builder:cuda12.1 \ + bash -c " + pip install --no-cache-dir torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121 && \ + export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX' && \ + cd /sgl-kernel && \ + python setup.py bdist_wheel + " diff --git a/sgl-kernel/setup.py b/sgl-kernel/setup.py index 4702c7f20..f2af83643 100644 --- a/sgl-kernel/setup.py +++ b/sgl-kernel/setup.py @@ -13,6 +13,18 @@ setup( "src/sgl-kernel/csrc/warp_reduce.cc", "src/sgl-kernel/csrc/warp_reduce_kernel.cu", ], + extra_compile_args={ + "nvcc": [ + "-O3", + "-Xcompiler", + "-fPIC", + "-gencode=arch=compute_75,code=sm_75", + "-gencode=arch=compute_80,code=sm_80", + "-gencode=arch=compute_89,code=sm_89", + "-gencode=arch=compute_90,code=sm_90", + ], + "cxx": ["-O3"], + }, ) ], cmdclass={"build_ext": BuildExtension}, diff --git a/sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc b/sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc index 66033c9d2..46c6a41c3 100644 --- a/sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc +++ b/sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc @@ -1,5 +1,4 @@ #include -#include torch::Tensor warp_reduce_cuda(torch::Tensor input);