diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt index c884c4ba8..b74fefb77 100644 --- a/sgl-kernel/CMakeLists.txt +++ b/sgl-kernel/CMakeLists.txt @@ -3,6 +3,7 @@ project(sgl-kernel LANGUAGES CXX CUDA) # CMake cmake_policy(SET CMP0169 OLD) +cmake_policy(SET CMP0177 NEW) include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) set(CMAKE_COLOR_DIAGNOSTICS ON) set(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "ON") @@ -50,14 +51,7 @@ FetchContent_Declare( ) FetchContent_Populate(repo-cutlass) -FetchContent_Declare( - repo-fmt - GIT_REPOSITORY https://github.com/fmtlib/fmt - GIT_TAG 553ec11ec06fbe0beebfbb45f9dc3c9eabd83d28 - GIT_SHALLOW OFF -) -FetchContent_Populate(repo-fmt) - +# DeepGEMM FetchContent_Declare( repo-deepgemm GIT_REPOSITORY https://github.com/sgl-project/DeepGEMM @@ -66,6 +60,14 @@ FetchContent_Declare( ) FetchContent_Populate(repo-deepgemm) +FetchContent_Declare( + repo-fmt + GIT_REPOSITORY https://github.com/fmtlib/fmt + GIT_TAG 553ec11ec06fbe0beebfbb45f9dc3c9eabd83d28 + GIT_SHALLOW OFF +) +FetchContent_Populate(repo-fmt) + # Triton FetchContent_Declare( repo-triton @@ -148,21 +150,40 @@ set(SGL_KERNEL_CUDA_FLAGS "--expt-extended-lambda" "--threads=32" - # Suppress warnings - "-Xcompiler=-Wconversion" - "-Xcompiler=-fno-strict-aliasing" + # Supress warnings + "-Xcompiler=-Wno-clang-format-violations" + "-Xcompiler=-Wno-conversion" + "-Xcompiler=-Wno-deprecated-declarations" + "-Xcompiler=-Wno-terminate" + "-Xcompiler=-Wfatal-errors" + "-Xcompiler=-ftemplate-backtrace-limit=1" + "-Xcudafe=--diag_suppress=177" # variable was declared but never referenced # uncomment to debug # "--ptxas-options=-v" # "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage" ) -option(SGL_KERNEL_ENABLE_SM100A "Enable SM100A" OFF) -option(SGL_KERNEL_ENABLE_SM90A "Enable SM90A" OFF) option(SGL_KERNEL_ENABLE_BF16 "Enable BF16" ON) option(SGL_KERNEL_ENABLE_FP8 "Enable FP8" ON) option(SGL_KERNEL_ENABLE_FP4 "Enable FP4" OFF) option(SGL_KERNEL_ENABLE_FA3 "Enable FA3" OFF) +option(SGL_KERNEL_ENABLE_SM90A "Enable SM90A" OFF) +option(SGL_KERNEL_ENABLE_SM100A "Enable SM100A" OFF) + +if (SGL_KERNEL_ENABLE_BF16) + list(APPEND SGL_KERNEL_CUDA_FLAGS + "-DFLASHINFER_ENABLE_BF16" + ) +endif() + +if (SGL_KERNEL_ENABLE_FP8) + list(APPEND SGL_KERNEL_CUDA_FLAGS + "-DFLASHINFER_ENABLE_FP8" + "-DFLASHINFER_ENABLE_FP8_E4M3" + "-DFLASHINFER_ENABLE_FP8_E5M2" + ) +endif() if (ENABLE_BELOW_SM90) list(APPEND SGL_KERNEL_CUDA_FLAGS @@ -210,31 +231,12 @@ if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.4" OR SGL_KERNEL_ENABLE_SM90A) ) endif() -if (SGL_KERNEL_ENABLE_BF16) - list(APPEND SGL_KERNEL_CUDA_FLAGS - "-DFLASHINFER_ENABLE_BF16" - ) -endif() - -if (SGL_KERNEL_ENABLE_FP8) - list(APPEND SGL_KERNEL_CUDA_FLAGS - "-DFLASHINFER_ENABLE_FP8" - "-DFLASHINFER_ENABLE_FP8_E4M3" - "-DFLASHINFER_ENABLE_FP8_E5M2" - ) -endif() - if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.8" OR SGL_KERNEL_ENABLE_FP4) list(APPEND SGL_KERNEL_CUDA_FLAGS "-DENABLE_NVFP4=1" ) endif() -string(REPLACE "-D__CUDA_NO_HALF_OPERATORS__" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") -string(REPLACE "-D__CUDA_NO_HALF_CONVERSIONS__" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") -string(REPLACE "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") -string(REPLACE "-D__CUDA_NO_HALF2_OPERATORS__" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") - set(SOURCES "csrc/allreduce/custom_all_reduce.cu" "csrc/allreduce/mscclpp_allreduce.cu" diff --git a/sgl-kernel/Makefile b/sgl-kernel/Makefile index 58e0897dc..c40489800 100644 --- a/sgl-kernel/Makefile +++ b/sgl-kernel/Makefile @@ -21,12 +21,11 @@ submodule: ## Initialize and update git submodules ln: submodule ## Create compilation database @rm -rf build && mkdir build && cd build && cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=YES -DCMAKE_POLICY_VERSION_MINIMUM=3.5 - install: submodule ## Install package in development mode @pip install -e . --no-build-isolation build: install-deps submodule ## Build and install wheel package - @rm -rf dist/* || true && export MAX_JOBS=$(nproc) && CMAKE_POLICY_VERSION_MINIMUM=3.5 CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) uv build --wheel -Cbuild-dir=build . --verbose --color=always --no-build-isolation && pip3 install dist/*whl --force-reinstall --no-deps + @rm -rf dist/* || true && CMAKE_POLICY_VERSION_MINIMUM=3.5 MAX_JOBS=$(nproc) CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) uv build --wheel -Cbuild-dir=build . --verbose --color=always --no-build-isolation && pip3 install dist/*whl --force-reinstall --no-deps clean: ## Remove build artifacts @rm -rf build dist *.egg-info diff --git a/sgl-kernel/csrc/attention/cutlass_mla_kernel.cu b/sgl-kernel/csrc/attention/cutlass_mla_kernel.cu index 88c4c89e2..6f4d46577 100644 --- a/sgl-kernel/csrc/attention/cutlass_mla_kernel.cu +++ b/sgl-kernel/csrc/attention/cutlass_mla_kernel.cu @@ -162,7 +162,7 @@ typename T::Fmha::Arguments args_from_options( // TODO(trevor-m): Change split_kv back to -1 when // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will // perform worse with larger context length and smaller batch sizes. - num_kv_splits, // split_kv + static_cast(num_kv_splits), // split_kv nullptr, // is_var_split_kv }; // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute @@ -259,7 +259,7 @@ int64_t cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, // Assumes device 0 when getting sm_count. arguments.hw_info.sm_count = sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count; - arguments.split_kv = num_kv_splits; + arguments.split_kv = static_cast(num_kv_splits); MlaSm100Type::Fmha::set_split_kv(arguments); return MlaSm100Type::Fmha::get_workspace_size(arguments); diff --git a/sgl-kernel/csrc/gemm/dsv3_fused_a_gemm.cu b/sgl-kernel/csrc/gemm/dsv3_fused_a_gemm.cu index 28dcaaee1..37aff1b9a 100644 --- a/sgl-kernel/csrc/gemm/dsv3_fused_a_gemm.cu +++ b/sgl-kernel/csrc/gemm/dsv3_fused_a_gemm.cu @@ -131,6 +131,7 @@ __device__ bool try_wait_barrier(uint64_t* smem_ptr, int phase_bit) { : "r"(smem_int_ptr), "r"(phase_bit)); return static_cast(wait_complete); #endif + return false; } // Barrier arrive diff --git a/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu b/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu index 1228b21c5..e18f2057b 100644 --- a/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu +++ b/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu @@ -541,6 +541,11 @@ void quant_impl( } } +// Avoid redefinition warnings +#undef CHECK_CONTIGUOUS +#undef CHECK_TH_CUDA +#undef CHECK_INPUT + /*Quantization entry for fp4 experts quantization*/ #define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor") #define CHECK_CONTIGUOUS(x, m) TORCH_CHECK(x.is_contiguous(), m, "must be contiguous")