Unify SGL Kernel Releases (#10701)

This commit is contained in:
Kangyan-Zhou
2025-09-28 19:48:28 -07:00
committed by GitHub
parent 2572886367
commit 0c9174108a
3 changed files with 241 additions and 30 deletions

View File

@@ -239,14 +239,9 @@ if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.8" OR SGL_KERNEL_ENABLE_SM100A)
"-gencode=arch=compute_101a,code=sm_101a"
)
endif()
else()
list(APPEND SGL_KERNEL_CUDA_FLAGS
"-use_fast_math"
)
endif()
if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.4" OR SGL_KERNEL_ENABLE_SM90A)
if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.4")
set(SGL_KERNEL_ENABLE_FA3 ON)
list(APPEND SGL_KERNEL_CUDA_FLAGS
"-gencode=arch=compute_90a,code=sm_90a"
@@ -334,14 +329,47 @@ set(SOURCES
"${repo-flash-attention_SOURCE_DIR}/csrc/flash_attn/flash_sparse_api.cpp"
)
Python_add_library(common_ops MODULE USE_SABI ${SKBUILD_SABI_VERSION} WITH_SOABI ${SOURCES})
# Build SM90 library with fast math optimization (same namespace, different directory)
Python_add_library(common_ops_sm90_build MODULE USE_SABI ${SKBUILD_SABI_VERSION} WITH_SOABI ${SOURCES})
target_compile_options(common_ops PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${SGL_KERNEL_CUDA_FLAGS}>)
target_include_directories(common_ops PRIVATE
target_compile_definitions(common_ops_sm90_build PRIVATE
USE_FAST_MATH=1
)
target_compile_options(common_ops_sm90_build PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:${SGL_KERNEL_CUDA_FLAGS} -use_fast_math>
)
target_include_directories(common_ops_sm90_build PRIVATE
${PROJECT_SOURCE_DIR}/csrc
${repo-cutlass_SOURCE_DIR}/examples/77_blackwell_fmha
${repo-cutlass_SOURCE_DIR}/examples/common
${repo-flash-attention_SOURCE_DIR}/csrc/flash_attn/src
)
# Set output name and separate build directory to avoid conflicts
set_target_properties(common_ops_sm90_build PROPERTIES
OUTPUT_NAME "common_ops"
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/sm90"
)
# Build SM100+ library with precise math (same namespace, different directory)
Python_add_library(common_ops_sm100_build MODULE USE_SABI ${SKBUILD_SABI_VERSION} WITH_SOABI ${SOURCES})
target_compile_definitions(common_ops_sm100_build PRIVATE
USE_FAST_MATH=0
)
target_compile_options(common_ops_sm100_build PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:${SGL_KERNEL_CUDA_FLAGS}>
)
target_include_directories(common_ops_sm100_build PRIVATE
${PROJECT_SOURCE_DIR}/csrc
${repo-cutlass_SOURCE_DIR}/examples/77_blackwell_fmha
${repo-cutlass_SOURCE_DIR}/examples/common
${repo-flash-attention_SOURCE_DIR}/csrc/flash_attn/src
)
# Set output name and separate build directory to avoid conflicts
set_target_properties(common_ops_sm100_build PROPERTIES
OUTPUT_NAME "common_ops"
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/sm100"
)
find_package(Python3 COMPONENTS Interpreter REQUIRED)
execute_process(
@@ -367,16 +395,26 @@ add_subdirectory(
${repo-mscclpp_SOURCE_DIR}
${CMAKE_CURRENT_BINARY_DIR}/mscclpp-build
)
target_link_libraries(common_ops PRIVATE ${TORCH_LIBRARIES} c10 cuda cublas cublasLt mscclpp_static)
target_link_libraries(common_ops_sm90_build PRIVATE ${TORCH_LIBRARIES} c10 cuda cublas cublasLt mscclpp_static)
target_link_libraries(common_ops_sm100_build PRIVATE ${TORCH_LIBRARIES} c10 cuda cublas cublasLt mscclpp_static)
# flash attention
target_compile_definitions(common_ops PRIVATE
target_compile_definitions(common_ops_sm90_build PRIVATE
FLASHATTENTION_DISABLE_BACKWARD
FLASHATTENTION_DISABLE_DROPOUT
FLASHATTENTION_DISABLE_UNEVEN_K
)
target_compile_definitions(common_ops_sm100_build PRIVATE
FLASHATTENTION_DISABLE_BACKWARD
FLASHATTENTION_DISABLE_DROPOUT
FLASHATTENTION_DISABLE_UNEVEN_K
)
install(TARGETS common_ops LIBRARY DESTINATION sgl_kernel)
# Install to different subdirectories
# CMake will find the built libraries in their respective LIBRARY_OUTPUT_DIRECTORY locations
# and install them to the specified destinations
install(TARGETS common_ops_sm90_build LIBRARY DESTINATION sgl_kernel/sm90)
install(TARGETS common_ops_sm100_build LIBRARY DESTINATION sgl_kernel/sm100)
# ============================ Optional Install ============================= #
# set flash-attention sources file