diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt index 7fa1c723c..58ac06c08 100644 --- a/sgl-kernel/CMakeLists.txt +++ b/sgl-kernel/CMakeLists.txt @@ -148,7 +148,10 @@ set(SGL_KERNEL_CUDA_FLAGS "-DCUTLASS_DEBUG_TRACE_LEVEL=0" "--expt-relaxed-constexpr" "--expt-extended-lambda" - "--threads=32" + # The following flag leads to the CMAKE_BUILD_PARALLEL_LEVEL breaking, + # it triggers OOM with low memory host. Extract the threads number to + # option named SGL_KERNEL_COMPILE_THREADS, default value 32. + # "--threads=32" # Supress warnings "-Xcompiler=-Wno-clang-format-violations" @@ -164,6 +167,20 @@ set(SGL_KERNEL_CUDA_FLAGS # "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage" ) +set(SGL_KERNEL_COMPILE_THREADS 32 CACHE STRING "Set compilation threads, default 32") + +# When SGL_KERNEL_COMPILE_THREADS value is less than 1, set it to 1 +if (NOT SGL_KERNEL_COMPILE_THREADS MATCHES "^[0-9]+$") + message(FATAL_ERROR "SGL_KERNEL_COMPILE_THREADS must be an integer, but was set to '${SGL_KERNEL_COMPILE_THREADS}'.") +elseif (SGL_KERNEL_COMPILE_THREADS LESS 1) + message(STATUS "SGL_KERNEL_COMPILE_THREADS was set to a value less than 1. Using 1 instead.") + set(SGL_KERNEL_COMPILE_THREADS 1) +endif() + +list(APPEND SGL_KERNEL_CUDA_FLAGS + "--threads=${SGL_KERNEL_COMPILE_THREADS}" +) + option(SGL_KERNEL_ENABLE_BF16 "Enable BF16" ON) option(SGL_KERNEL_ENABLE_FP8 "Enable FP8" ON) option(SGL_KERNEL_ENABLE_FP4 "Enable FP4" OFF) diff --git a/sgl-kernel/README.md b/sgl-kernel/README.md index 5871d5347..47f3dea54 100644 --- a/sgl-kernel/README.md +++ b/sgl-kernel/README.md @@ -52,10 +52,12 @@ See CMakeLists.txt for more options. ### Parallel Build We highly recommend you build sgl-kernel with Ninja. Ninja can automatically build sgl-kernel in parallel. -And if you build the sgl-kernel with cmake, you need to add `CMAKE_BUILD_PARALLEL_LEVEL` for parallel build like: +And if you build the sgl-kernel with cmake, you need to add `CMAKE_BUILD_PARALLEL_LEVEL` and limit the +nvcc threads to a single thread by setting `SGL_KERNEL_COMPILE_THREADS=1` for parallel build like: ```bash -CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) python -m uv build --wheel -Cbuild-dir=build --color=always . +CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) python -m uv build --wheel -Cbuild-dir=build \ +-Ccmake.define.SGL_KERNEL_COMPILE_THREADS=1 --color=always . ``` ### ⚠️ Compilation Issue with `sgl-kernel` and CUDA 12.6