diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index dd6370091..ee0b10c0e 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -187,8 +187,6 @@ jobs: timeout-minutes: 10 run: | cd test/srt - USE_VLLM_CUSTOM_ALLREDUCE=1 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 - python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 - name: Benchmark single latency + torch.compile (TP=2) diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt index 8b056d4cc..cd5aa5786 100644 --- a/sgl-kernel/CMakeLists.txt +++ b/sgl-kernel/CMakeLists.txt @@ -4,6 +4,10 @@ project(sgl-kernel LANGUAGES CXX CUDA) # CMake cmake_policy(SET CMP0169 OLD) include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) +set(CMAKE_COLOR_DIAGNOSTICS ON) +set(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "ON") +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_SHARED_LIBRARY_PREFIX "") # Python find_package(Python COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT} REQUIRED) @@ -82,8 +86,6 @@ include_directories( ${PROJECT_SOURCE_DIR}/csrc ${repo-cutlass_SOURCE_DIR}/include ${repo-cutlass_SOURCE_DIR}/tools/util/include - ${repo-cutlass_SOURCE_DIR}/examples/77_blackwell_fmha - ${repo-cutlass_SOURCE_DIR}/examples/common ${repo-flashinfer_SOURCE_DIR}/include ${repo-flashinfer_SOURCE_DIR}/csrc ) @@ -109,6 +111,8 @@ set(SGL_KERNEL_CUDA_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--threads=32" + + # Supress warnings "-Xcompiler=-Wconversion" "-Xcompiler=-fno-strict-aliasing" @@ -209,17 +213,19 @@ Python_add_library(common_ops MODULE USE_SABI ${SKBUILD_SABI_VERSION} WITH_SOABI target_compile_options(common_ops PRIVATE $<$:${SGL_KERNEL_CUDA_FLAGS}>) target_include_directories(common_ops PRIVATE - ${TORCH_INCLUDE_DIRS} - ${repo-flash-attention_SOURCE_DIR}/csrc/flash_attn/src) + ${repo-cutlass_SOURCE_DIR}/examples/77_blackwell_fmha + ${repo-cutlass_SOURCE_DIR}/examples/common + ${repo-flash-attention_SOURCE_DIR}/csrc/flash_attn/src +) target_link_libraries(common_ops PRIVATE ${TORCH_LIBRARIES} c10 cuda cublas cublasLt) target_compile_definitions(common_ops PRIVATE - FLASHATTENTION_DISABLE_BACKWARD - FLASHATTENTION_DISABLE_DROPOUT - FLASHATTENTION_DISABLE_UNEVEN_K - ) + FLASHATTENTION_DISABLE_BACKWARD + FLASHATTENTION_DISABLE_DROPOUT + FLASHATTENTION_DISABLE_UNEVEN_K +) -install(TARGETS common_ops LIBRARY DESTINATION "sgl_kernel") +install(TARGETS common_ops LIBRARY DESTINATION sgl_kernel) # ============================ Optional Install ============================= # # set flash-attention sources file @@ -280,8 +286,8 @@ if (SGL_KERNEL_ENABLE_FA3) target_compile_options(flash_ops PRIVATE $<$:${SGL_FLASH_KERNEL_CUDA_FLAGS}>) target_include_directories(flash_ops PRIVATE - ${TORCH_INCLUDE_DIRS} - ${repo-flash-attention_SOURCE_DIR}/hopper) + ${repo-flash-attention_SOURCE_DIR}/hopper + ) target_link_libraries(flash_ops PRIVATE ${TORCH_LIBRARIES} c10 cuda) install(TARGETS flash_ops LIBRARY DESTINATION "sgl_kernel") diff --git a/sgl-kernel/build.sh b/sgl-kernel/build.sh index 945d60ca5..6b82613a2 100755 --- a/sgl-kernel/build.sh +++ b/sgl-kernel/build.sh @@ -35,6 +35,8 @@ docker run --rm \ ${PYTHON_ROOT_PATH}/bin/pip install --no-cache-dir ninja setuptools==75.0.0 wheel==0.41.0 numpy uv scikit-build-core && \ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX' && \ export CUDA_VERSION=${CUDA_VERSION} && \ + export CMAKE_BUILD_PARALLEL_LEVEL=96 + export MAX_JOBS=96 mkdir -p /usr/lib/x86_64-linux-gnu/ && \ ln -s /usr/local/cuda-${CUDA_VERSION}/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/x86_64-linux-gnu/libcuda.so && \ cd /sgl-kernel && \