将分散在各文件中的CUDA/HIP/MUSA硬件相关头文件引用统一到vendors目录下的对应头文件中,提高代码可维护性。移除重复的头文件引用,优化构建配置。
126 lines
4.4 KiB
CMake
126 lines
4.4 KiB
CMake
#
|
|
# MUSA-specific configuration for vLLM
|
|
#
|
|
|
|
set(VLLM_GPU_LANG "MUSA")
|
|
|
|
# Set the supported torch version for MUSA
|
|
set(TORCH_SUPPORTED_VERSION_MUSA "2.7.1")
|
|
|
|
# Warn if the torch version doesn't match what we expect
|
|
if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_MUSA})
|
|
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_MUSA} "
|
|
"expected for MUSA build, saw ${Torch_VERSION} instead.")
|
|
endif()
|
|
|
|
# Find MUSA package
|
|
list(APPEND CMAKE_MODULE_PATH $ENV{MUSA_HOME}/cmake)
|
|
find_package(MUSA REQUIRED)
|
|
|
|
# Extract and filter MUSA architectures
|
|
# MUSA architectures are similar to CUDA, but may have different naming
|
|
message(STATUS "MUSA target architectures: ${MUSA_ARCHS}")
|
|
|
|
# Filter the target architectures by the supported archs
|
|
# MUSA SDK 4.3.0 supports the following architectures
|
|
set(MUSA_SUPPORTED_ARCHS "21;22")
|
|
|
|
# Override GPU architectures detected by cmake/torch
|
|
override_gpu_arches(VLLM_GPU_ARCHES
|
|
${VLLM_GPU_LANG}
|
|
"${MUSA_SUPPORTED_ARCHS}")
|
|
|
|
# Query torch for additional GPU compilation flags
|
|
get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
|
|
|
|
# Set nvcc parallelism (MUSA compiler also supports --threads flag)
|
|
if(NVCC_THREADS)
|
|
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
|
endif()
|
|
|
|
# Set MUSA include flags for CXX compiler
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I$ENV{MUSA_HOME}/include")
|
|
|
|
# Set up CUTLASS for MUSA builds
|
|
# MUSA is compatible with CUDA, so we can use the same CUTLASS configuration
|
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
|
|
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
|
set(CUTLASS_REVISION "v4.2.1")
|
|
|
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
|
set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
|
|
endif()
|
|
|
|
if(VLLM_CUTLASS_SRC_DIR)
|
|
if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
|
|
get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
|
|
endif()
|
|
message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
|
|
FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
|
|
else()
|
|
FetchContent_Declare(
|
|
cutlass
|
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
|
# Please keep this in sync with CUTLASS_REVISION line above.
|
|
GIT_TAG ${CUTLASS_REVISION}
|
|
GIT_PROGRESS TRUE
|
|
|
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
|
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
|
|
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
|
|
GIT_SHALLOW TRUE
|
|
)
|
|
endif()
|
|
FetchContent_MakeAvailable(cutlass)
|
|
|
|
# Set MUSA extension sources
|
|
# These are the same source files as CUDA, since MUSA is compatible with CUDA code
|
|
set(VLLM_EXT_SRC
|
|
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
|
"csrc/cache_kernels.cu"
|
|
"csrc/attention/paged_attention_v1.cu"
|
|
"csrc/attention/paged_attention_v2.cu"
|
|
"csrc/attention/merge_attn_states.cu"
|
|
"csrc/attention/vertical_slash_index.cu"
|
|
"csrc/pos_encoding_kernels.cu"
|
|
"csrc/activation_kernels.cu"
|
|
"csrc/layernorm_kernels.cu"
|
|
"csrc/fused_qknorm_rope_kernel.cu"
|
|
"csrc/layernorm_quant_kernels.cu"
|
|
"csrc/sampler.cu"
|
|
"csrc/cuda_view.cu" # Note: Keeping this name for compatibility
|
|
"csrc/quantization/gptq/q_gemm.cu"
|
|
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
|
"csrc/quantization/w8a8/fp8/common.cu"
|
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
|
"csrc/quantization/activation_kernels.cu"
|
|
"csrc/cuda_utils_kernels.cu" # Note: Keeping this name for compatibility
|
|
"csrc/custom_all_reduce.cu"
|
|
"csrc/torch_bindings.cpp")
|
|
|
|
# MOE extension sources for MUSA
|
|
set(VLLM_MOE_EXT_SRC
|
|
"csrc/moe/torch_bindings.cpp"
|
|
"csrc/moe/moe_align_sum_kernels.cu"
|
|
"csrc/moe/topk_softmax_kernels.cu")
|
|
|
|
list(APPEND VLLM_MOE_EXT_SRC
|
|
"csrc/moe/moe_wna16.cu"
|
|
"csrc/moe/grouped_topk_kernels.cu")
|
|
|
|
set(MOE_PERMUTE_SRC
|
|
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
|
"csrc/moe/moe_permute_unpermute_op.cu")
|
|
|
|
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
|
|
|
# Cumem allocator for MUSA
|
|
set(VLLM_CUMEM_EXT_SRC
|
|
"csrc/cumem_allocator.cpp")
|
|
|
|
# Link against musa driver library for cumem
|
|
list(APPEND CUMEM_LIBS musa::musa_driver)
|