# # MUSA-specific configuration for vLLM # set(VLLM_GPU_LANG "MUSA") # Set the supported torch version for MUSA set(TORCH_SUPPORTED_VERSION_MUSA "2.7.1") # Warn if the torch version doesn't match what we expect if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_MUSA}) message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_MUSA} " "expected for MUSA build, saw ${Torch_VERSION} instead.") endif() # Find MUSA package list(APPEND CMAKE_MODULE_PATH $ENV{MUSA_HOME}/cmake) find_package(MUSA REQUIRED) # Extract and filter MUSA architectures # MUSA architectures are similar to CUDA, but may have different naming message(STATUS "MUSA target architectures: ${MUSA_ARCHS}") # Filter the target architectures by the supported archs # MUSA SDK 4.3.0 supports the following architectures set(MUSA_SUPPORTED_ARCHS "21;22") # Override GPU architectures detected by cmake/torch override_gpu_arches(VLLM_GPU_ARCHES ${VLLM_GPU_LANG} "${MUSA_SUPPORTED_ARCHS}") # Query torch for additional GPU compilation flags get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG}) # Set nvcc parallelism (MUSA compiler also supports --threads flag) if(NVCC_THREADS) list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") endif() # Set MUSA include flags for CXX compiler set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I$ENV{MUSA_HOME}/include") # Set up CUTLASS for MUSA builds # MUSA is compatible with CUDA, so we can use the same CUTLASS configuration SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. set(CUTLASS_REVISION "v4.2.1") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR}) endif() if(VLLM_CUTLASS_SRC_DIR) if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR) get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE) endif() message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation") FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR}) else() FetchContent_Declare( cutlass GIT_REPOSITORY https://github.com/nvidia/cutlass.git # Please keep this in sync with CUTLASS_REVISION line above. GIT_TAG ${CUTLASS_REVISION} GIT_PROGRESS TRUE # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags. # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE GIT_SHALLOW TRUE ) endif() FetchContent_MakeAvailable(cutlass) # Set MUSA extension sources # These are the same source files as CUDA, since MUSA is compatible with CUDA code set(VLLM_EXT_SRC "csrc/mamba/mamba_ssm/selective_scan_fwd.cu" "csrc/cache_kernels.cu" "csrc/attention/paged_attention_v1.cu" "csrc/attention/paged_attention_v2.cu" "csrc/attention/merge_attn_states.cu" "csrc/attention/vertical_slash_index.cu" "csrc/pos_encoding_kernels.cu" "csrc/activation_kernels.cu" "csrc/layernorm_kernels.cu" "csrc/fused_qknorm_rope_kernel.cu" "csrc/layernorm_quant_kernels.cu" "csrc/sampler.cu" "csrc/cuda_view.cu" # Note: Keeping this name for compatibility "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/w8a8/int8/scaled_quant.cu" "csrc/quantization/w8a8/fp8/common.cu" "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/activation_kernels.cu" "csrc/cuda_utils_kernels.cu" # Note: Keeping this name for compatibility "csrc/custom_all_reduce.cu" "csrc/torch_bindings.cpp") # MOE extension sources for MUSA set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp" "csrc/moe/moe_align_sum_kernels.cu" "csrc/moe/topk_softmax_kernels.cu") list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu" "csrc/moe/grouped_topk_kernels.cu") set(MOE_PERMUTE_SRC "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu" "csrc/moe/moe_permute_unpermute_op.cu") list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}") # Cumem allocator for MUSA set(VLLM_CUMEM_EXT_SRC "csrc/cumem_allocator.cpp") # Link against musa driver library for cumem list(APPEND CUMEM_LIBS musa::musa_driver)