refactor: 统一硬件相关头文件引用

将分散在各文件中的CUDA/HIP/MUSA硬件相关头文件引用统一到vendors目录下的对应头文件中，提高代码可维护性。移除重复的头文件引用，优化构建配置。
2026-01-20 10:14:31 +08:00
parent 5aef6c175a
commit 2bd9bd4cc2
98 changed files with 1757 additions and 1286 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/cmake/comm.cmake
+++ b/cmake/comm.cmake
@@ -0,0 +1,23 @@
+set(VLLM_EXT_SRC
+  "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
+  "csrc/cache_kernels.cu"
+  "csrc/attention/paged_attention_v1.cu"
+  "csrc/attention/paged_attention_v2.cu"
+  "csrc/attention/merge_attn_states.cu"
+  "csrc/attention/vertical_slash_index.cu"
+  "csrc/pos_encoding_kernels.cu"
+  "csrc/activation_kernels.cu"
+  "csrc/layernorm_kernels.cu"
+  "csrc/fused_qknorm_rope_kernel.cu"
+  "csrc/layernorm_quant_kernels.cu"
+  "csrc/sampler.cu"
+  "csrc/cuda_view.cu"
+  "csrc/quantization/gptq/q_gemm.cu"
+  "csrc/quantization/w8a8/int8/scaled_quant.cu"
+  "csrc/quantization/w8a8/fp8/common.cu"
+  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
+  "csrc/quantization/gguf/gguf_kernel.cu"
+  "csrc/quantization/activation_kernels.cu"
+  "csrc/cuda_utils_kernels.cu"
+  "csrc/custom_all_reduce.cu"
+  "csrc/torch_bindings.cpp")
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -0,0 +1,753 @@
+#
+# CUDA-specific configuration for vLLM
+#
+
+set(VLLM_GPU_LANG "CUDA")
+
+# Set the supported torch version for CUDA
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
+
+# Warn if the torch version doesn't match what we expect
+if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
+  message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
+    "expected for CUDA build, saw ${Torch_VERSION} instead.")
+endif()
+
+# Extract and filter CUDA architectures
+clear_cuda_arches(CUDA_ARCH_FLAGS)
+extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
+message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
+
+# Filter the target architectures by the supported archs
+cuda_archs_loose_intersection(CUDA_ARCHS
+  "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
+message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
+
+# Query torch for additional GPU compilation flags
+set(VLLM_GPU_ARCHES "${CUDA_ARCHS}")
+get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
+
+# Set nvcc parallelism
+if(NVCC_THREADS)
+  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
+endif()
+
+# Set compression mode for CUDA >=13.x
+if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+  list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
+endif()
+
+# Set CUDA include flags for CXX compiler
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
+if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
+endif()
+
+# Set up CUTLASS for CUDA builds
+SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+
+# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
+set(CUTLASS_REVISION "v4.2.1")
+
+# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
+if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
+  set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
+endif()
+
+if(VLLM_CUTLASS_SRC_DIR)
+  if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
+    get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
+  endif()
+  message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
+  FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
+else()
+  FetchContent_Declare(
+      cutlass
+      GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+      # Please keep this in sync with CUTLASS_REVISION line above.
+      GIT_TAG ${CUTLASS_REVISION}
+      GIT_PROGRESS TRUE
+
+      # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
+      # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
+      # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
+      GIT_SHALLOW TRUE
+  )
+endif()
+FetchContent_MakeAvailable(cutlass)
+
+# Set CUDA extension sources
+set(VLLM_EXT_SRC
+  "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
+  "csrc/cache_kernels.cu"
+  "csrc/attention/paged_attention_v1.cu"
+  "csrc/attention/paged_attention_v2.cu"
+  "csrc/attention/merge_attn_states.cu"
+  "csrc/attention/vertical_slash_index.cu"
+  "csrc/pos_encoding_kernels.cu"
+  "csrc/activation_kernels.cu"
+  "csrc/layernorm_kernels.cu"
+  "csrc/fused_qknorm_rope_kernel.cu"
+  "csrc/layernorm_quant_kernels.cu"
+  "csrc/sampler.cu"
+  "csrc/cuda_view.cu"
+  "csrc/quantization/gptq/q_gemm.cu"
+  "csrc/quantization/w8a8/int8/scaled_quant.cu"
+  "csrc/quantization/w8a8/fp8/common.cu"
+  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
+  "csrc/quantization/gguf/gguf_kernel.cu"
+  "csrc/quantization/activation_kernels.cu"
+  "csrc/cuda_utils_kernels.cu"
+  "csrc/custom_all_reduce.cu"
+  "csrc/torch_bindings.cpp")
+
+# Add CUDA-specific sources
+list(APPEND VLLM_EXT_SRC
+  "csrc/quantization/awq/gemm_kernels.cu"
+  "csrc/permute_cols.cu"
+  "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
+  "csrc/quantization/fp4/nvfp4_quant_entry.cu"
+  "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
+  "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
+  "csrc/cutlass_extensions/common.cpp"
+  "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
+  "csrc/quantization/w8a8/int8/per_token_group_quant.cu")
+
+set_gencode_flags_for_srcs(
+  SRCS "${VLLM_EXT_SRC}"
+  CUDA_ARCHS "${CUDA_ARCHS}")
+
+# Marlin kernels configuration
+# marlin arches for fp16 output
+cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+# marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
+cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+# marlin arches for fp8 input
+# - sm80 doesn't support fp8 computation
+# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+# so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+
+if (MARLIN_ARCHS)
+  # Generate Marlin kernel sources
+  set(MARLIN_GEN_SCRIPT
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
+  file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+  list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+  set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
+
+  message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+  message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+
+  if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+      OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
+    execute_process(
+      COMMAND ${CMAKE_COMMAND} -E env
+      PYTHONPATH=$ENV{PYTHONPATH}
+        ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
+      RESULT_VARIABLE marlin_generation_result
+      OUTPUT_VARIABLE marlin_generation_result
+      OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
+      ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
+    )
+
+    if (NOT marlin_generation_result EQUAL 0)
+      message(FATAL_ERROR "Marlin generation failed."
+                          " Result: \"${marlin_generation_result}\""
+                          "\nCheck the log for details: "
+                          "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
+    else()
+      set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+          CACHE STRING "Last run Marlin generate script hash" FORCE)
+      message(STATUS "Marlin generation completed successfully.")
+    endif()
+  else()
+    message(STATUS "Marlin generation script has not changed, skipping generation.")
+  endif()
+
+  file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
+    CUDA_ARCHS "${MARLIN_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+    set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
+      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+  endif()
+  list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+
+  file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+    CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+    set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+  endif()
+  list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
+
+  if (MARLIN_FP8_ARCHS) 
+    file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
+      CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
+    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
+  endif()
+
+  set(MARLIN_SRCS
+     "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+     "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+     "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
+     "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+     "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${MARLIN_SRCS}"
+    CUDA_ARCHS "${MARLIN_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+    set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
+      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+  endif()
+  list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
+
+  message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
+else()
+  message(STATUS "Not building Marlin kernels as no compatible archs found"
+                 " in CUDA target architectures")
+endif()
+
+# AllSpark kernels configuration
+cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
+if (ALLSPARK_ARCHS)
+  set(ALLSPARK_SRCS
+     "csrc/quantization/gptq_allspark/allspark_repack.cu"
+     "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${ALLSPARK_SRCS}"
+    CUDA_ARCHS "${ALLSPARK_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
+  message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
+else()
+  message(STATUS "Not building AllSpark kernels as no compatible archs found"
+                 " in CUDA target architectures")
+endif()
+
+# Scaled MM 3X (Hopper) kernels
+set(SCALED_MM_3X_ARCHS)
+cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
+  set(SRCS
+     "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
+     "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
+     "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
+     "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${SCALED_MM_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
+  # Let scaled_mm_c2x know it doesn't need to build these arches
+  list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
+  message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
+else()
+  if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
+    message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
+                   "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                   "later if you intend on running FP8 quantized models on "
+                   "Hopper.")
+  else()
+    message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
+                   "in CUDA target architectures")
+  endif()
+endif()
+
+# Scaled MM 3X (Geforce Blackwell SM120) kernels
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
+else()
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+  set(SRCS
+    "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
+    "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
+    "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
+  )
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${SCALED_MM_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM120=1")
+  # Let scaled_mm_c2x know it doesn't need to build these arches
+  list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
+  message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
+else()
+  if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
+                   "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
+                   "later if you intend on running FP8 quantized models on "
+                   "Blackwell.")
+  else()
+    message(STATUS "Not building scaled_mm_c3x_120 as no compatible archs found "
+                   "in CUDA target architectures")
+  endif()
+endif()
+
+# Scaled MM 3X (Blackwell SM100) kernels
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+else()
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+  set(SRCS
+    "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
+    "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
+    "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+  )
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${SCALED_MM_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
+  # Let scaled_mm_c2x know it doesn't need to build these arches
+  list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
+  message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
+else()
+  if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
+                   "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
+                   "later if you intend on running FP8 quantized models on "
+                   "Blackwell.")
+  else()
+    message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
+                   "in CUDA target architectures")
+  endif()
+endif()
+
+# Scaled MM 2X kernels for remaining archs
+cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
+  "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
+# subtract out the archs that are already built for 3x
+list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
+if (SCALED_MM_2X_ARCHS)
+  set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
+  message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
+else()
+  if (SCALED_MM_3X_ARCHS)
+    message(STATUS "Not building scaled_mm_c2x as all archs are already built"
+                   " for and covered by scaled_mm_c3x")
+  else()
+    message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
+                  "in CUDA target architectures")
+  endif()
+endif()
+
+# 2:4 Sparse Kernels
+cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
+  set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${SCALED_MM_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
+  message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
+else()
+  if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
+    message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
+                   "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
+                   "if you intend on running FP8 sparse quantized models on Hopper.")
+  else()
+    message(STATUS "Not building sparse_scaled_mm_c3x as no compatible archs found "
+                   "in CUDA target architectures")
+  endif()
+endif()
+
+# NVFP4 kernels for Geforce Blackwell SM120
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
+else()
+  cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
+  set(SRCS
+      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${FP4_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
+  message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
+else()
+  message(STATUS "Not building NVFP4 as no compatible archs were found.")
+  # clear FP4_ARCHS
+  set(FP4_ARCHS)
+endif()
+
+# NVFP4 kernels for other Blackwell architectures
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+else()
+  cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
+  set(SRCS
+      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${FP4_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM100=1")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
+  message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
+else()
+  message(STATUS "Not building NVFP4 as no compatible archs were found.")
+  # clear FP4_ARCHS
+  set(FP4_ARCHS)
+endif()
+
+# CUTLASS MLA kernels
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+else()
+  cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
+  set(SRCS
+    "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${MLA_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
+  # Add MLA-specific include directories only to MLA source files
+  set_source_files_properties(${SRCS}
+    PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
+  message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
+else()
+  message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
+  # clear MLA_ARCHS
+  set(MLA_ARCHS)
+endif()
+
+# CUTLASS MoE kernels for Hopper
+cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
+  set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${SCALED_MM_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
+  message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
+else()
+  if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
+    message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
+                   "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
+                   "if you intend on running FP8 quantized MoE models on Hopper.")
+  else()
+    message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
+                   "in CUDA target architectures.")
+  endif()
+endif()
+
+# CUTLASS MoE kernels for Blackwell SM100
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+else()
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+  set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${SCALED_MM_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
+  message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
+else()
+  if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
+                   "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
+                   "if you intend on running FP8 quantized MoE models on Blackwell.")
+  else()
+    message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
+                   "in CUDA target architectures.")
+  endif()
+endif()
+
+# MoE data kernel (used by all CUTLASS MoE kernels)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+else()
+  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
+  set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  message(STATUS "Building moe_data for archs: ${CUTLASS_MOE_DATA_ARCHS}")
+else()
+  if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
+    message(STATUS "Not building moe_data as CUDA Compiler version is "
+                   "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
+                   "if you intend on running FP8 quantized MoE models on Hopper or Blackwell.")
+  else()
+    message(STATUS "Not building moe_data as no compatible archs found "
+                   "in CUDA target architectures.")
+  endif()
+endif()
+
+# Blockwise scaled group MM for Blackwell SM100
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+else()
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+  set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${SCALED_MM_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
+  message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
+else()
+  if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
+                   "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
+                   "if you intend on running FP8 quantized MoE models on Blackwell.")
+  else()
+    message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
+                   "in CUDA target architectures")
+  endif()
+endif()
+
+# Machete kernels
+cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS)
+  # Generate Machete kernel sources
+  set(MACHETE_GEN_SCRIPT
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
+  file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
+
+  message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
+  message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
+
+  if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
+      OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
+    execute_process(
+      COMMAND ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH}
+        ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
+      RESULT_VARIABLE machete_generation_result
+      OUTPUT_VARIABLE machete_generation_output
+      OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
+      ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
+    )
+
+    if (NOT machete_generation_result EQUAL 0)
+      message(FATAL_ERROR "Machete generation failed."
+                          " Result: \"${machete_generation_result}\""
+                          "\nCheck the log for details: "
+                          "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
+    else()
+      set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
+          CACHE STRING "Last run machete generate script hash" FORCE)
+      message(STATUS "Machete generation completed successfully.")
+    endif()
+  else()
+    message(STATUS "Machete generation script has not changed, skipping generation.")
+  endif()
+
+  # Add machete generated sources
+  file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
+  list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
+
+  # forward compatible
+  set_gencode_flags_for_srcs(
+    SRCS "${MACHETE_GEN_SOURCES}"
+    CUDA_ARCHS "${MACHETE_ARCHS}")
+
+  list(APPEND VLLM_EXT_SRC
+    csrc/quantization/machete/machete_pytorch.cu)
+
+  message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
+else()
+  if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
+      AND MACHETE_ARCHS)
+    message(STATUS "Not building Machete kernels as CUDA Compiler version is "
+                   "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                   "later if you intend on running w4a16 quantized models on "
+                   "Hopper.")
+  else()
+    message(STATUS "Not building Machete kernels as no compatible archs "
+                   "found in CUDA target architectures")
+  endif()
+endif()
+
+# W4A8 kernels
+cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
+  set(SRCS
+     "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
+     "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
+     "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
+     )
+
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${W4A8_ARCHS}")
+
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+
+  message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}")
+else()
+  if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
+      AND W4A8_ARCHS)
+    message(STATUS "Not building W4A8 kernels as CUDA Compiler version is "
+                   "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+                   "later if you intend on running w4a16 quantized models on "
+                   "Hopper.")
+  else()
+    message(STATUS "Not building W4A8 kernels as no compatible archs "
+                   "found in CUDA target architectures")
+  endif()
+endif()
+
+# Hadacore kernels
+cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+if(HADACORE_ARCHS)
+  set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${SRCS}"
+    CUDA_ARCHS "${HADACORE_ARCHS}")
+  list(APPEND VLLM_EXT_SRC "${SRCS}")
+  message(STATUS "Building hadacore")
+endif()
+
+# MOE extension sources for CUDA
+set(VLLM_MOE_EXT_SRC
+  "csrc/moe/torch_bindings.cpp"
+  "csrc/moe/moe_align_sum_kernels.cu"
+  "csrc/moe/topk_softmax_kernels.cu")
+
+list(APPEND VLLM_MOE_EXT_SRC
+  "csrc/moe/moe_wna16.cu"
+  "csrc/moe/grouped_topk_kernels.cu")
+
+set(MOE_PERMUTE_SRC
+    "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
+    "csrc/moe/moe_permute_unpermute_op.cu")
+
+list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
+
+set_gencode_flags_for_srcs(
+  SRCS "${VLLM_MOE_EXT_SRC}"
+  CUDA_ARCHS "${CUDA_ARCHS}")
+
+# Marlin MOE kernels
+# note that we always set `use_atomic_add=False` for moe marlin now,
+# so we don't need 9.0 for bf16 atomicAdd PTX
+cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+# moe marlin arches for fp8 input
+# - sm80 doesn't support fp8 computation
+# - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+# so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+if (MARLIN_MOE_ARCHS)
+  # Generate Marlin MOE kernel sources
+  set(MOE_MARLIN_GEN_SCRIPT
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
+  file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+  list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+  set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
+
+  message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+  message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+
+  if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+      OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
+    execute_process(
+      COMMAND ${CMAKE_COMMAND} -E env
+      PYTHONPATH=$ENV{PYTHONPATH}
+        ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
+      RESULT_VARIABLE moe_marlin_generation_result
+      OUTPUT_VARIABLE moe_marlin_generation_output
+      OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
+      ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
+    )
+
+    if (NOT moe_marlin_generation_result EQUAL 0)
+      message(FATAL_ERROR "Marlin MOE generation failed."
+                          " Result: \"${moe_marlin_generation_result}\""
+                          "\nCheck the log for details: "
+                          "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
+    else()
+      set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+          CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
+      message(STATUS "Marlin MOE generation completed successfully.")
+    endif()
+  else()
+    message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
+  endif()
+
+  file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+  list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${MARLIN_MOE_SRC}"
+    CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+    set_source_files_properties(${MARLIN_MOE_SRC}
+      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+  endif()
+  list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
+
+  if (MARLIN_MOE_FP8_ARCHS)
+    file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_MOE_FP8_SRC}"
+      CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_MOE_FP8_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
+    list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
+  endif()
+
+  message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
+else()
+  message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
+                 " in CUDA target architectures")
+endif()
+
+# Cumem allocator for CUDA
+set(VLLM_CUMEM_EXT_SRC
+  "csrc/cumem_allocator.cpp")
+
+set_gencode_flags_for_srcs(
+  SRCS "${VLLM_CUMEM_EXT_SRC}"
+  CUDA_ARCHS "${CUDA_ARCHS}")
+
+# Link against cuda driver library for cumem
+list(APPEND CUMEM_LIBS CUDA::cuda_driver)
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -0,0 +1,146 @@
+#
+# HIP/ROCm-specific configuration for vLLM
+#
+
+set(VLLM_GPU_LANG "HIP")
+
+# Set the supported torch version for ROCm
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
+
+# Warn if the torch version doesn't match what we expect
+if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
+    Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
+  message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
+    "expected for ROCm build, saw ${Torch_VERSION} instead.")
+endif()
+
+# Enable HIP language support
+# Importing torch recognizes and sets up some HIP/ROCm configuration but does
+# not let cmake recognize .hip files. In order to get cmake to understand the
+# .hip extension automatically, HIP must be enabled explicitly.
+enable_language(HIP)
+
+# Supported AMD GPU architectures
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
+
+# Override GPU architectures detected by cmake/torch and filter by supported versions
+override_gpu_arches(VLLM_GPU_ARCHES
+  ${VLLM_GPU_LANG}
+  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
+
+# Query torch for additional GPU compilation flags
+get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
+
+# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose debug info
+set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
+
+# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
+# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
+set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
+
+# Set up CUTLASS for HIP builds
+SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+
+# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
+set(CUTLASS_REVISION "v4.2.1")
+
+# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
+if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
+  set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
+endif()
+
+if(VLLM_CUTLASS_SRC_DIR)
+  if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
+    get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
+  endif()
+  message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
+  FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
+else()
+  FetchContent_Declare(
+      cutlass
+      GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+      # Please keep this in sync with CUTLASS_REVISION line above.
+      GIT_TAG ${CUTLASS_REVISION}
+      GIT_PROGRESS TRUE
+
+      # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
+      # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
+      # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
+      GIT_SHALLOW TRUE
+  )
+endif()
+FetchContent_MakeAvailable(cutlass)
+
+# Set HIP extension sources
+set(VLLM_EXT_SRC
+  "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
+  "csrc/cache_kernels.cu"
+  "csrc/attention/paged_attention_v1.cu"
+  "csrc/attention/paged_attention_v2.cu"
+  "csrc/attention/merge_attn_states.cu"
+  "csrc/attention/vertical_slash_index.cu"
+  "csrc/pos_encoding_kernels.cu"
+  "csrc/activation_kernels.cu"
+  "csrc/layernorm_kernels.cu"
+  "csrc/fused_qknorm_rope_kernel.cu"
+  "csrc/layernorm_quant_kernels.cu"
+  "csrc/sampler.cu"
+  "csrc/cuda_view.cu"  # Note: Keeping this name for compatibility
+  "csrc/quantization/gptq/q_gemm.cu"
+  "csrc/quantization/w8a8/int8/scaled_quant.cu"
+  "csrc/quantization/w8a8/fp8/common.cu"
+  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
+  "csrc/quantization/gguf/gguf_kernel.cu"
+  "csrc/quantization/activation_kernels.cu"
+  "csrc/cuda_utils_kernels.cu"  # Note: Keeping this name for compatibility
+  "csrc/custom_all_reduce.cu"
+  "csrc/torch_bindings.cpp")
+
+# Add QuickReduce kernels for ROCm
+list(APPEND VLLM_EXT_SRC
+  "csrc/custom_quickreduce.cu"
+)
+
+# MOE extension sources for ROCm
+set(VLLM_MOE_EXT_SRC
+  "csrc/moe/torch_bindings.cpp"
+  "csrc/moe/moe_align_sum_kernels.cu"
+  "csrc/moe/topk_softmax_kernels.cu")
+
+# Cumem allocator for ROCm
+set(VLLM_CUMEM_EXT_SRC
+  "csrc/cumem_allocator.cpp")
+
+# Link against rocm driver library for cumem
+# Prefer an absolute path to libamdhip64.so inside ${ROCM_PATH}/lib if available,
+# otherwise fall back to linking by name "amdhip64".
+find_library(AMDHIP64_LIB
+  NAMES amdhip64 libamdhip64.so
+  PATHS ${ROCM_PATH}/lib
+  NO_DEFAULT_PATH)
+if(AMDHIP64_LIB)
+  message(STATUS "Found libamdhip64 at ${AMDHIP64_LIB}")
+  list(APPEND CUMEM_LIBS ${AMDHIP64_LIB})
+else()
+  message(WARNING "libamdhip64 not found in ${ROCM_PATH}/lib; falling back to linking 'amdhip64' by name")
+  list(APPEND CUMEM_LIBS amdhip64)
+endif()
+
+# ROCm-specific extension sources
+set(VLLM_ROCM_EXT_SRC
+  "csrc/rocm/torch_bindings.cpp"
+  "csrc/rocm/skinny_gemms.cu"
+  "csrc/rocm/attention.cu")
+
+# Define ROCm-specific extension target
+define_extension_target(
+  _rocm_C
+  DESTINATION vllm
+  LANGUAGE ${VLLM_GPU_LANG}
+  SOURCES ${VLLM_ROCM_EXT_SRC}
+  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  USE_SABI 3
+  WITH_SOABI)
--- a/cmake/musa.cmake
+++ b/cmake/musa.cmake
@@ -0,0 +1,125 @@
+#
+# MUSA-specific configuration for vLLM
+#
+
+set(VLLM_GPU_LANG "MUSA")
+
+# Set the supported torch version for MUSA
+set(TORCH_SUPPORTED_VERSION_MUSA "2.7.1")
+
+# Warn if the torch version doesn't match what we expect
+if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_MUSA})
+  message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_MUSA} "
+    "expected for MUSA build, saw ${Torch_VERSION} instead.")
+endif()
+
+# Find MUSA package
+list(APPEND CMAKE_MODULE_PATH $ENV{MUSA_HOME}/cmake)
+find_package(MUSA REQUIRED)
+
+# Extract and filter MUSA architectures
+# MUSA architectures are similar to CUDA, but may have different naming
+message(STATUS "MUSA target architectures: ${MUSA_ARCHS}")
+
+# Filter the target architectures by the supported archs
+# MUSA SDK 4.3.0 supports the following architectures
+set(MUSA_SUPPORTED_ARCHS "21;22")
+
+# Override GPU architectures detected by cmake/torch
+override_gpu_arches(VLLM_GPU_ARCHES
+  ${VLLM_GPU_LANG}
+  "${MUSA_SUPPORTED_ARCHS}")
+
+# Query torch for additional GPU compilation flags
+get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
+
+# Set nvcc parallelism (MUSA compiler also supports --threads flag)
+if(NVCC_THREADS)
+  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
+endif()
+
+# Set MUSA include flags for CXX compiler
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I$ENV{MUSA_HOME}/include")
+
+# Set up CUTLASS for MUSA builds
+# MUSA is compatible with CUDA, so we can use the same CUTLASS configuration
+SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+
+# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
+set(CUTLASS_REVISION "v4.2.1")
+
+# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
+if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
+  set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
+endif()
+
+if(VLLM_CUTLASS_SRC_DIR)
+  if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
+    get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
+  endif()
+  message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
+  FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
+else()
+  FetchContent_Declare(
+      cutlass
+      GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+      # Please keep this in sync with CUTLASS_REVISION line above.
+      GIT_TAG ${CUTLASS_REVISION}
+      GIT_PROGRESS TRUE
+
+      # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
+      # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
+      # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
+      GIT_SHALLOW TRUE
+  )
+endif()
+FetchContent_MakeAvailable(cutlass)
+
+# Set MUSA extension sources
+# These are the same source files as CUDA, since MUSA is compatible with CUDA code
+set(VLLM_EXT_SRC
+  "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
+  "csrc/cache_kernels.cu"
+  "csrc/attention/paged_attention_v1.cu"
+  "csrc/attention/paged_attention_v2.cu"
+  "csrc/attention/merge_attn_states.cu"
+  "csrc/attention/vertical_slash_index.cu"
+  "csrc/pos_encoding_kernels.cu"
+  "csrc/activation_kernels.cu"
+  "csrc/layernorm_kernels.cu"
+  "csrc/fused_qknorm_rope_kernel.cu"
+  "csrc/layernorm_quant_kernels.cu"
+  "csrc/sampler.cu"
+  "csrc/cuda_view.cu"  # Note: Keeping this name for compatibility
+  "csrc/quantization/gptq/q_gemm.cu"
+  "csrc/quantization/w8a8/int8/scaled_quant.cu"
+  "csrc/quantization/w8a8/fp8/common.cu"
+  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
+  "csrc/quantization/gguf/gguf_kernel.cu"
+  "csrc/quantization/activation_kernels.cu"
+  "csrc/cuda_utils_kernels.cu"  # Note: Keeping this name for compatibility
+  "csrc/custom_all_reduce.cu"
+  "csrc/torch_bindings.cpp")
+
+# MOE extension sources for MUSA
+set(VLLM_MOE_EXT_SRC
+  "csrc/moe/torch_bindings.cpp"
+  "csrc/moe/moe_align_sum_kernels.cu"
+  "csrc/moe/topk_softmax_kernels.cu")
+
+list(APPEND VLLM_MOE_EXT_SRC
+  "csrc/moe/moe_wna16.cu"
+  "csrc/moe/grouped_topk_kernels.cu")
+
+set(MOE_PERMUTE_SRC
+    "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
+    "csrc/moe/moe_permute_unpermute_op.cu")
+
+list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
+
+# Cumem allocator for MUSA
+set(VLLM_CUMEM_EXT_SRC
+  "csrc/cumem_allocator.cpp")
+
+# Link against musa driver library for cumem
+list(APPEND CUMEM_LIBS musa::musa_driver)
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -1,6 +1,5 @@
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "vendors/functions.h"
+

 #include <cmath>

--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@@ -17,9 +17,7 @@
 * limitations under the License.
 */

-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "../vendors/functions.h"
 #include <algorithm>

 #include "attention_dtypes.h"
@@ -27,9 +25,7 @@
 #include "../cuda_compat.h"

 #ifdef USE_ROCM
-  #include <hip/hip_bf16.h>
  #include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
-typedef __hip_bfloat16 __nv_bfloat16;
 #else
  #include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
 #endif
--- a/csrc/attention/dtype_bfloat16.cuh
+++ b/csrc/attention/dtype_bfloat16.cuh
@@ -23,16 +23,8 @@
 #include "attention_generic.cuh"
 #include "dtype_float32.cuh"

-#ifndef USE_ROCM
-  #include <cuda_bf16.h>
-  #include <cuda_fp16.h>
-#else
-  #include <hip/hip_bf16.h>
-  #include <hip/hip_fp16.h>

-typedef __hip_bfloat162 __nv_bfloat162;
-typedef __hip_bfloat16 __nv_bfloat16;
-#endif
+#include "../vendors/functions.h"

 #include <stdint.h>

--- a/csrc/attention/dtype_float16.cuh
+++ b/csrc/attention/dtype_float16.cuh
@@ -23,9 +23,7 @@
 #include "attention_generic.cuh"
 #include "dtype_float32.cuh"

-#ifdef USE_ROCM
-  #include <hip/hip_fp16.h>
-#endif
+#include "../vendors/functions.h"

 #include <stdint.h>

--- a/csrc/attention/dtype_fp8.cuh
+++ b/csrc/attention/dtype_fp8.cuh
@@ -3,11 +3,7 @@
 #include "attention_generic.cuh"

 #include <stdint.h>
-#ifdef ENABLE_FP8
-  #ifndef USE_ROCM
-    #include <cuda_fp8.h>
-  #endif  // USE_ROCM
-#endif    // ENABLE_FP8
+#include "../vendors/functions.h"

 namespace vllm {

--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -1,7 +1,5 @@
 #include <optional>
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "../vendors/functions.h"
 #include <algorithm>

 #include "attention_dtypes.h"
--- a/csrc/attention/vertical_slash_index.cu
+++ b/csrc/attention/vertical_slash_index.cu
@@ -3,9 +3,7 @@

 #include <assert.h>

-#include <cuda.h>
-
-#include <torch/all.h>
+#include "../vendors/functions.h"

 __device__ int64_t save_blocks(int* block_offset, int64_t range_start,
                               int64_t range_end, int64_t block_size,
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -1,7 +1,7 @@
 #pragma once

-#include <torch/all.h>
-#include <c10/util/Optional.h>
+#include "vendors/functions.h"
+

 #include <map>
 #include <vector>
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -1,8 +1,6 @@
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/cuda/CUDAException.h>
-#include <c10/util/Optional.h>
+#include "vendors/functions.h"
+
+

 #include "cuda_utils.h"
 #include "cuda_compat.h"
@@ -19,10 +17,7 @@
 #include <cassert>
 #include <cfloat>

-#ifdef USE_ROCM
-  #include <hip/hip_bf16.h>
-typedef __hip_bfloat16 __nv_bfloat16;
-#endif
+

 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 const torch::Tensor& block_mapping) {
--- a/csrc/cub_helpers.h
+++ b/csrc/cub_helpers.h
@@ -1,18 +1,4 @@
 #pragma once

-#ifndef USE_ROCM
-  #include <cub/cub.cuh>
-  #if CUB_VERSION >= 200800
-    #include <cuda/std/functional>
-using CubAddOp = cuda::std::plus<>;
-using CubMaxOp = cuda::maximum<>;
-  #else   // if CUB_VERSION < 200800
-using CubAddOp = cub::Sum;
-using CubMaxOp = cub::Max;
-  #endif  // CUB_VERSION
-#else
-  #include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-using CubAddOp = hipcub::Sum;
-using CubMaxOp = hipcub::Max;
-#endif  // USE_ROCM
+#include "vendors/functions.h"
+
--- a/csrc/cuda_utils_kernels.cu
+++ b/csrc/cuda_utils_kernels.cu
@@ -1,8 +1,5 @@
 #include "cuda_utils.h"
-#ifdef USE_ROCM
-  #include <hip/hip_runtime.h>
-  #include <hip/hip_runtime_api.h>
-#endif
+#include "vendors/functions.h"

 int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
  // Return the cached value on subsequent calls
--- a/csrc/cuda_view.cu
+++ b/csrc/cuda_view.cu
@@ -1,6 +1,5 @@
-#include <torch/all.h>
-#include <torch/cuda.h>
-#include <cuda_runtime.h>
+#include "vendors/functions.h"
+

 // This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
 // memory, and that UVA (Unified Virtual Addressing) is enabled.
--- a/csrc/cumem_allocator_compat.h
+++ b/csrc/cumem_allocator_compat.h
@@ -104,6 +104,6 @@ CUresult cuMemUnmap(CUdeviceptr ptr, size_t size) {
 ////////////////////////////////////////
 // Import CUDA headers for NVIDIA GPUs
 ////////////////////////////////////////
-  #include <cuda_runtime_api.h>
-  #include <cuda.h>
+#include "vendors/functions.h"
+
 #endif
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@@ -1,7 +1,5 @@
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/cuda/CUDAStream.h>
-#include <torch/all.h>
+#include "vendors/functions.h"
+

 #include "custom_all_reduce.cuh"

--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -1,13 +1,7 @@
 #pragma once

-#include <cuda.h>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
+#include "vendors/functions.h"

-#if defined(USE_ROCM)
-typedef __hip_bfloat16 nv_bfloat16;
-#endif

 #include <iostream>
 #include <array>
--- a/csrc/custom_all_reduce_test.cu
+++ b/csrc/custom_all_reduce_test.cu
@@ -11,8 +11,8 @@
 * To run:
 * mpirun --allow-run-as-root -np 8 ./custom_all_reduce_test
 */
-#include <cuda.h>
-#include <curand_kernel.h>
+#include "vendors/functions.h"
+
 #include <stdio.h>
 #include <stdlib.h>

--- a/csrc/custom_quickreduce.cu
+++ b/csrc/custom_quickreduce.cu
@@ -1,7 +1,5 @@
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/cuda/CUDAStream.h>
-#include <torch/all.h>
+#include "vendors/functions.h"
+

 #ifdef USE_ROCM

--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -4,7 +4,8 @@
 */
 #pragma once

-#include <torch/all.h>
+#include "vendors/functions.h"
+

 // Need a special dispatch case macro since we will nest the FP8 dispatch.
 // Instead of the usual 'scalar_t', this names the dispatched type 'fp8_t'.
--- a/csrc/fused_qknorm_rope_kernel.cu
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -15,11 +15,10 @@
 */

 #include <cmath>
-#include <cuda_runtime.h>
 #include <type_traits>

-#include <torch/cuda.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "vendors/functions.h"
+

 #include "cuda_compat.h"
 #include "dispatch_utils.h"
--- a/csrc/launch_bounds_utils.h
+++ b/csrc/launch_bounds_utils.h
@@ -1,6 +1,7 @@
 #pragma once

-#include <cuda_runtime_api.h>
+#include "vendors/functions.h"
+
 #include <algorithm>

 // maximum blocks per SM cap
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -4,8 +4,8 @@
 #include "core/batch_invariant.hpp"
 #include "quantization/vectorization_utils.cuh"

-#include <torch/cuda.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "vendors/functions.h"
+

 namespace vllm {

--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@@ -12,8 +12,8 @@
 #include "core/batch_invariant.hpp"
 #include "quantization/vectorization_utils.cuh"

-#include <torch/cuda.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "vendors/functions.h"
+

 namespace vllm {

--- a/csrc/mamba/mamba_ssm/selective_scan.h
+++ b/csrc/mamba/mamba_ssm/selective_scan.h
@@ -6,12 +6,7 @@

 #pragma once

-#ifndef USE_ROCM
-    #include <cuda_bf16.h>
-#else
-    #include <hip/hip_bf16.h>
-#endif
-#include <cuda_fp16.h>
+#include "vendors/functions.h"
 ////////////////////////////////////////////////////////////////////////////////////////////////////

 struct SSMParamsBase {
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -1,27 +1,9 @@
 // clang-format off
 // adapted from https://github.com/state-spaces/mamba/blob/main/csrc/selective_scan/selective_scan_fwd_kernel.cuh
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
+
+#include "vendors/functions.h"
+
 #include "selective_scan.h"
-
-#include <c10/util/BFloat16.h>
-#include <c10/util/Half.h>
-#ifdef USE_ROCM
-    #include <c10/hip/HIPException.h>  // For C10_HIP_CHECK and C10_HIP_KERNEL_LAUNCH_CHECK
-#else
-    #include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
-#endif
-
-#ifndef USE_ROCM
-    #include <cub/block/block_load.cuh>
-    #include <cub/block/block_store.cuh>
-    #include <cub/block/block_scan.cuh>
-#else
-    #include <hipcub/hipcub.hpp>
-    namespace cub = hipcub;
-#endif
-
 #include "selective_scan.h"
 #include "static_switch.h"

--- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
+++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp
@@ -1,7 +1,4 @@
-#include <ATen/ATen.h>
-#include <ATen/Parallel.h>
-#include <torch/all.h>
-
+#include "../../vendors/functions.h"
 // _dyn_quant_matmul_4bit is only available on AArch64.
 #if defined(__aarch64__)
  #include <ATen/ops/_dyn_quant_matmul_4bit.h>
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -17,13 +17,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include <c10/cuda/CUDAStream.h>
-#include <torch/all.h>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#include <cuda/std/limits>
-#include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
+
+#include "../../vendors/functions.h"
 namespace cg = cooperative_groups;

 namespace vllm {
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -1,10 +1,4 @@
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cub/cub.cuh>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/Atomic.cuh>
+#include "../vendors/functions.h"

 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -1,6 +1,6 @@
 #pragma once

-#include <torch/all.h>
+#include "../vendors/functions.h"

 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& token_expert_indices,
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -1,6 +1,4 @@
-#include <c10/core/ScalarType.h>
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
+#include "../vendors/functions.h"
 #include "permute_unpermute_kernels/moe_permute_unpermute_kernel.h"
 #include "permute_unpermute_kernels/dispatch.h"
 #include "core/registration.h"
--- a/csrc/moe/moe_wna16.cu
+++ b/csrc/moe/moe_wna16.cu
@@ -1,11 +1,7 @@

-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime.h>
+#include "../vendors/functions.h"
+

-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
 #include "moe_wna16_utils.h"

 #define DIVIDE(x, size) (((x) + (size) - 1) / (size))
--- a/csrc/moe/moe_wna16_utils.h
+++ b/csrc/moe/moe_wna16_utils.h
@@ -1,6 +1,5 @@

-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
+#include "../vendors/functions.h"

 template <typename scalar_t>
 class ScalarType {};
--- a/csrc/moe/permute_unpermute_kernels/dispatch.h
+++ b/csrc/moe/permute_unpermute_kernels/dispatch.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <cuda_fp8.h>
+#include "vendors/functions.h"
 #define MOE_SWITCH(TYPE, ...)                                     \
  at::ScalarType _st = ::detail::scalar_type(TYPE);               \
  switch (_st) {                                                  \
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h
@@ -2,12 +2,11 @@
 // reference from tensorrt_llm moe kernel implementation archive in
 // https://github.com/BBuf/tensorrt-llm-moe/tree/master

-#include <c10/core/ScalarType.h>
 #include <torch/all.h>
 #include "dispatch.h"
-#include <cub/cub.cuh>
-#include <cub/device/device_radix_sort.cuh>
-#include <cub/util_type.cuh>
+
+
+#include "../../vendors/functions.h"
 #include "cutlass/numeric_size.h"
 #include "cutlass/array.h"

--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -17,9 +17,9 @@
 * limitations under the License.
 */
 #include <type_traits>
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "../vendors/functions.h"
+
+
 #include "../cuda_compat.h"
 #include "../cub_helpers.h"

--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -1,6 +1,7 @@
 #pragma once

 #include <optional>
+
 #include <torch/library.h>

 #include "core/scalar_type.hpp"
--- a/csrc/permute_cols.cu
+++ b/csrc/permute_cols.cu
@@ -1,9 +1,6 @@
-#include <torch/all.h>
+#include "vendors/functions.h"

-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>

-#include <cuda_fp16.h>

 static constexpr int default_threads = 256;
 static constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@@ -1,6 +1,5 @@
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "vendors/functions.h"
+

 #include "cuda_compat.h"
 #include "dispatch_utils.h"
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -1,6 +1,5 @@
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
+
+#include "../vendors/functions.h"

 #include <cmath>
 #include "core/math.hpp"
@@ -9,29 +8,8 @@

 #include "quantization/w8a8/fp8/common.cuh"

-#include <c10/util/Float8_e4m3fn.h>

-#ifndef USE_ROCM
-  #include <cuda_bf16.h>
-  #include <cuda_fp16.h>
-  #include <cuda_fp8.h>
-#else
-  #include <hip/hip_bf16.h>
-  #include <hip/hip_fp16.h>
-  #include <hip/hip_fp8.h>

-typedef __hip_bfloat162 __nv_bfloat162;
-typedef __hip_bfloat16 __nv_bfloat16;
-typedef __hip_bfloat16_raw __nv_bfloat16_raw;
-  #if defined(HIP_FP8_TYPE_OCP)
-typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
-typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3;
-  #else
-// ROCm 6.2 fallback: only *_fnuz types exist
-typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3;
-typedef __hip_fp8x4_e4m3_fnuz __nv_fp8x4_e4m3;
-  #endif
-#endif

 #include "core/registration.h"
 namespace vllm {
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@@ -7,12 +7,11 @@ Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
 }
 */

-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
+

 #include "dequantize.cuh"

-#include <cuda_fp16.h>
+#include "../../vendors/functions.h"

 namespace vllm {
 namespace awq {
--- a/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
+++ b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh
@@ -1,9 +1,7 @@
 // see csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
 #pragma once

-#include <cuda.h>
-#include <torch/all.h>
-#include <c10/cuda/CUDAStream.h>
+#include "../../vendors/functions.h"

 #include "core/scalar_type.hpp"
 #include "cutlass/bfloat16.h"
--- a/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu
@@ -14,9 +14,8 @@
 #include "cutlass/util/mixed_dtype_utils.hpp"

 // vllm includes
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/all.h>
+#include "../../vendors/functions.h"
+
 #include "cutlass_extensions/torch_utils.hpp"
 #include "cutlass_extensions/common.hpp"

--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@@ -3,9 +3,10 @@
 //   https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu
 //

-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/all.h>
+
+#include "../../vendors/functions.h"
+
+
 #include "cutlass_extensions/torch_utils.hpp"
 #include "w4a8_utils.cuh"

@@ -26,7 +27,6 @@
 #include "cutlass_extensions/common.hpp"
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"

-#include <cuda_runtime.h>

 namespace vllm::cutlass_w4a8 {

--- a/csrc/quantization/cutlass_w4a8/w4a8_utils.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cu
@@ -1,7 +1,10 @@
 #include "w4a8_utils.cuh"

+
+#include "../../vendors/functions.h"
+
+
 #include <array>
-#include <cuda_runtime.h>
 #include <cstdio>

 namespace vllm::cutlass_w4a8_utils {
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -14,15 +14,10 @@
 * limitations under the License.
 */

-#include <torch/all.h>

-#include <cuda_runtime_api.h>
-#include <cuda_runtime.h>
+ #include "../../vendors/functions.h"

-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <cuda_fp8.h>
+ 
 #include "dispatch_utils.h"

 #include "cuda_utils.h"
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -19,9 +19,9 @@
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>

-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/cuda/CUDAStream.h>
+#include "../../vendors/functions.h"
+
+
 #include "cutlass_extensions/common.hpp"

 #include "cute/tensor.hpp"
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -14,15 +14,10 @@
 * limitations under the License.
 */

-#include <torch/all.h>

-#include <cuda_runtime_api.h>
-#include <cuda_runtime.h>
+ #include "../../vendors/functions.h"

-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <cuda_fp8.h>
+ 
 #include "dispatch_utils.h"

 #include "nvfp4_utils.cuh"
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -14,7 +14,9 @@
 * limitations under the License.
 */

-#include <torch/all.h>
+
+#include "../../vendors/functions.h"
+

 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -14,15 +14,9 @@
 * limitations under the License.
 */

-#include <torch/all.h>
+#include "../../vendors/functions.h"

-#include <cuda_runtime_api.h>
-#include <cuda_runtime.h>

-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <cuda_fp8.h>
 #include "dispatch_utils.h"

 #include "cuda_utils.h"
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
@@ -14,8 +14,10 @@
 * limitations under the License.
 */

-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
+
+#include "../../vendors/functions.h"
+
+
 #include "cutlass_extensions/common.hpp"

 #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
@@ -14,10 +14,7 @@
 * limitations under the License.
 */

-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "../../vendors/functions.h"

 #include "cutlass_extensions/common.hpp"

--- a/csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu
@@ -14,10 +14,8 @@
 * limitations under the License.
 */

-#include <torch/all.h>
+#include "../../vendors/functions.h"

-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>

 #include "cutlass_extensions/common.hpp"

--- a/csrc/quantization/fp4/nvfp4_utils.cuh
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@@ -16,8 +16,8 @@

 #pragma once

-#include <cuda_runtime.h>
-#include <cuda_fp8.h>
+#include "../../vendors/functions.h"
+

 #define ELTS_PER_THREAD 8

--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -1,6 +1,7 @@

-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
+
+#include "../../vendors/functions.h"
+

 #include "../../dispatch_utils.h"
 #include "layernorm_utils.cuh"
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -1,8 +1,5 @@
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
+#include "../../vendors/functions.h"

-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>

 #include "../../cuda_compat.h"
 #include "dispatch_utils.h"
--- a/csrc/quantization/gptq/matrix_view.cuh
+++ b/csrc/quantization/gptq/matrix_view.cuh
@@ -6,8 +6,8 @@ https://github.com/turboderp/exllama
 #ifndef _matrix_view_cuh
 #define _matrix_view_cuh

-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
+#include "../../vendors/functions.h"
+

 #include "qdq_util.cuh"

--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -6,11 +6,8 @@ https://github.com/qwopqwop200/GPTQ-for-LLaMa
 #include <cstdint>
 #include <cstdio>

-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
+#include "../../vendors/functions.h"
+

 #include "compat.cuh"
 #include "matrix_view.cuh"
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@@ -1,5 +1,5 @@
 #include "allspark_utils.cuh"
-#include <torch/all.h>
+#include "../../vendors/functions.h"
 #include "core/registration.h"
 #include <cublas_v2.h>

--- a/csrc/quantization/gptq_allspark/allspark_repack.cu
+++ b/csrc/quantization/gptq_allspark/allspark_repack.cu
@@ -1,5 +1,5 @@
 #include "allspark_utils.cuh"
-#include <torch/all.h>
+#include "../../vendors/functions.h"
 #include "core/registration.h"

 namespace allspark {
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@@ -1,11 +1,6 @@
 #pragma once

-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
+#include "../../vendors/functions.h"
 #include <iostream>
 #include "../gptq_marlin/marlin_dtypes.cuh"
 using marlin::MarlinScalarType2;
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@@ -1,12 +1,7 @@
 #pragma once

-#include <torch/all.h>
+#include "../../vendors/functions.h"

-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
 #include <iostream>

 #ifndef MARLIN_NAMESPACE_NAME
--- a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
+++ b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
@@ -11,15 +11,9 @@ Redistribution and use in source and binary forms, with or without modification,
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ***********/

-#include <torch/all.h>
+#include "../../../vendors/functions.h"
 #include <stdint.h>
-#include <cuda_runtime.h>
 #include <mma.h>
-#include <cuda/annotated_ptr>
-#include <c10/cuda/CUDAException.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>

 #include "core/registration.h"
 #include "dispatch_utils.h"
--- a/csrc/quantization/machete/machete_prepacked_layout.cuh
+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
@@ -1,8 +1,6 @@
 #pragma once

-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/all.h>
+#include "../../vendors/functions.h"

 // clang-format off
 // The cutlass include order matters (annoyingly)
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@@ -17,8 +17,7 @@

 #pragma once
 #include "base.h"
-#include <cudaTypedefs.h>
-
+#include "../../../../vendors/functions.h"
 namespace marlin_24 {

 // On CUDA earlier than 12.5, the ordered_metadata version of this instruction
--- a/csrc/quantization/vectorization.cuh
+++ b/csrc/quantization/vectorization.cuh
@@ -4,8 +4,7 @@
 */

 // Include both AMD and NVIDIA fp8 types to avoid circular import
-#include <c10/util/Float8_e4m3fnuz.h>
-#include <c10/util/Float8_e4m3fn.h>
+#include "../vendors/functions.h"

 namespace vllm {

--- a/csrc/quantization/w8a8/cutlass/c3x/cutlass_gemm_caller.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/cutlass_gemm_caller.cuh
@@ -2,9 +2,9 @@

 // clang-format will break include orders
 // clang-format off
-#include <torch/all.h>
+#include "../../../../vendors/functions.h"
+

-#include <ATen/cuda/CUDAContext.h>

 #include "cutlass/cutlass.h"

--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_helper.hpp
@@ -1,4 +1,4 @@
-#include <torch/all.h>
+#include "../../../../vendors/functions.h"
 #include "cuda_utils.h"
 #include "cutlass_extensions/common.hpp"

--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_kernels.hpp
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_kernels.hpp
@@ -1,6 +1,6 @@
 #pragma once

-#include <torch/all.h>
+#include "../../../../vendors/functions.h"

 namespace vllm {

--- a/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu
@@ -1,11 +1,10 @@
 #include "core/registration.h"

-#include <torch/all.h>
-#include <cutlass/arch/arch.h>
+// #include <cutlass/arch/arch.h>
+
+#include "../../../../vendors/functions.h"
+

-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/cuda/CUDAStream.h>

 #include "cute/tensor.hpp"
 #include "cutlass/tensor_ref.h"
--- a/csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
+++ b/csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh
@@ -1,8 +1,6 @@
 #pragma once

-#include <cuda.h>
-#include <torch/all.h>
-#include <c10/cuda/CUDAStream.h>
+#include "../../../../vendors/functions.h"

 #include "core/scalar_type.hpp"
 #include "cutlass/bfloat16.h"
--- a/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x.cuh
+++ b/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x.cuh
@@ -1,5 +1,7 @@
 #pragma once

+#include "../../../../vendors/functions.h"
+
 #include "cutlass/cutlass.h"

 #include "cutlass/gemm/collective/collective_builder.hpp"
--- a/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu
@@ -1,7 +1,5 @@
-#include <cudaTypedefs.h>

-#include <c10/cuda/CUDAGuard.h>
-#include <torch/all.h>
+#include "../../../../vendors/functions.h"

 #include "cutlass/cutlass.h"
 #include "grouped_mm_c3x.cuh"
--- a/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu
@@ -1,7 +1,5 @@
-#include <cudaTypedefs.h>

-#include <c10/cuda/CUDAGuard.h>
-#include <torch/all.h>
+#include "../../../../vendors/functions.h"

 #include "cutlass/cutlass.h"
 #include "grouped_mm_c3x.cuh"
--- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
+++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu
@@ -1,7 +1,4 @@
-#include <cudaTypedefs.h>
-
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/all.h>
+#include "../../../../vendors/functions.h"

 #include <iostream>

--- a/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cuh
@@ -2,7 +2,7 @@
 #include <stddef.h>
 #include <torch/all.h>

-#include <ATen/cuda/CUDAContext.h>
+#include "../../../../vendors/functions.h"

 // clang-format will break include orders
 // clang-format off
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -1,7 +1,7 @@
-#include <cudaTypedefs.h>

-#include <c10/cuda/CUDAGuard.h>
-#include <torch/all.h>
+#include "../../../vendors/functions.h"
+
+

 #include "cutlass_extensions/common.hpp"

--- a/csrc/quantization/w8a8/fp8/common.cu
+++ b/csrc/quantization/w8a8/fp8/common.cu
@@ -2,8 +2,9 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "quantization/vectorization_utils.cuh"
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/Exceptions.h>
+#include "../../../../vendors/functions.h"
+
+

 namespace vllm {

--- a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu
@@ -1,5 +1,4 @@
-#include <ATen/cuda/CUDAContext.h>
-
+#include "../../../vendors/functions.h"
 #include "quantization/w8a8/per_token_group_quant_8bit.h"

 #include <cmath>
--- a/csrc/quantization/w8a8/int8/per_token_group_quant.cu
+++ b/csrc/quantization/w8a8/int8/per_token_group_quant.cu
@@ -1,6 +1,4 @@
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/all.h>
-
+#include "../../../vendors/functions.h"
 #include "quantization/w8a8/per_token_group_quant_8bit.h"

 void per_token_group_quant_int8(const torch::Tensor& input,
--- a/csrc/quantization/w8a8/int8/scaled_quant.cu
+++ b/csrc/quantization/w8a8/int8/scaled_quant.cu
@@ -1,6 +1,6 @@
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "../../../vendors/functions.h"
+
+

 #include <cmath>

--- a/csrc/quantization/w8a8/per_token_group_quant_8bit.h
+++ b/csrc/quantization/w8a8/per_token_group_quant_8bit.h
@@ -1,6 +1,5 @@
 #pragma once
-#include <torch/all.h>
-
+#include "../../vendors/functions.h"
 // 8-bit per-token-group quantization helper used by both FP8 and INT8
 void per_token_group_quant_8bit(const torch::Tensor& input,
                                torch::Tensor& output_q,
--- a/csrc/quickreduce/base.h
+++ b/csrc/quickreduce/base.h
@@ -1,9 +1,7 @@
 #pragma once

 #include <cstdint>
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <hip/hip_bf16.h>
+#include "../vendors/functions.h"

 #define __quickreduce_device_inline__ __device__ __forceinline__
 #define __quickreduce_launch_bounds_two_shot__ __launch_bounds__(256, 4)
@@ -11,8 +9,7 @@

 namespace quickreduce {

-typedef __hip_bfloat16 nv_bfloat16;
-typedef __hip_bfloat162 nv_bfloat162;
+

 using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
 using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
--- a/csrc/quickreduce/quick_reduce.h
+++ b/csrc/quickreduce/quick_reduce.h
@@ -1,7 +1,7 @@
 #pragma once

 #include <vector>
-#include <hip/hip_runtime.h>
+#include "../vendors/functions.h"
 #include "quick_reduce_impl.cuh"

 #define HIP_CHECK(err)                                                     \
--- a/csrc/quickreduce/quick_reduce_impl.cuh
+++ b/csrc/quickreduce/quick_reduce_impl.cuh
@@ -1,6 +1,6 @@
 #pragma once

-#include <hip/hip_runtime.h>
+#include "../vendors/functions.h"
 #include "base.h"

 namespace quickreduce {
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@@ -1,13 +1,7 @@
 #include "dispatch_utils.h"

-#include <torch/cuda.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "vendors/functions.h"

-#ifndef USE_ROCM
-  #include <cub/cub.cuh>
-#else
-  #include <hipcub/hipcub.hpp>
-#endif

 namespace vllm {

--- a/csrc/sparse/cutlass/sparse_compressor_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_compressor_c3x.cuh
@@ -2,7 +2,7 @@

 // clang-format will break include orders
 // clang-format off
-#include <cudaTypedefs.h>
+#include "../../vendors/functions.h"

 #if defined CUDA_VERSION && CUDA_VERSION >= 12020
 #include "sparse_scaled_mm_c3x.cuh"
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -1,6 +1,6 @@
 // clang-format will break include orders
 // clang-format off
-#include <cudaTypedefs.h>
+#include "../../vendors/functions.h"

 #if defined CUDA_VERSION && CUDA_VERSION >= 12020
 #include "sparse_scaled_mm_c3x.cuh"
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -2,11 +2,8 @@

 // clang-format will break include orders
 // clang-format off
-#include <cudaTypedefs.h>
+#include "../../vendors/functions.h"

-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>

 #include "cuda_utils.h"

--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -1,7 +1,5 @@
-#include <cudaTypedefs.h>
+#include "../../vendors/functions.h"

-#include <c10/cuda/CUDAGuard.h>
-#include <torch/all.h>

 #include "cutlass_extensions/common.hpp"

--- a/csrc/type_convert.cuh
+++ b/csrc/type_convert.cuh
@@ -1,17 +1,7 @@
 #pragma once

-#include <torch/all.h>
+#include "vendors/functions.h"

-#ifndef USE_ROCM
-  #include <cuda_bf16.h>
-  #include <cuda_fp16.h>
-#else
-  #include <hip/hip_bf16.h>
-  #include <hip/hip_fp16.h>
-
-using __nv_bfloat16 = __hip_bfloat16;
-using __nv_bfloat162 = __hip_bfloat162;
-#endif

 namespace vllm {
 /* Converter structs for the conversion from torch types to HIP/CUDA types,
--- a/csrc/vendors/cuda.h
+++ b/csrc/vendors/cuda.h
@@ -0,0 +1,50 @@
+#pragma once
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/cub.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_type.cuh>
+#include <c10/core/ScalarType.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cuda/std/limits>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <cuda/annotated_ptr>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Optional.h>
+#if CUDART_VERSION >= 12050
+#include <cuda_fp8.h>
+#endif // CUDART_VERSION >= 12050
+
+#if CUDART_VERSION < 11020
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif // CUDART_VERSION < 11020
+
+#if CUB_VERSION >= 200800
+    #include <cuda/std/functional>
+using CubAddOp = cuda::std::plus<>;
+using CubMaxOp = cuda::maximum<>;
+  #else   // if CUB_VERSION < 200800
+using CubAddOp = cub::Sum;
+using CubMaxOp = cub::Max;
+#endif  // CUB_VERSION
--- a/csrc/vendors/functions.h
+++ b/csrc/vendors/functions.h
@@ -0,0 +1,9 @@
+#ifdef USE_MUSA
+#include "musa.h"
+#elif USE_HIP
+#include "hip.h"
+#elif USE_CUDA
+#include "cuda.h"
+#else
+"No Support"
+#endif
--- a/csrc/vendors/hip.h
+++ b/csrc/vendors/hip.h
@@ -0,0 +1,307 @@
+#pragma once
+#include <torch/all.h>
+
+#define HIP_DISABLE_WARP_SYNC_BUILTINS 1
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bf16.h>
+
+#include <c10/core/ScalarType.h>
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/hip/HIPException.h> 
+
+#include <hipcub/hipcub.hpp>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e4m3fn.h>
+namespace cub = hipcub;
+
+typedef __hip_bfloat16 __nv_bfloat16;
+typedef __hip_bfloat162 __nv_bfloat162;
+
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_16BF HIPBLAS_R_16B
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT
+#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
+#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
+#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
+#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
+#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
+#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
+#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define __all_sync(mask, var) __all(var)
+#define __any_sync(mask, var) __any(var)
+#define cublasStrsmBatched hipblasStrsmBatched
+#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cublasOperation_t hipblasOperation_t
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaHostRegister hipHostRegister
+#define cudaHostRegisterPortable hipHostRegisterPortable
+#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
+#define cudaHostUnregister hipHostUnregister
+#define cudaLaunchHostFunc hipLaunchHostFunc
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMallocManaged hipMallocManaged
+#define cudaMemAdvise hipMemAdvise
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cuDeviceGet hipDeviceGet
+#define CUdevice hipDevice_t
+#define CUdeviceptr hipDeviceptr_t
+#define cuMemUnmap hipMemUnmap
+#define CUmemAccessDesc hipMemAccessDesc
+#define cuMemAddressFree hipMemAddressFree
+#define cuMemRelease hipMemRelease
+#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
+#define cuMemCreate hipMemCreate
+#define cuMemAddressReserve hipMemAddressReserve
+#define cuMemMap hipMemMap
+#define cuMemSetAccess hipMemSetAccess
+#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
+#define CUmemAllocationProp hipMemAllocationProp
+#define cuDeviceGetAttribute hipDeviceGetAttribute
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamFireAndForget hipStreamFireAndForget
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamPerThread hipStreamPerThread
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent hipStreamWaitEvent
+#define cudaGraphExec_t hipGraphExec_t
+#define cudaGraphNode_t hipGraphNode_t
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaGraphExecDestroy hipGraphExecDestroy
+#define cudaGraphLaunch hipGraphLaunch
+#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
+#define cudaGraphExecUpdateResult hipGraphExecUpdateResult
+#define cudaGraphNodeType hipGraphNodeType
+#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
+#define cudaGraphInstantiate hipGraphInstantiate
+#define cudaStreamEndCapture hipStreamEndCapture
+#define cudaGraphDestroy hipGraphDestroy
+#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
+#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
+#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
+#define cudaGraphNodeGetType hipGraphNodeGetType
+#define cudaGraphGetNodes hipGraphGetNodes
+#define cudaGraphExecUpdate hipGraphExecUpdate
+#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
+#define cudaStreamBeginCapture hipStreamBeginCapture
+#define cudaGraph_t hipGraph_t
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define cudaFuncSetAttribute hipFuncSetAttribute
+#define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
+#define __trap() do { abort(); __builtin_unreachable(); } while(0)
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+
+
+#define __ldg(arg) *(arg)
+
+
+#if HIP_VERSION >= 60500000
+#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
+#define cublasComputeType_t hipblasComputeType_t
+#define cudaDataType_t hipDataType
+#else
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define cublasComputeType_t hipblasDatatype_t
+#define cudaDataType_t hipblasDatatype_t
+#endif // HIP_VERSION >= 6050000
+
+#if !defined(__HIP_PLATFORM_AMD__)
+#error "The HIP backend supports only AMD targets"
+#endif // !defined(__HIP_PLATFORM_AMD__)
+
+#define __CUDA_ARCH__ 1300
+
+#if defined(__gfx900__) || defined(__gfx906__)
+#define GCN5
+#endif // defined(__gfx900__) || defined(__gfx906__)
+
+#if defined(__gfx803__)
+#define GCN4
+#endif // defined(__gfx803__)
+
+#if defined(GCN5) || defined(GCN4)
+#define GCN
+#endif // defined(GCN5) || defined(GCN4)
+
+#if defined(__gfx942__)
+#define CDNA3
+#endif // defined(__gfx942__)
+
+#if defined(__gfx90a__)
+#define CDNA2
+#endif // defined(__gfx90a__)
+
+#if defined(__gfx908__)
+#define CDNA1
+#endif // defined(__gfx908__)
+
+#if defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
+#define CDNA // For the entire family
+#endif // defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
+
+#if defined(__GFX12__)
+#define RDNA4
+#endif // defined(__GFX12__)
+
+#if defined(__GFX11__)
+#define RDNA3
+#endif // defined(__GFX11__)
+
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+
+#if defined(__gfx1010__) || defined(__gfx1012__)
+#define RDNA1
+#endif // defined(__gfx1010__) || defined(__gfx1012__)
+
+#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(RDNA1)
+#define RDNA // For the entire family
+#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(RDNA1)
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef __hip_bfloat16 nv_bfloat16;
+typedef __hip_bfloat162 nv_bfloat162;
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+#if __has_builtin(__builtin_elementwise_sub_sat)
+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+    return reinterpret_cast<const int &>(c);
+#else
+    int8x4_t c;
+    int16_t tmp;
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+        tmp = va[i] - vb[i];
+        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
+        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
+        c[i] = tmp;
+    }
+    return reinterpret_cast<int &>(c);
+#endif // __has_builtin(__builtin_elementwise_sub_sat)
+}
+
+static __device__ __forceinline__ int __vsub4(const int a, const int b) {
+    return __vsubss4(a, b);
+}
+
+static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
+    }
+    return c;
+}
+
+static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
+    }
+    return c;
+}
+
+
+typedef __hip_bfloat162 __nv_bfloat162;
+typedef __hip_bfloat16 __nv_bfloat16;
+typedef __hip_bfloat16_raw __nv_bfloat16_raw;
+  #if defined(HIP_FP8_TYPE_OCP)
+typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
+typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3;
+  #else
+// ROCm 6.2 fallback: only *_fnuz types exist
+typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3;
+typedef __hip_fp8x4_e4m3_fnuz __nv_fp8x4_e4m3;
+  #include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+using CubAddOp = hipcub::Sum;
+using CubMaxOp = hipcub::Max;
--- a/csrc/vendors/musa.h
+++ b/csrc/vendors/musa.h
@@ -0,0 +1,181 @@
+// All header files
+
+#pragma once
+#include <torch/all.h>
+
+#include <musa_runtime.h>
+#include <musa.h>
+#include <mublas.h>
+#include <musa_bf16.h>
+#include <musa_fp16.h>
+#include <c10/core/ScalarType.h>
+#include <c10/musa/MUSAStream.h>
+#include <musa/std/limits>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <cudaTypedefs>
+#include <c10/musa/MUSAException.h>
+#include <c10/musa/MUSAGuard.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/util/Optional.h>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/cub.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_type.cuh>
+#include <ATen/musa/MUSAContext.h>
+
+using CubAddOp = cub::Sum;
+using CubMaxOp = cub::Max;
+
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
+#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N MUBLAS_OP_N
+#define CUBLAS_OP_T MUBLAS_OP_T
+#define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH
+#define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT
+#define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER
+#define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT
+#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
+#define CUDA_R_16F  MUSA_R_16F
+#define CUDA_R_16BF MUSA_R_16BF
+#define CUDA_R_32F  MUSA_R_32F
+#define cublasStrsmBatched mublasStrsmBatched
+#define cublasComputeType_t cudaDataType_t
+#define cublasCreate mublasCreate
+#define cublasDestroy mublasDestroy
+#define cublasGemmEx mublasGemmEx
+#define cublasGemmBatchedEx mublasGemmBatchedEx
+#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
+#define cublasHandle_t mublasHandle_t
+#define cublasSetMathMode mublasSetMathMode
+#define cublasSetStream mublasSetStream
+#define cublasSgemm mublasSgemm
+#define cublasStatus_t mublasStatus_t
+#define cublasOperation_t mublasOperation_t
+#define cublasGetStatusString mublasGetStatusString
+#define cudaDataType_t musaDataType_t
+#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
+#define cudaDeviceProp musaDeviceProp
+#define cudaDeviceSynchronize musaDeviceSynchronize
+#define cudaError_t musaError_t
+#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags musaEventCreateWithFlags
+#define cudaEventDisableTiming musaEventDisableTiming
+#define cudaEventRecord musaEventRecord
+#define cudaEventSynchronize musaEventSynchronize
+#define cudaEvent_t musaEvent_t
+#define cudaEventDestroy musaEventDestroy
+#define cudaFree musaFree
+#define cudaFreeHost musaFreeHost
+#define cudaGetDevice musaGetDevice
+#define cudaGetDeviceCount musaGetDeviceCount
+#define cudaGetDeviceProperties musaGetDeviceProperties
+#define cudaGetErrorString musaGetErrorString
+#define cudaGetLastError musaGetLastError
+#define cudaHostRegister musaHostRegister
+#define cudaHostRegisterPortable musaHostRegisterPortable
+#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
+#define cudaHostUnregister musaHostUnregister
+#define cudaLaunchHostFunc musaLaunchHostFunc
+#define cudaMalloc musaMalloc
+#define cudaMallocHost musaMallocHost
+#define cudaMallocManaged musaMallocManaged
+#define cudaMemcpy musaMemcpy
+#define cudaMemcpyAsync musaMemcpyAsync
+#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
+#define cudaMemcpy2DAsync musaMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
+#define cudaMemcpyKind musaMemcpyKind
+#define cudaMemset musaMemset
+#define cudaMemsetAsync musaMemsetAsync
+#define cudaMemGetInfo musaMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
+#define cudaSetDevice musaSetDevice
+#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
+#define cudaStreamDestroy musaStreamDestroy
+#define cudaStreamFireAndForget musaStreamFireAndForget
+#define cudaStreamNonBlocking musaStreamNonBlocking
+#define cudaStreamPerThread musaStreamPerThread
+#define cudaStreamSynchronize musaStreamSynchronize
+#define cudaStreamWaitEvent musaStreamWaitEvent
+#define cudaStream_t musaStream_t
+#define cudaSuccess musaSuccess
+
+// Additional mappings for MUSA virtual memory pool
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
+#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
+#define CUdevice MUdevice
+#define CUdeviceptr MUdeviceptr
+#define CUmemAccessDesc MUmemAccessDesc
+#define CUmemAllocationProp MUmemAllocationProp
+#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
+#define cuDeviceGet muDeviceGet
+#define cuDeviceGetAttribute muDeviceGetAttribute
+#define cuMemAddressFree muMemAddressFree
+#define cuMemAddressReserve muMemAddressReserve
+#define cuMemCreate muMemCreate
+#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
+#define cuMemMap muMemMap
+#define cuMemRelease muMemRelease
+#define cuMemSetAccess muMemSetAccess
+#define cuMemUnmap muMemUnmap
+#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
+#define cudaFuncSetAttribute musaFuncSetAttribute
+#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
+#define make_cudaExtent make_musaExtent
+#define make_cudaPitchedPtr make_musaPitchedPtr
+
+// Additional mappings for MUSA graphs
+#define CUDA_SUCCESS MUSA_SUCCESS
+#define CUresult MUresult
+#define cuGetErrorString muGetErrorString
+#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
+#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
+#define cudaGraphDestroy musaGraphDestroy
+#define cudaGraphExecDestroy musaGraphExecDestroy
+#define cudaGraphExec_t musaGraphExec_t
+#define cudaGraphExecUpdate musaGraphExecUpdate
+#define cudaGraphExecUpdateResult musaGraphExecUpdateResult
+#define cudaGraphGetNodes musaGraphGetNodes
+#define cudaGraphInstantiate musaGraphInstantiate
+#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
+#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
+#define cudaGraphLaunch musaGraphLaunch
+#define cudaGraphNodeGetType musaGraphNodeGetType
+#define cudaGraphNode_t musaGraphNode_t
+#define cudaGraphNodeType musaGraphNodeType
+#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
+#define cudaGraph_t musaGraph_t
+#define cudaKernelNodeParams musaKernelNodeParams
+#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
+#define cudaStreamBeginCapture musaStreamBeginCapture
+#define cudaStreamEndCapture musaStreamEndCapture
+#define cudaOccupancyMaxActiveBlocksPerMultiprocessor musaOccupancyMaxActiveBlocksPerMultiprocessor
+#define __ldg(arg) *(arg)
+typedef __mt_bfloat16 nv_bfloat16;
+typedef __mt_bfloat16 __nv_bfloat16;
+typedef __mt_bfloat162 nv_bfloat162;
+typedef __mt_bfloat162 __nv_bfloat162;
+typedef __mt_bfloat162 __nv_bfloat162;
+typedef __mt_bfloat16 __nv_bfloat16;
+typedef __mt_bfloat16_raw __nv_bfloat16_raw;
+typedef __mt_fp8_e4m3 __nv_fp8_e4m3;
+typedef __mt_fp8x4_e4m3 __nv_fp8x4_e4m3;