sync from b7516

2026-01-16 11:16:14 +08:00
parent f4ae4cc7da
commit 6ee41dd9e3
380 changed files with 18435 additions and 38806 deletions
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -15,7 +15,6 @@ if (CUDAToolkit_FOUND)
        # 80     == Ampere, asynchronous data loading, faster tensor core instructions
        # 86     == RTX 3000, needs CUDA v11.1
        # 89     == RTX 4000, needs CUDA v11.8
-        # 120    == Blackwell, needs CUDA v12.8, FP4 tensor cores
        #
        # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
        # XX-real    == compile CUDA code as device code for this specific architecture
@@ -35,69 +34,12 @@ if (CUDAToolkit_FOUND)
            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
                list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real)
            endif()
-
-            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
-                # The CUDA architecture 120f-virtual would in principle work for Blackwell support
-                #     but the newly added "f" suffix conflicted with a preexising regex for validating CUDA architectures in CMake.
-                # So either a recent CMake version or one with the backported fix is needed.
-                # The following versions should work:
-                #   - CMake >= v3.31.8 && CMake < v4.0.0
-                #   - CMake >= v4.0.2
-                # This is NOT documented in the CMake release notes,
-                #     check Modules/Internal/CMakeCUDAArchitecturesValidate.cmake in the CMake git repository instead.
-                # However, the architectures 120a-real and 121a-real should work with basically any CMake version and
-                #     until the release of e.g. Rubin there is no benefit to shipping virtual architectures for Blackwell.
-                list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real)
-            endif()
-            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.9")
-                list(APPEND CMAKE_CUDA_ARCHITECTURES 121a-real)
-            endif()
        endif()
    endif()
+    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

    enable_language(CUDA)

-    # TODO: Remove once CCCL 3.2 has been released and bundled with CUDA Toolkit
-    if (GGML_CUDA_CUB_3DOT2)
-        include(FetchContent)
-
-        FetchContent_Declare(
-            CCCL
-            GIT_REPOSITORY https://github.com/nvidia/cccl.git
-            GIT_TAG        v3.2.0-rc2
-            GIT_SHALLOW    TRUE
-        )
-
-        FetchContent_MakeAvailable(CCCL)
-    endif()
-
-    # Replace any plain 12X CUDA architectures with their "architecture-specific" equivalents 12Xa.
-    # 12X is forwards-compatible, 12Xa is not.
-    # Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa.
-    # But while 12X vs. 12Xa can be checked in device code there is (to my knowledge) no easy way to do the same check in host code.
-    # So for now just replace all instances of 12X with 12Xa, this should be fine until Rubin is released.
-    foreach(ARCHS IN ITEMS CMAKE_CUDA_ARCHITECTURES CMAKE_CUDA_ARCHITECTURES_NATIVE)
-        set(FIXED_ARCHS "")
-        foreach(ARCH IN LISTS ${ARCHS})
-            if (ARCH MATCHES "^12[0-9](-real|-virtual)?$")
-                string(REGEX REPLACE "^(12[0-9])((-real|-virtual)?)$" "\\1a\\2" FIXED_ARCH ${ARCH})
-                message(STATUS "Replacing ${ARCH} in ${ARCHS} with ${FIXED_ARCH}")
-                list(APPEND FIXED_ARCHS "${FIXED_ARCH}")
-            else()
-                list(APPEND FIXED_ARCHS "${ARCH}")
-            endif()
-        endforeach()
-        set(${ARCHS} ${FIXED_ARCHS})
-    endforeach()
-
-    # If we try to compile a "native" build it will use the 12X architectures and fail.
-    # So we should instead use the native architectures as determined by CMake after replacing 12X with 12Xa.
-    # But if at the time of the build no GPUs are connected at all CMAKE_CUDA_ARCHITECTURES will contain garbage that we should not use.
-    if (CMAKE_CUDA_ARCHITECTURES STREQUAL "native" AND CMAKE_CUDA_ARCHITECTURES_NATIVE MATCHES "^[0-9]+(a|f)?(-real|-virtual)?(;[0-9]+(a|f)?(-real|-virtual)?|;)*$")
-        set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
-    endif()
-    message(STATUS "Using CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} CMAKE_CUDA_ARCHITECTURES_NATIVE=${CMAKE_CUDA_ARCHITECTURES_NATIVE}")
-
    file(GLOB   GGML_HEADERS_CUDA "*.cuh")
    list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")

@@ -160,9 +102,6 @@ if (CUDAToolkit_FOUND)
            # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
            target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas)
        else ()
-            if (GGML_CUDA_CUB_3DOT2)
-                target_link_libraries(ggml-cuda PRIVATE  CCCL::CCCL)
-            endif()
            if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "10.1")
                target_link_libraries(ggml-cuda PRIVATE  CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
            else()
@@ -170,9 +109,6 @@ if (CUDAToolkit_FOUND)
            endif()
        endif()
    else()
-        if (GGML_CUDA_CUB_3DOT2)
-            target_link_libraries(ggml-cuda PRIVATE  CCCL::CCCL)
-        endif()
        target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas)
    endif()

@@ -241,10 +177,6 @@ if (CUDAToolkit_FOUND)

    if (NOT MSVC)
        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
-    else()
-        # CCCL 3.2 onwards will require a cpp-standard-compliant preprocessor for MSVC
-        # https://github.com/NVIDIA/cccl/pull/6827
-        list(APPEND CUDA_CXX_FLAGS /Zc:preprocessor)
    endif()

    list(JOIN   CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -22,13 +22,13 @@ static __global__ void init_offsets(int * offsets, const int ncols, const int nr
 }

 #ifdef GGML_CUDA_USE_CUB
-void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
-                              const float *    x,
-                              int *            dst,
-                              const int        ncols,
-                              const int        nrows,
-                              ggml_sort_order  order,
-                              cudaStream_t     stream) {
+static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
+                                     const float *    x,
+                                     int *            dst,
+                                     const int        ncols,
+                                     const int        nrows,
+                                     ggml_sort_order  order,
+                                     cudaStream_t     stream) {
    ggml_cuda_pool_alloc<int>   temp_indices_alloc(pool, ncols * nrows);
    ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
    ggml_cuda_pool_alloc<int>   offsets_alloc(pool, nrows + 1);
@@ -49,49 +49,28 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
    size_t temp_storage_bytes = 0;

    if (order == GGML_SORT_ORDER_ASC) {
-        if (nrows == 1) {
-            DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
-                                       temp_indices, dst,                                  // values (indices)
-                                       ncols, 0, sizeof(float) * 8, stream);
-        } else {
-            DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
-                                           temp_indices, dst,                                  // values (indices)
-                                           ncols * nrows, nrows,  // num items, num segments
-                                           d_offsets, d_offsets + 1, stream);
-        }
+        DeviceSegmentedRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
+                                            temp_indices, dst,                                  // values (indices)
+                                            ncols * nrows, nrows,                            // num items, num segments
+                                            d_offsets, d_offsets + 1, 0, sizeof(float) * 8,  // all bits
+                                            stream);
    } else {
-        if (nrows == 1) {
-            DeviceRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
-                                                 temp_indices, dst,                                  // values (indices)
-                                                 ncols, 0, sizeof(float) * 8, stream);
-        } else {
-            DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
-                                                     dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
-        }
+        DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
+                                                      dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, 0,
+                                                      sizeof(float) * 8, stream);
    }

    ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
    void *                        d_temp_storage = temp_storage_alloc.get();

    if (order == GGML_SORT_ORDER_ASC) {
-        if (nrows == 1) {
-            DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
-                                       temp_indices, dst,  // values (indices)
-                                       ncols, 0, sizeof(float) * 8, stream);
-        } else {
-            DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
-                                           ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
-        }
+        DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
+                                            ncols * nrows, nrows, d_offsets, d_offsets + 1, 0, sizeof(float) * 8,
+                                            stream);
    } else {
-        if (nrows == 1) {
-            DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
-                                                 temp_indices, dst,                                  // values (indices)
-                                                 ncols, 0, sizeof(float) * 8, stream);
-        } else {
-            DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
-                                                     temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
-                                                     stream);
-        }
+        DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
+                                                      temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
+                                                      0, sizeof(float) * 8, stream);
    }
 }
 #endif  // GGML_CUDA_USE_CUB
@@ -162,12 +141,12 @@ static int next_power_of_2(int x) {
    return n;
 }

-void argsort_f32_i32_cuda_bitonic(const float *   x,
-                                  int *           dst,
-                                  const int       ncols,
-                                  const int       nrows,
-                                  ggml_sort_order order,
-                                  cudaStream_t    stream) {
+static void argsort_f32_i32_cuda_bitonic(const float *   x,
+                                         int *           dst,
+                                         const int       ncols,
+                                         const int       nrows,
+                                         ggml_sort_order order,
+                                         cudaStream_t    stream) {
    // bitonic sort requires ncols to be power of 2
    const int ncols_pad = next_power_of_2(ncols);

--- a/ggml/src/ggml-cuda/argsort.cuh
+++ b/ggml/src/ggml-cuda/argsort.cuh
@@ -1,19 +1,3 @@
 #include "common.cuh"

 void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-#ifdef GGML_CUDA_USE_CUB
-void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
-                              const float *    x,
-                              int *            dst,
-                              const int        ncols,
-                              const int        nrows,
-                              ggml_sort_order  order,
-                              cudaStream_t     stream);
-#endif  // GGML_CUDA_USE_CUB
-void argsort_f32_i32_cuda_bitonic(const float *   x,
-                                  int *           dst,
-                                  const int       ncols,
-                                  const int       nrows,
-                                  ggml_sort_order order,
-                                  cudaStream_t    stream);
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -50,10 +50,6 @@
 #define GGML_CUDA_CC_TURING          750
 #define GGML_CUDA_CC_AMPERE          800
 #define GGML_CUDA_CC_ADA_LOVELACE    890
-// While BW spans CC 1000, 1100 & 1200, we are integrating Tensor Core instructions available to 1200 family, see
-// https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#blackwell-sm120-gemms
-#define GGML_CUDA_CC_BLACKWELL       1200
-#define GGML_CUDA_CC_RUBIN           1300
 #define GGML_CUDA_CC_OFFSET_AMD      0x1000000
 #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
 #define GGML_CUDA_CC_IS_NVIDIA(cc)   (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
@@ -250,10 +246,6 @@ static const char * cu_get_error_str(CUresult err) {
 #define AMPERE_MMA_AVAILABLE
 #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE

-#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_BLACKWELL && __CUDA_ARCH__ < GGML_CUDA_CC_RUBIN
-#    define BLACKWELL_MMA_AVAILABLE
-#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_BLACKWELL
-
 #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 #define CP_ASYNC_AVAILABLE
 #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
@@ -262,10 +254,6 @@ static const char * cu_get_error_str(CUresult err) {
 #define FLASH_ATTN_AVAILABLE
 #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)

-#if defined(TURING_MMA_AVAILABLE)
-#define LDMATRIX_TRANS_AVAILABLE
-#endif // defined(TURING_MMA_AVAILABLE)
-
 static bool fp16_available(const int cc) {
    return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL ||
        (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_PH1);
@@ -328,11 +316,6 @@ static bool cp_async_available(const int cc) {
    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
 }

-static bool blackwell_mma_available(const int cc) {
-    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_BLACKWELL &&
-           ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_RUBIN;
-}
-
 static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
 #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
    return 64;
@@ -530,86 +513,6 @@ static __device__ __forceinline__ half2 warp_prefix_inclusive_sum(half2 a) {
 #endif // FP16_AVAILABLE
 }

-enum class block_reduce_method {
-    MAX,
-    SUM,
-};
-
-template<block_reduce_method method_t, typename T>
-struct block_reduce_policy;
-
-template <typename T, typename... Ts>
-inline constexpr bool is_any = (std::is_same_v<T, Ts> || ...);
-
-template<typename...>
-inline constexpr bool ggml_cuda_dependent_false_v = false;
-
-template <typename T> struct block_reduce_policy<block_reduce_method::SUM, T> {
-    static __device__ T reduce(T val) {
-        if constexpr(is_any<T, float, float2, half2, int>) {
-            return warp_reduce_sum(val);
-        } else {
-            static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce sum");
-        }
-    }
-
-    static __device__ T sentinel() {
-        if constexpr (std::is_same_v<T, float>) {
-            return 0.0f;
-        } else if constexpr (std::is_same_v<T, float2>) {
-            return make_float2(0.0f, 0.0f);
-        } else if constexpr (std::is_same_v<T, half2>) {
-            return make_half2(0.0f, 0.0f);
-        } else if constexpr (std::is_same_v<T, int>) {
-            return 0;
-        } else {
-            static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce sum");
-        }
-    }
-};
-
-template <typename T> struct block_reduce_policy<block_reduce_method::MAX, T> {
-    static __device__ T reduce(T val) {
-        if constexpr (is_any<T, float, half2>) {
-            return warp_reduce_max(val);
-        } else {
-            static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce max");
-        }
-    }
-
-    static __device__ T sentinel() {
-        if constexpr (std::is_same_v<T, float>) {
-            return -INFINITY;
-        } else if constexpr (std::is_same_v<T, half2>) {
-            return make_half2(-INFINITY, -INFINITY);
-        } else {
-            static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce max");
-        }
-    }
-};
-
-template <block_reduce_method reduce_method_t, const unsigned int block_size_template = 0, typename T>
-static __device__ T block_reduce(T val, T * shared_vals) {
-    val                           = block_reduce_policy<reduce_method_t, T>::reduce(val);
-    const unsigned int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
-    if (block_size > WARP_SIZE) {
-        assert((block_size <= 1024) && (block_size % WARP_SIZE) == 0);
-        const int warp_id = threadIdx.x / WARP_SIZE;
-        const int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            shared_vals[warp_id] = val;
-        }
-        __syncthreads();
-        val = block_reduce_policy<reduce_method_t, T>::sentinel();
-        if (lane_id < (static_cast<int>(block_size) / WARP_SIZE)) {
-            val = shared_vals[lane_id];
-        }
-        return block_reduce_policy<reduce_method_t, T>::reduce(val);
-    }
-
-    return val;
-}
-
 static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
 #ifdef FP16_AVAILABLE

@@ -798,28 +701,6 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
 #endif // CUDART_VERSION >= 12050
 }

-__device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
-    const uint8_t sign_bit = (x < 0.0f) << 3;
-    float         ax       = fabsf(x) * e;
-
-    // Positive LUT
-    static constexpr float pos_lut[8] = { 0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f };
-
-    int   best_i   = 0;
-    float best_err = fabsf(ax - pos_lut[0]);
-
-#pragma unroll
-    for (int i = 1; i < 8; ++i) {
-        const float err = fabsf(ax - pos_lut[i]);
-        if (err < best_err) {
-            best_err = err;
-            best_i   = i;
-        }
-    }
-
-    return static_cast<uint8_t>(best_i | sign_bit);
-}
-
 // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
 // Precompute mp (m' in the paper) and L such that division
 // can be computed using a multiply (high 32b of 64b result)
@@ -1034,16 +915,15 @@ struct ggml_cuda_device_info {
    int device_count;

    struct cuda_device_info {
-        int     cc;                             // compute capability
-        int     nsm;                            // number of streaming multiprocessors
-        size_t  smpb;                           // max. shared memory per block
-        size_t  smpbo;                          // max. shared memory per block (with opt-in)
-        bool    integrated;                     // Device is integrated as opposed to discrete
-        bool    vmm;                            // virtual memory support
-        size_t  vmm_granularity;                // granularity of virtual memory
+        int     cc;                 // compute capability
+        int     nsm;                // number of streaming multiprocessors
+        size_t  smpb;               // max. shared memory per block
+        size_t  smpbo;              // max. shared memory per block (with opt-in)
+        bool    integrated;         // Device is integrated as opposed to discrete
+        bool    vmm;                // virtual memory support
+        size_t  vmm_granularity;    // granularity of virtual memory
        size_t  total_vram;
-        int     warp_size;                      // Number of threads in a dispatch
-        bool    supports_cooperative_launch;    // whether cooperative launch is supported
+        int     warp_size;          // Number of threads in a dispatch
    };

    cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
@@ -1120,7 +1000,7 @@ struct ggml_tensor_extra_gpu {
 #define USE_CUDA_GRAPH
 #endif

-struct ggml_cuda_graph_node_properties {
+struct ggml_graph_node_properties {
    void * node_address;
    ggml_op node_op;
    int64_t ne[GGML_MAX_DIMS];
@@ -1143,27 +1023,12 @@ struct ggml_cuda_graph {
    cudaGraphExec_t instance = nullptr;
    size_t num_nodes = 0;
    std::vector<cudaGraphNode_t> nodes;
+    std::vector<cudaKernelNodeParams> params;
    bool disable_due_to_gpu_arch = false;
    bool disable_due_to_too_many_updates = false;
+    bool disable_due_to_failed_graph_capture = false;
    int number_consecutive_updates = 0;
-    std::vector<ggml_cuda_graph_node_properties> props;
-
-    void record_update(bool use_graph, bool update_required) {
-        if (use_graph && update_required) {
-            number_consecutive_updates++;
-        } else {
-            number_consecutive_updates = 0;
-        }
-        if (number_consecutive_updates >= 4) {
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
-            disable_due_to_too_many_updates = true;
-        }
-    }
-
-    bool is_enabled() const {
-        static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
-        return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
-    }
+    std::vector<ggml_graph_node_properties> ggml_graph_properties;
 #endif
 };

--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -12,11 +12,11 @@ const int CUDA_CPY_BLOCK_NM = 8;     // block size of 3rd dimension if available
 const int CUDA_CPY_BLOCK_ROWS = 8;   // block dimension for marching through rows

 template <cpy_kernel_t cpy_1>
-static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne,
-                                  const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
-                                  const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
-                                  const int64_t nb12, const int64_t nb13) {
-    const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
+static __global__ void cpy_scalar(const char * cx, char * cdst, const int ne,
+                                  const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                  const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                  const int nb12, const int nb13) {
+    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= ne) {
        return;
@@ -40,10 +40,10 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne
 }

 template <typename T>
-static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int64_t ne,
-                               const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
-                               const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
-                               const int64_t nb12, const int64_t nb13) {
+static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int ne,
+                               const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                               const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                               const int nb12, const int nb13) {

    const T* src = reinterpret_cast<const T*>(cx);
    T* dst = reinterpret_cast<T*>(cdst);
@@ -117,60 +117,60 @@ static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
 }

 template <cpy_kernel_t cpy_blck, int qk>
-static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne,
-                                 const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
-                                 const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
-                                 const int64_t nb12, const int64_t nb13) {
-    const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk;
+static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                 const int nb12, const int nb13) {
+    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;

    if (i >= ne) {
        return;
    }

-    const int64_t i03 = i/(ne00 * ne01 * ne02);
-    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
-    const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+    const int i03 = i/(ne00 * ne01 * ne02);
+    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;

-    const int64_t i13 = i/(ne10 * ne11 * ne12);
-    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
-    const int64_t dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
+    const int i13 = i/(ne10 * ne11 * ne12);
+    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;

    cpy_blck(cx + x_offset, cdst + dst_offset);
 }

 template <cpy_kernel_t cpy_blck, int qk>
-static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne,
-                                 const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
-                                 const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11,
-                                 const int64_t nb12, const int64_t nb13) {
-    const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk;
+static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                 const int nb12, const int nb13) {
+    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;

    if (i >= ne) {
        return;
    }

-    const int64_t i03 = i/(ne00 * ne01 * ne02);
-    const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
-    const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
-    const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
-    const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+    const int i03 = i/(ne00 * ne01 * ne02);
+    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;

-    const int64_t i13 = i/(ne10 * ne11 * ne12);
-    const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-    const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-    const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
-    const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
+    const int i13 = i/(ne10 * ne11 * ne12);
+    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;

    cpy_blck(cx + x_offset, cdst + dst_offset);
 }

 template<typename src_t, typename dst_t>
 static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const int64_t ne) {
-    const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
+    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= ne) {
        return;
@@ -188,20 +188,19 @@ static void ggml_cpy_scalar_contiguous_cuda(
 cudaStream_t stream) {

    const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    GGML_ASSERT(num_blocks < UINT_MAX);
    cpy_scalar_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
        (cx, cdst, ne);
 }

 template<typename src_t, typename dst_t, bool transposed = false>
 static void ggml_cpy_scalar_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    if (transposed) {
        GGML_ASSERT(ne == ne00*ne01*ne02);  // ne[3] is 1 assumed
-        int64_t ne00n, ne01n, ne02n;
+        int ne00n, ne01n, ne02n;
        if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here
            ne00n = ne00;
            ne01n = ne01;
@@ -212,159 +211,143 @@ static void ggml_cpy_scalar_cuda(
            ne02n = 1;
        }

-        int64_t grid_x = (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
-        int64_t grid_y = (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D;
-        int64_t grid_z = (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM;
-        GGML_ASSERT(grid_x < UINT_MAX);
-        GGML_ASSERT(grid_y < USHRT_MAX);
-        GGML_ASSERT(grid_z < USHRT_MAX);
-        dim3 dimGrid(grid_x, grid_y, grid_z);
+        dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
+                      (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
+                      (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM);
        dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
        cpy_scalar_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
            (cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
    } else {
-        const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-        GGML_ASSERT(num_blocks < UINT_MAX);
+        const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
        cpy_scalar<cpy_1_scalar<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
            (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
    }
 }

 static void ggml_cpy_f32_q8_0_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK8_0 == 0);
-    const int64_t num_blocks = ne / QK8_0;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    const int num_blocks = ne / QK8_0;
    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_q8_0_f32_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

-    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    const int num_blocks = ne;
    cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_f32_q4_0_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK4_0 == 0);
-    const int64_t num_blocks = ne / QK4_0;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    const int num_blocks = ne / QK4_0;
    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_q4_0_f32_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02,
-    const int64_t nb00, const int64_t nb01, const int64_t nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
-    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02,
+    const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12,
+    const int nb10, const int nb11, const int nb12, const int nb13,
    cudaStream_t stream) {
-    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    const int num_blocks = ne;
    cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_f32_q4_1_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK4_1 == 0);
-    const int64_t num_blocks = ne / QK4_1;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    const int num_blocks = ne / QK4_1;
    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_q4_1_f32_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02,
-    const int64_t nb00, const int64_t nb01, const int64_t nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
-    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02,
+    const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12,
+    const int nb10, const int nb11, const int nb12, const int nb13,
    cudaStream_t stream) {
-    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    const int num_blocks = ne;
    cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_f32_q5_0_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK5_0 == 0);
-    const int64_t num_blocks = ne / QK5_0;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    const int num_blocks = ne / QK5_0;
    cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_q5_0_f32_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02,
-    const int64_t nb00, const int64_t nb01, const int64_t nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
-    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02,
+    const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12,
+    const int nb10, const int nb11, const int nb12, const int nb13,
    cudaStream_t stream) {
-    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    const int num_blocks = ne;
    cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_f32_q5_1_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK5_1 == 0);
-    const int64_t num_blocks = ne / QK5_1;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    const int num_blocks = ne / QK5_1;
    cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_q5_1_f32_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02,
-    const int64_t nb00, const int64_t nb01, const int64_t nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
-    const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02,
+    const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12,
+    const int nb10, const int nb11, const int nb12, const int nb13,
    cudaStream_t stream) {
-    const int64_t num_blocks = ne;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    const int num_blocks = ne;
    cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_f32_iq4_nl_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-    const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
-    const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) {
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK4_NL == 0);
-    const int64_t num_blocks = ne / QK4_NL;
-    GGML_ASSERT(num_blocks < UINT_MAX);
+    const int num_blocks = ne / QK4_NL;
    cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
@@ -373,6 +356,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
    const int64_t ne = ggml_nelements(src0);
    GGML_ASSERT(ne == ggml_nelements(src1));

+    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
+    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
+
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
--- a/ggml/src/ggml-cuda/cumsum.cu
+++ b/ggml/src/ggml-cuda/cumsum.cu
@@ -5,7 +5,7 @@
 #include "ggml.h"

 #ifdef GGML_CUDA_USE_CUB
-#   include <cub/cub.cuh>
+#   include <cub/device/device_scan.cuh>
 #endif // GGML_CUDA_USE_CUB

 template<typename T, int BLOCK_SIZE>
@@ -16,14 +16,12 @@ static __global__ void cumsum_cub_kernel(
        const int64_t  s01, const int64_t  s02, const int64_t  s03,
        const int64_t   s1,  const int64_t   s2,  const int64_t   s3) {
 #ifdef GGML_CUDA_USE_CUB
-    using BlockScanT = cub::BlockScan<T, BLOCK_SIZE>;
+    using BlockScan = cub::BlockScan<T, BLOCK_SIZE>;

-    __shared__ typename BlockScanT::TempStorage temp_storage;
-    __shared__ T block_carry;
+    __shared__ typename BlockScan::TempStorage temp_storage;
+    __shared__ T block_carry;      // carry from previous tile

    const int tid = threadIdx.x;
-    constexpr int UNROLL_FACTOR = 4;
-    constexpr int TILE_SIZE = BLOCK_SIZE * UNROLL_FACTOR;

    const int64_t i1 = blockIdx.x;
    const int64_t i2 = blockIdx.y;
@@ -41,47 +39,37 @@ static __global__ void cumsum_cub_kernel(
    }
    __syncthreads();

-    for (int64_t start = 0; start < ne00; start += TILE_SIZE) {
-        T items[UNROLL_FACTOR];
-        T thread_sum = T(0);
+    for (int64_t start = 0; start < ne00; start += BLOCK_SIZE) {
+        int64_t idx = start + tid;
+        T x = (idx < ne00) ? src_row[idx] : T(0);

-#pragma unroll
-        for (int i = 0; i < UNROLL_FACTOR; i++) {
-            int64_t idx = start + tid * UNROLL_FACTOR + i;
-            T val = (idx < ne00) ? src_row[idx] : T(0);
-            thread_sum += val;
-            items[i] = thread_sum;
-        }
-
-        // Block-wide scan on thread sums
-        T thread_prefix;
+        T inclusive;
        T block_total;
-        BlockScanT(temp_storage).InclusiveSum(thread_sum, thread_prefix, block_total);
+        BlockScan(temp_storage).InclusiveSum(x, inclusive, block_total);
+
        __syncthreads();

-        // Add offset to each item and store
-        T thread_offset = thread_prefix - thread_sum + block_carry;
-#pragma unroll
-        for (int i = 0; i < UNROLL_FACTOR; i++) {
-            int64_t idx = start + tid * UNROLL_FACTOR + i;
-            if (idx < ne00) {
-                dst_row[idx] = items[i] + thread_offset;
-            }
+        T final_val = inclusive + block_carry;
+
+        // store result
+        if (idx < ne00) {
+            dst_row[idx] = final_val;
        }

        __syncthreads();

-        // Update carry for next tile
        if (tid == 0) {
            block_carry += block_total;
        }
+
+        __syncthreads();
    }
 #else
    NO_DEVICE_CODE;
 #endif // GGML_CUDA_USE_CUB
 }

-// Fallback kernel implementation
+// Fallback kernel implementation (original)
 template<typename T>
 static __global__ void cumsum_kernel(
        const T * src, T * dst,
@@ -98,10 +86,10 @@ static __global__ void cumsum_kernel(
    const int warps_per_block = blockDim.x / warp_size;

    extern __shared__ float smem[];
-    float *                 s_vals        = smem;
-    float *                 s_warp_sums   = smem + blockDim.x;
-    float *                 s_carry       = smem + blockDim.x + warps_per_block;
-    float *                 s_chunk_total = s_carry + 1;
+    float * s_vals = smem;
+    float * s_warp_sums = smem + blockDim.x;
+    float * s_carry = smem + blockDim.x + warps_per_block;
+    float * s_chunk_total = s_carry + 1;

    // Initialize carry
    if (tid == 0) {
@@ -119,39 +107,21 @@ static __global__ void cumsum_kernel(
    const T * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
    T       * dst_row = dst + i1 * s1  + i2 * s2  + i3 * s3;

-    // register blocking: process 4 elements per thread to hide latency
-    // and reduce synchronization overhead
-    constexpr int num_unroll = 4;
-    T             temp[num_unroll];
+    for (int64_t start = 0; start < ne00; start += blockDim.x) {
+        int64_t idx = start + tid;
+        float val = (idx < ne00) ? ggml_cuda_cast<float, T>(src_row[idx]) : 0.0f;

-    for (int64_t i = 0; i < ne00; i += num_unroll * blockDim.x) {
-        int64_t idx = i + tid * num_unroll;
-
-        // thread local sequential scan
-        temp[0] = (idx < ne00 ? src_row[idx] : T(0));
-#pragma unroll
-        for (int64_t j = 1; j < num_unroll; j++) {
-            temp[j] = temp[j - 1];
-            if (idx + j < ne00) {
-                temp[j] += src_row[idx + j];
-            } else {
-                temp[j] += 0;
-            }
-        }
-
-        // last emenent is sum of all values assigned to thread
-        float val = (idx < ne00) ? ggml_cuda_cast<float, T>(temp[num_unroll - 1]) : 0.0f;
-
-        // Warp inclusive scan
+        // 1. Warp inclusive scan
        val = warp_prefix_inclusive_sum<T, warp_size>(val);
        s_vals[tid] = val;

+        // Store warp total
        if (lane == warp_size - 1) {
            s_warp_sums[warp] = val;
        }
        __syncthreads();

-        // Exclusive scan of warp sums (warp 0 only)
+        // 2. Exclusive scan of warp sums (warp 0 only)
        if (warp == 0) {
            float w = (tid < warps_per_block) ? s_warp_sums[tid] : 0.0f;
            float inc = warp_prefix_inclusive_sum<T, warp_size>(w);
@@ -164,55 +134,24 @@ static __global__ void cumsum_kernel(
        }
        __syncthreads();

-        // write back results
        float carry = *s_carry;
-        // calculate sum offset for this thread
-        float final_val_offset = s_vals[tid] + s_warp_sums[warp] + carry - temp[num_unroll - 1];
-
-#pragma unroll
-        for (int32_t j = 0; j < num_unroll; j++) {
-            if (idx + j < ne00) {
-                dst_row[idx + j] = temp[j] + ggml_cuda_cast<T, float>(final_val_offset);
-            }
+        float final_val = s_vals[tid] + s_warp_sums[warp] + carry;
+        if (idx < ne00) {
+            dst_row[idx] = ggml_cuda_cast<T, float>(final_val);
        }
-
        __syncthreads();

        // Update carry for next chunk
        if (tid == 0) {
            *s_carry += *s_chunk_total;
        }
+        __syncthreads();
    }
 }

-#ifdef GGML_CUDA_USE_CUB
-template <typename T>
-static void cumsum_cub(ggml_cuda_pool & pool,
-                       const T *        src,
-                       T *              dst,
-                       int64_t          ne,
-                       cudaStream_t     stream) {
-    size_t tmp_size = 0;
-
-    // Query how much temp storage CUDA UnBound (CUB) needs
-    cub::DeviceScan::InclusiveSum(nullptr,   // d_temp_storage (null = just query size)
-                                  tmp_size,  // reference to size (will be set by CUB)
-                                  src,       // input pointer
-                                  dst,       // output pointer
-                                  ne,        // number of elements
-                                  stream     // CUDA stream to use
-    );
-
-    ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
-
-    // Perform the inclusive scan
-    cub::DeviceScan::InclusiveSum((void *) tmp_alloc.get(), tmp_size, src, dst, ne, stream);
-}
-#endif // GGML_CUDA_USE_CUB
-
 template<typename T>
 static void cumsum_cuda(
-        [[maybe_unused]] ggml_backend_cuda_context & ctx, const T * src, T * dst,
+        const T * src, T * dst,
        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
        const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
        const int64_t  nb0,  const int64_t nb1, const int64_t  nb2, const int64_t  nb3,
@@ -226,15 +165,6 @@ static void cumsum_cuda(

    if (is_contiguous) {
        use_cub = true;
-        const int64_t nrows = ne01 * ne02 * ne03;
-        // TODO: Compare with DeviceSegmentedScan::InclusiveSegmentedSum for nrows > 1 once InclusiveSegmentedSum is released
-        // Heuristics were determined as part of https://github.com/ggml-org/llama.cpp/pull/17004
-        if (((nrows == 1) && (ne00 > 1024)) || (ne00 / nrows > 4096)) {
-            for (int i=0; i<nrows; i++) {
-                cumsum_cub(ctx.pool(), src + i * ne00, dst + i * ne00, ne00, stream);
-            }
-            return;
-        }
    }
 #endif // GGML_CUDA_USE_CUB
    dim3 grid_dims(ne01, ne02, ne03);
@@ -247,7 +177,7 @@ static void cumsum_cuda(
    const int warps_per_block = block_size / warp_size;
    const size_t shmem_size = (block_size + warps_per_block + 2) * sizeof(float);

-    if (use_cub && ne00 >= 1024) {
+    if (use_cub) {
        cumsum_cub_kernel<T, CUDA_CUMSUM_BLOCK_SIZE><<<grid_dims, CUDA_CUMSUM_BLOCK_SIZE, 0, stream>>>(
            src, dst,
            ne00, ne01, ne02, ne03,
@@ -273,7 +203,7 @@ void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
        case GGML_TYPE_F32:
            {
                cumsum_cuda(
-                    ctx, (const float *)src0->data, (float *)dst->data,
+                    (const float *)src0->data, (float *)dst->data,
                    src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
                    src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
                    dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -11,12 +11,10 @@
 #define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.

 // log(2) = 0.6931, by adding this to the KQ maximum used for the softmax the numerical range representable
-//     by the VKQ accumulators is effectively being shifted up by a factor of 2.
+//     by the VKQ accumulators is effectively being shifted up by a factor of 8.
 // This reduces issues with numerical overflow but also causes larger values to be flushed to zero.
 // However, as the output from FlashAttention will usually be used as an input for a matrix multiplication this should be negligible.
-// Still, the value range should be shifted as much as necessary but as little as possible.
-// The macro on the following line shifts it by a factor of 2**3=8, as was needed to fix https://github.com/ggml-org/llama.cpp/issues/18606 .
-#define FATTN_KQ_MAX_OFFSET (3.0f*0.6931f)
+#define FATTN_KQ_MAX_OFFSET 0.6931f

 typedef void (* fattn_kernel_t)(
        const char * __restrict__ Q,
@@ -59,7 +57,7 @@ static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_f16(

 #pragma unroll
    for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
-        __align__(16) half2 tmp[cpy_ne];
+        half2 tmp[cpy_ne];
        ggml_cuda_memcpy_1<sizeof(tmp)>(tmp, K_h2 + k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne);
 #pragma unroll
        for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
@@ -309,7 +307,7 @@ static __device__ __forceinline__ void dequantize_V_f16(const void * __restrict_
        ggml_cuda_memcpy_1<ne*sizeof(half)>(dst, (const half *) vx + i0);
    } else if constexpr (std::is_same_v<T, float>) {
        static_assert(ne % 2 == 0, "bad ne");
-        __align__(16) half2 tmp[ne/2];
+        half2 tmp[ne/2];
        ggml_cuda_memcpy_1<ne*sizeof(half)>(tmp, (const half *) vx + i0);
        float2 * dst_f2 = (float2 *) dst;
 #pragma unroll
@@ -914,15 +912,13 @@ void launch_fattn(

        const int nblocks_stream_k = max_blocks;

-        const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || amd_wmma_available(cc) || tiles_efficiency_percent < 75;
+        const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || tiles_efficiency_percent < 75;

        blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
        blocks_num.y = 1;
        blocks_num.z = 1;

-        if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
-            dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2)));
-        }
+        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
    } else {
        const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.

--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -98,19 +98,6 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
    return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
 }

-static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config_rdna(const int DKQ, const int DV, const int ncols) {
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 128, 2,  64, 128, 128, 128, 2, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2,  64, 128, 128,  64, 2, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2,  64, 128, 128,  64, 2, true);
-
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32,  96,  64, 128, 1, false);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128, 128, 1, false);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128, 128, 1, false);
-
-    // TODO tune specifically for RDNA
-    return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
-}
-
 static __host__ fattn_mma_config ggml_cuda_fattn_mma_get_config(const int DKQ, const int DV, const int ncols, const int cc) {
    if (ampere_mma_available(cc)) {
        return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
@@ -118,9 +105,6 @@ static __host__ fattn_mma_config ggml_cuda_fattn_mma_get_config(const int DKQ, c
    if (turing_mma_available(cc)) {
        return ggml_cuda_fattn_mma_get_config_turing(DKQ, DV, ncols);
    }
-    if (amd_wmma_available(cc)) {
-        return ggml_cuda_fattn_mma_get_config_rdna(DKQ, DV, ncols);
-    }
    GGML_ASSERT(volta_mma_available(cc));
    return ggml_cuda_fattn_mma_get_config_volta(DKQ, DV, ncols);
 }
@@ -132,8 +116,6 @@ static constexpr __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config(cons
    return ggml_cuda_fattn_mma_get_config_turing(DKQ, DV, ncols);
 #elif defined(VOLTA_MMA_AVAILABLE)
    return ggml_cuda_fattn_mma_get_config_volta(DKQ, DV, ncols);
-#elif defined(AMD_WMMA_AVAILABLE)
-    return ggml_cuda_fattn_mma_get_config_rdna(DKQ, DV, ncols);
 #else
    GGML_UNUSED_VARS(DKQ, DV, ncols);
    return fattn_mma_config(32, 1, 0, 0, 0, 0, 0, false);
@@ -204,23 +186,6 @@ static constexpr __device__ bool ggml_cuda_fattn_mma_get_Q_in_reg(const int DKQ,
    return ggml_cuda_fattn_mma_get_config(DKQ, DV, ncols).Q_in_reg;
 }

-static constexpr __device__ int get_cols_per_thread() {
-#if defined(AMD_WMMA_AVAILABLE)
-    return 1; // RDNA has a single column.
-#else
-    return 2; // This is specifically KQ columns, Volta only has a single VKQ column.
-#endif // defined(AMD_WMMA_AVAILABLE)
-}
-
-static __host__ int get_cols_per_warp(const int cc) {
-    if (turing_mma_available(cc) || amd_wmma_available(cc)) {
-        return 16;
-    } else {
-        // Volta
-        return 32;
-    }
-}
-
 // ------------------------------------------------------------------------------------------------------------------

 static __host__ int ggml_cuda_fattn_mma_get_nstages(const int DKQ, const int DV, const int ncols1, const int ncols2, const int cc) {
@@ -428,10 +393,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        const int jt,
        const int kb0,
        const int k_VKQ_sup) {
-#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
+#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
    constexpr int  ncols           = ncols1 * ncols2;
    constexpr int  cols_per_warp   = T_B_KQ::I;
-    constexpr int  cols_per_thread = get_cols_per_thread();
+    constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
    constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa(DKQ, DV, ncols);
    constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2(DKQ, DV, ncols);
@@ -448,8 +413,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    const int k_VKQ_0 = kb0 * nbatch_fa;
 #if defined(TURING_MMA_AVAILABLE)
    T_C_KQ KQ_C[nbatch_fa/(np*(cols_per_warp == 8 ? T_C_KQ::I : T_C_KQ::J))];
-#elif defined(AMD_WMMA_AVAILABLE)
-    T_C_KQ KQ_C[nbatch_fa/(np*T_C_KQ::J)];
 #else // Volta
    T_C_KQ KQ_C[nbatch_fa/(np*T_C_KQ::J)];
 #endif // defined(TURING_MMA_AVAILABLE)
@@ -498,14 +461,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                    if constexpr (cols_per_warp == 8) {
                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[k_KQ_0/T_A_KQ::J]);
                    } else {
-                        // Wide version of KQ_C is column-major
-#if defined(AMD_WMMA_AVAILABLE)
-                        // RDNA matrix C is column-major.
-                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[k_KQ_0/T_A_KQ::J]);
-#else
-                        // swap A and B for CUDA.
+                        // Wide version of KQ_C is column-major => swap A and B.
                        mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[k_KQ_0/T_A_KQ::J], K_A);
-#endif // defined(AMD_WMMA_AVAILABLE)
                    }
                }
            }
@@ -522,14 +479,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                    T_A_KQ K_A;
                    load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K);

-                    // Wide version of KQ_C is column-major
-#if defined(AMD_WMMA_AVAILABLE)
-                    // RDNA matrix C is column-major.
-                    mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
-#else
-                    // swap A and B for CUDA.
+                    // Wide version of KQ_C is column-major => swap A and B.
                    mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
-#endif // defined(AMD_WMMA_AVAILABLE)
                }
            }
        }
@@ -580,14 +531,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::I) {
 #pragma unroll
            for (int l = 0; l < T_C_KQ::ne; ++l) {
-                if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) {
-#if defined(AMD_WMMA_AVAILABLE)
-                    constexpr int KQ_idx = 0;
-#else
-                    // Turing + Volta:
-                    const int KQ_idx = l % 2;
-#endif // defined(AMD_WMMA_AVAILABLE)
-                    KQ_max_new[KQ_idx] = fmaxf(KQ_max_new[KQ_idx], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
+                if (!oob_check || k0 + T_C_KQ::get_i(l) < k_VKQ_sup) {
+                    KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
                }
            }
        }
@@ -607,14 +552,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #pragma unroll
            for (int l = 0; l < T_C_KQ::ne; ++l) {
                if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) {
-#if defined(AMD_WMMA_AVAILABLE)
-                    constexpr int KQ_idx = 0;
-#else
-                    // Turing + Volta:
-                    const int KQ_idx = l % 2;
-#endif // defined(AMD_WMMA_AVAILABLE)
-                    KQ_C[k0/(np*T_C_KQ::I)].x[l] = expf(KQ_C[k0/(np*T_C_KQ::I)].x[l] - KQ_max_new[KQ_idx]);
-                    KQ_rowsum_add[KQ_idx] += KQ_C[k0/(np*T_C_KQ::I)].x[l];
+                    KQ_C[k0/(np*T_C_KQ::I)].x[l] = expf(KQ_C[k0/(np*T_C_KQ::I)].x[l] - KQ_max_new[l % 2]);
+                    KQ_rowsum_add[l % 2] += KQ_C[k0/(np*T_C_KQ::I)].x[l];
                } else {
                    KQ_C[k0/(np*T_C_KQ::I)].x[l] = 0.0f;
                }
@@ -644,14 +583,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::J) {
 #pragma unroll
            for (int l = 0; l < T_C_KQ::ne; ++l) {
-                if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) {
-#if defined(AMD_WMMA_AVAILABLE)
-                    constexpr int KQ_idx = 0;
-#else
+                if (!oob_check || k0 + T_C_KQ::get_j(l) < k_VKQ_sup) {
                    // Turing + Volta:
-                    const int KQ_idx = (l/2) % 2;
-#endif // defined(AMD_WMMA_AVAILABLE)
-                    KQ_max_new[KQ_idx] = fmaxf(KQ_max_new[KQ_idx], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
+                    KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
                }
            }
        }
@@ -662,11 +596,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
            // Values per KQ column are spread across 4 threads:
            constexpr int offset_first = 2;
            constexpr int offset_last  = 1;
-#elif defined(AMD_WMMA_AVAILABLE)
-            // Values per KQ column are spread across 2 threads:
-            constexpr int offset_first = 16;
-            constexpr int offset_last  = 16;
-#else // Volta
+#else
            // Values per KQ column are spread across 2 threads:
            constexpr int offset_first = 2;
            constexpr int offset_last  = 2;
@@ -682,15 +612,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::J) {
 #pragma unroll
            for (int l = 0; l < T_C_KQ::ne; ++l) {
+                // Turing + Volta:
                if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) {
-#if defined(AMD_WMMA_AVAILABLE)
-                    constexpr int KQ_idx = 0;
-#else
-                    // Turing + Volta:
-                    const int KQ_idx = (l/2) % 2;
-#endif // defined(AMD_WMMA_AVAILABLE)
-                    KQ_C[(k0/(np*T_C_KQ::J))].x[l] = expf(KQ_C[(k0/(np*T_C_KQ::J))].x[l] - KQ_max_new[KQ_idx]);
-                    KQ_rowsum_add[KQ_idx] += KQ_C[(k0/(np*T_C_KQ::J))].x[l];
+                    KQ_C[(k0/(np*T_C_KQ::J))].x[l] = expf(KQ_C[(k0/(np*T_C_KQ::J))].x[l] - KQ_max_new[(l/2) % 2]);
+                    KQ_rowsum_add[(l/2) % 2] += KQ_C[(k0/(np*T_C_KQ::J))].x[l];
                } else {
                    KQ_C[(k0/(np*T_C_KQ::J))].x[l] = 0.0f;
                }
@@ -714,7 +639,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(

 #if defined(TURING_MMA_AVAILABLE)
        if constexpr (cols_per_warp == 8) {
-            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[cols_per_thread - 1]);
+            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]);
 #pragma unroll
            for (int i = 0; i < DV/T_C_VKQ::I; ++i) {
 #pragma unroll
@@ -735,16 +660,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                }
            }
        }
-#elif defined(AMD_WMMA_AVAILABLE)
-        const half2 KQ_max_scale_h2 = make_half2(
-            KQ_max_scale[0], KQ_max_scale[0]);
-#pragma unroll
-        for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) {
-#pragma unroll
-            for (int l = 0; l < T_C_VKQ::ne; ++l) {
-                VKQ_C[i].x[l] *= KQ_max_scale_h2;
-            }
-        }
 #else // Volta
        const half2 KQ_max_scale_h2 = make_half2(
            KQ_max_scale[(threadIdx.x / 2) % 2], KQ_max_scale[(threadIdx.x / 2) % 2]);
@@ -792,10 +707,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
    // Therefore, iterate over V in reverse and re-use the data if possible.
    static_assert(!mla || nstages <= 1, "combination of MLA and multi-stage loading not implemented");
    constexpr int reusable_cutoff = mla ? (DKQ - 1) - (DKQ - 1) % (2*nbatch_K2) - (DKQ - DV) : DV;
-#if defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
-    T_A_VKQ A_identity;
-    make_identity_mat(A_identity);
-#endif // defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)

    // Calculate VKQ tile, need to use logical rather than physical elements for i0 due to transposition of V:
 #pragma unroll
@@ -816,7 +727,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        }
        const half2 * tile_V_i = i0_start < reusable_cutoff ? tile_V : tile_V + (i0_start - reusable_cutoff)/2;

-#if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+#if defined(TURING_MMA_AVAILABLE)
        constexpr int i0_stride = cols_per_warp == 8 ? T_C_VKQ::I : 2*T_C_VKQ::J;
 #pragma unroll
        for (int i_VKQ_0 = i0_start; i_VKQ_0 < i0_stop; i_VKQ_0 += i0_stride) {
@@ -826,26 +737,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                const int k0 = k00 + (threadIdx.y % np)*T_A_VKQ::J;

                T_A_VKQ A; // Transposed in SRAM but not in registers, gets transposed on load.
-#if defined(LDMATRIX_TRANS_AVAILABLE)
                load_ldmatrix_trans(A, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
-#else
-                // TODO: Try to transpose tile_V when loading gmem to smem.
-                // Use mma to transpose T_A_VKQ for RDNA.
-                T_A_VKQ A_trans;
-                load_ldmatrix(A_trans, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
-                mma(A, A_trans, A_identity);
-#endif // defined(TURING_MMA_AVAILABLE)
                if constexpr (T_B_KQ::I == 8) {
                    mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
                } else {
-                    // Wide version of VKQ_C is column-major.
-#if defined(AMD_WMMA_AVAILABLE)
-                    // RDNA matrix C is column-major.
-                    mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
-#else
-                    // swap A and B for CUDA.
+                    // Wide version of VKQ_C is column-major => swap A and B.
                    mma(VKQ_C[i_VKQ_0/i0_stride], B[k00/(np*T_A_VKQ::J)], A);
-#endif // defined(AMD_WMMA_AVAILABLE)
                }
            }
        }
@@ -864,7 +761,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
                mma(VKQ_C[i_VKQ_0/i0_stride], B[k00/(np*T_A_VKQ::I)], A);
            }
        }
-#endif // defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+#endif // defined(TURING_MMA_AVAILABLE)

        if constexpr (nstages <= 1) {
            __syncthreads(); // Only needed if tile_K == tile_V.
@@ -877,7 +774,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
        tile_Q, tile_K, tile_V, tile_mask,
        Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
    NO_DEVICE_CODE;
-#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
+#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
 }

 #if defined(TURING_MMA_AVAILABLE)
@@ -897,15 +794,6 @@ template<> struct mma_tile_sizes<8> {
    using T_B_VKQ = tile< 8,  8, half2>; // column-major
    using T_C_VKQ = tile<16,  4, half2>; // row-major
 };
-#elif defined(AMD_WMMA_AVAILABLE)
-template<int ncols> struct mma_tile_sizes {
-    using T_A_KQ  = tile<16,  8, half2>; // row-major
-    using T_B_KQ  = tile<16,  8, half2>; // column-major
-    using T_C_KQ  = tile<16, 16, float>; // column-major
-    using T_A_VKQ = tile<16,  8, half2>; // row-major
-    using T_B_VKQ = tile<16,  8, half2>; // column-major
-    using T_C_VKQ = tile<16,  8, half2>; // column-major
-};
 #else // Volta
 template<int ncols> struct mma_tile_sizes {
    using T_A_KQ  = tile< 8,  4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major
@@ -940,7 +828,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        const int jt,
        const int kb0_start,
        const int kb0_stop) {
-#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
+#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

    constexpr int ncols = ncols1 * ncols2;
@@ -952,7 +840,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
    using     T_C_VKQ   = typename mma_tile_sizes<ncols>::T_C_VKQ;

    constexpr int  cols_per_warp   = T_B_KQ::I;
-    constexpr int  cols_per_thread = get_cols_per_thread();
+    constexpr int  cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
    constexpr int  np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
    constexpr int  nbatch_fa       = ggml_cuda_fattn_mma_get_nbatch_fa     (DKQ, DV, ncols);
    constexpr int  nbatch_K2       = ggml_cuda_fattn_mma_get_nbatch_K2     (DKQ, DV, ncols);
@@ -983,8 +871,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
    T_B_KQ    Q_B[(Q_in_reg ? DKQ/(2*T_B_KQ::J) : 1)];
 #if defined(TURING_MMA_AVAILABLE)
    T_C_VKQ VKQ_C[cols_per_warp == 8 ? DV/T_C_VKQ::I : DV/(2*T_C_VKQ::J)];
-#elif defined(AMD_WMMA_AVAILABLE)
-    T_C_VKQ VKQ_C[                                     DV/(2*T_C_VKQ::J)];
 #else // Volta
    T_C_VKQ VKQ_C[                                     DV/(2*T_C_VKQ::J)];
 #endif // defined(TURING_MMA_AVAILABLE)
@@ -1124,10 +1010,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        // The partial sums are spread across 8/4 threads.
        constexpr int offset_first = cols_per_warp == 8 ? 16 : 2;
        constexpr int offset_last  = cols_per_warp == 8 ?  4 : 1;
-#elif defined(AMD_WMMA_AVAILABLE)
-        // The partial sums are spread across 2 threads.
-        constexpr int offset_first = 16;
-        constexpr int offset_last  = 16;
 #else // Volta
        // The partial sums are spread across 2 threads.
        constexpr int offset_first = 2;
@@ -1165,7 +1047,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(

 #if defined(TURING_MMA_AVAILABLE)
        if constexpr (cols_per_warp == 8) {
-            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[cols_per_thread - 1]);
+            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]);
 #pragma unroll
            for (int i = 0; i < DV/T_C_VKQ::I; ++i) {
 #pragma unroll
@@ -1186,15 +1068,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
                }
            }
        }
-#elif defined(AMD_WMMA_AVAILABLE)
-        const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[0]);
-#pragma unroll
-        for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) {
-#pragma unroll
-            for (int l = 0; l < T_C_VKQ::ne; ++l) {
-                VKQ_C[i].x[l] *= KQ_max_scale_h2;
-            }
-        }
 #else // Volta
        const int col = (threadIdx.x / 2) % 2;
        const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[col], KQ_max_scale[col]);
@@ -1246,10 +1119,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        const int jc_cwm = threadIdx.y*cols_per_warp + T_C_VKQ::get_i(threadIdx.x % 4);
        const float2 KQ_cmr = make_float2(KQ_max[threadIdx.x % cols_per_thread], KQ_rowsum[threadIdx.x % cols_per_thread]);
        const bool thread_should_write = threadIdx.x % 4 < cols_per_thread;
-#elif defined(AMD_WMMA_AVAILABLE)
-        const int jc_cwm = threadIdx.y*cols_per_warp + T_C_VKQ::get_i(0);
-        const float2 KQ_cmr = make_float2(KQ_max[0], KQ_rowsum[0]);
-        const bool thread_should_write = threadIdx.x / 16 < cols_per_thread;
 #else // Volta
        const int jc_cwm = threadIdx.y*cols_per_warp + T_C_KQ::get_i(threadIdx.x & 2);
        const float2 KQ_cmr = make_float2(KQ_max[(threadIdx.x & 2) / 2], KQ_rowsum[(threadIdx.x & 2) / 2]);
@@ -1450,7 +1319,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        stride_Q1, stride_Q2, stride_K, stride_V, stride_mask,
        jt, kb0_start, kb0_stop);
    NO_DEVICE_CODE;
-#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
+#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
 }

 template<int DKQ, int DV, int ncols1, int ncols2, bool use_logit_softcap, bool mla>
@@ -1477,7 +1346,7 @@ static __global__ void flash_attn_ext_f16(
                            const int32_t nb21, const int32_t nb22, const int64_t nb23,
                            const int32_t ne31, const int32_t ne32, const int32_t ne33,
                            const int32_t nb31, const int32_t nb32, const int64_t nb33) {
-#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)))
+#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE))

    // Skip unused kernel variants for faster compilation:
    if (use_logit_softcap && !(DKQ == 128 || DKQ == 256)) {
@@ -1491,13 +1360,6 @@ static __global__ void flash_attn_ext_f16(
    }
 #endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING

-#if defined(AMD_WMMA_AVAILABLE)
-    if (ncols1*ncols2 > 32 || ncols1*ncols2 < 16 || DKQ > 128 || ncols2 == 1) {
-        NO_DEVICE_CODE;
-        return;
-    }
-#endif // defined(AMD_WMMA_AVAILABLE)
-
    static_assert(!mla || DKQ >= DV, "MLA needs DKQ >= DV");

    constexpr int ncols     = ncols1 * ncols2;
@@ -1611,7 +1473,7 @@ static __global__ void flash_attn_ext_f16(
              ne31, ne32, ne33,
              nb31, nb32, nb33);
    NO_DEVICE_CODE;
-#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)))
+#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE))
 }

 template <int DKQ, int DV, int ncols1, int ncols2>
@@ -1630,7 +1492,7 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
    const bool Q_in_reg       = ggml_cuda_fattn_mma_get_Q_in_reg      (DKQ, DV, ncols, cc);
    const int  nstages        = ggml_cuda_fattn_mma_get_nstages       (DKQ, DV, ncols1, ncols2, cc);

-    const int cols_per_warp = std::min(ncols, get_cols_per_warp(cc));
+    const int cols_per_warp = std::min(ncols, turing_mma_available(cc) ? 16 : 32);
    const int nwarps        = nthreads / WARP_SIZE;

    constexpr bool mla = DKQ == 576;
@@ -1650,34 +1512,29 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

-#if defined(GGML_USE_HIP)
-    using fattn_kernel_ptr_t = const void*;
-#else
-    using fattn_kernel_ptr_t = fattn_kernel_t;
-#endif // defined(GGML_USE_HIP)
    fattn_kernel_t fattn_kernel;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
        fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, use_logit_softcap, mla>;

-#if !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
        static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
        if (!shared_memory_limit_raised[id]) {
-            CUDA_CHECK(cudaFuncSetAttribute(reinterpret_cast<fattn_kernel_ptr_t>(fattn_kernel), cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
+            CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
            shared_memory_limit_raised[id] = true;
        }
-#endif // !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
    } else {
        constexpr bool use_logit_softcap = true;
        fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, use_logit_softcap, mla>;

-#if !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
        static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
        if (!shared_memory_limit_raised[id]) {
-            CUDA_CHECK(cudaFuncSetAttribute(reinterpret_cast<fattn_kernel_ptr_t>(fattn_kernel), cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
+            CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
            shared_memory_limit_raised[id] = true;
        }
-#endif // !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
    }

    launch_fattn<DV, ncols1, ncols2>
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -343,7 +343,7 @@ static __device__ __forceinline__ void flash_attn_tile_load_tile(
                for (int j0 = j0_start; j0 < j0_stop; j0 += stride_j) {
                    const int j = j0*cpy_ne + (stride_j == warp_size ? threadIdx.x : threadIdx.x % stride_j)*cpy_ne;

-                    const __align__(16) half2 zero[cpy_ne] = {{0.0f, 0.0f}};
+                    const half2 zero[cpy_ne] = {{0.0f, 0.0f}};
                    ggml_cuda_memcpy_1<cpy_nb>(
                        tile_KV + i*(J/2 + J_padding) + j,
                        !oob_check || i < i_sup ? KV + i*stride_KV + j : zero);
@@ -394,11 +394,11 @@ static __device__ __forceinline__ void flash_attn_tile_load_tile(
                    const int j = j0*(cpy_ne/2) + (stride_j == warp_size ? threadIdx.x : threadIdx.x % stride_j)*(cpy_ne/2);

                    const half2 zero[cpy_ne/2] = {{0.0f, 0.0f}};
-                    __align__(16) half2 tmp_h2[cpy_ne/2];
+                    half2 tmp_h2[cpy_ne/2];
                    ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
                        tmp_h2, !oob_check || i < i_sup ? KV + i*stride_KV + j : zero);

-                    __align__(16) float2 tmp_f2[cpy_ne/2];
+                    float2 tmp_f2[cpy_ne/2];
 #pragma unroll
                    for (int l = 0; l < cpy_ne/2; ++l) {
                        tmp_f2[l] = __half22float2(tmp_h2[l]);
@@ -445,14 +445,14 @@ static __device__ __forceinline__ void flash_attn_tile_iter_KQ(
    static_assert((nbatch_K/2) % cpy_ne == 0, "bad nbatch_K");
 #pragma unroll
    for (int k_KQ_1 = 0; k_KQ_1 < nbatch_K/2; k_KQ_1 += cpy_ne) {
-        __align__(16) half2 K_k[nbatch_fa/(np*warp_size)][cpy_ne];
-        __align__(16) half2 Q_k[cpw][cpy_ne];
+        half2 K_k[nbatch_fa/(np*warp_size)][cpy_ne];
+        half2 Q_k[cpw][cpy_ne];
 #else
    static_assert(nbatch_K % cpy_ne == 0, "bad nbatch_K");
 #pragma unroll
    for (int k_KQ_1 = 0; k_KQ_1 < nbatch_K; k_KQ_1 += cpy_ne) {
-        __align__(16) float K_k[nbatch_fa/(np*warp_size)][cpy_ne];
-        __align__(16) float Q_k[cpw][cpy_ne];
+        float K_k[nbatch_fa/(np*warp_size)][cpy_ne];
+        float Q_k[cpw][cpy_ne];
 #endif // FAST_FP16_AVAILABLE

 #pragma unroll
@@ -602,9 +602,9 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
 #pragma unroll
    for (int jc0 = 0; jc0 < cpw; jc0 += KQ_cs) {
 #ifdef FAST_FP16_AVAILABLE
-        __align__(16) half  tmp[nbatch_fa/(np*warp_size)][KQ_cs];
+        half  tmp[nbatch_fa/(np*warp_size)][KQ_cs];
 #else
-        __align__(16) float tmp[nbatch_fa/(np*warp_size)][KQ_cs];
+        float tmp[nbatch_fa/(np*warp_size)][KQ_cs];
 #endif // FAST_FP16_AVAILABLE

 #pragma unroll
@@ -664,8 +664,8 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
 #ifdef FAST_FP16_AVAILABLE
 #pragma unroll
        for (int k1 = 0; k1 < nbatch_V; k1 += np) {
-            __align__(16) half2 V_k[(DVp/2)/warp_size];
-            __align__(16) half2 KQ_k[cpw];
+            half2 V_k[(DVp/2)/warp_size];
+            half2 KQ_k[cpw];

            constexpr int cpy_ne_D = cpy_ne/2 < (DVp/2)/warp_size ? cpy_ne/2 : (DVp/2)/warp_size;
 #pragma unroll
@@ -676,7 +676,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
            for (int jc_VKQ_0 = 0; jc_VKQ_0 < cpw; jc_VKQ_0 += KQ_cs) {
                const int jc_KQ = jc_VKQ_0/KQ_cs + (threadIdx.y / np)*(cpw/KQ_cs);

-                __align__(16) half tmp[KQ_cs];
+                half tmp[KQ_cs];
                ggml_cuda_memcpy_1<KQ_cs*sizeof(half)>(
                    &tmp, KQ + jc_KQ*(nbatch_fa*KQ_cs) + (k0 + k1 + threadIdx.y % np)*KQ_cs);
 #pragma unroll
@@ -696,8 +696,8 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
 #else
 #pragma unroll
        for (int k1 = 0; k1 < nbatch_V; k1 += np) {
-            __align__(16) float2 V_k[(DVp/2)/warp_size];
-            __align__(16) float  KQ_k[cpw];
+            float2 V_k[(DVp/2)/warp_size];
+            float  KQ_k[cpw];

            constexpr int cpy_ne_D = cpy_ne < DVp/warp_size ? cpy_ne : DVp/warp_size;
 #pragma unroll
@@ -821,12 +821,12 @@ static __global__ void flash_attn_tile(
    __shared__ half2 Q_tmp[ncols * DKQ/2];
    __shared__ half2 KV_tmp[nbatch_fa * (nbatch_K/2 + cpy_ne) + DVp-DV];
    __shared__ half  KQ[ncols * nbatch_fa];
-    __align__(16) half2 VKQ[cpw * ((DVp/2)/warp_size)] = {{0.0f, 0.0f}};
+    half2 VKQ[cpw * ((DVp/2)/warp_size)] = {{0.0f, 0.0f}};
 #else
    __shared__ float Q_tmp[ncols * DKQ];
    __shared__ float KV_tmp[nbatch_fa * (nbatch_K + cpy_ne) + DVp-DV];
    __shared__ float KQ[ncols * nbatch_fa];
-    __align__(16) float2 VKQ[cpw * ((DVp/2)/warp_size)] = {{0.0f, 0.0f}};
+    float2 VKQ[cpw * ((DVp/2)/warp_size)] = {{0.0f, 0.0f}};
 #endif // FAST_FP16_AVAILABLE

    float KQ_max[cpw];
@@ -849,7 +849,7 @@ static __global__ void flash_attn_tile(
 #pragma unroll
        for (int i0 = 0; i0 < DKQp; i0 += np*warp_size*cpy_ne_D) {
            if (i0 + np*warp_size*cpy_ne_D <= DKQ || i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D < DKQ) {
-                __align__(16) float tmp_f[cpy_ne_D] = {0.0f};
+                float tmp_f[cpy_ne_D] = {0.0f};
                ggml_cuda_memcpy_1<sizeof(tmp_f)>
                    (tmp_f, &Q_f[c*(nb02/sizeof(float)) + fastmodulo(col_Q_0 + j, ne01)*(nb01/sizeof(float))
                                 + i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D]);
@@ -860,7 +860,7 @@ static __global__ void flash_attn_tile(
                }

 #ifdef FAST_FP16_AVAILABLE
-                __align__(16) half2 tmp_h2[cpy_ne_D/2];
+                half2 tmp_h2[cpy_ne_D/2];
 #pragma unroll
                for (int i1 = 0; i1 < cpy_ne_D; i1 += 2) {
                    tmp_h2[i1/2] = make_half2(tmp_f[i1 + 0], tmp_f[i1 + 1]);
@@ -959,7 +959,7 @@ static __global__ void flash_attn_tile(
            constexpr int cpy_ne_D = cpy_ne < (DVp/2)/warp_size ? cpy_ne : (DVp/2)/warp_size;
 #pragma unroll
            for (int i0 = 0; i0 < DVp/2; i0 += warp_size*cpy_ne_D) {
-                __align__(16) half2 tmp[cpy_ne_D];
+                half2 tmp[cpy_ne_D];
                ggml_cuda_memcpy_1<cpy_ne_D*4>(tmp, &VKQ_combine[(threadIdx.y + ip)*(DVp/2) + i0 + threadIdx.x*cpy_ne_D]);
 #pragma unroll
                for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
@@ -970,7 +970,7 @@ static __global__ void flash_attn_tile(
            constexpr int cpy_ne_D = cpy_ne < DVp/warp_size ? cpy_ne : DVp/warp_size;
 #pragma unroll
            for (int i0 = 0; i0 < DVp; i0 += warp_size*cpy_ne_D) {
-                __align__(16) float tmp[cpy_ne_D];
+                float tmp[cpy_ne_D];
                ggml_cuda_memcpy_1<cpy_ne_D*4>(tmp, &VKQ_combine[(threadIdx.y + ip)*DVp + i0 + threadIdx.x*cpy_ne_D]);
 #pragma unroll
                for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
@@ -1033,7 +1033,7 @@ static __global__ void flash_attn_tile(
        constexpr int cpy_ne_D = cpy_ne/2 < (DVp/2)/warp_size ? cpy_ne/2 : (DVp/2)/warp_size;
 #pragma unroll
        for (int i0 = 0; i0 < DVp/2; i0 += warp_size*cpy_ne_D) {
-            __align__(16) float2 tmp[cpy_ne_D];
+            float2 tmp[cpy_ne_D];
 #pragma unroll
            for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
                tmp[i1] = __half22float2(VKQ[jc0*((DVp/2)/warp_size) + i0/warp_size + i1]);
--- a/ggml/src/ggml-cuda/fattn-vec.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec.cuh
@@ -10,7 +10,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
    return 128;
 }

-// Currenlty llvm with the amdgcn target does not support unrolling loops
+// Currenlty llvm with the amdgcn target dose not support unrolling loops
 // that contain a break that can not be resolved at compile time.
 #ifdef __clang__
 #pragma clang diagnostic push
@@ -132,7 +132,7 @@ static __global__ void flash_attn_ext_vec(
 #ifdef V_DOT2_F32_F16_AVAILABLE
    half2  Q_reg[ncols][(D/2)/nthreads_KQ]; // Will be initialized completely.
 #else
-    __align__(16) float2 Q_reg[ncols][(D/2)/nthreads_KQ] = {{{0.0f, 0.0f}}}; // May be only partially initialized.
+    float2 Q_reg[ncols][(D/2)/nthreads_KQ] = {{{0.0f, 0.0f}}}; // May be only partially initialized.
 #endif // V_DOT2_F32_F16_AVAILABLE
    int    Q_i32[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
    float2  Q_ds[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
@@ -200,7 +200,7 @@ static __global__ void flash_attn_ext_vec(
            for (int i0 = 0; i0 < D/2; i0 += nthreads_KQ*cpy_ne) {
                const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;

-                __align__(16) float2 tmp[cpy_ne] = {{0.0f, 0.0f}};
+                float2 tmp[cpy_ne] = {{0.0f, 0.0f}};
                if (ncols == 1 || ic0 + j < int(ne01.z)) {
                    ggml_cuda_memcpy_1<cpy_nb>(tmp,            &Q_j[i]);
                    ggml_cuda_memcpy_1<cpy_nb>(tmp + cpy_ne/2, &Q_j[i + cpy_ne/2]);
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -18,12 +18,12 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con
        }
    }

-    if ((turing_mma_available(cc) || amd_wmma_available(cc)) && Q->ne[1] <= 16/ncols2) {
+    if (turing_mma_available(cc) && Q->ne[1] <= 16/ncols2) {
        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
        return;
    }

-    if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || amd_wmma_available(cc) || Q->ne[1] <= 32/ncols2) {
+    if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || Q->ne[1] <= 32/ncols2) {
        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 32/ncols2, ncols2>(ctx, dst);
        return;
    }
@@ -230,18 +230,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const

    // The effective batch size for the kernel can be increased by gqa_ratio.
    // The kernel versions without this optimization are also used for ALiBi, if there is no mask, or if the KV cache is not padded,
-    bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
-    for (const ggml_tensor * t : {Q, K, V, mask}) {
-        if (t == nullptr) {
-            continue;
-        }
-        for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
-            if (t->nb[i] % 16 != 0) {
-                gqa_opt_applies = false;
-                break;
-            }
-        }
-    }
+    const bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;

    const int cc = ggml_cuda_info().devices[device].cc;

@@ -348,31 +337,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
        return BEST_FATTN_KERNEL_WMMA_F16;
    }

-    if (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc) && gqa_opt_applies && Q->ne[0] <= 128 && Q->ne[0] != 40 && Q->ne[0] != 72) {
-        if (can_use_vector_kernel) {
-            if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
-                if (Q->ne[1] == 1) {
-                    if (!gqa_opt_applies) {
-                        return BEST_FATTN_KERNEL_VEC;
-                    }
-                }
-            } else {
-                if (Q->ne[1] <= 2) {
-                    return BEST_FATTN_KERNEL_VEC;
-                }
-            }
-        }
-        int gqa_ratio_eff = 1;
-        const int ncols2_max = Q->ne[0] == 576 ? 16 : 8;
-        while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
-            gqa_ratio_eff *= 2;
-        }
-        if (Q->ne[1] * gqa_ratio_eff <= 8) {
-            return BEST_FATTN_KERNEL_TILE; // AMD WMMA is only faster if the full tile width of 16 can be utilized.
-        }
-        return BEST_FATTN_KERNEL_MMA_F16;
-    }
-
    // If there are no tensor cores available, use the generic tile kernel:
    if (can_use_vector_kernel) {
        if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -19,7 +19,6 @@
 #include "ggml-cuda/count-equal.cuh"
 #include "ggml-cuda/cpy.cuh"
 #include "ggml-cuda/cross-entropy-loss.cuh"
-#include "ggml-cuda/cumsum.cuh"
 #include "ggml-cuda/diagmask.cuh"
 #include "ggml-cuda/diag.cuh"
 #include "ggml-cuda/fattn.cuh"
@@ -45,7 +44,6 @@
 #include "ggml-cuda/ssm-scan.cuh"
 #include "ggml-cuda/sum.cuh"
 #include "ggml-cuda/sumrows.cuh"
-#include "ggml-cuda/top-k.cuh"
 #include "ggml-cuda/mean.cuh"
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/topk-moe.cuh"
@@ -203,6 +201,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
    GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);

    int64_t total_vram = 0;
+#ifdef GGML_CUDA_FORCE_MMQ
+    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ:    yes\n", __func__);
+#else
+    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ:    no\n", __func__);
+#endif // GGML_CUDA_FORCE_MMQ
+#ifdef GGML_CUDA_FORCE_CUBLAS
+    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
+#else
+    GGML_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
+#endif // GGML_CUDA_FORCE_CUBLAS
    GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);

    std::vector<std::pair<int, std::string>> turing_devices_without_mma;
@@ -233,14 +241,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
        info.devices[id].nsm        = prop.multiProcessorCount;
        info.devices[id].smpb       = prop.sharedMemPerBlock;
        info.devices[id].warp_size  = prop.warpSize;
-
-#ifndef GGML_USE_MUSA
-        int supports_coop_launch = 0;
-        CUDA_CHECK(cudaDeviceGetAttribute(&supports_coop_launch, cudaDevAttrCooperativeLaunch, id));
-        info.devices[id].supports_cooperative_launch = !!supports_coop_launch;
-#else
-        info.devices[id].supports_cooperative_launch = false;
-#endif // !(GGML_USE_MUSA)
 #if defined(GGML_USE_HIP)
        info.devices[id].smpbo = prop.sharedMemPerBlock;

@@ -2211,7 +2211,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor

            const int cc            = ggml_cuda_info().devices[id].cc;
            const int warp_size     = ggml_cuda_info().devices[id].warp_size;
-            use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
+            use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
            use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
            use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
            any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
@@ -2219,7 +2219,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    } else {
        const int cc            = ggml_cuda_info().devices[ctx.device].cc;
        const int warp_size     = ggml_cuda_info().devices[ctx.device].warp_size;
-        use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
+        use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
        use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
        use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
        any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
@@ -2287,7 +2287,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
            return;
        }

-        if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) {
+        if (ggml_cuda_should_use_mmq(src0->type, cc, ne12)) {
            ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
            return;
        }
@@ -2687,9 +2687,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_SUM:
            ggml_cuda_op_sum(ctx, dst);
            break;
-        case GGML_OP_CUMSUM:
-            ggml_cuda_op_cumsum(ctx, dst);
-            break;
        case GGML_OP_SUM_ROWS:
            ggml_cuda_op_sum_rows(ctx, dst);
            break;
@@ -2702,9 +2699,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_SSM_SCAN:
            ggml_cuda_op_ssm_scan(ctx, dst);
            break;
-        case GGML_OP_TOP_K:
-            ggml_cuda_op_top_k(ctx, dst);
-            break;
        case GGML_OP_ARGSORT:
            ggml_cuda_op_argsort(ctx, dst);
            break;
@@ -2714,6 +2708,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_CROSS_ENTROPY_LOSS:
            ggml_cuda_cross_entropy_loss(ctx, dst);
            break;
+        case GGML_OP_CUMSUM:
+            ggml_cuda_op_cumsum(ctx, dst);
+            break;
        case GGML_OP_TRI:
            ggml_cuda_op_tri(ctx, dst);
            break;
@@ -2853,9 +2850,9 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
 }

 #ifdef USE_CUDA_GRAPH
-static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
+static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
+    bool use_cuda_graph) {

-    bool use_cuda_graph = true;
    // Loop over nodes in GGML graph to obtain info needed for CUDA graph

    const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
@@ -2915,41 +2912,41 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
    return use_cuda_graph;
 }

-static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
-    props->node_address = node->data;
-    props->node_op = node->op;
+static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
+    graph_node_properties->node_address = node->data;
+    graph_node_properties->node_op = node->op;
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        props->ne[i] = node->ne[i];
-        props->nb[i] = node->nb[i];
+        graph_node_properties->ne[i] = node->ne[i];
+        graph_node_properties->nb[i] = node->nb[i];
    }
    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        props->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
+        graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
    }
-    memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS);
+    memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
 }

-static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) {
-    if (node->data != props->node_address &&
+static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
+    if (node->data != graph_node_properties->node_address &&
          node->op != GGML_OP_VIEW) {
        return false;
    }

-    if (node->op != props->node_op) {
+    if (node->op != graph_node_properties->node_op) {
        return false;
    }

    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (node->ne[i] != props->ne[i]) {
+        if (node->ne[i] != graph_node_properties->ne[i]) {
            return false;
        }
-        if (node->nb[i] != props->nb[i]) {
+        if (node->nb[i] != graph_node_properties->nb[i]) {
            return false;
        }
    }

    for (int i = 0; i < GGML_MAX_SRC; i++) {
        if (node->src[i] &&
-            node->src[i]->data != props->src_address[i] &&
+            node->src[i]->data != graph_node_properties->src_address[i] &&
            node->op != GGML_OP_VIEW
        ) {
            return false;
@@ -2957,55 +2954,44 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
    }

    if ((node->op == GGML_OP_SCALE || node->op == GGML_OP_GLU) &&
-        memcmp(props->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
+        memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
        return false;
    }

    return true;
 }

-static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
+static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {

-    bool res = false;
+    bool cuda_graph_update_required = false;

    if (cuda_ctx->cuda_graph->instance == nullptr) {
-        res = true;
+        cuda_graph_update_required = true;
    }

    // Check if the graph size has changed
-    if (cuda_ctx->cuda_graph->props.size() != (size_t)cgraph->n_nodes + cgraph->n_leafs) {
-        res = true;
-        cuda_ctx->cuda_graph->props.resize(cgraph->n_nodes + cgraph->n_leafs);
+    if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
+        cuda_graph_update_required = true;
+        cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
    }

    // Loop over nodes in GGML graph to determine if CUDA graph update is required
    // and store properties to allow this comparison for the next token
    for (int i = 0; i < cgraph->n_nodes; i++) {
-        bool props_match = true;
-        if (!res) {
-            props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &cuda_ctx->cuda_graph->props[i]);
+        bool has_matching_properties = true;
+        if (!cuda_graph_update_required) {
+            has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
        }
-        if (!props_match) {
-            res = true;
+        if (!has_matching_properties) {
+            cuda_graph_update_required = true;
        }
-        ggml_cuda_graph_node_set_properties(&cuda_ctx->cuda_graph->props[i], cgraph->nodes[i]);
+        set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
    }

-    for (int i = 0; i < cgraph->n_leafs; i++) {
-        bool props_match= true;
-        if (!res) {
-            props_match = ggml_cuda_graph_node_properties_match(cgraph->leafs[i], &cuda_ctx->cuda_graph->props[cgraph->n_nodes + i]);
-        }
-        if (!props_match) {
-            res = true;
-        }
-        ggml_cuda_graph_node_set_properties(&cuda_ctx->cuda_graph->props[cgraph->n_nodes + i], cgraph->leafs[i]);
-    }
-
-    return res;
+    return cuda_graph_update_required;
 }

-static void ggml_cuda_graph_update_executable(ggml_backend_cuda_context * cuda_ctx) {
+static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {

 #if CUDART_VERSION >= 12000
    cudaGraphExecUpdateResultInfo result_info;
@@ -3236,11 +3222,10 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
    return false;
 }

-static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, const bool use_cuda_graph, const bool cuda_graph_update_required) {
-    bool graph_evaluated_or_captured = false;
-
+static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+    bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
    // flag used to determine whether it is an integrated_gpu
-    const bool integrated            = ggml_cuda_info().devices[cuda_ctx->device].integrated;
+    const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;

    ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
    bool                         is_concurrent_event_active = false;
@@ -3278,7 +3263,6 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                    should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid();
                }
            }
-
            if (should_launch_concurrent_events) {
                // Restore original node order within each concurrent region to enable fusion within streams

@@ -3330,8 +3314,6 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                        cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]);
                    }
                }
-            } else {
-                stream_ctx.concurrent_events.clear();
            }

            for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -3710,7 +3692,7 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
            CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
        }
        if (cuda_graph_update_required) { // Update graph executable
-            ggml_cuda_graph_update_executable(cuda_ctx);
+            update_cuda_graph_executable(cuda_ctx);
        }
        // Launch graph
        CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
@@ -3720,48 +3702,60 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
    }
 }

-static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx) {
+static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    ggml_cuda_set_device(cuda_ctx->device);

 #ifdef USE_CUDA_GRAPH
+    static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);

+    // Objects required for CUDA Graph
    if (cuda_ctx->cuda_graph == nullptr) {
        cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
    }

+    bool use_cuda_graph = true;
+    bool cuda_graph_update_required = false;
+
    if (cuda_ctx->cuda_graph->graph == nullptr) {
        if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
-            if (!cuda_ctx->cuda_graph->disable_due_to_gpu_arch) {
-                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
-            }
            cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
+#ifndef NDEBUG
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
+#endif
        }
    }

-    return cuda_ctx->cuda_graph->is_enabled();
-#else
-    GGML_UNUSED(cuda_ctx);
-    return false;
-#endif // USE_CUDA_GRAPH
-}
-
-static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
-
-    ggml_cuda_set_device(cuda_ctx->device);
-
-    bool use_cuda_graph             = false;
-    bool cuda_graph_update_required = false;
-
-#ifdef USE_CUDA_GRAPH
-    use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx);
-
-    if (cuda_ctx->cuda_graph->is_enabled()) {
-        cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
-        use_cuda_graph             = ggml_cuda_graph_check_compability(cgraph);
-
-        cuda_ctx->cuda_graph->record_update(use_cuda_graph, cuda_graph_update_required);
+    // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
+    // or previous graph capture failure.
+    // Also disable for multi-gpu for now. TO DO investigate
+    if (disable_cuda_graphs_due_to_env
+        || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
+        || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
+        || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
+        use_cuda_graph = false;
+    }
+
+    if (use_cuda_graph) {
+        cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
+
+        use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph);
+
+        // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
+        if (use_cuda_graph && cuda_graph_update_required) {
+            cuda_ctx->cuda_graph->number_consecutive_updates++;
+        } else {
+            cuda_ctx->cuda_graph->number_consecutive_updates = 0;
+        }
+
+        if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
+            cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
+#ifndef NDEBUG
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
+#endif
+        }
    }
-#endif // USE_CUDA_GRAPH

    if (use_cuda_graph && cuda_graph_update_required) {
        // Start CUDA graph capture
@@ -3773,7 +3767,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
        CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
    }

-    ggml_cuda_graph_evaluate_and_capture(cuda_ctx, cgraph, use_cuda_graph, cuda_graph_update_required);
+#else
+    bool use_cuda_graph = false;
+    bool cuda_graph_update_required = false;
+#endif // USE_CUDA_GRAPH
+
+    bool graph_evaluated_or_captured = false;
+
+    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);

    return GGML_STATUS_SUCCESS;
 }
@@ -3806,10 +3807,8 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
 static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;

-    const bool use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx);
-
    static bool enable_graph_optimization = [] {
-        const char * env     = getenv("GGML_CUDA_GRAPH_OPT");
+        const char * env = getenv("GGML_CUDA_GRAPH_OPT");
        return env != nullptr && atoi(env) == 1;
    }();

@@ -3817,13 +3816,12 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
        return;
    }

+    GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend");
+    GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes);
+
    ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
    stream_context.reset();

-    if (!use_cuda_graph || ggml_backend_cuda_get_device_count() != 1) {
-        return;
-    }
-
    // number of out-degrees for a particular node
    std::unordered_map<const ggml_tensor *, int> fan_out;
    // reverse mapping of node to index in the cgraph
@@ -3884,12 +3882,6 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
        if (count >= min_fan_out && count <= max_fan_out) {
            const int root_node_idx = node_indices[root_node];

-            // only optimize for attn_norm
-            // TODO: make this more generic
-            if (!strstr(root_node->name, "attn_norm")) {
-                continue;
-            }
-
            bool is_part_of_event = false;
            for (const auto & [start, end] : concurrent_node_ranges) {
                if (root_node_idx >= start && root_node_idx <= end) {
@@ -4125,7 +4117,6 @@ struct ggml_backend_cuda_device_context {
    std::string name;
    std::string description;
    std::string pci_bus_id;
-    int op_offload_min_batch_size;
 };

 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -4553,7 +4544,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_L2_NORM:
            return true;
        case GGML_OP_RMS_NORM_BACK:
-            return ggml_is_contiguous(op->src[0]);
+            return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
            break;
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
@@ -4619,7 +4610,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            return true;
        case GGML_OP_SUM:
            return ggml_is_contiguous_rows(op->src[0]);
-        case GGML_OP_TOP_K:
        case GGML_OP_ARGSORT:
 #ifndef GGML_CUDA_USE_CUB
            return op->src[0]->ne[0] <= 1024;
@@ -4680,9 +4670,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
 }

 static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
+    const int min_batch_size = 32;

-    return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
+    return get_op_batch_size(op) >= min_batch_size;
+
+    GGML_UNUSED(dev);
 }

 static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
@@ -4793,16 +4785,6 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
        features.push_back({ "FA_ALL_QUANTS", "1" });
    #endif

-    {
-        const auto & info = ggml_cuda_info();
-        for (int id = 0; id < info.device_count; ++id) {
-            if (blackwell_mma_available(info.devices[id].cc)) {
-                features.push_back({ "BLACKWELL_NATIVE_FP4", "1"});
-                break;
-            }
-        }
-    }
-
    #undef _STRINGIFY
    #undef STRINGIFY

@@ -4850,7 +4832,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
        std::lock_guard<std::mutex> lock(mutex);
        if (!initialized) {
            ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
-            const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;

            for (int i = 0; i < ggml_cuda_info().device_count; i++) {
                ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -4864,7 +4845,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                char pci_bus_id[16] = {};
                snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
                dev_ctx->pci_bus_id = pci_bus_id;
-                dev_ctx->op_offload_min_batch_size = min_batch_size;

                ggml_backend_dev_t dev = new ggml_backend_device {
                    /* .iface   = */ ggml_backend_cuda_device_interface,
--- a/ggml/src/ggml-cuda/mean.cu
+++ b/ggml/src/ggml-cuda/mean.cu
@@ -34,11 +34,13 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
            // CUDA_GRAPHS_DISABLED
            ((ncols > 65536) &&
             ((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
-              ctx.cuda_graph->is_enabled())) ||
+              ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
+              ctx.cuda_graph->disable_due_to_failed_graph_capture)) ||
        // CUDA_GRAPHS ENABLED
        ((ncols > 32768) &&
         !((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
-            ctx.cuda_graph->is_enabled()))) {
+           ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
+           ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
 #else
        (ncols > 65536)) {
 #endif // USE_CUDA_GRAPH
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -206,16 +206,10 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 16 && J == 16) {
+                // matrix C
 #if defined(RDNA3)
-                if constexpr (std::is_same_v<T, float> || std::is_same_v<T, int>) {
-                    // matrix C
-                    return 2 * l + (threadIdx.x / 16);
-                } else {
-                    // matrix A&B
-                    return l;
-                }
+                return 2 * l + (threadIdx.x / 16);
 #else
-                // matrix C is the transposed matrix A&B on RDNA4
                return ne * (threadIdx.x / 16) + l;
 #endif // defined(RDNA3)
            } else if constexpr (I == 16 && J == 8) {
@@ -627,21 +621,6 @@ namespace ggml_cuda_mma {

        return ret;
    }
-#elif defined(AMD_WMMA_AVAILABLE)
-    template <int I, int J>
-    static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
-        tile<I, J/2, half2> ret;
-#pragma unroll
-        for (int l0 = 0; l0 < tile_float.ne; l0 += 2) {
-            ret.x[l0/2] = make_half2(tile_float.x[l0 + 0], tile_float.x[l0 + 1]);
-        }
-        return ret;
-    }
-
-    static __device__ __forceinline__ tile<8, 8, half2> get_transposed(const tile<16, 4, half2> & t) {
-        NO_DEVICE_CODE;
-        return tile<8, 8, half2>{};
-    }
 #else // Volta
    template <int I, int J>
    static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
@@ -660,19 +639,6 @@ namespace ggml_cuda_mma {
    }
 #endif // defined(TURING_MMA_AVAILABLE)

-    static __device__ __forceinline__ void make_identity_mat(tile<16, 8, half2> & t) {
-#if defined(RDNA4)
-        const int row = t.get_i(0);
-        const int left_right = t.get_j(0) / 4;
-        const int up_down = row / 8;
-        const int idx = row % 8;
-        reinterpret_cast<half*>(t.x)[idx] = left_right == up_down ? 1.0f : 0.0f;
-#else
-        GGML_UNUSED_VARS(t);
-        NO_DEVICE_CODE;
-#endif // defined(RDNA4)
-    }
-
    template <int I, int J, typename T, data_layout dl>
    static __device__ __forceinline__ void load_generic(tile<I, J, T, dl> & t, const T * __restrict__ xs0, const int stride) {
 #if defined(AMD_MFMA_AVAILABLE)
@@ -912,17 +878,6 @@ namespace ggml_cuda_mma {
            : "+r"(Dxi[2]), "+r"(Dxi[3])
            : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
-#elif defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA4)
-        using halfx8_t = __attribute__((ext_vector_type(8))) _Float16;
-        halfx8_t& acc_frag = reinterpret_cast<halfx8_t&>(D.x[0]);
-        const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
-        const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
-        acc_frag = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
-#else
-        GGML_UNUSED_VARS(D, A, B);
-        NO_DEVICE_CODE;
-#endif // defined(RDNA4)
 #else
        GGML_UNUSED_VARS(D, A, B);
        NO_DEVICE_CODE;
@@ -945,27 +900,6 @@ namespace ggml_cuda_mma {
 #endif // AMPERE_MMA_AVAILABLE
    }

-    static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> &     D,
-                                                            const tile<16, 8, int> & A,
-                                                            const tile<8, 8, int> &  B,
-                                                            uint32_t                 a_scale,
-                                                            uint32_t                 b_scale) {
-#ifdef BLACKWELL_MMA_AVAILABLE
-        const int * Axi = (const int *) A.x;
-        const int * Bxi = (const int *) B.x;
-        float *     Dxi = (float *) D.x;
-
-        asm volatile(
-            "mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
-            "%10, {0, 0}, %11, {0, 0};"
-            : "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
-            : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
-#else
-        GGML_UNUSED_VARS(D, A, B, a_scale, b_scale);
-#endif  // BLACKWELL_MMA_AVAILABLE
-    }
-
    static __device__ __forceinline__ void mma(
            tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
 #ifdef TURING_MMA_AVAILABLE
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -1,4 +1,3 @@
-#include "common.cuh"
 #include "mmq.cuh"
 #include "quantize.cuh"
 #include "mmid.cuh"
@@ -115,9 +114,6 @@ void ggml_cuda_mul_mat_q(
    const bool use_stream_k = (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
                            || GGML_CUDA_CC_IS_CDNA(cc);

-    // TODO: tighter pool buffer size vs q8 path
-    const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4;
-
    if (!ids) {
        const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
            get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
@@ -127,24 +123,12 @@ void ggml_cuda_mul_mat_q(
            const int64_t s11 = src1->nb[1] / ts_src1;
            const int64_t s12 = src1->nb[2] / ts_src1;
            const int64_t s13 = src1->nb[3] / ts_src1;
-            if (use_native_mxfp4) {
-                static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1));
-                quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
-                                        ne11, ne12, ne13, stream);
-
-            } else {
-                quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
-                                       ne11, ne12, ne13, stream);
-            }
+            quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
+                ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
            CUDA_CHECK(cudaGetLastError());
        }

-        // Stride depends on quantization format
-        const int64_t s12 = use_native_mxfp4 ?
-                                ne11 * ne10_padded * sizeof(block_fp4_mmq) /
-                                    (8 * QK_MXFP4 * sizeof(int))  // block_fp4_mmq holds 256 values (8 blocks of 32)
-                                :
-                                ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
+        const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
        const int64_t s13 = ne12*s12;

        const mmq_args args = {
@@ -190,20 +174,13 @@ void ggml_cuda_mul_mat_q(
    {
        const int64_t s11 = src1->nb[1] / ts_src1;
        const int64_t s12 = src1->nb[2] / ts_src1;
-        const int64_t s13 = src1->nb[3] / ts_src1;
-
-        if (use_native_mxfp4) {
-            quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
-                                    ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
-        } else {
-            quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
-                                   ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
-        }
+        const int64_t s13 = src1->nb[2] / ts_src1;
+        quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type,
+            ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
        CUDA_CHECK(cudaGetLastError());
    }

-    const int64_t s12 = use_native_mxfp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (8 * QK_MXFP4 * sizeof(int)) :
-                                           ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
+    const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
    const int64_t s13 = ne12*s12;

    // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
@@ -259,7 +236,7 @@ void ggml_cuda_op_mul_mat_q(
    GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_padded_row_size);
 }

-bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts) {
+bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
 #ifdef GGML_CUDA_FORCE_CUBLAS
    return false;
 #endif // GGML_CUDA_FORCE_CUBLAS
@@ -320,10 +297,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
        if (GGML_CUDA_CC_IS_CDNA3(cc)) {
            return true;
        }
-        if (n_experts > 64 || ne11 <= 128) {
-            return true;
-        }
-        if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
+        if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
            return true;
        }
        if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
@@ -333,31 +307,6 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
    }

    if (amd_wmma_available(cc)) {
-        if (GGML_CUDA_CC_IS_RDNA3(cc)) {
-            // High expert counts are almost always better on MMQ due to
-            //     the synchronization overhead in the cuBLAS/hipBLAS path:
-            // https://github.com/ggml-org/llama.cpp/pull/18202
-            if (n_experts >= 64) {
-                return true;
-            }
-
-            // For some quantization types MMQ can have lower peak TOPS than hipBLAS
-            //     so it's only faster for sufficiently small batch sizes:
-            switch (type) {
-                case GGML_TYPE_Q2_K:
-                    return ne11 <= 128;
-                case GGML_TYPE_Q6_K:
-                    return ne11 <= (GGML_CUDA_CC_IS_RDNA3_0(cc) ? 128 : 256);
-                case GGML_TYPE_IQ2_XS:
-                case GGML_TYPE_IQ2_S:
-                    return GGML_CUDA_CC_IS_RDNA3_5(cc) || ne11 <= 128;
-                default:
-                    return true;
-            }
-        }
-
-        // For RDNA4 MMQ is consistently faster than dequantization + hipBLAS:
-        // https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
        return true;
    }

--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -11,7 +11,6 @@ using namespace ggml_cuda_mma;

 #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
 #define MMQ_ITER_K 256
-#define MMQ_ITER_K_MXFP4_FP4    512
 #define MMQ_NWARPS 8

 typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride);
@@ -45,15 +44,8 @@ struct block_q8_1_mmq {
    };
    int8_t qs[4*QK8_1]; // 128 values quantized to 8 bit each
 };
-
-struct block_fp4_mmq {
-    uint32_t d4[4];       // 8 E8M0 scales (1 per 32 values), 2 packed per uint32: d4[0]={s0,s1}, d4[1]={s2,s3}, etc.
-    int8_t   qs[4 * 32];  // 256 FP4 values packed as 4-bit pairs (2 per byte), 8 blocks of 32 values
-};
-
 static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
 static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1),      "Unexpected block_q8_1_mmq size");
-static_assert(sizeof(block_fp4_mmq)  == sizeof(block_q8_1_mmq),    "Unexpected block_fp4_mmq size");

 static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
    switch (type_x) {
@@ -137,14 +129,6 @@ static int get_mmq_y_host(const int cc) {
        ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ? 128 : 64);
 }

-static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type) {
-#if defined(BLACKWELL_MMA_AVAILABLE)
-    return type == GGML_TYPE_MXFP4 ? MMQ_ITER_K_MXFP4_FP4 : MMQ_ITER_K;
-#else
-    return MMQ_ITER_K;
-#endif // defined(BLACKWELL_MMA_AVAILABLE)
-}
-
 static constexpr __device__ int get_mmq_y_device() {
 #if defined(GGML_USE_HIP)
 #if defined(RDNA1)
@@ -207,7 +191,6 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
 }

 #define MMQ_MMA_TILE_X_K_Q8_0 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0                   + 4)
-#define MMQ_MMA_TILE_X_K_FP4  (2*MMQ_TILE_NE_K + 8                                       + 4)
 #define MMQ_MMA_TILE_X_K_Q8_1 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0                   + 4)
 #define MMQ_MMA_TILE_X_K_Q2_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K                           + 4)
 #define MMQ_MMA_TILE_X_K_Q3_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2                         + 4)
@@ -218,8 +201,6 @@ static_assert(MMQ_MMA_TILE_X_K_Q8_1 % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q2_K % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
-static_assert(MMQ_MMA_TILE_X_K_FP4  % 8 == 4, "Wrong padding.");
-static_assert(MMQ_MMA_TILE_X_K_FP4 == MMQ_MMA_TILE_X_K_Q8_1, "Wrong tile size for MXFP4");

 static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
    switch (type) {
@@ -228,7 +209,6 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
        case GGML_TYPE_Q5_0:    return MMQ_MMA_TILE_X_K_Q8_0;
        case GGML_TYPE_Q5_1:    return MMQ_MMA_TILE_X_K_Q8_1;
        case GGML_TYPE_Q8_0:    return MMQ_MMA_TILE_X_K_Q8_0;
-        // tile sizes are the same for Q8_1 and FP4 for blackwell
        case GGML_TYPE_MXFP4:   return MMQ_MMA_TILE_X_K_Q8_1;
        case GGML_TYPE_Q2_K:    return MMQ_MMA_TILE_X_K_Q2_K;
        case GGML_TYPE_Q3_K:    return MMQ_MMA_TILE_X_K_Q3_K;
@@ -248,8 +228,7 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
 }

 // block_q8_1_mmq has (128 8-bit ints == 32 32-bit ints + 4 32-bit scales)
-#define MMQ_TILE_Y_K     (MMQ_TILE_NE_K + MMQ_TILE_NE_K / QI8_1)
-#define MMQ_TILE_Y_FP4_K MMQ_TILE_Y_K
+#define MMQ_TILE_Y_K (MMQ_TILE_NE_K + MMQ_TILE_NE_K/QI8_1)

 static int mmq_get_granularity_host(const int mmq_x, const int cc) {
    if (amd_mfma_available(cc) || amd_wmma_available(cc)) {
@@ -782,50 +761,6 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
    }
 }

-template <int mmq_y, bool need_check>
-static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restrict__ x,
-                                                            int * __restrict__ x_tile,
-                                                            const int kbx0,
-                                                            const int i_max,
-                                                            const int stride) {
-    constexpr int nwarps = mmq_get_nwarps_device();
-    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
-
-    int *      x_qs = (int *) x_tile;
-    uint32_t * x_sc = (uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
-
-    const int txi = threadIdx.x;
-
-    constexpr int iter_k = get_iter_k(GGML_TYPE_MXFP4);
-
-    constexpr int threads_per_row = iter_k / QK_MXFP4;  // each thread processes 1 block
-    constexpr int rows_per_warp   = warp_size / threads_per_row;
-    const int     kbx             = txi % threads_per_row;
-    const int     row_in_warp     = txi / threads_per_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) {
-        int i = i0 + threadIdx.y * rows_per_warp + row_in_warp;
-
-        if constexpr (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_mxfp4 * bxi = (const block_mxfp4 *) x + kbx0 + i * stride + kbx;
-
-        // quantize_mxfp4_mmq permutes nibbles to match the quantized format
-        const int k0 = kbx * 4;
-        memcpy(x_qs + i * MMQ_MMA_TILE_X_K_FP4 + k0, bxi->qs, 16);
-
-        // Load E8M0 scales: pack 2 consecutive scales into one uint32
-        if (kbx % 2 == 0) {
-            uint32_t e = bxi->e;
-            e |= ((bxi + 1)->e << 8);
-            x_sc[i * MMQ_MMA_TILE_X_K_FP4 + kbx / 2] = e;
-        }
-    }
-}
-
 template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a(
    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
@@ -996,78 +931,6 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
 #endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
 }

-template <int mmq_x, int mmq_y>
-static __device__ __forceinline__ void vec_dot_mxfp4_mxfp4_mma(const int * __restrict__ x,
-                                                               const int * __restrict__ y,
-                                                               float * __restrict__ sum,
-                                                               const int k00) {
-    typedef tile<16, 8, int>   tile_A;
-    typedef tile<8, 8, int>    tile_B;
-    typedef tile<16, 8, float> tile_C;  // Output is float for native scaled MMA
-
-    constexpr int granularity   = mmq_get_granularity_device(mmq_x);
-    constexpr int rows_per_warp = 2 * granularity;
-    constexpr int ntx           = rows_per_warp / tile_C::I;  // Number of x minitiles per warp.
-
-    y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_FP4_K);
-
-    // Match layout from load_tiles_mxfp4_fp4
-    const int *      x_qs = (const int *) x;
-    const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
-    const int *      y_qs = (const int *) y + 4;
-    const uint32_t * y_sc = (const uint32_t *) y;
-
-    // tile_A has a length of 64 logical values vs. 32 values in block_mxfp4
-    tile_A   A[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)];
-    uint32_t scaleA[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)];
-
-    // Block scale
-    // Each thread has to point to a 4 byte scale value
-    // https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
-
-    const int i0 = (threadIdx.y / ntx) * rows_per_warp;
-
-#pragma unroll
-    for (int n = 0; n < ntx; ++n) {
-#pragma unroll
-        for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) {
-            const int k0 = k00 + k01;
-
-            load_ldmatrix(A[n][k01 / (2 * QI_MXFP4)], x_qs + (i0 + n * tile_A::I) * MMQ_MMA_TILE_X_K_FP4 + k0,
-                          MMQ_MMA_TILE_X_K_FP4);
-
-            // based on block-scaling document, 2 threads in each quad need to supply to the scale value
-            const int tidx         = threadIdx.x / 4 + (threadIdx.x % 2) * 8;
-            scaleA[n][k01 / (2 * QI_MXFP4)] =
-                *(x_sc + (i0 + n * tile_A::I + tidx) * MMQ_MMA_TILE_X_K_FP4 + k0 / (2 * QI_MXFP4));
-        }
-    }
-
-#pragma unroll
-    for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) {
-#pragma unroll
-        for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) {
-            tile_B   B;
-            uint32_t scaleB;  // 2xN scales
-
-            load_generic(B, y_qs + j0 * MMQ_TILE_Y_FP4_K + k01, MMQ_TILE_Y_FP4_K);
-
-            scaleB = y_sc[(j0 + threadIdx.x / 4) * MMQ_TILE_Y_FP4_K + k01 / (2 * QI_MXFP4)];
-
-#pragma unroll
-            for (int n = 0; n < ntx; ++n) {
-                tile_C C;
-
-                mma_block_scaled(C, A[n][k01 / (2 * QI_MXFP4)], B, scaleA[n][k01 / (2 * QI_MXFP4)], scaleB);
-#pragma unroll
-                for (int l = 0; l < tile_C::ne; ++l) {
-                    sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l];
-                }
-            }
-        }
-    }
-}
-
 template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
@@ -3246,13 +3109,8 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q8_0> {
 template <int mmq_x, int mmq_y, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_MXFP4> {
    static constexpr int              vdr          = VDR_MXFP4_Q8_1_MMQ;
-#ifdef BLACKWELL_MMA_AVAILABLE
-    static constexpr load_tiles_mmq_t load_tiles  = load_tiles_mxfp4_fp4<mmq_y, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma = vec_dot_mxfp4_mxfp4_mma<mmq_x, mmq_y>;
-#else
    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_mxfp4<mmq_y, need_check>;
    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
-#endif // BLACKWELL_MMA_AVAILABLE
    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
 };

@@ -3385,26 +3243,17 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
    constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, need_check>;
 #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)

-#if defined(BLACKWELL_MMA_AVAILABLE)
-    // FP4 tile stores 8 blocks
-    constexpr int ne_block = (type == GGML_TYPE_MXFP4) ? 8 * QK_MXFP4 : 4 * QK8_1;
-#else
-    constexpr int ne_block = 4 * QK8_1;
-#endif  // defined(BLACKWELL_MMA_AVAILABLE)
-
-    constexpr int ITER_K          = get_iter_k(type);
-    constexpr int blocks_per_iter = ITER_K / qk;
+    constexpr int blocks_per_iter = MMQ_ITER_K / qk;

    float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f};

-    constexpr int sz = sizeof(block_q8_1_mmq) / sizeof(int);
-
    for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) {
        load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x);
+
        {
-            const int * by0 = y + ncols_y * (kb0 * qk / ne_block) * sz;
+            const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int));
 #pragma unroll
-            for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
+            for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*warp_size) {
                int l = l0 + threadIdx.y*warp_size + threadIdx.x;

                tile_y[l] = by0[l];
@@ -3418,9 +3267,9 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
        __syncthreads();

        {
-            const int * by0 = y + ncols_y * ((kb0 * qk / ne_block) * sz + sz);
+            const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int));
 #pragma unroll
-            for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
+            for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*warp_size) {
                int l = l0 + threadIdx.y*warp_size + threadIdx.x;

                tile_y[l] = by0[l];
@@ -3552,10 +3401,8 @@ static __global__ void mul_mat_q(
    }
 #endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA

-    constexpr int ITER_K = get_iter_k(type);
-
    const     int64_t blocks_per_ne00 = ncols_x / qk;
-    constexpr int     blocks_per_iter = ITER_K / qk;
+    constexpr int     blocks_per_iter = MMQ_ITER_K / qk;

    // kbc == k block continuous, current index in continuous ijk space.
    int64_t kbc      = (int64_t) blockIdx.x     *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
@@ -3616,7 +3463,7 @@ static __global__ void mul_mat_q(
            __syncthreads();
        }

-        offset_y += (col_low + jt * mmq_x) * (sizeof(block_q8_1_mmq) / sizeof(int));
+        offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
        offset_dst += it*mmq_y;

        const int tile_x_max_i = nrows_x  - it*mmq_y - 1;
@@ -3683,7 +3530,7 @@ static __global__ void mul_mat_q(
        __syncthreads();
    }

-    offset_y += (col_low + jt * mmq_x) * (sizeof(block_q8_1_mmq) / sizeof(int));
+    offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
    offset_dst += it*mmq_y;

    const int tile_x_max_i = nrows_x  - it*mmq_y - 1;
@@ -3706,9 +3553,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
        const int ncols_max) {
    constexpr int     mmq_y           = get_mmq_y_device();
    constexpr int     qk              = ggml_cuda_type_traits<type>::qk;
-    constexpr int     ITER_K          = get_iter_k(type);
-
-    constexpr int     blocks_per_iter = ITER_K / qk;
+    constexpr int     blocks_per_iter = MMQ_ITER_K / qk;
    const     int64_t blocks_per_ne00 = ncols_x / qk;

    constexpr int nwarps = mmq_get_nwarps_device();
@@ -3866,7 +3711,7 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int
    const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type);
    const size_t nbs_ids = mmq_x*sizeof(int);
    const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc) || amd_wmma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
-    const size_t nbs_y = mmq_x * (sizeof(block_q8_1_mmq));
+    const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq);
    return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int));
 }

@@ -4082,4 +3927,4 @@ void ggml_cuda_op_mul_mat_q(
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream);

-bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t n_experts);
+bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11);
--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@@ -25,8 +25,19 @@ static __global__ void norm_f32(
    }

    // sum up partial sums
-    extern __shared__ float2 s_sum2[];
-    mean_var = block_reduce<block_reduce_method::SUM, block_size>(mean_var, s_sum2);
+    mean_var = warp_reduce_sum(mean_var);
+    if constexpr (block_size > WARP_SIZE) {
+        static_assert(block_size == 1024, "unexpected block_size");
+        __shared__ float2 s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = mean_var;
+        }
+        __syncthreads();
+        mean_var = s_sum[lane_id];
+        mean_var = warp_reduce_sum(mean_var);
+    }

    const float mean = mean_var.x / ncols;
    const float var = mean_var.y / ncols - mean * mean;
@@ -50,8 +61,19 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
        tmp += x[j];
    }

-    extern __shared__ float s_sum[];
-    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
+    tmp = warp_reduce_sum(tmp);
+    if constexpr (block_size > WARP_SIZE) {
+        static_assert(block_size == 1024, "unexpected block_size");
+        __shared__ float s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }

    const float mean = tmp / group_size;
    tmp = 0.0f;
@@ -62,7 +84,18 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
        tmp += xi * xi;
    }

-    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __shared__ float s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }

    const float variance = tmp / group_size;
    const float scale = rsqrtf(variance + eps);
@@ -130,8 +163,22 @@ static __global__ void rms_norm_f32(const float * x,
    }

    // sum up partial sums
-    extern __shared__ float s_sum[];
-    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
+    tmp = warp_reduce_sum(tmp);
+    if constexpr (block_size > WARP_SIZE) {
+        static_assert((block_size <= 1024) && (block_size % 32 == 0), "unexpected block_size");
+        __shared__ float s_sum[32];
+        const int        warp_id = tid / WARP_SIZE;
+        const int        lane_id = tid % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = 0.0f;
+        if (lane_id < (block_size / WARP_SIZE)) {
+            tmp = s_sum[lane_id];
+        }
+        tmp = warp_reduce_sum(tmp);
+    }

    const float mean = tmp / ncols;
    const float scale = rsqrtf(mean + eps);
@@ -259,8 +306,19 @@ static __global__ void l2_norm_f32(
    }

    // sum up partial sums
-    extern __shared__ float s_sum[];
-    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
+    tmp = warp_reduce_sum(tmp);
+    if constexpr (block_size > WARP_SIZE) {
+        static_assert(block_size == 1024, "unexpected block_size");
+        __shared__ float s_sum[32];
+        const int warp_id = threadIdx.x / WARP_SIZE;
+        const int lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }

    // from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html
    const float scale = rsqrtf(fmaxf(tmp, eps * eps));
@@ -279,7 +337,7 @@ static void norm_f32_cuda(
        norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
-        norm_f32<1024><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float2): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    }
 }

@@ -290,7 +348,7 @@ static void group_norm_f32_cuda(
        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
-        group_norm_f32<1024><<<num_groups, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, group_size, ne_elements, eps);
+        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
    }
 }

@@ -300,10 +358,10 @@ static void rms_norm_f32_cuda(
    const dim3 blocks_num(nrows, nchannels, nsamples);
    if (ncols < 1024) {
        const dim3 block_dims(256, 1, 1);
-        rms_norm_f32<256, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        rms_norm_f32<256, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
-        rms_norm_f32<1024, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        rms_norm_f32<1024, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    }
 }

@@ -346,12 +404,12 @@ static void rms_norm_mul_f32_cuda(const float *  x,
        const uint3 mul_nsamples_packed  = init_fastdiv_values(mul_nsamples);
        if (ncols < 1024) {
            const dim3 block_dims(256, 1, 1);
-            rms_norm_f32<256, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
+            rms_norm_f32<256, true><<<blocks_num, block_dims, 0, stream>>>(
                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
        } else {
            const dim3 block_dims(1024, 1, 1);
-            rms_norm_f32<1024, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
+            rms_norm_f32<1024, true><<<blocks_num, block_dims, 0, stream>>>(
                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
        }
@@ -367,14 +425,14 @@ static void rms_norm_mul_f32_cuda(const float *  x,
        const uint3 add_nsamples_packed  = init_fastdiv_values(add_nsamples);
        if (ncols < 1024) {
            const dim3 block_dims(256, 1, 1);
-            rms_norm_f32<256, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
+            rms_norm_f32<256, true, true><<<blocks_num, block_dims, 0, stream>>>(
                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
                add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
                add_nchannels_packed, add_nsamples_packed);
        } else {
            const dim3 block_dims(1024, 1, 1);
-            rms_norm_f32<1024, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
+            rms_norm_f32<1024, true, true><<<blocks_num, block_dims, 0, stream>>>(
                x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
                mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
                add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
@@ -402,7 +460,7 @@ static void l2_norm_f32_cuda(
        l2_norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
-        l2_norm_f32<1024><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
+        l2_norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
    }
 }

--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -47,131 +47,6 @@ static __global__ void quantize_q8_1(
    y[ib].ds = make_half2(d, sum);
 }

-__device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) {
-    if (!(amax > 0.0f)) {
-        return 0;
-    }
-
-    // FP4 E2M1: max exponent (unbiased) is 2.
-    constexpr int FP4_E2M1_EMAX = 2;
-
-    const float e = log2f(amax);
-
-    // "even" -> round-to-nearest integer, ties-to-even
-    const int e_int = __float2int_rn(e);
-
-    const int shared_exp = e_int - FP4_E2M1_EMAX;
-
-    int biased = shared_exp + 127;
-
-    biased = max(biased, 0);
-    biased = min(biased, 254);
-
-    return static_cast<uint8_t>(biased);
-}
-
-// quantize values in the format mxfp4 is stored which is interleaved nibbles
-// i.e. a block a0-a31 is represented as a0a16,a1a17 ...a15a31
-static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x,
-                                          const int32_t * __restrict__ ids,
-                                          void * __restrict__ vy,
-                                          const int64_t ne00,
-                                          const int64_t s01,
-                                          const int64_t s02,
-                                          const int64_t s03,
-                                          const int64_t ne0,
-                                          const int     ne1,
-                                          const int     ne2) {
-    constexpr int vals_per_scale = 32;
-    constexpr int vals_per_warp  = 2 * vals_per_scale;  // Each warp processes 2 blocks of 32 = 64 values
-
-    const int warp_id = threadIdx.y;
-    const int lane_id_32 = threadIdx.x;
-
-    const int nwarps = blockDim.y;
-
-    const int64_t warp_start_offset = (blockIdx.y * nwarps + warp_id) * vals_per_warp;
-
-    if (warp_start_offset >= ne0) {
-        return;
-    }
-
-    const int64_t i1 = blockIdx.x;
-    const int64_t i2 = blockIdx.z % ne2;
-    const int64_t i3 = blockIdx.z / ne2;
-
-    const int64_t i01 = ids ? ids[i1] : i1;
-    const int64_t i02 = i2;
-    const int64_t i03 = i3;
-
-    block_fp4_mmq * y = (block_fp4_mmq *) vy;
-
-    const int64_t block_fp4_mmq_size = 8 * QK_MXFP4;  // 256 values
-    const int64_t ib0                = blockIdx.z * ((int64_t) ne1 * (ne0 / block_fp4_mmq_size));
-    const int64_t ib = ib0 + (warp_start_offset / block_fp4_mmq_size) * ne1 + blockIdx.x;
-    const int64_t quad_idx_in_block  = (warp_start_offset % block_fp4_mmq_size) / vals_per_warp;
-
-    const int group_id = lane_id_32 / 4;
-    const int lane_in_group = lane_id_32 % 4;
-    const int base = group_id * 2;
-    char2 * yqs2 = (char2 *) y[ib].qs;
-
-    const int64_t base_pos = i03 * s03 + i02 * s02 + i01 * s01;
-
-    uint8_t scales[2];
-
-#pragma unroll
-    for (int b = 0; b < 2; ++b) {
-        const int64_t i0 = warp_start_offset + b * vals_per_scale + lane_id_32;
-        const float xi = (i0 < ne00) ? x[base_pos + i0] : 0.0f;
-
-        float amax = fabsf(xi);
-#pragma unroll
-        for (int mask = 16; mask > 0; mask >>= 1) {
-            amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE));
-        }
-
-        const uint8_t e = compute_e8m0_scale(amax);
-        scales[b] = e;
-        const float inv_s = (amax == 0.0f) ? 0.0f : __frcp_rn(ggml_cuda_e8m0_to_fp32(e));
-
-#if CUDART_VERSION >= 12080
-        const float scaled_val = xi * inv_s;
-
-        const float val0 = __shfl_sync(0xFFFFFFFF, scaled_val, base, WARP_SIZE);
-        const float val1 = __shfl_sync(0xFFFFFFFF, scaled_val, base + 16, WARP_SIZE);
-        const float val2 = __shfl_sync(0xFFFFFFFF, scaled_val, base + 1, WARP_SIZE);
-        const float val3 = __shfl_sync(0xFFFFFFFF, scaled_val, base + 17, WARP_SIZE);
-
-        if (lane_in_group == 0) {
-            __nv_fp4x4_e2m1 fp4_packed(make_float4(val0, val1, val2, val3));
-
-            yqs2[quad_idx_in_block * 16 + b * 8 + group_id] = *(char2 *) &fp4_packed;
-        }
-#else
-        // Fallback: manual FP4 conversion using LUT
-        const uint8_t q_val = ggml_cuda_float_to_fp4_e2m1(xi, inv_s);
-
-        const uint8_t q_lo_0 = __shfl_sync(0xFFFFFFFF, q_val, base,      WARP_SIZE);
-        const uint8_t q_lo_1 = __shfl_sync(0xFFFFFFFF, q_val, base + 1,  WARP_SIZE);
-        const uint8_t q_hi_0 = __shfl_sync(0xFFFFFFFF, q_val, base + 16, WARP_SIZE);
-        const uint8_t q_hi_1 = __shfl_sync(0xFFFFFFFF, q_val, base + 17, WARP_SIZE);
-
-        if (lane_in_group == 0) {
-            char2 q;
-            q.x = (q_hi_0 << 4) | q_lo_0;
-            q.y = (q_hi_1 << 4) | q_lo_1;
-            yqs2[quad_idx_in_block * 16 + b * 8 + group_id] = q;
-        }
-#endif // CUDART_VERSION >= 12080
-    }
-
-    if (lane_id_32 == 0) {
-        // Store 2 scales packed into 1 uint32
-        y[ib].d4[quad_idx_in_block] = (scales[1] << 8) | scales[0];
-    }
-}
-
 template <mmq_q8_1_ds_layout ds_layout>
 static __global__ void quantize_mmq_q8_1(
        const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy,
@@ -315,29 +190,3 @@ void quantize_mmq_q8_1_cuda(
            break;
    }
 }
-
-void quantize_mmq_mxfp4_cuda(const float *                    x,
-                             const int32_t *                  ids,
-                             void *                           vy,
-                             [[maybe_unused]] const ggml_type type_src0,
-                             const int64_t                    ne00,
-                             const int64_t                    s01,
-                             const int64_t                    s02,
-                             const int64_t                    s03,
-                             const int64_t                    ne0,
-                             const int64_t                    ne1,
-                             const int64_t                    ne2,
-                             const int64_t                    ne3,
-                             cudaStream_t                     stream) {
-    GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0);
-
-    constexpr int nwarps = 8;
-    constexpr int vals_per_warp  = 2 * QK_MXFP4;
-    constexpr int vals_per_block = nwarps * vals_per_warp;
-
-    const int64_t block_num_y = (ne0 + vals_per_block - 1) / vals_per_block;
-    const dim3    num_blocks(ne1, block_num_y, ne2 * ne3);
-    const dim3    block_size(WARP_SIZE, nwarps, 1);
-
-    quantize_mmq_mxfp4<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
-}
--- a/ggml/src/ggml-cuda/quantize.cuh
+++ b/ggml/src/ggml-cuda/quantize.cuh
@@ -25,17 +25,3 @@ void quantize_mmq_q8_1_cuda(
        const float * x, const int32_t * ids, void * vy,
        ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
        int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
-
-void quantize_mmq_mxfp4_cuda(const float *   x,
-                             const int32_t * ids,
-                             void *          vy,
-                             ggml_type       type_src0,
-                             int64_t         ne00,
-                             int64_t         s01,
-                             int64_t         s02,
-                             int64_t         s03,
-                             int64_t         ne0,
-                             int64_t         ne1,
-                             int64_t         ne2,
-                             int64_t         ne3,
-                             cudaStream_t    stream);
--- a/ggml/src/ggml-cuda/reduce_rows.cuh
+++ b/ggml/src/ggml-cuda/reduce_rows.cuh
@@ -28,8 +28,22 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
    }

    // sum up partial sums
-    __shared__ float shared_vals[32];
-    sum = block_reduce<block_reduce_method::SUM>(sum, shared_vals);
+    sum = warp_reduce_sum(sum);
+    if (blockDim.x > WARP_SIZE) {
+        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
+        __shared__ float s_sum[32];
+        const int        warp_id = threadIdx.x / WARP_SIZE;
+        const int        lane_id = threadIdx.x % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = sum;
+        }
+        __syncthreads();
+        sum = 0.0f;
+        if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
+            sum = s_sum[lane_id];
+        }
+        sum = warp_reduce_sum(sum);
+    }

    if (col != 0) {
        return;
--- a/ggml/src/ggml-cuda/softmax.cu
+++ b/ggml/src/ggml-cuda/softmax.cu
@@ -1,14 +1,6 @@
 #include "common.cuh"
 #include "ggml.h"
 #include "softmax.cuh"
-
-#ifdef GGML_USE_HIP
-#include <hip/hip_cooperative_groups.h>
-#else
-#include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
-#endif // GGML_USE_HIP
-
 #include <cstdint>
 #include <utility>

@@ -75,6 +67,9 @@ static __global__ void soft_max_f32(

    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;

+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
    const float slope = get_alibi_slope(p.max_bias, i02, p.n_head_log2, p.m0, p.m1);

    extern __shared__ float data_soft_max_f32[];
@@ -99,7 +94,21 @@ static __global__ void soft_max_f32(
    }

    // find the max value in the block
-    max_val = block_reduce<block_reduce_method::MAX, block_size_template>(max_val, buf_iw);
+    max_val = warp_reduce_max(max_val);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf_iw[lane_id] = -INFINITY;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf_iw[warp_id] = max_val;
+        }
+        __syncthreads();
+
+        max_val = buf_iw[lane_id];
+        max_val = warp_reduce_max(max_val);
+    }

    float tmp = 0.0f; // partial sum

@@ -117,7 +126,22 @@ static __global__ void soft_max_f32(
    }

    // find the sum of exps in the block
-    tmp = block_reduce<block_reduce_method::SUM, block_size_template>(tmp, buf_iw);
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __syncthreads();
+        if (warp_id == 0) {
+            buf_iw[lane_id] = 0.0f;
+        }
+        __syncthreads();
+
+        if (lane_id == 0) {
+            buf_iw[warp_id] = tmp;
+        }
+        __syncthreads();
+
+        tmp = buf_iw[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }

    if (sinks) {
        tmp += expf(sinks[i02] - max_val);
@@ -136,113 +160,6 @@ static __global__ void soft_max_f32(
        dst[col] = vals[col] * inv_sum;
    }
 }
-
-// TODO: Template to allow keeping ncols in registers if they fit
-static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __restrict__ x,
-                                                                float * __restrict__ dst,
-                                                                float * __restrict__ tmp_maxs,
-                                                                float * __restrict__ tmp_sums,
-                                                                const soft_max_params p) {
-    namespace cg = cooperative_groups;
-
-    const cg::grid_group g = cg::this_grid();
-
-    const int tid               = threadIdx.x;
-    const int col_start         = blockIdx.x * blockDim.x + tid;
-    const int n_elem_per_thread = 4;
-
-    float     local_vals[n_elem_per_thread] = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
-    float     local_max                     = -INFINITY;
-    const int step_size                     = gridDim.x * blockDim.x;
-    __shared__ float shared_vals[32];
-
-    // Compute thread-local max
-    for (int col = col_start; col < p.ncols;) {
-#pragma unroll
-        for (int i = 0; i < n_elem_per_thread; i++) {
-            const int idx = col + i * step_size;
-            local_vals[i] = idx < p.ncols ? x[idx] : -INFINITY;
-        }
-#pragma unroll
-        for (int i = 0; i < n_elem_per_thread; i++) {
-            local_max = fmaxf(local_max, local_vals[i]);
-        }
-        col += step_size * n_elem_per_thread;
-    }
-
-    // Compute CTA-level max
-    local_max = block_reduce<block_reduce_method::MAX>(local_max, shared_vals);
-
-    // Store CTA-level max to GMEM
-    if (tid == 0) {
-        tmp_maxs[blockIdx.x] = local_max;
-    }
-    g.sync();
-
-    // Compute compute global max from CTA-level maxs
-    assert(gridDim.x < blockDim.x);  // currently we only support this case
-    if (tid < gridDim.x) {
-        local_max = tmp_maxs[tid];
-    } else {
-        local_max = -INFINITY;
-    }
-    local_max = block_reduce<block_reduce_method::MAX>(local_max, shared_vals);
-
-    // Compute softmax dividends, accumulate divisor
-    float tmp_expf = 0.0f;
-    for (int col = col_start; col < p.ncols;) {
-#pragma unroll
-        for (int i = 0; i < n_elem_per_thread; i++) {
-            const int idx = col + i * step_size;
-            local_vals[i] = idx < p.ncols ? x[idx] : -INFINITY;
-        }
-#pragma unroll
-        for (int i = 0; i < n_elem_per_thread; i++) {
-            const int idx = col + i * step_size;
-            if (idx < p.ncols) {
-                const float tmp = expf(local_vals[i] - local_max);
-                tmp_expf += tmp;
-                dst[idx] = tmp;
-            }
-        }
-        col += step_size * n_elem_per_thread;
-    }
-
-    // Reduce divisor within CTA
-    tmp_expf = block_reduce<block_reduce_method::SUM>(tmp_expf, shared_vals);
-
-    // Store CTA-level sum to GMEM
-    if (tid == 0) {
-        tmp_sums[blockIdx.x] = tmp_expf;
-    }
-    g.sync();
-
-    // Compute global sum from CTA-level sums
-    if (tid < gridDim.x) {
-        tmp_expf = tmp_sums[tid];
-    } else {
-        tmp_expf = 0.0f;
-    }
-    tmp_expf = block_reduce<block_reduce_method::SUM>(tmp_expf, shared_vals);
-
-    // Divide dividend by global sum + store data
-    for (int col = col_start; col < p.ncols;) {
-#pragma unroll
-        for (int i = 0; i < n_elem_per_thread; i++) {
-            const int idx = col + i * step_size;
-            local_vals[i] = idx < p.ncols ? dst[idx] : -INFINITY;
-        }
-#pragma unroll
-        for (int i = 0; i < n_elem_per_thread; i++) {
-            const int idx = col + i * step_size;
-            if (idx < p.ncols) {
-                dst[idx] = local_vals[i] / tmp_expf;
-            }
-        }
-        col += step_size * n_elem_per_thread;
-    }
-}
-
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif // __clang__
@@ -299,31 +216,9 @@ static void launch_soft_max_kernels(const float * x, const T * mask, const float
    soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>(x, mask, sinks, dst, p);
 }

-__launch_bounds__(8*WARP_SIZE, 1) static __global__ void soft_max_f32_parallelize_cols(const float * __restrict__ x,
-                                                     float * __restrict__ dst,
-                                                     float * __restrict__ tmp_maxs,
-                                                     float * __restrict__ tmp_sums,
-                                                     const soft_max_params p)
-// We loop over all instead of parallelizing across gridDim.y as cooperative groups
-// currently only support synchronizing the complete grid if not launched as a cluster group
-// (which requires CC > 9.0)
-// https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/device-callable-apis.html#grid-synchronization
-// https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/device-callable-apis.html#class-cluster-group
-{
-    for (int rowx = 0; rowx < p.ne01 * p.ne02 * p.ne03; rowx++) {
-        soft_max_f32_parallelize_cols_single_row(x + int64_t(rowx) * p.ncols, dst + int64_t(rowx) * p.ncols, tmp_maxs,
-                                                 tmp_sums, p);
-    }
-}

-template <typename T>
-static void soft_max_f32_cuda(const float *                                x,
-                              const T *                                    mask,
-                              const float *                                sinks,
-                              float *                                      dst,
-                              const soft_max_params &                      params,
-                              cudaStream_t                                 stream,
-                              [[maybe_unused]] ggml_backend_cuda_context & ctx) {
+template<typename T>
+static void soft_max_f32_cuda(const float * x, const T * mask, const float * sinks, float * dst, const soft_max_params & params, cudaStream_t stream) {
    int nth = WARP_SIZE;
    const int64_t ncols_x = params.ncols;

@@ -341,25 +236,8 @@ static void soft_max_f32_cuda(const float *                                x,
    if (nbytes_shared <= smpbo) {
        launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(x, mask, sinks, dst, params, stream, block_dims, block_nums, nbytes_shared);
    } else {
-        // Parallelize across SMs for top-p/dist-sampling
-        // The heuristic for parallelizing rows across SMs vs parallelizing single row & looping over all rows was done on the basis of a B6000 GPU and
-        // Can be adapted further for lower-SM-count GPUs, though keeping data in registers should be implemented first as that is the optimal solution.
-        if (ggml_cuda_info().devices[id].supports_cooperative_launch &&
-            ncols_x / (params.ne01 * params.ne02 * params.ne03) > 8192 && mask == nullptr && sinks == nullptr &&
-            params.scale == 1.0f && params.max_bias == 0.0f) {
-            ggml_cuda_pool_alloc<float> tmp_maxs_alloc(ctx.pool(), ggml_cuda_info().devices[id].nsm * sizeof(float));
-            ggml_cuda_pool_alloc<float> tmp_sums_alloc(ctx.pool(), ggml_cuda_info().devices[id].nsm * sizeof(float));
-
-            void * kernel_args[] = { (void *) &x, (void *) &dst, (void *) &tmp_maxs_alloc.ptr,
-                                     (void *) &tmp_sums_alloc.ptr, (void *) const_cast<soft_max_params *>(&params) };
-            CUDA_CHECK(cudaLaunchCooperativeKernel((void *) soft_max_f32_parallelize_cols,
-                                                   dim3(ggml_cuda_info().devices[id].nsm, 1, 1),
-                                                   dim3(WARP_SIZE * 8, 1, 1), kernel_args, 0, stream));
-        } else {
-            const size_t nbytes_shared_low = WARP_SIZE * sizeof(float);
-            soft_max_f32<false, 0, 0>
-                <<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, sinks, dst, params);
-        }
+        const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
+        soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, sinks, dst, params);
    }
 }

@@ -437,9 +315,9 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    params.m1 = m1;

    if (use_f16) {
-        soft_max_f32_cuda(src0_d, (const half *) src1_d, (const float *) src2_d, dst_d, params, stream, ctx);
+        soft_max_f32_cuda(src0_d, (const half  *) src1_d, (const float *) src2_d, dst_d, params, stream);
    } else {
-        soft_max_f32_cuda(src0_d, (const float *) src1_d, (const float *) src2_d, dst_d, params, stream, ctx);
+        soft_max_f32_cuda(src0_d, (const float *) src1_d, (const float *) src2_d, dst_d, params, stream);
    }
 }

--- a/ggml/src/ggml-cuda/ssm-scan.cu
+++ b/ggml/src/ggml-cuda/ssm-scan.cu
@@ -114,7 +114,7 @@ __global__ void __launch_bounds__(splitD, 1)
 #endif // __clang__

 // assumes as many threads as d_state
-template <int c_factor, int d_state>
+template <int splitH, int d_state>
 __global__ void __launch_bounds__(d_state, 1)
    ssm_scan_f32_group(
        const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
@@ -125,25 +125,20 @@ __global__ void __launch_bounds__(d_state, 1)
        const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3,
        const int64_t s_off, const int64_t n_head, const int64_t d_head, const int64_t n_group, const int64_t n_tok) {

-    const int warp     = threadIdx.x / WARP_SIZE;
-    const int lane     = threadIdx.x % WARP_SIZE;
-    const int warp_idx = blockIdx.x  * c_factor + warp;
-
-    const int head_idx =  warp_idx / d_head;
-    const int head_off = (warp_idx % d_head) * sizeof(float);
-    const int seq_idx  = blockIdx.y;
+    const int head_idx = (blockIdx.x * splitH) / d_head;
+    const int head_off = ((blockIdx.x * splitH) % d_head) * sizeof(float);
+    const int seq_idx = blockIdx.y;

    const int group_off = (head_idx / (n_head / n_group)) * d_state * sizeof(float);

-    // TODO: refactor strides to be in elements/floats instead of bytes to be cleaner and consistent with the rest of the codebase
-    const float * s0_warp = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
-    const float * x_warp  = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + (warp_idx * sizeof(float)));
-    const float * dt_warp = (const float *) ((const char *) src2 + (seq_idx * src2_nb2) + head_idx * sizeof(float));
-    const float * A_warp  = (const float *) ((const char *) src3 + head_idx * src3_nb1);
-    const float * B_warp  = (const float *) ((const char *) src4 + (seq_idx * src4_nb3) + (group_off));
-    const float * C_warp  = (const float *) ((const char *) src5 + (seq_idx * src5_nb3) + (group_off));
-    float *       y_warp  = dst + (seq_idx * n_tok * n_head * d_head) + warp_idx;
-    float *       s_warp  = (float *) ((char *) dst + s_off + seq_idx * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
+    const float * s0_block = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);
+    const float * x_block  = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + blockIdx.x * splitH * sizeof(float));
+    const float * dt_block = (const float *) ((const char *) src2 + (seq_idx * src2_nb2) + head_idx * sizeof(float));
+    const float * A_block  = (const float *) ((const char *) src3 + head_idx * src3_nb1);
+    const float * B_block  = (const float *) ((const char *) src4 + (seq_idx * src4_nb3) + (group_off));
+    const float * C_block  = (const float *) ((const char *) src5 + (seq_idx * src5_nb3) + (group_off));
+    float *       y_block  = dst + (seq_idx * n_tok * n_head * d_head) + blockIdx.x * splitH;
+    float *       s_block  = (float *) ((char *) dst + s_off + seq_idx * src0_nb3 + head_idx * src0_nb2 + head_off * d_state);

    // strides across n_seq_tokens
    const int stride_x  = src1_nb2 / sizeof(float);
@@ -152,42 +147,80 @@ __global__ void __launch_bounds__(d_state, 1)
    const int stride_C  = src5_nb2 / sizeof(float);
    const int stride_y  = n_head * d_head;

-    float state[c_factor];
-    float state_sum = 0.0f;
+    float state[splitH];
+    // for the parallel accumulation
+    __shared__ float stateC[splitH * d_state];

 #pragma unroll
-    for (int j = 0; j < c_factor; j++) {
-        state[j] = s0_warp[WARP_SIZE * j + lane];
+    for (int j = 0; j < splitH; j++) {
+        state[j] = s0_block[j * d_state + threadIdx.x];
    }

    for (int64_t i = 0; i < n_tok; i++) {
-        // NOTE: dt_soft_plus, dA and x_dt have the same value for a warp here.
-        // Recalculation is intentional; sharing via shuffles/smem proved slower due to sync overhead.
-        const float dt_soft_plus = (dt_warp[i * stride_dt] <= 20.0f ? log1pf(expf(dt_warp[i * stride_dt])) : dt_warp[i * stride_dt]);
+        // TODO: only calculate dA and dt_soft_plus once per head instead of every splitH head elements
+        // TODO: only calculate B and C once per head group
+        // NOTE: dt_soft_plus, dA and x_dt have the same value across threads here.
+        float dt_soft_plus = dt_block[i * stride_dt];
+        if (dt_soft_plus <= 20.0f) {
+            dt_soft_plus = log1pf(expf(dt_soft_plus));
+        }
+        const float dA = expf(dt_soft_plus * A_block[0]);
+        const float B = B_block[i * stride_B + threadIdx.x];
+        const float C = C_block[i * stride_C + threadIdx.x];

-        state_sum = 0.0f;
-        const float dA   = expf(dt_soft_plus * A_warp[0]);
-        const float x_dt = x_warp[i * stride_x] * dt_soft_plus;
+        // across d_head
 #pragma unroll
-        for (int j = 0; j < c_factor; j++) {
-            const float B_val = B_warp[i * stride_B + WARP_SIZE * j + lane];
-            const float C_val = C_warp[i * stride_C + WARP_SIZE * j + lane];
-            state[j] = (state[j] * dA) + (B_val * x_dt);
-            state_sum += state[j] * C_val;
+        for (int j = 0; j < splitH; j++) {
+            const float x_dt = x_block[i * stride_x + j] * dt_soft_plus;
+
+            state[j] = (state[j] * dA) + (B * x_dt);
+
+            stateC[j * d_state + threadIdx.x] = state[j] * C;
        }

-        // parallel accumulation for output
-        state_sum = warp_reduce_sum(state_sum);
+        __syncthreads();

-        if (lane == 0) {
-            y_warp[i * stride_y] = state_sum;
+        // parallel accumulation for stateC
+        // TODO: simplify
+        {
+            static_assert((d_state & -d_state) == d_state, "the state size has to be a power of 2");
+            static_assert((splitH & -splitH) == splitH, "splitH has to be a power of 2");
+
+            // reduce until w matches the warp size
+            // TODO: does this work even when the physical warp size is 64?
+#pragma unroll
+            for (int w = d_state; w > WARP_SIZE; w >>= 1) {
+                // (assuming there are d_state threads)
+#pragma unroll
+                for (int j = 0; j < ((w >> 1) * splitH + d_state - 1) / d_state; j++) {
+                    // TODO: check for bank conflicts
+                    const int k = (threadIdx.x % (w >> 1)) + (d_state * (threadIdx.x / (w >> 1))) + j * d_state * (d_state / (w >> 1));
+                    stateC[k] += stateC[k + (w >> 1)];
+
+                }
+                __syncthreads();
+            }
+
+            static_assert(splitH >= d_state / WARP_SIZE);
+
+#pragma unroll
+            for (int j = 0; j < splitH / (d_state / WARP_SIZE); j++) {
+                float y = stateC[(threadIdx.x % WARP_SIZE) + d_state * (threadIdx.x / WARP_SIZE) + j * d_state * (d_state / WARP_SIZE)];
+                y = warp_reduce_sum(y);
+
+                // store the above accumulations
+                if (threadIdx.x % WARP_SIZE == 0) {
+                    const int k = threadIdx.x / WARP_SIZE + j * (d_state / WARP_SIZE);
+                    y_block[i * stride_y + k] = y;
+                }
+            }
        }
    }

    // write back the state
 #pragma unroll
-    for (int j = 0; j < c_factor; j++) {
-        s_warp[WARP_SIZE * j + lane] = state[j];
+    for (int j = 0; j < splitH; j++) {
+        s_block[j * d_state + threadIdx.x] = state[j];
    }
 }

@@ -198,24 +231,27 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
                              const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim,
                              const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq,
                              cudaStream_t stream) {
+    const int threads = 128;
    // NOTE: if you change conditions here, be sure to update the corresponding supports_op condition!
    if (src3_nb1 == sizeof(float)) {
        // Mamba-2
        if (d_state == 128) {
-            constexpr int threads   = 128;
-            constexpr int num_warps = threads/WARP_SIZE;
-
-            const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1);
-            ssm_scan_f32_group<128/WARP_SIZE, 128><<<blocks, threads, 0, stream>>>(
+            GGML_ASSERT(d_state % threads == 0);
+            // NOTE: can be any power of two between 4 and 64
+            const int splitH = 16;
+            GGML_ASSERT(head_dim % splitH == 0);
+            const dim3 blocks((n_head * head_dim + (splitH - 1)) / splitH, n_seq, 1);
+            ssm_scan_f32_group<16, 128><<<blocks, threads, 0, stream>>>(
                    src0, src1, src2, src3, src4, src5, src6, dst,
                    src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
                    src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
        } else if (d_state == 256) { // Falcon-H1
-            constexpr int threads   = 256;
-            constexpr int num_warps = threads/WARP_SIZE;
-
-            const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1);
-            ssm_scan_f32_group<256/WARP_SIZE, 256><<<blocks, threads, 0, stream>>>(
+            const int threads = 256;
+            // NOTE: can be any power of two between 8 and 64
+            const int splitH = 16;
+            GGML_ASSERT(head_dim % splitH == 0);
+            const dim3 blocks((n_head * head_dim + (splitH - 1)) / splitH, n_seq, 1);
+            ssm_scan_f32_group<16, 256><<<blocks, threads, 0, stream>>>(
                    src0, src1, src2, src3, src4, src5, src6, dst,
                    src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
                    src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
@@ -224,7 +260,6 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
        }
    } else {
        // Mamba-1
-        constexpr int threads = 128;
        GGML_ASSERT(n_head % threads == 0);
        GGML_ASSERT(head_dim == 1);
        GGML_ASSERT(n_group == 1);
--- a/ggml/src/ggml-cuda/top-k.cu
+++ b/ggml/src/ggml-cuda/top-k.cu
@@ -1,96 +0,0 @@
-#include "argsort.cuh"
-#include "top-k.cuh"
-
-#ifdef GGML_CUDA_USE_CUB
-#    include <cub/cub.cuh>
-#    if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2)
-#        include <cuda/iterator>
-#        define CUB_TOP_K_AVAILABLE
-using namespace cub;
-#    endif  // CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2
-#endif      // GGML_CUDA_USE_CUB
-
-#ifdef CUB_TOP_K_AVAILABLE
-
-static void top_k_cub(ggml_cuda_pool & pool,
-                      const float *    src,
-                      int *            dst,
-                      const int        ncols,
-                      const int        k,
-                      cudaStream_t     stream) {
-    auto requirements = cuda::execution::require(cuda::execution::determinism::not_guaranteed,
-                                                 cuda::execution::output_ordering::unsorted);
-    auto stream_env   = cuda::stream_ref{ stream };
-    auto env          = cuda::std::execution::env{ stream_env, requirements };
-
-    auto indexes_in = cuda::make_counting_iterator(0);
-
-    size_t temp_storage_bytes = 0;
-    DeviceTopK::MaxPairs(nullptr, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst, ncols, k,
-                         env);
-
-    ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
-    void *                        d_temp_storage = temp_storage_alloc.get();
-
-    DeviceTopK::MaxPairs(d_temp_storage, temp_storage_bytes, src, cuda::discard_iterator(), indexes_in, dst,
-                         ncols, k, env);
-}
-
-#elif defined(GGML_CUDA_USE_CUB)  // CUB_TOP_K_AVAILABLE
-
-static int next_power_of_2(int x) {
-    int n = 1;
-    while (n < x) {
-        n *= 2;
-    }
-    return n;
-}
-
-#endif                            // CUB_TOP_K_AVAILABLE
-
-void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0   = dst->src[0];
-    const float *       src0_d = (const float *) src0->data;
-    int *               dst_d  = (int *) dst->data;
-    cudaStream_t        stream = ctx.stream();
-
-    // are these asserts truly necessary?
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_is_contiguous(src0));
-
-    const int64_t    ncols = src0->ne[0];
-    const int64_t    nrows = ggml_nrows(src0);
-    const int64_t    k     = dst->ne[0];
-    ggml_cuda_pool & pool  = ctx.pool();
-#ifdef CUB_TOP_K_AVAILABLE
-    // TODO: Switch to `DeviceSegmentedTopK` for multi-row TopK once implemented
-    // https://github.com/NVIDIA/cccl/issues/6391
-    // TODO: investigate if there exists a point where parallelized argsort is faster than sequential top-k
-    for (int i = 0; i < nrows; i++) {
-        top_k_cub(pool, src0_d + i * ncols, dst_d + i * k, ncols, k, stream);
-    }
-#elif defined(GGML_CUDA_USE_CUB)  // CUB_TOP_K_AVAILABLE
-    // Fall back to argsort + copy
-    const int    ncols_pad      = next_power_of_2(ncols);
-    const size_t shared_mem     = ncols_pad * sizeof(int);
-    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
-
-    ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
-    int *                     tmp_dst = temp_dst_alloc.get();
-
-    if (shared_mem > max_shared_mem || ncols > 1024) {
-        argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
-    } else {
-        argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
-    }
-    CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows,
-                                 cudaMemcpyDeviceToDevice, stream));
-#else                             // GGML_CUDA_USE_CUB
-    ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
-    int *                     tmp_dst = temp_dst_alloc.get();
-    argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
-    CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows,
-                                 cudaMemcpyDeviceToDevice, stream));
-#endif
-}
--- a/ggml/src/ggml-cuda/top-k.cuh
+++ b/ggml/src/ggml-cuda/top-k.cuh
@@ -1,3 +0,0 @@
-#include "common.cuh"
-
-void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/vendors/cuda.h
+++ b/ggml/src/ggml-cuda/vendors/cuda.h
@@ -10,10 +10,6 @@
 #include <cuda_fp8.h>
 #endif // CUDART_VERSION >= 12050

-#if CUDART_VERSION >= 12080
-#include <cuda_fp4.h>
-#endif // CUDART_VERSION >= 12080
-
 #if CUDART_VERSION < 11020
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
 #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -45,11 +45,9 @@
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
 #define cublasOperation_t hipblasOperation_t
-#define cudaDevAttrCooperativeLaunch hipDeviceAttributeCooperativeLaunch
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
-#define cudaDeviceGetAttribute hipDeviceGetAttribute
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
@@ -72,7 +70,6 @@
 #define cudaHostRegisterPortable hipHostRegisterPortable
 #define cudaHostRegisterReadOnly hipHostRegisterReadOnly
 #define cudaHostUnregister hipHostUnregister
-#define cudaLaunchCooperativeKernel hipLaunchCooperativeKernel
 #define cudaLaunchHostFunc hipLaunchHostFunc
 #define cudaMalloc hipMalloc
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
@@ -138,8 +135,6 @@
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
-#define cudaFuncSetAttribute hipFuncSetAttribute
-#define cudaFuncAttributeMaxDynamicSharedMemorySize hipFuncAttributeMaxDynamicSharedMemorySize
 #define __trap() do { abort(); __builtin_unreachable(); } while(0)
 #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 #define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -61,7 +61,6 @@
 #define cudaHostRegisterPortable musaHostRegisterPortable
 #define cudaHostRegisterReadOnly musaHostRegisterReadOnly
 #define cudaHostUnregister musaHostUnregister
-#define cudaLaunchCooperativeKernel musaLaunchCooperativeKernel
 #define cudaLaunchHostFunc musaLaunchHostFunc
 #define cudaMalloc musaMalloc
 #define cudaMallocHost musaMallocHost