CUDA: skip masked KV slices for all FA kernels (#14924)

2025-07-30 15:46:13 +02:00
parent 00131d6eaf
commit 92b8810ec7
9 changed files with 120 additions and 56 deletions
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -13,6 +13,7 @@ static __global__ void flash_attn_tile_ext_f16(
        const char * __restrict__ K,
        const char * __restrict__ V,
        const char * __restrict__ mask,
+        const int  * __restrict__ KV_max,
        float      * __restrict__ dst,
        float2     * __restrict__ dst_meta,
        const float scale,
@@ -90,7 +91,8 @@ static __global__ void flash_attn_tile_ext_f16(

    __syncthreads();

-    for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F16; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F16) {
+    const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
+    for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F16; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F16) {
        // Calculate KQ tile and keep track of new maximum KQ values:

        half kqmax_new[ncols/nwarps];