vulkan: Support FA with any multiple of 8 head sizes (#15537)

The scalar FA shader already handled multiples of 8. The coopmat1 FA shader assumed 16x16x16 and the shared memory allocations need the HSK dimensions padded to a multiple of 16. NVIDIA's coopmat2 implementation requires multiples of 16 for N and K, and needs the matrix dimensions padded and loads clamped. Store the FA pipelines in a map, indexed by the pipeline state.
2025-08-24 04:24:25 -05:00
parent a9c6ffcbfa
commit c9a24fb932
5 changed files with 143 additions and 137 deletions
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
@@ -46,14 +46,14 @@ const uint32_t MatBc = 16;
 shared FLOAT_TYPE tmpsh[gl_WorkGroupSize.x];
 shared ACC_TYPEV4 tmpshv4[gl_WorkGroupSize.x];

-const uint32_t qstride = HSK / 4 + 2; // in units of f16vec4
+const uint32_t qstride = HSK_pad / 4 + 2; // in units of f16vec4
 shared f16vec4 Qf[Br * qstride];

 // Avoid padding for hsk==256 to make it fit in 48KB shmem.
 const uint32_t sfshstride = (HSK <= 128) ? (Br + 8) : Br;
 shared ACC_TYPE sfsh[Bc * sfshstride];

-const uint32_t kshstride = HSK / 4 + 2; // in units of f16vec4
+const uint32_t kshstride = HSK_pad / 4 + 2; // in units of f16vec4
 shared f16vec4 ksh[Bc * kshstride];

 shared float slope[Br];
@@ -74,6 +74,21 @@ void main() {

 #define tile_row(r) (row_tid * rows_per_thread + (r))

+    // Zero-initialize shared memory for Q/K when HSK is not a multiple of 16 (HSK_pad > HSK).
+    if ((HSK % 16) != 0) {
+        [[unroll]] for (uint i = 0; i < Br * qstride; i += gl_WorkGroupSize.x) {
+            if (i + tid < Br * qstride) {
+                Qf[i + tid] = f16vec4(0);
+            }
+        }
+        [[unroll]] for (uint i = 0; i < Bc * kshstride; i += gl_WorkGroupSize.x) {
+            if (i + tid < Bc * kshstride) {
+                ksh[i + tid] = f16vec4(0);
+            }
+        }
+        barrier();
+    }
+
    uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;

    [[unroll]] for (uint32_t idx = 0; idx < Br * HSK / 4; idx += gl_WorkGroupSize.x) {
@@ -151,14 +166,14 @@ void main() {
        }
        barrier();

-        // K * Q^T -> S^T: Bc x HSK * HSK x Br -> Bc x Br
+        // K * Q^T -> S^T: Bc x HSK_pad * HSK_pad x Br -> Bc x Br
        // Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16
        // This is written transposed in order to allow for N being 8 if implementations need it
        coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> SfMat = coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
        coopmat<float16_t, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
        coopmat<float16_t, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;

-        for (uint32_t d = 0; d < HSK / 16; ++d) {
+        for (uint32_t d = 0; d < HSK_pad / 16; ++d) {
            coopMatLoad(QMat, Qf, d * 16 / 4, qstride, gl_CooperativeMatrixLayoutColumnMajor);

            uint coord = (gl_SubgroupID * MatBc) * kshstride + d * 16 / 4;