metal : add support for non-padded FA KV (#16148)

* metal : pad K, V and Mask when needed * cont : simplify * cuda : add TODO about KV padding requirement * metal : add comments * metal : remove mask padding requirement
2025-10-07 08:23:30 +03:00
parent 1d6092fc72
commit 0a319bb75e
9 changed files with 460 additions and 72 deletions
--- a/ggml/src/ggml-metal/ggml-metal-ops.h
+++ b/ggml/src/ggml-metal/ggml-metal-ops.h
@@ -39,6 +39,7 @@ size_t ggml_metal_op_mul_mat_id_extra_ids(const struct ggml_tensor * op);
 // return true if we should use the FA vector kernel for this op
 bool ggml_metal_op_flash_attn_ext_use_vec(const struct ggml_tensor * op);

+size_t ggml_metal_op_flash_attn_ext_extra_pad(const struct ggml_tensor * op);
 size_t ggml_metal_op_flash_attn_ext_extra_tmp(const struct ggml_tensor * op);

 int ggml_metal_op_concat            (ggml_metal_op_t ctx, int idx);