metal : optimize FA vec for large sequences and BS <= 8 (#15566)

* metal : optmize FA vec for large heads and sequences

* metal : adjust small-batch mul mv kernels

ggml-ci

* batched-bench : fix total speed computation

ggml-ci

* cont : add comments

ggml-ci
This commit is contained in:
Georgi Gerganov
2025-08-26 14:22:14 +03:00
committed by GitHub
parent 79a546220c
commit b3964c1e89
4 changed files with 183 additions and 25 deletions

View File

@@ -249,6 +249,7 @@ typedef struct {
uint64_t nb33;
int32_t ne1;
int32_t ne2;
int32_t ne3;
float scale;
float max_bias;
float m0;
@@ -257,6 +258,11 @@ typedef struct {
float logit_softcap;
} ggml_metal_kargs_flash_attn_ext;
typedef struct {
int32_t nrows;
int32_t ne20;
} ggml_metal_kargs_flash_attn_ext_reduce;
typedef struct {
int32_t ne00;
int32_t ne02;