metal: optimise GGML_OP_SUM (#16559)

* optimise GGML_OP_SUM

* add non-contiguous tests by permuting the input

* change tests to require full contiguity of OP_SUM

* cuda : add check GGML_OP_SUM

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Sam/Samuel
2025-10-15 23:05:56 +09:00
committed by GitHub
parent 17304cbcc1
commit f4ce81c45e
5 changed files with 71 additions and 11 deletions

View File

@@ -662,6 +662,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
case GGML_OP_LOG:
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_SUM:
return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_SOFT_MAX: