ggml: aarch64: Implement SVE F16 kernels for vector functions (#15115)

* Added sve implementation for vec_dot_fp16 Kernel

* removed white spaces

* Added comment

* removed white spaces

* changed GGML_F16x_VEC_FMA for code consistency

* Update vec.h

---------

Co-authored-by: vithulep <p.m.vithule1517@gmail.com>
This commit is contained in:
Prashant Vithule
2025-09-01 23:43:16 +05:30
committed by GitHub
parent 4b20d8b7e3
commit a0c2b207c5
3 changed files with 404 additions and 92 deletions

View File

@@ -215,6 +215,47 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
#define GGML_F32_VEC_MUL GGML_F32xt_MUL
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
// F16 SVE
#define DEFAULT_PG32 svptrue_b32()
#define DEFAULT_PG16 svptrue_b16()
#define GGML_F32Cxt svfloat16_t
#define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
#define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
#define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
#define GGML_F16x_VEC GGML_F32Cxt
#define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
#define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
#define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
#define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
#define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
#define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
#define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
{ \
sum1 = svadd_f16_x(pg16, sum1, sum2); \
sum3 = svadd_f16_x(pg16, sum3, sum4); \
sum1 = svadd_f16_x(pg16, sum1, sum3); \
__fp16 sum_f16 = svaddv_f16(pg16, sum1); \
(res) = (ggml_float) sum_f16; \
}
#define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
// F16 NEON
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)