ggml-cpu : add basic RVV support for vector f32 ops (#15057)

* ggml-cpu : add basic RVV support for vector f32 ops

* ggml-cpu : add RVV support for f32 softmax
This commit is contained in:
xctan
2025-08-27 16:44:22 +08:00
committed by GitHub
parent fcca2182a1
commit 1cf123a343
5 changed files with 168 additions and 19 deletions

View File

@@ -18,6 +18,10 @@
#include <immintrin.h>
#endif
#if defined(__riscv_v_intrinsic)
#include <riscv_vector.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
@@ -94,24 +98,15 @@ extern "C" {
}
#elif defined(__riscv) && defined(__riscv_zfhmin)
static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
float f;
__asm__(
"fmv.h.x %[f], %[h]\n\t"
"fcvt.s.h %[f], %[f]"
: [f] "=&f" (f)
: [h] "r" (h)
);
return f;
_Float16 hf;
memcpy(&hf, &h, sizeof(ggml_fp16_t));
return hf;
}
static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
__asm__(
"fcvt.h.s %[f], %[f]\n\t"
"fmv.x.h %[h], %[f]"
: [h] "=&r" (res)
: [f] "f" (f)
);
_Float16 hf = (_Float16)f;
memcpy(&res, &hf, sizeof(ggml_fp16_t));
return res;
}
@@ -1170,6 +1165,36 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
#elif defined(__riscv_v_intrinsic)
// compatible with vlen >= 128
#define GGML_SIMD
// F32
#define GGML_F32_STEP 16
#define GGML_F32_EPR 4
#define GGML_F32x4 vfloat32m1_t
#define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
#define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
#define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
#define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
#define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
#define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
#define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
#define GGML_F32_VEC GGML_F32x4
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
#endif
// GGML_F32_ARR / GGML_F16_ARR