ggml-cpu : add basic RVV support for vector f32 ops (#15057)
* ggml-cpu : add basic RVV support for vector f32 ops * ggml-cpu : add RVV support for f32 softmax
This commit is contained in:
@@ -18,6 +18,10 @@
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@@ -94,24 +98,15 @@ extern "C" {
|
||||
}
|
||||
#elif defined(__riscv) && defined(__riscv_zfhmin)
|
||||
static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
float f;
|
||||
__asm__(
|
||||
"fmv.h.x %[f], %[h]\n\t"
|
||||
"fcvt.s.h %[f], %[f]"
|
||||
: [f] "=&f" (f)
|
||||
: [h] "r" (h)
|
||||
);
|
||||
return f;
|
||||
_Float16 hf;
|
||||
memcpy(&hf, &h, sizeof(ggml_fp16_t));
|
||||
return hf;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
|
||||
ggml_fp16_t res;
|
||||
__asm__(
|
||||
"fcvt.h.s %[f], %[f]\n\t"
|
||||
"fmv.x.h %[h], %[f]"
|
||||
: [h] "=&r" (res)
|
||||
: [f] "f" (f)
|
||||
);
|
||||
_Float16 hf = (_Float16)f;
|
||||
memcpy(&res, &hf, sizeof(ggml_fp16_t));
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -1170,6 +1165,36 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
||||
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
|
||||
// compatible with vlen >= 128
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32
|
||||
|
||||
#define GGML_F32_STEP 16
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 vfloat32m1_t
|
||||
#define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
|
||||
#define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
|
||||
#define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
|
||||
#define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
|
||||
#define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
|
||||
#define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
|
||||
#define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
#endif
|
||||
|
||||
// GGML_F32_ARR / GGML_F16_ARR
|
||||
|
||||
Reference in New Issue
Block a user