CUDA: GEMM for FP32/FP16/BF16 and ne11 <= 16 (#15131)

* CUDA: GEMM for FP32/FP16/BF16 and ne11 <= 16
This commit is contained in:
Johannes Gäßler
2025-08-07 10:53:21 +02:00
committed by GitHub
parent 20638e4f16
commit 1d72c84188
13 changed files with 750 additions and 225 deletions

View File

@@ -200,6 +200,7 @@
#endif
typedef hip_bfloat16 nv_bfloat16;
typedef short2 nv_bfloat162; // FIXME there is no 2x BF16 type being defined in bfloat16.h, ad-hoc compilation fix
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));