CUDA: fix non-cont. inputs for batched mat mul (#13155)

This commit is contained in:
Johannes Gäßler
2025-04-29 16:00:27 +02:00
committed by GitHub
parent 7d3af70b08
commit cdf76586b2
4 changed files with 94 additions and 42 deletions

View File

@@ -3,7 +3,7 @@
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
template<typename T>
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
using to_t_cuda_t = void (*)(const void * x, T * y, int64_t k, cudaStream_t stream);
typedef to_t_cuda_t<float> to_fp32_cuda_t;
typedef to_t_cuda_t<half> to_fp16_cuda_t;
@@ -14,3 +14,13 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type);
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
// TODO more general support for non-contiguous inputs
template<typename T>
using to_t_nc_cuda_t = void (*)(const void * x, T * y,
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);