CUDA: MoE helper in device code, better tile sizes (#15525)

* CUDA: MoE helper in device code, better tile sizes

* reduce superfluous CUDA blocks
This commit is contained in:
Johannes Gäßler
2025-08-25 17:23:40 +02:00
committed by GitHub
parent dfd9b5f6c7
commit 5eff6ec9b1
4 changed files with 221 additions and 68 deletions

View File

@@ -22,7 +22,10 @@
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
#define __shfl_up_sync(mask, var, laneMask, width) __shfl_up(var, laneMask, width)
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define __all_sync(mask, var) __all(var)
#define __any_sync(mask, var) __any(var)
#define cublasCreate hipblasCreate
#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx