CUDA: app option to compile without FlashAttention (#12025)

This commit is contained in:
Johannes Gäßler
2025-02-22 20:44:34 +01:00
committed by GitHub
parent 36c258ee92
commit a28e0d5eb1
13 changed files with 46 additions and 31 deletions

View File

@@ -51,7 +51,7 @@ static __global__ void flash_attn_ext_f16(
const int ne1,
const int ne2,
const int ne3) {
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#if defined(FLASH_ATTN_AVAILABLE) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
@@ -425,7 +425,7 @@ static __global__ void flash_attn_ext_f16(
}
#else
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#endif // defined(FLASH_ATTN_AVAILABLE) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
}
constexpr int get_max_power_of_2(int x) {