CUDA: FA support for Deepseek (Ampere or newer) (#13306)
* CUDA: FA support for Deepseek (Ampere or newer) * do loop unrolling via C++ template
This commit is contained in:
@@ -118,7 +118,7 @@ if (CUDAToolkit_FOUND)
|
||||
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
set(CUDA_FLAGS -use_fast_math)
|
||||
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
# Options are:
|
||||
|
||||
Reference in New Issue
Block a user