* CUDA: use mma PTX instructions for FlashAttention * __shfl_sync workaround for movmatrix * add __shfl_sync to HIP Co-authored-by: Diego Devesa <slarengh@gmail.com>
11 lines
312 B
Plaintext
11 lines
312 B
Plaintext
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
|
|
|
#include "../fattn-mma-f16.cuh"
|
|
|
|
DECL_FATTN_MMA_F16_CASE(64, 8);
|
|
DECL_FATTN_MMA_F16_CASE(80, 8);
|
|
DECL_FATTN_MMA_F16_CASE(96, 8);
|
|
DECL_FATTN_MMA_F16_CASE(112, 8);
|
|
DECL_FATTN_MMA_F16_CASE(128, 8);
|
|
DECL_FATTN_MMA_F16_CASE(256, 8);
|