[AMD] Add silu_and_mul, gelu_and_mul, gelu_tanh_and_mul, and gelu_quick kernels for AMD GPUs (#7135)

Co-authored-by: yiakwy-xpu-ml-framework-team <961186938@qq.com>
Co-authored-by: HAI <hixiao@gmail.com>
This commit is contained in:
Hubert Lu
2025-07-24 23:44:28 -07:00
committed by GitHub
parent 7ad6b766c5
commit af4b9bae95
17 changed files with 1226 additions and 61 deletions

View File

@@ -0,0 +1,177 @@
#pragma once
#if USE_ROCM
#include <hip/hip_bf16.h>
#include <hip/hip_common.h>
// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
using nv_bfloat16 = __hip_bfloat16;
using nv_bfloat162 = __hip_bfloat162;
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 make_bfloat162(const __hip_bfloat16 x, const __hip_bfloat16 y) {
__hip_bfloat162 t;
t.x = x;
t.y = y;
return t;
}
namespace sgl_hip {
// nv_bfloat16 x 1
template <>
struct vec_t<nv_bfloat16, 1> {
nv_bfloat16 data;
SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
return ((nv_bfloat16*)(&data))[i];
}
SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
return ((const nv_bfloat16*)(&data))[i];
}
SGL_HIP_INLINE nv_bfloat16* ptr() {
return reinterpret_cast<nv_bfloat16*>(&data);
}
SGL_HIP_INLINE void load(const nv_bfloat16* ptr);
SGL_HIP_INLINE void store(nv_bfloat16* ptr) const;
template <typename T>
SGL_HIP_INLINE void cast_from(const vec_t<T, 1>& src) {
cast_from_impl(*this, src);
}
template <typename T>
SGL_HIP_INLINE void cast_load(const T* ptr) {
cast_load_impl(*this, ptr);
}
template <typename T>
SGL_HIP_INLINE void cast_store(T* ptr) const {
cast_store_impl(ptr, *this);
}
};
SGL_HIP_INLINE void vec_t<nv_bfloat16, 1>::load(const nv_bfloat16* ptr) {
data = *ptr;
}
SGL_HIP_INLINE void vec_t<nv_bfloat16, 1>::store(nv_bfloat16* ptr) const {
*ptr = data;
}
// nv_bfloat16 x 2
template <>
struct vec_t<nv_bfloat16, 2> {
nv_bfloat162 data;
SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
return ((nv_bfloat16*)(&data))[i];
}
SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
return ((const nv_bfloat16*)(&data))[i];
}
SGL_HIP_INLINE nv_bfloat16* ptr() {
return reinterpret_cast<nv_bfloat16*>(&data);
}
SGL_HIP_INLINE void load(const nv_bfloat16* ptr);
SGL_HIP_INLINE void store(nv_bfloat16* ptr) const;
template <typename T>
SGL_HIP_INLINE void cast_from(const vec_t<T, 2>& src) {
cast_from_impl(*this, src);
}
template <typename T>
SGL_HIP_INLINE void cast_load(const T* ptr) {
cast_load_impl(*this, ptr);
}
template <typename T>
SGL_HIP_INLINE void cast_store(T* ptr) const {
cast_store_impl(ptr, *this);
}
};
SGL_HIP_INLINE void vec_t<nv_bfloat16, 2>::load(const nv_bfloat16* ptr) {
data = *((nv_bfloat162*)ptr);
}
SGL_HIP_INLINE void vec_t<nv_bfloat16, 2>::store(nv_bfloat16* ptr) const {
*((nv_bfloat162*)ptr) = data;
}
template <>
struct vec_t<nv_bfloat16, 4> {
uint2 data;
SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
return ((nv_bfloat16*)(&data))[i];
}
SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
return ((const nv_bfloat16*)(&data))[i];
}
SGL_HIP_INLINE nv_bfloat16* ptr() {
return reinterpret_cast<nv_bfloat16*>(&data);
}
SGL_HIP_INLINE void load(const nv_bfloat16* ptr);
SGL_HIP_INLINE void store(nv_bfloat16* ptr) const;
template <typename T>
SGL_HIP_INLINE void cast_from(const vec_t<T, 4>& src) {
cast_from_impl(*this, src);
}
template <typename T>
SGL_HIP_INLINE void cast_load(const T* ptr) {
cast_load_impl(*this, ptr);
}
template <typename T>
SGL_HIP_INLINE void cast_store(T* ptr) const {
cast_store_impl(ptr, *this);
}
};
SGL_HIP_INLINE void vec_t<nv_bfloat16, 4>::load(const nv_bfloat16* ptr) {
data = *((uint2*)ptr);
}
SGL_HIP_INLINE void vec_t<nv_bfloat16, 4>::store(nv_bfloat16* ptr) const {
*((uint2*)ptr) = data;
}
// nv_bfloat16 x 8 or more
template <size_t vec_size>
struct vec_t<nv_bfloat16, vec_size> {
uint4 data[vec_size / 8];
SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
return ((nv_bfloat16*)data)[i];
}
SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
return ((const nv_bfloat16*)data)[i];
}
SGL_HIP_INLINE nv_bfloat16* ptr() {
return reinterpret_cast<nv_bfloat16*>(&data);
}
SGL_HIP_INLINE void load(const nv_bfloat16* ptr) {
#pragma unoll
for (size_t i = 0; i < vec_size / 8; ++i) {
data[i] = ((uint4*)ptr)[i];
}
}
SGL_HIP_INLINE void store(nv_bfloat16* ptr) const {
#pragma unoll
for (size_t i = 0; i < vec_size / 8; ++i) {
((uint4*)ptr)[i] = data[i];
}
}
template <typename T>
SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src) {
cast_from_impl(*this, src);
}
template <typename T>
SGL_HIP_INLINE void cast_load(const T* ptr) {
cast_load_impl(*this, ptr);
}
template <typename T>
SGL_HIP_INLINE void cast_store(T* ptr) const {
cast_store_impl(ptr, *this);
}
};
} // namespace sgl_hip
#endif

View File

@@ -0,0 +1,129 @@
#pragma once
#if USE_ROCM
#include <hip/hip_common.h>
// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
namespace sgl_hip {
template <>
struct vec_t<float, 1> {
float data;
SGL_HIP_INLINE float& operator[](size_t i) {
return ((float*)(&data))[i];
}
SGL_HIP_INLINE const float& operator[](size_t i) const {
return ((const float*)(&data))[i];
}
SGL_HIP_INLINE float* ptr() {
return reinterpret_cast<float*>(&data);
}
SGL_HIP_INLINE void load(const float* ptr);
SGL_HIP_INLINE void store(float* ptr) const;
template <typename T>
SGL_HIP_INLINE void cast_from(const vec_t<T, 1>& src) {
cast_from_impl(*this, src);
}
template <typename T>
SGL_HIP_INLINE void cast_load(const T* ptr) {
cast_load_impl(*this, ptr);
}
template <typename T>
SGL_HIP_INLINE void cast_store(T* ptr) const {
cast_store_impl(ptr, *this);
}
};
SGL_HIP_INLINE void vec_t<float, 1>::load(const float* ptr) {
data = *ptr;
}
SGL_HIP_INLINE void vec_t<float, 1>::store(float* ptr) const {
*ptr = data;
}
// float x 2
template <>
struct vec_t<float, 2> {
float2 data;
SGL_HIP_INLINE float& operator[](size_t i) {
return ((float*)(&data))[i];
}
SGL_HIP_INLINE const float& operator[](size_t i) const {
return ((const float*)(&data))[i];
}
SGL_HIP_INLINE float* ptr() {
return reinterpret_cast<float*>(&data);
}
SGL_HIP_INLINE void load(const float* ptr);
SGL_HIP_INLINE void store(float* ptr) const;
template <typename T>
SGL_HIP_INLINE void cast_from(const vec_t<T, 2>& src) {
cast_from_impl(*this, src);
}
template <typename T>
SGL_HIP_INLINE void cast_load(const T* ptr) {
cast_load_impl(*this, ptr);
}
template <typename T>
SGL_HIP_INLINE void cast_store(T* ptr) const {
cast_store_impl(ptr, *this);
}
};
SGL_HIP_INLINE void vec_t<float, 2>::load(const float* ptr) {
data = *((float2*)ptr);
}
SGL_HIP_INLINE void vec_t<float, 2>::store(float* ptr) const {
*((float2*)ptr) = data;
}
// float x 4 or more
template <size_t vec_size>
struct vec_t<float, vec_size> {
float4 data[vec_size / 4];
SGL_HIP_INLINE float& operator[](size_t i) {
return ((float*)(data))[i];
}
SGL_HIP_INLINE const float& operator[](size_t i) const {
return ((const float*)(data))[i];
}
SGL_HIP_INLINE float* ptr() {
return reinterpret_cast<float*>(&data);
}
SGL_HIP_INLINE void load(const float* ptr) {
#pragma unroll
for (size_t i = 0; i < vec_size / 4; ++i) {
data[i] = ((float4*)ptr)[i];
}
}
SGL_HIP_INLINE void store(float* ptr) const {
#pragma unroll
for (size_t i = 0; i < vec_size / 4; ++i) {
((float4*)ptr)[i] = data[i];
}
}
template <typename T>
SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src) {
cast_from_impl(*this, src);
}
template <typename T>
SGL_HIP_INLINE void cast_load(const T* ptr) {
cast_load_impl(*this, ptr);
}
template <typename T>
SGL_HIP_INLINE void cast_store(T* ptr) const {
cast_store_impl(ptr, *this);
}
};
} // namespace sgl_hip
#endif

View File

@@ -0,0 +1,172 @@
#pragma once
#if USE_ROCM
#include <hip/hip_common.h>
#include <hip/hip_fp16.h>
// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
using half = __half;
using half2 = __half2;
namespace sgl_hip {
// half x 1
template <>
struct vec_t<half, 1> {
half data;
SGL_HIP_INLINE half& operator[](size_t i) {
return ((half*)(&data))[i];
}
SGL_HIP_INLINE const half& operator[](size_t i) const {
return ((const half*)(&data))[i];
}
SGL_HIP_INLINE half* ptr() {
return reinterpret_cast<half*>(&data);
}
SGL_HIP_INLINE void load(const half* ptr);
SGL_HIP_INLINE void store(half* ptr) const;
template <typename T>
SGL_HIP_INLINE void cast_from(const vec_t<T, 1>& src) {
cast_from_impl(*this, src);
}
template <typename T>
SGL_HIP_INLINE void cast_load(const T* ptr) {
cast_load_impl(*this, ptr);
}
template <typename T>
SGL_HIP_INLINE void cast_store(T* ptr) const {
cast_store_impl(ptr, *this);
}
};
SGL_HIP_INLINE void vec_t<half, 1>::load(const half* ptr) {
data = *ptr;
}
SGL_HIP_INLINE void vec_t<half, 1>::store(half* ptr) const {
*ptr = data;
}
// half x 2
template <>
struct vec_t<half, 2> {
half2 data;
SGL_HIP_INLINE half& operator[](size_t i) {
return ((half*)(&data))[i];
}
SGL_HIP_INLINE const half& operator[](size_t i) const {
return ((const half*)(&data))[i];
}
SGL_HIP_INLINE half* ptr() {
return reinterpret_cast<half*>(&data);
}
SGL_HIP_INLINE void load(const half* ptr);
SGL_HIP_INLINE void store(half* ptr) const;
template <typename T>
SGL_HIP_INLINE void cast_from(const vec_t<T, 2>& src) {
cast_from_impl(*this, src);
}
template <typename T>
SGL_HIP_INLINE void cast_load(const T* ptr) {
cast_load_impl(*this, ptr);
}
template <typename T>
SGL_HIP_INLINE void cast_store(T* ptr) const {
cast_store_impl(ptr, *this);
}
};
SGL_HIP_INLINE void vec_t<half, 2>::load(const half* ptr) {
data = *((half2*)ptr);
}
SGL_HIP_INLINE void vec_t<half, 2>::store(half* ptr) const {
*((half2*)ptr) = data;
}
// half x 4
template <>
struct vec_t<half, 4> {
uint2 data;
SGL_HIP_INLINE half& operator[](size_t i) {
return ((half*)(&data))[i];
}
SGL_HIP_INLINE const half& operator[](size_t i) const {
return ((const half*)(&data))[i];
}
SGL_HIP_INLINE half* ptr() {
return reinterpret_cast<half*>(&data);
}
SGL_HIP_INLINE void load(const half* ptr);
SGL_HIP_INLINE void store(half* ptr) const;
template <typename T>
SGL_HIP_INLINE void cast_from(const vec_t<T, 4>& src) {
cast_from_impl(*this, src);
}
template <typename T>
SGL_HIP_INLINE void cast_load(const T* ptr) {
cast_load_impl(*this, ptr);
}
template <typename T>
SGL_HIP_INLINE void cast_store(T* ptr) const {
cast_store_impl(ptr, *this);
}
};
SGL_HIP_INLINE void vec_t<half, 4>::load(const half* ptr) {
data = *((uint2*)ptr);
}
SGL_HIP_INLINE void vec_t<half, 4>::store(half* ptr) const {
*((uint2*)ptr) = data;
}
// half x 8 or more
template <size_t vec_size>
struct vec_t<half, vec_size> {
uint4 data[vec_size / 8];
SGL_HIP_INLINE half& operator[](size_t i) {
return ((half*)data)[i];
}
SGL_HIP_INLINE const half& operator[](size_t i) const {
return ((const half*)data)[i];
}
SGL_HIP_INLINE half* ptr() {
return reinterpret_cast<half*>(&data);
}
SGL_HIP_INLINE void load(const half* ptr) {
#pragma unroll
for (size_t i = 0; i < vec_size / 8; ++i) {
data[i] = ((uint4*)ptr)[i];
}
}
SGL_HIP_INLINE void store(half* ptr) const {
#pragma unroll
for (size_t i = 0; i < vec_size / 8; ++i) {
((uint4*)ptr)[i] = data[i];
}
}
template <typename T>
SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src) {
cast_from_impl(*this, src);
}
template <typename T>
SGL_HIP_INLINE void cast_load(const T* ptr) {
cast_load_impl(*this, ptr);
}
template <typename T>
SGL_HIP_INLINE void cast_store(T* ptr) const {
cast_store_impl(ptr, *this);
}
};
} // namespace sgl_hip
#endif