adapt to sglang v0.5.2rc1 on dcu
This commit is contained in:
87
sgl-kernel/include/hip/hip_act_and_mul.cuh
Normal file
87
sgl-kernel/include/hip/hip_act_and_mul.cuh
Normal file
@@ -0,0 +1,87 @@
|
||||
/* Copyright 2025 SGLang Team. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
#define kBitsToLoad 128
|
||||
#define kBytesToLoad (kBitsToLoad / 8)
|
||||
|
||||
// Adapted from
|
||||
// [flashinfer::activation::act_and_mul_kernel](https://github.com/flashinfer-ai/flashinfer/blob/4e8eb1879f9c3ba6d75511e5893183bf8f289a62/include/flashinfer/activation.cuh#L29)
|
||||
|
||||
namespace sgl_hip {
|
||||
namespace activation {
|
||||
|
||||
template <typename T, T (*Activation)(const T&)>
|
||||
__global__ void act_and_mul_kernel(T* __restrict__ out, const T* __restrict__ input, const int d) {
|
||||
constexpr uint32_t vec_size = kBytesToLoad / sizeof(T);
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t thread_idx = threadIdx.x;
|
||||
const int64_t stride = blockDim.x;
|
||||
const int64_t offset = token_idx * 2 * d;
|
||||
|
||||
#pragma unroll 1
|
||||
for (uint32_t idx = thread_idx; idx < d / vec_size; idx += stride) {
|
||||
sgl_hip::vec_t<T, vec_size> x_vec, y_vec, out_vec;
|
||||
x_vec.cast_load(input + offset + idx * vec_size);
|
||||
y_vec.cast_load(input + offset + d + idx * vec_size);
|
||||
#pragma unroll
|
||||
for (uint32_t i = 0; i < vec_size; ++i) {
|
||||
out_vec[i] = Activation(x_vec[i]) * y_vec[i];
|
||||
}
|
||||
out_vec.cast_store(out + token_idx * d + idx * vec_size);
|
||||
}
|
||||
|
||||
const int64_t remaining_offset = d - d % (stride * vec_size);
|
||||
// process the remaining elements
|
||||
#pragma unroll 1
|
||||
for (int64_t idx = thread_idx; idx < d % (stride * vec_size); idx += stride) {
|
||||
T x = input[offset + remaining_offset + idx], y = input[offset + remaining_offset + d + idx];
|
||||
out[token_idx * d + remaining_offset + idx] = Activation(x) * y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, T (*Activation)(const T&)>
|
||||
__global__ void act_only_kernel(T* __restrict__ out, const T* __restrict__ input, const int d) {
|
||||
constexpr uint32_t vec_size = kBytesToLoad / sizeof(T);
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t thread_idx = threadIdx.x;
|
||||
const int64_t stride = blockDim.x;
|
||||
const int64_t offset = token_idx * d;
|
||||
|
||||
#pragma unroll 1
|
||||
for (uint32_t idx = thread_idx; idx < d / vec_size; idx += stride) {
|
||||
sgl_hip::vec_t<T, vec_size> x_vec, y_vec, out_vec;
|
||||
x_vec.cast_load(input + offset + idx * vec_size);
|
||||
#pragma unroll
|
||||
for (uint32_t i = 0; i < vec_size; ++i) {
|
||||
out_vec[i] = Activation(x_vec[i]);
|
||||
}
|
||||
out_vec.cast_store(out + token_idx * d + idx * vec_size);
|
||||
}
|
||||
|
||||
const int64_t remaining_offset = d - d % (stride * vec_size);
|
||||
// process the remaining elements
|
||||
#pragma unroll 1
|
||||
for (int64_t idx = thread_idx; idx < d % (stride * vec_size); idx += stride) {
|
||||
T x = input[offset + remaining_offset + idx];
|
||||
out[token_idx * d + remaining_offset + idx] = Activation(x);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace activation
|
||||
} // namespace sgl_hip
|
||||
94
sgl-kernel/include/hip/hip_math_def.h
Normal file
94
sgl-kernel/include/hip/hip_math_def.h
Normal file
@@ -0,0 +1,94 @@
|
||||
/* Copyright 2025 SGLang Team. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef USE_ROCM
|
||||
|
||||
#include <hip/hip_bf16.h>
|
||||
#include <hip/hip_common.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
|
||||
|
||||
namespace amdgpu {
|
||||
|
||||
template <typename T>
|
||||
__forceinline__ __device__ T shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize);
|
||||
|
||||
template <typename srcDtype, typename destDtype>
|
||||
__forceinline__ __device__ destDtype cast(srcDtype val);
|
||||
|
||||
// specialization
|
||||
template <>
|
||||
__forceinline__ __device__ float shfl_xor_sync(unsigned mask, float var, int laneMask, int width) {
|
||||
return __shfl_xor(var, laneMask, width);
|
||||
}
|
||||
|
||||
template <>
|
||||
__forceinline__ __device__ int shfl_xor_sync(unsigned mask, int var, int laneMask, int width) {
|
||||
return __shfl_xor(var, laneMask, width);
|
||||
}
|
||||
|
||||
template <>
|
||||
__forceinline__ __device__ float cast(float val) {
|
||||
return val;
|
||||
}
|
||||
|
||||
template <>
|
||||
__forceinline__ __device__ float cast(__half val) {
|
||||
return __half2float(val);
|
||||
}
|
||||
|
||||
template <>
|
||||
__forceinline__ __device__ float cast(__hip_bfloat16 val) {
|
||||
return __bfloat162float(val);
|
||||
}
|
||||
|
||||
template <>
|
||||
__forceinline__ __device__ __half cast(float fval) {
|
||||
return __float2half(fval);
|
||||
}
|
||||
|
||||
template <>
|
||||
__forceinline__ __device__ __hip_bfloat16 cast(float fval) {
|
||||
return __float2bfloat16(fval);
|
||||
}
|
||||
|
||||
} // namespace amdgpu
|
||||
|
||||
template <typename T>
|
||||
__forceinline__ __device__ T __shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize) {
|
||||
return amdgpu::shfl_xor_sync(mask, var, laneMask, width);
|
||||
}
|
||||
|
||||
template <typename srcDtype>
|
||||
__device__ __forceinline__ float castToFloat(srcDtype val) {
|
||||
return amdgpu::cast<srcDtype, float>(val);
|
||||
}
|
||||
|
||||
template <typename dstDtype>
|
||||
__device__ __forceinline__ dstDtype castFromFloat(float val) {
|
||||
return amdgpu::cast<float, dstDtype>(val);
|
||||
}
|
||||
|
||||
// operator overload to support flashinfer
|
||||
__host__ __device__ __forceinline__ __half operator*(const __half& x, const __half& y) {
|
||||
__half h_x = x;
|
||||
__half h_y = y;
|
||||
return __hmul(h_x, h_y);
|
||||
}
|
||||
|
||||
#endif
|
||||
101
sgl-kernel/include/hip/hip_vec_dtypes.h
Normal file
101
sgl-kernel/include/hip/hip_vec_dtypes.h
Normal file
@@ -0,0 +1,101 @@
|
||||
/* Copyright 2025 SGLang Team. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#if USE_ROCM
|
||||
|
||||
#include <hip/hip_bf16.h>
|
||||
#include <hip/hip_common.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)d
|
||||
|
||||
#define SGL_HIP_INLINE inline __attribute__((always_inline)) __device__
|
||||
|
||||
namespace sgl_hip {
|
||||
|
||||
template <typename float_t, size_t vec_size>
|
||||
struct vec_t;
|
||||
|
||||
template <typename srcDtype, typename dstDtype, size_t vec_size>
|
||||
SGL_HIP_INLINE void cast_load_impl(vec_t<dstDtype, vec_size>& dst, const srcDtype* src);
|
||||
|
||||
template <typename srcDtype, typename dstDtype, size_t vec_size>
|
||||
SGL_HIP_INLINE void cast_store_impl(dstDtype* dst_ptr, const vec_t<srcDtype, vec_size>& src);
|
||||
|
||||
template <typename float_t, size_t vec_size>
|
||||
struct vec_t {
|
||||
SGL_HIP_INLINE float_t& operator[](size_t i);
|
||||
SGL_HIP_INLINE const float_t& operator[](size_t i) const;
|
||||
SGL_HIP_INLINE float_t* ptr();
|
||||
|
||||
SGL_HIP_INLINE void load(const float_t* ptr);
|
||||
SGL_HIP_INLINE void store(float_t* ptr) const;
|
||||
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src);
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_load(const T* ptr);
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_store(T* ptr) const;
|
||||
};
|
||||
|
||||
} // namespace sgl_hip
|
||||
|
||||
// **** impl *****
|
||||
|
||||
namespace sgl_hip {
|
||||
|
||||
template <typename srcDtype, typename dstDtype, size_t vec_size>
|
||||
SGL_HIP_INLINE void cast_load_impl(vec_t<dstDtype, vec_size>& dst, const srcDtype* src_ptr) {
|
||||
if constexpr (std::is_same<srcDtype, dstDtype>::value) {
|
||||
dst.load(src_ptr);
|
||||
} else {
|
||||
vec_t<srcDtype, vec_size> tmp;
|
||||
tmp.load(src_ptr);
|
||||
dst.cast_from(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename srcDtype, typename dstDtype, size_t vec_size>
|
||||
SGL_HIP_INLINE void cast_store_impl(dstDtype* dst_ptr, const vec_t<srcDtype, vec_size>& src) {
|
||||
if constexpr (std::is_same<srcDtype, dstDtype>::value) {
|
||||
src.store(dst_ptr);
|
||||
} else {
|
||||
vec_t<dstDtype, vec_size> tmp;
|
||||
tmp.cast_from(src);
|
||||
tmp.store(dst_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename float_t, size_t vec_size>
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void vec_t<float_t, vec_size>::cast_load(const T* ptr) {
|
||||
cast_load_impl(*this, ptr);
|
||||
}
|
||||
|
||||
template <typename float_t, size_t vec_size>
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void vec_t<float_t, vec_size>::cast_store(T* ptr) const {
|
||||
cast_store_impl(ptr, *this);
|
||||
}
|
||||
|
||||
} // namespace sgl_hip
|
||||
|
||||
#include "impl/hip_vec_bf16_impl.h"
|
||||
#include "impl/hip_vec_fp32_impl.h"
|
||||
#include "impl/hip_vec_half_impl.h"
|
||||
#endif
|
||||
177
sgl-kernel/include/hip/impl/hip_vec_bf16_impl.h
Normal file
177
sgl-kernel/include/hip/impl/hip_vec_bf16_impl.h
Normal file
@@ -0,0 +1,177 @@
|
||||
#pragma once
|
||||
|
||||
#if USE_ROCM
|
||||
|
||||
#include <hip/hip_bf16.h>
|
||||
#include <hip/hip_common.h>
|
||||
|
||||
// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
|
||||
|
||||
using nv_bfloat16 = __hip_bfloat16;
|
||||
using nv_bfloat162 = __hip_bfloat162;
|
||||
|
||||
__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 make_bfloat162(const __hip_bfloat16 x, const __hip_bfloat16 y) {
|
||||
__hip_bfloat162 t;
|
||||
t.x = x;
|
||||
t.y = y;
|
||||
return t;
|
||||
}
|
||||
|
||||
namespace sgl_hip {
|
||||
|
||||
// nv_bfloat16 x 1
|
||||
template <>
|
||||
struct vec_t<nv_bfloat16, 1> {
|
||||
nv_bfloat16 data;
|
||||
SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
|
||||
return ((nv_bfloat16*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
|
||||
return ((const nv_bfloat16*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE nv_bfloat16* ptr() {
|
||||
return reinterpret_cast<nv_bfloat16*>(&data);
|
||||
}
|
||||
SGL_HIP_INLINE void load(const nv_bfloat16* ptr);
|
||||
SGL_HIP_INLINE void store(nv_bfloat16* ptr) const;
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_from(const vec_t<T, 1>& src) {
|
||||
cast_from_impl(*this, src);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_load(const T* ptr) {
|
||||
cast_load_impl(*this, ptr);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_store(T* ptr) const {
|
||||
cast_store_impl(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
SGL_HIP_INLINE void vec_t<nv_bfloat16, 1>::load(const nv_bfloat16* ptr) {
|
||||
data = *ptr;
|
||||
}
|
||||
|
||||
SGL_HIP_INLINE void vec_t<nv_bfloat16, 1>::store(nv_bfloat16* ptr) const {
|
||||
*ptr = data;
|
||||
}
|
||||
|
||||
// nv_bfloat16 x 2
|
||||
template <>
|
||||
struct vec_t<nv_bfloat16, 2> {
|
||||
nv_bfloat162 data;
|
||||
|
||||
SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
|
||||
return ((nv_bfloat16*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
|
||||
return ((const nv_bfloat16*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE nv_bfloat16* ptr() {
|
||||
return reinterpret_cast<nv_bfloat16*>(&data);
|
||||
}
|
||||
SGL_HIP_INLINE void load(const nv_bfloat16* ptr);
|
||||
SGL_HIP_INLINE void store(nv_bfloat16* ptr) const;
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_from(const vec_t<T, 2>& src) {
|
||||
cast_from_impl(*this, src);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_load(const T* ptr) {
|
||||
cast_load_impl(*this, ptr);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_store(T* ptr) const {
|
||||
cast_store_impl(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
SGL_HIP_INLINE void vec_t<nv_bfloat16, 2>::load(const nv_bfloat16* ptr) {
|
||||
data = *((nv_bfloat162*)ptr);
|
||||
}
|
||||
|
||||
SGL_HIP_INLINE void vec_t<nv_bfloat16, 2>::store(nv_bfloat16* ptr) const {
|
||||
*((nv_bfloat162*)ptr) = data;
|
||||
}
|
||||
|
||||
template <>
|
||||
struct vec_t<nv_bfloat16, 4> {
|
||||
uint2 data;
|
||||
|
||||
SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
|
||||
return ((nv_bfloat16*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
|
||||
return ((const nv_bfloat16*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE nv_bfloat16* ptr() {
|
||||
return reinterpret_cast<nv_bfloat16*>(&data);
|
||||
}
|
||||
SGL_HIP_INLINE void load(const nv_bfloat16* ptr);
|
||||
SGL_HIP_INLINE void store(nv_bfloat16* ptr) const;
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_from(const vec_t<T, 4>& src) {
|
||||
cast_from_impl(*this, src);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_load(const T* ptr) {
|
||||
cast_load_impl(*this, ptr);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_store(T* ptr) const {
|
||||
cast_store_impl(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
SGL_HIP_INLINE void vec_t<nv_bfloat16, 4>::load(const nv_bfloat16* ptr) {
|
||||
data = *((uint2*)ptr);
|
||||
}
|
||||
|
||||
SGL_HIP_INLINE void vec_t<nv_bfloat16, 4>::store(nv_bfloat16* ptr) const {
|
||||
*((uint2*)ptr) = data;
|
||||
}
|
||||
|
||||
// nv_bfloat16 x 8 or more
|
||||
|
||||
template <size_t vec_size>
|
||||
struct vec_t<nv_bfloat16, vec_size> {
|
||||
uint4 data[vec_size / 8];
|
||||
|
||||
SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
|
||||
return ((nv_bfloat16*)data)[i];
|
||||
}
|
||||
SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
|
||||
return ((const nv_bfloat16*)data)[i];
|
||||
}
|
||||
SGL_HIP_INLINE nv_bfloat16* ptr() {
|
||||
return reinterpret_cast<nv_bfloat16*>(&data);
|
||||
}
|
||||
SGL_HIP_INLINE void load(const nv_bfloat16* ptr) {
|
||||
#pragma unoll
|
||||
for (size_t i = 0; i < vec_size / 8; ++i) {
|
||||
data[i] = ((uint4*)ptr)[i];
|
||||
}
|
||||
}
|
||||
SGL_HIP_INLINE void store(nv_bfloat16* ptr) const {
|
||||
#pragma unoll
|
||||
for (size_t i = 0; i < vec_size / 8; ++i) {
|
||||
((uint4*)ptr)[i] = data[i];
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src) {
|
||||
cast_from_impl(*this, src);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_load(const T* ptr) {
|
||||
cast_load_impl(*this, ptr);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_store(T* ptr) const {
|
||||
cast_store_impl(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace sgl_hip
|
||||
|
||||
#endif
|
||||
129
sgl-kernel/include/hip/impl/hip_vec_fp32_impl.h
Normal file
129
sgl-kernel/include/hip/impl/hip_vec_fp32_impl.h
Normal file
@@ -0,0 +1,129 @@
|
||||
#pragma once
|
||||
|
||||
#if USE_ROCM
|
||||
|
||||
#include <hip/hip_common.h>
|
||||
|
||||
// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
|
||||
|
||||
namespace sgl_hip {
|
||||
|
||||
template <>
|
||||
struct vec_t<float, 1> {
|
||||
float data;
|
||||
|
||||
SGL_HIP_INLINE float& operator[](size_t i) {
|
||||
return ((float*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE const float& operator[](size_t i) const {
|
||||
return ((const float*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE float* ptr() {
|
||||
return reinterpret_cast<float*>(&data);
|
||||
}
|
||||
SGL_HIP_INLINE void load(const float* ptr);
|
||||
SGL_HIP_INLINE void store(float* ptr) const;
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_from(const vec_t<T, 1>& src) {
|
||||
cast_from_impl(*this, src);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_load(const T* ptr) {
|
||||
cast_load_impl(*this, ptr);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_store(T* ptr) const {
|
||||
cast_store_impl(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
SGL_HIP_INLINE void vec_t<float, 1>::load(const float* ptr) {
|
||||
data = *ptr;
|
||||
}
|
||||
|
||||
SGL_HIP_INLINE void vec_t<float, 1>::store(float* ptr) const {
|
||||
*ptr = data;
|
||||
}
|
||||
|
||||
// float x 2
|
||||
|
||||
template <>
|
||||
struct vec_t<float, 2> {
|
||||
float2 data;
|
||||
|
||||
SGL_HIP_INLINE float& operator[](size_t i) {
|
||||
return ((float*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE const float& operator[](size_t i) const {
|
||||
return ((const float*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE float* ptr() {
|
||||
return reinterpret_cast<float*>(&data);
|
||||
}
|
||||
SGL_HIP_INLINE void load(const float* ptr);
|
||||
SGL_HIP_INLINE void store(float* ptr) const;
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_from(const vec_t<T, 2>& src) {
|
||||
cast_from_impl(*this, src);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_load(const T* ptr) {
|
||||
cast_load_impl(*this, ptr);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_store(T* ptr) const {
|
||||
cast_store_impl(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
SGL_HIP_INLINE void vec_t<float, 2>::load(const float* ptr) {
|
||||
data = *((float2*)ptr);
|
||||
}
|
||||
|
||||
SGL_HIP_INLINE void vec_t<float, 2>::store(float* ptr) const {
|
||||
*((float2*)ptr) = data;
|
||||
}
|
||||
|
||||
// float x 4 or more
|
||||
template <size_t vec_size>
|
||||
struct vec_t<float, vec_size> {
|
||||
float4 data[vec_size / 4];
|
||||
|
||||
SGL_HIP_INLINE float& operator[](size_t i) {
|
||||
return ((float*)(data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE const float& operator[](size_t i) const {
|
||||
return ((const float*)(data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE float* ptr() {
|
||||
return reinterpret_cast<float*>(&data);
|
||||
}
|
||||
SGL_HIP_INLINE void load(const float* ptr) {
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < vec_size / 4; ++i) {
|
||||
data[i] = ((float4*)ptr)[i];
|
||||
}
|
||||
}
|
||||
SGL_HIP_INLINE void store(float* ptr) const {
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < vec_size / 4; ++i) {
|
||||
((float4*)ptr)[i] = data[i];
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src) {
|
||||
cast_from_impl(*this, src);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_load(const T* ptr) {
|
||||
cast_load_impl(*this, ptr);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_store(T* ptr) const {
|
||||
cast_store_impl(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace sgl_hip
|
||||
|
||||
#endif
|
||||
172
sgl-kernel/include/hip/impl/hip_vec_half_impl.h
Normal file
172
sgl-kernel/include/hip/impl/hip_vec_half_impl.h
Normal file
@@ -0,0 +1,172 @@
|
||||
#pragma once
|
||||
|
||||
#if USE_ROCM
|
||||
|
||||
#include <hip/hip_common.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
|
||||
|
||||
using half = __half;
|
||||
using half2 = __half2;
|
||||
|
||||
namespace sgl_hip {
|
||||
|
||||
// half x 1
|
||||
template <>
|
||||
struct vec_t<half, 1> {
|
||||
half data;
|
||||
|
||||
SGL_HIP_INLINE half& operator[](size_t i) {
|
||||
return ((half*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE const half& operator[](size_t i) const {
|
||||
return ((const half*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE half* ptr() {
|
||||
return reinterpret_cast<half*>(&data);
|
||||
}
|
||||
SGL_HIP_INLINE void load(const half* ptr);
|
||||
SGL_HIP_INLINE void store(half* ptr) const;
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_from(const vec_t<T, 1>& src) {
|
||||
cast_from_impl(*this, src);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_load(const T* ptr) {
|
||||
cast_load_impl(*this, ptr);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_store(T* ptr) const {
|
||||
cast_store_impl(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
SGL_HIP_INLINE void vec_t<half, 1>::load(const half* ptr) {
|
||||
data = *ptr;
|
||||
}
|
||||
|
||||
SGL_HIP_INLINE void vec_t<half, 1>::store(half* ptr) const {
|
||||
*ptr = data;
|
||||
}
|
||||
|
||||
// half x 2
|
||||
template <>
|
||||
struct vec_t<half, 2> {
|
||||
half2 data;
|
||||
|
||||
SGL_HIP_INLINE half& operator[](size_t i) {
|
||||
return ((half*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE const half& operator[](size_t i) const {
|
||||
return ((const half*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE half* ptr() {
|
||||
return reinterpret_cast<half*>(&data);
|
||||
}
|
||||
SGL_HIP_INLINE void load(const half* ptr);
|
||||
SGL_HIP_INLINE void store(half* ptr) const;
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_from(const vec_t<T, 2>& src) {
|
||||
cast_from_impl(*this, src);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_load(const T* ptr) {
|
||||
cast_load_impl(*this, ptr);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_store(T* ptr) const {
|
||||
cast_store_impl(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
SGL_HIP_INLINE void vec_t<half, 2>::load(const half* ptr) {
|
||||
data = *((half2*)ptr);
|
||||
}
|
||||
|
||||
SGL_HIP_INLINE void vec_t<half, 2>::store(half* ptr) const {
|
||||
*((half2*)ptr) = data;
|
||||
}
|
||||
|
||||
// half x 4
|
||||
|
||||
template <>
|
||||
struct vec_t<half, 4> {
|
||||
uint2 data;
|
||||
|
||||
SGL_HIP_INLINE half& operator[](size_t i) {
|
||||
return ((half*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE const half& operator[](size_t i) const {
|
||||
return ((const half*)(&data))[i];
|
||||
}
|
||||
SGL_HIP_INLINE half* ptr() {
|
||||
return reinterpret_cast<half*>(&data);
|
||||
}
|
||||
SGL_HIP_INLINE void load(const half* ptr);
|
||||
SGL_HIP_INLINE void store(half* ptr) const;
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_from(const vec_t<T, 4>& src) {
|
||||
cast_from_impl(*this, src);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_load(const T* ptr) {
|
||||
cast_load_impl(*this, ptr);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_store(T* ptr) const {
|
||||
cast_store_impl(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
SGL_HIP_INLINE void vec_t<half, 4>::load(const half* ptr) {
|
||||
data = *((uint2*)ptr);
|
||||
}
|
||||
|
||||
SGL_HIP_INLINE void vec_t<half, 4>::store(half* ptr) const {
|
||||
*((uint2*)ptr) = data;
|
||||
}
|
||||
|
||||
// half x 8 or more
|
||||
|
||||
template <size_t vec_size>
|
||||
struct vec_t<half, vec_size> {
|
||||
uint4 data[vec_size / 8];
|
||||
|
||||
SGL_HIP_INLINE half& operator[](size_t i) {
|
||||
return ((half*)data)[i];
|
||||
}
|
||||
SGL_HIP_INLINE const half& operator[](size_t i) const {
|
||||
return ((const half*)data)[i];
|
||||
}
|
||||
SGL_HIP_INLINE half* ptr() {
|
||||
return reinterpret_cast<half*>(&data);
|
||||
}
|
||||
SGL_HIP_INLINE void load(const half* ptr) {
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < vec_size / 8; ++i) {
|
||||
data[i] = ((uint4*)ptr)[i];
|
||||
}
|
||||
}
|
||||
SGL_HIP_INLINE void store(half* ptr) const {
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < vec_size / 8; ++i) {
|
||||
((uint4*)ptr)[i] = data[i];
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src) {
|
||||
cast_from_impl(*this, src);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_load(const T* ptr) {
|
||||
cast_load_impl(*this, ptr);
|
||||
}
|
||||
template <typename T>
|
||||
SGL_HIP_INLINE void cast_store(T* ptr) const {
|
||||
cast_store_impl(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace sgl_hip
|
||||
#endif
|
||||
Reference in New Issue
Block a user