adapt to sglang v0.5.2rc1 on dcu

2025-09-04 15:56:33 +08:00
commit 909abb58f5
2320 changed files with 489411 additions and 0 deletions
--- a/sgl-kernel/include/hip/hip_act_and_mul.cuh
+++ b/sgl-kernel/include/hip/hip_act_and_mul.cuh
@@ -0,0 +1,87 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include "utils.h"
+
+#define kBitsToLoad 128
+#define kBytesToLoad (kBitsToLoad / 8)
+
+// Adapted from
+// [flashinfer::activation::act_and_mul_kernel](https://github.com/flashinfer-ai/flashinfer/blob/4e8eb1879f9c3ba6d75511e5893183bf8f289a62/include/flashinfer/activation.cuh#L29)
+
+namespace sgl_hip {
+namespace activation {
+
+template <typename T, T (*Activation)(const T&)>
+__global__ void act_and_mul_kernel(T* __restrict__ out, const T* __restrict__ input, const int d) {
+  constexpr uint32_t vec_size = kBytesToLoad / sizeof(T);
+  const int64_t token_idx = blockIdx.x;
+  const int64_t thread_idx = threadIdx.x;
+  const int64_t stride = blockDim.x;
+  const int64_t offset = token_idx * 2 * d;
+
+#pragma unroll 1
+  for (uint32_t idx = thread_idx; idx < d / vec_size; idx += stride) {
+    sgl_hip::vec_t<T, vec_size> x_vec, y_vec, out_vec;
+    x_vec.cast_load(input + offset + idx * vec_size);
+    y_vec.cast_load(input + offset + d + idx * vec_size);
+#pragma unroll
+    for (uint32_t i = 0; i < vec_size; ++i) {
+      out_vec[i] = Activation(x_vec[i]) * y_vec[i];
+    }
+    out_vec.cast_store(out + token_idx * d + idx * vec_size);
+  }
+
+  const int64_t remaining_offset = d - d % (stride * vec_size);
+  // process the remaining elements
+#pragma unroll 1
+  for (int64_t idx = thread_idx; idx < d % (stride * vec_size); idx += stride) {
+    T x = input[offset + remaining_offset + idx], y = input[offset + remaining_offset + d + idx];
+    out[token_idx * d + remaining_offset + idx] = Activation(x) * y;
+  }
+}
+
+template <typename T, T (*Activation)(const T&)>
+__global__ void act_only_kernel(T* __restrict__ out, const T* __restrict__ input, const int d) {
+  constexpr uint32_t vec_size = kBytesToLoad / sizeof(T);
+  const int64_t token_idx = blockIdx.x;
+  const int64_t thread_idx = threadIdx.x;
+  const int64_t stride = blockDim.x;
+  const int64_t offset = token_idx * d;
+
+#pragma unroll 1
+  for (uint32_t idx = thread_idx; idx < d / vec_size; idx += stride) {
+    sgl_hip::vec_t<T, vec_size> x_vec, y_vec, out_vec;
+    x_vec.cast_load(input + offset + idx * vec_size);
+#pragma unroll
+    for (uint32_t i = 0; i < vec_size; ++i) {
+      out_vec[i] = Activation(x_vec[i]);
+    }
+    out_vec.cast_store(out + token_idx * d + idx * vec_size);
+  }
+
+  const int64_t remaining_offset = d - d % (stride * vec_size);
+  // process the remaining elements
+#pragma unroll 1
+  for (int64_t idx = thread_idx; idx < d % (stride * vec_size); idx += stride) {
+    T x = input[offset + remaining_offset + idx];
+    out[token_idx * d + remaining_offset + idx] = Activation(x);
+  }
+}
+
+}  // namespace activation
+}  // namespace sgl_hip
--- a/sgl-kernel/include/hip/hip_math_def.h
+++ b/sgl-kernel/include/hip/hip_math_def.h
@@ -0,0 +1,94 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#ifdef USE_ROCM
+
+#include <hip/hip_bf16.h>
+#include <hip/hip_common.h>
+#include <hip/hip_fp16.h>
+
+// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
+
+namespace amdgpu {
+
+template <typename T>
+__forceinline__ __device__ T shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize);
+
+template <typename srcDtype, typename destDtype>
+__forceinline__ __device__ destDtype cast(srcDtype val);
+
+// specialization
+template <>
+__forceinline__ __device__ float shfl_xor_sync(unsigned mask, float var, int laneMask, int width) {
+  return __shfl_xor(var, laneMask, width);
+}
+
+template <>
+__forceinline__ __device__ int shfl_xor_sync(unsigned mask, int var, int laneMask, int width) {
+  return __shfl_xor(var, laneMask, width);
+}
+
+template <>
+__forceinline__ __device__ float cast(float val) {
+  return val;
+}
+
+template <>
+__forceinline__ __device__ float cast(__half val) {
+  return __half2float(val);
+}
+
+template <>
+__forceinline__ __device__ float cast(__hip_bfloat16 val) {
+  return __bfloat162float(val);
+}
+
+template <>
+__forceinline__ __device__ __half cast(float fval) {
+  return __float2half(fval);
+}
+
+template <>
+__forceinline__ __device__ __hip_bfloat16 cast(float fval) {
+  return __float2bfloat16(fval);
+}
+
+}  // namespace amdgpu
+
+template <typename T>
+__forceinline__ __device__ T __shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize) {
+  return amdgpu::shfl_xor_sync(mask, var, laneMask, width);
+}
+
+template <typename srcDtype>
+__device__ __forceinline__ float castToFloat(srcDtype val) {
+  return amdgpu::cast<srcDtype, float>(val);
+}
+
+template <typename dstDtype>
+__device__ __forceinline__ dstDtype castFromFloat(float val) {
+  return amdgpu::cast<float, dstDtype>(val);
+}
+
+// operator overload to support flashinfer
+__host__ __device__ __forceinline__ __half operator*(const __half& x, const __half& y) {
+  __half h_x = x;
+  __half h_y = y;
+  return __hmul(h_x, h_y);
+}
+
+#endif
--- a/sgl-kernel/include/hip/hip_vec_dtypes.h
+++ b/sgl-kernel/include/hip/hip_vec_dtypes.h
@@ -0,0 +1,101 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#if USE_ROCM
+
+#include <hip/hip_bf16.h>
+#include <hip/hip_common.h>
+#include <hip/hip_fp16.h>
+
+// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)d
+
+#define SGL_HIP_INLINE inline __attribute__((always_inline)) __device__
+
+namespace sgl_hip {
+
+template <typename float_t, size_t vec_size>
+struct vec_t;
+
+template <typename srcDtype, typename dstDtype, size_t vec_size>
+SGL_HIP_INLINE void cast_load_impl(vec_t<dstDtype, vec_size>& dst, const srcDtype* src);
+
+template <typename srcDtype, typename dstDtype, size_t vec_size>
+SGL_HIP_INLINE void cast_store_impl(dstDtype* dst_ptr, const vec_t<srcDtype, vec_size>& src);
+
+template <typename float_t, size_t vec_size>
+struct vec_t {
+  SGL_HIP_INLINE float_t& operator[](size_t i);
+  SGL_HIP_INLINE const float_t& operator[](size_t i) const;
+  SGL_HIP_INLINE float_t* ptr();
+
+  SGL_HIP_INLINE void load(const float_t* ptr);
+  SGL_HIP_INLINE void store(float_t* ptr) const;
+
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src);
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr);
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const;
+};
+
+}  // namespace sgl_hip
+
+// **** impl *****
+
+namespace sgl_hip {
+
+template <typename srcDtype, typename dstDtype, size_t vec_size>
+SGL_HIP_INLINE void cast_load_impl(vec_t<dstDtype, vec_size>& dst, const srcDtype* src_ptr) {
+  if constexpr (std::is_same<srcDtype, dstDtype>::value) {
+    dst.load(src_ptr);
+  } else {
+    vec_t<srcDtype, vec_size> tmp;
+    tmp.load(src_ptr);
+    dst.cast_from(tmp);
+  }
+}
+
+template <typename srcDtype, typename dstDtype, size_t vec_size>
+SGL_HIP_INLINE void cast_store_impl(dstDtype* dst_ptr, const vec_t<srcDtype, vec_size>& src) {
+  if constexpr (std::is_same<srcDtype, dstDtype>::value) {
+    src.store(dst_ptr);
+  } else {
+    vec_t<dstDtype, vec_size> tmp;
+    tmp.cast_from(src);
+    tmp.store(dst_ptr);
+  }
+}
+
+template <typename float_t, size_t vec_size>
+template <typename T>
+SGL_HIP_INLINE void vec_t<float_t, vec_size>::cast_load(const T* ptr) {
+  cast_load_impl(*this, ptr);
+}
+
+template <typename float_t, size_t vec_size>
+template <typename T>
+SGL_HIP_INLINE void vec_t<float_t, vec_size>::cast_store(T* ptr) const {
+  cast_store_impl(ptr, *this);
+}
+
+}  // namespace sgl_hip
+
+#include "impl/hip_vec_bf16_impl.h"
+#include "impl/hip_vec_fp32_impl.h"
+#include "impl/hip_vec_half_impl.h"
+#endif
--- a/sgl-kernel/include/hip/impl/hip_vec_bf16_impl.h
+++ b/sgl-kernel/include/hip/impl/hip_vec_bf16_impl.h
@@ -0,0 +1,177 @@
+#pragma once
+
+#if USE_ROCM
+
+#include <hip/hip_bf16.h>
+#include <hip/hip_common.h>
+
+// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
+
+using nv_bfloat16 = __hip_bfloat16;
+using nv_bfloat162 = __hip_bfloat162;
+
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 make_bfloat162(const __hip_bfloat16 x, const __hip_bfloat16 y) {
+  __hip_bfloat162 t;
+  t.x = x;
+  t.y = y;
+  return t;
+}
+
+namespace sgl_hip {
+
+// nv_bfloat16 x 1
+template <>
+struct vec_t<nv_bfloat16, 1> {
+  nv_bfloat16 data;
+  SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
+    return ((nv_bfloat16*)(&data))[i];
+  }
+  SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
+    return ((const nv_bfloat16*)(&data))[i];
+  }
+  SGL_HIP_INLINE nv_bfloat16* ptr() {
+    return reinterpret_cast<nv_bfloat16*>(&data);
+  }
+  SGL_HIP_INLINE void load(const nv_bfloat16* ptr);
+  SGL_HIP_INLINE void store(nv_bfloat16* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 1>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<nv_bfloat16, 1>::load(const nv_bfloat16* ptr) {
+  data = *ptr;
+}
+
+SGL_HIP_INLINE void vec_t<nv_bfloat16, 1>::store(nv_bfloat16* ptr) const {
+  *ptr = data;
+}
+
+// nv_bfloat16 x 2
+template <>
+struct vec_t<nv_bfloat16, 2> {
+  nv_bfloat162 data;
+
+  SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
+    return ((nv_bfloat16*)(&data))[i];
+  }
+  SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
+    return ((const nv_bfloat16*)(&data))[i];
+  }
+  SGL_HIP_INLINE nv_bfloat16* ptr() {
+    return reinterpret_cast<nv_bfloat16*>(&data);
+  }
+  SGL_HIP_INLINE void load(const nv_bfloat16* ptr);
+  SGL_HIP_INLINE void store(nv_bfloat16* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 2>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<nv_bfloat16, 2>::load(const nv_bfloat16* ptr) {
+  data = *((nv_bfloat162*)ptr);
+}
+
+SGL_HIP_INLINE void vec_t<nv_bfloat16, 2>::store(nv_bfloat16* ptr) const {
+  *((nv_bfloat162*)ptr) = data;
+}
+
+template <>
+struct vec_t<nv_bfloat16, 4> {
+  uint2 data;
+
+  SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
+    return ((nv_bfloat16*)(&data))[i];
+  }
+  SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
+    return ((const nv_bfloat16*)(&data))[i];
+  }
+  SGL_HIP_INLINE nv_bfloat16* ptr() {
+    return reinterpret_cast<nv_bfloat16*>(&data);
+  }
+  SGL_HIP_INLINE void load(const nv_bfloat16* ptr);
+  SGL_HIP_INLINE void store(nv_bfloat16* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 4>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<nv_bfloat16, 4>::load(const nv_bfloat16* ptr) {
+  data = *((uint2*)ptr);
+}
+
+SGL_HIP_INLINE void vec_t<nv_bfloat16, 4>::store(nv_bfloat16* ptr) const {
+  *((uint2*)ptr) = data;
+}
+
+// nv_bfloat16 x 8 or more
+
+template <size_t vec_size>
+struct vec_t<nv_bfloat16, vec_size> {
+  uint4 data[vec_size / 8];
+
+  SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
+    return ((nv_bfloat16*)data)[i];
+  }
+  SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
+    return ((const nv_bfloat16*)data)[i];
+  }
+  SGL_HIP_INLINE nv_bfloat16* ptr() {
+    return reinterpret_cast<nv_bfloat16*>(&data);
+  }
+  SGL_HIP_INLINE void load(const nv_bfloat16* ptr) {
+#pragma unoll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      data[i] = ((uint4*)ptr)[i];
+    }
+  }
+  SGL_HIP_INLINE void store(nv_bfloat16* ptr) const {
+#pragma unoll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      ((uint4*)ptr)[i] = data[i];
+    }
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+}  // namespace sgl_hip
+
+#endif
--- a/sgl-kernel/include/hip/impl/hip_vec_fp32_impl.h
+++ b/sgl-kernel/include/hip/impl/hip_vec_fp32_impl.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#if USE_ROCM
+
+#include <hip/hip_common.h>
+
+// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
+
+namespace sgl_hip {
+
+template <>
+struct vec_t<float, 1> {
+  float data;
+
+  SGL_HIP_INLINE float& operator[](size_t i) {
+    return ((float*)(&data))[i];
+  }
+  SGL_HIP_INLINE const float& operator[](size_t i) const {
+    return ((const float*)(&data))[i];
+  }
+  SGL_HIP_INLINE float* ptr() {
+    return reinterpret_cast<float*>(&data);
+  }
+  SGL_HIP_INLINE void load(const float* ptr);
+  SGL_HIP_INLINE void store(float* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 1>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<float, 1>::load(const float* ptr) {
+  data = *ptr;
+}
+
+SGL_HIP_INLINE void vec_t<float, 1>::store(float* ptr) const {
+  *ptr = data;
+}
+
+// float x 2
+
+template <>
+struct vec_t<float, 2> {
+  float2 data;
+
+  SGL_HIP_INLINE float& operator[](size_t i) {
+    return ((float*)(&data))[i];
+  }
+  SGL_HIP_INLINE const float& operator[](size_t i) const {
+    return ((const float*)(&data))[i];
+  }
+  SGL_HIP_INLINE float* ptr() {
+    return reinterpret_cast<float*>(&data);
+  }
+  SGL_HIP_INLINE void load(const float* ptr);
+  SGL_HIP_INLINE void store(float* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 2>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<float, 2>::load(const float* ptr) {
+  data = *((float2*)ptr);
+}
+
+SGL_HIP_INLINE void vec_t<float, 2>::store(float* ptr) const {
+  *((float2*)ptr) = data;
+}
+
+// float x 4 or more
+template <size_t vec_size>
+struct vec_t<float, vec_size> {
+  float4 data[vec_size / 4];
+
+  SGL_HIP_INLINE float& operator[](size_t i) {
+    return ((float*)(data))[i];
+  }
+  SGL_HIP_INLINE const float& operator[](size_t i) const {
+    return ((const float*)(data))[i];
+  }
+  SGL_HIP_INLINE float* ptr() {
+    return reinterpret_cast<float*>(&data);
+  }
+  SGL_HIP_INLINE void load(const float* ptr) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      data[i] = ((float4*)ptr)[i];
+    }
+  }
+  SGL_HIP_INLINE void store(float* ptr) const {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      ((float4*)ptr)[i] = data[i];
+    }
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+}  // namespace sgl_hip
+
+#endif
--- a/sgl-kernel/include/hip/impl/hip_vec_half_impl.h
+++ b/sgl-kernel/include/hip/impl/hip_vec_half_impl.h
@@ -0,0 +1,172 @@
+#pragma once
+
+#if USE_ROCM
+
+#include <hip/hip_common.h>
+#include <hip/hip_fp16.h>
+
+// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
+
+using half = __half;
+using half2 = __half2;
+
+namespace sgl_hip {
+
+// half x 1
+template <>
+struct vec_t<half, 1> {
+  half data;
+
+  SGL_HIP_INLINE half& operator[](size_t i) {
+    return ((half*)(&data))[i];
+  }
+  SGL_HIP_INLINE const half& operator[](size_t i) const {
+    return ((const half*)(&data))[i];
+  }
+  SGL_HIP_INLINE half* ptr() {
+    return reinterpret_cast<half*>(&data);
+  }
+  SGL_HIP_INLINE void load(const half* ptr);
+  SGL_HIP_INLINE void store(half* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 1>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<half, 1>::load(const half* ptr) {
+  data = *ptr;
+}
+
+SGL_HIP_INLINE void vec_t<half, 1>::store(half* ptr) const {
+  *ptr = data;
+}
+
+// half x 2
+template <>
+struct vec_t<half, 2> {
+  half2 data;
+
+  SGL_HIP_INLINE half& operator[](size_t i) {
+    return ((half*)(&data))[i];
+  }
+  SGL_HIP_INLINE const half& operator[](size_t i) const {
+    return ((const half*)(&data))[i];
+  }
+  SGL_HIP_INLINE half* ptr() {
+    return reinterpret_cast<half*>(&data);
+  }
+  SGL_HIP_INLINE void load(const half* ptr);
+  SGL_HIP_INLINE void store(half* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 2>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<half, 2>::load(const half* ptr) {
+  data = *((half2*)ptr);
+}
+
+SGL_HIP_INLINE void vec_t<half, 2>::store(half* ptr) const {
+  *((half2*)ptr) = data;
+}
+
+// half x 4
+
+template <>
+struct vec_t<half, 4> {
+  uint2 data;
+
+  SGL_HIP_INLINE half& operator[](size_t i) {
+    return ((half*)(&data))[i];
+  }
+  SGL_HIP_INLINE const half& operator[](size_t i) const {
+    return ((const half*)(&data))[i];
+  }
+  SGL_HIP_INLINE half* ptr() {
+    return reinterpret_cast<half*>(&data);
+  }
+  SGL_HIP_INLINE void load(const half* ptr);
+  SGL_HIP_INLINE void store(half* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 4>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<half, 4>::load(const half* ptr) {
+  data = *((uint2*)ptr);
+}
+
+SGL_HIP_INLINE void vec_t<half, 4>::store(half* ptr) const {
+  *((uint2*)ptr) = data;
+}
+
+// half x 8 or more
+
+template <size_t vec_size>
+struct vec_t<half, vec_size> {
+  uint4 data[vec_size / 8];
+
+  SGL_HIP_INLINE half& operator[](size_t i) {
+    return ((half*)data)[i];
+  }
+  SGL_HIP_INLINE const half& operator[](size_t i) const {
+    return ((const half*)data)[i];
+  }
+  SGL_HIP_INLINE half* ptr() {
+    return reinterpret_cast<half*>(&data);
+  }
+  SGL_HIP_INLINE void load(const half* ptr) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      data[i] = ((uint4*)ptr)[i];
+    }
+  }
+  SGL_HIP_INLINE void store(half* ptr) const {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      ((uint4*)ptr)[i] = data[i];
+    }
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+}  // namespace sgl_hip
+#endif