[AMD] Add silu_and_mul, gelu_and_mul, gelu_tanh_and_mul, and gelu_quick kernels for AMD GPUs (#7135)

Co-authored-by: yiakwy-xpu-ml-framework-team <961186938@qq.com>
Co-authored-by: HAI <hixiao@gmail.com>
This commit is contained in:
Hubert Lu
2025-07-24 23:44:28 -07:00
committed by GitHub
parent 7ad6b766c5
commit af4b9bae95
17 changed files with 1226 additions and 61 deletions

View File

@@ -0,0 +1,94 @@
/* Copyright 2025 SGLang Team. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#pragma once
#if defined(__HIP_PLATFORM_AMD__)
#include <hip/hip_bf16.h>
#include <hip/hip_common.h>
#include <hip/hip_fp16.h>
// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
namespace amdgpu {
template <typename T>
__forceinline__ __device__ T shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize);
template <typename srcDtype, typename destDtype>
__forceinline__ __device__ destDtype cast(srcDtype val);
// specialization
template <>
__forceinline__ __device__ float shfl_xor_sync(unsigned mask, float var, int laneMask, int width) {
return __shfl_xor(var, laneMask, width);
}
template <>
__forceinline__ __device__ int shfl_xor_sync(unsigned mask, int var, int laneMask, int width) {
return __shfl_xor(var, laneMask, width);
}
template <>
__forceinline__ __device__ float cast(float val) {
return val;
}
template <>
__forceinline__ __device__ float cast(__half val) {
return __half2float(val);
}
template <>
__forceinline__ __device__ float cast(__hip_bfloat16 val) {
return __bfloat162float(val);
}
template <>
__forceinline__ __device__ __half cast(float fval) {
return __float2half(fval);
}
template <>
__forceinline__ __device__ __hip_bfloat16 cast(float fval) {
return __float2bfloat16(fval);
}
} // namespace amdgpu
template <typename T>
__forceinline__ __device__ T __shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize) {
return amdgpu::shfl_xor_sync(mask, var, laneMask, width);
}
template <typename srcDtype>
__device__ __forceinline__ float castToFloat(srcDtype val) {
return amdgpu::cast<srcDtype, float>(val);
}
template <typename dstDtype>
__device__ __forceinline__ dstDtype castFromFloat(float val) {
return amdgpu::cast<float, dstDtype>(val);
}
// operator overload to support flashinfer
__host__ __device__ __forceinline__ __half operator*(const __half& x, const __half& y) {
__half h_x = x;
__half h_y = y;
return __hmul(h_x, h_y);
}
#endif