[AMD] Expand test coverage for AMD CI and enable apply_token_bitmask_inplace_cuda in sgl-kernel (#8268)
This commit is contained in:
@@ -25,19 +25,24 @@
|
||||
#include <torch/all.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
|
||||
#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
|
||||
|
||||
#if !defined(USE_ROCM) && (!defined(CUDA_VERSION) || CUDA_VERSION < 12040)
|
||||
void ApplyTokenBitmaskInplace(at::Tensor logits, at::Tensor bitmask, at::optional<at::Tensor> indices = at::nullopt) {
|
||||
TORCH_CHECK(false, "CUDA version must be >= 12.4 for ApplyTokenBitmaskInplace");
|
||||
}
|
||||
#else
|
||||
|
||||
#ifndef CUDART_INF_FP16
|
||||
#ifndef USE_ROCM
|
||||
#define CUDART_INF_FP16 __ushort_as_half((unsigned short)0x7C00U)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef CUDART_INF_BF16
|
||||
#ifndef USE_ROCM
|
||||
#define CUDART_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
constexpr int32_t BITS_PER_BLOCK = 32;
|
||||
constexpr int32_t THREADS_PER_THREAD_BLOCK = 256;
|
||||
@@ -49,12 +54,20 @@ __device__ T NegativeInfinity() {
|
||||
|
||||
template <>
|
||||
__device__ __half NegativeInfinity<__half>() {
|
||||
#ifdef USE_ROCM
|
||||
return __float2half(-INFINITY);
|
||||
#else
|
||||
return -CUDART_INF_FP16;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __nv_bfloat16 NegativeInfinity<__nv_bfloat16>() {
|
||||
#ifdef USE_ROCM
|
||||
return __nv_bfloat16(-INFINITY);
|
||||
#else
|
||||
return -CUDART_INF_BF16;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename PackedT>
|
||||
|
||||
Reference in New Issue
Block a user