[AMD] Expand test coverage for AMD CI and enable apply_token_bitmask_inplace_cuda in sgl-kernel (#8268)

2025-08-15 12:32:51 -07:00
parent e52c3866eb
commit 9c3e95d98b
6 changed files with 61 additions and 6 deletions
--- a/sgl-kernel/csrc/grammar/apply_token_bitmask_inplace_cuda.cu
+++ b/sgl-kernel/csrc/grammar/apply_token_bitmask_inplace_cuda.cu
@@ -25,19 +25,24 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>

-#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
+
+#if !defined(USE_ROCM) && (!defined(CUDA_VERSION) || CUDA_VERSION < 12040)
 void ApplyTokenBitmaskInplace(at::Tensor logits, at::Tensor bitmask, at::optional<at::Tensor> indices = at::nullopt) {
  TORCH_CHECK(false, "CUDA version must be >= 12.4 for ApplyTokenBitmaskInplace");
 }
 #else

 #ifndef CUDART_INF_FP16
+#ifndef USE_ROCM
 #define CUDART_INF_FP16 __ushort_as_half((unsigned short)0x7C00U)
 #endif
+#endif

 #ifndef CUDART_INF_BF16
+#ifndef USE_ROCM
 #define CUDART_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)
 #endif
+#endif

 constexpr int32_t BITS_PER_BLOCK = 32;
 constexpr int32_t THREADS_PER_THREAD_BLOCK = 256;
@@ -49,12 +54,20 @@ __device__ T NegativeInfinity() {

 template <>
 __device__ __half NegativeInfinity<__half>() {
+#ifdef USE_ROCM
+  return __float2half(-INFINITY);
+#else
  return -CUDART_INF_FP16;
+#endif
 }

 template <>
 __device__ __nv_bfloat16 NegativeInfinity<__nv_bfloat16>() {
+#ifdef USE_ROCM
+  return __nv_bfloat16(-INFINITY);
+#else
  return -CUDART_INF_BF16;
+#endif
 }

 template <typename T, typename PackedT>