[AMD] Expand test coverage for AMD CI and enable apply_token_bitmask_inplace_cuda in sgl-kernel (#8268)

This commit is contained in:
Hubert Lu
2025-08-15 12:32:51 -07:00
committed by GitHub
parent e52c3866eb
commit 9c3e95d98b
6 changed files with 61 additions and 6 deletions

View File

@@ -114,6 +114,12 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
"Tensor! retrive_next_sibling, int topk, int depth, int draft_token_num, int tree_mask_mode) -> "
"()");
m.impl("build_tree_kernel_efficient", torch::kCUDA, &build_tree_kernel_efficient);
/*
* From XGrammar
*/
m.def("apply_token_bitmask_inplace_cuda(Tensor logits, Tensor bitmask, Tensor? indices=None) -> ()");
m.impl("apply_token_bitmask_inplace_cuda", &ApplyTokenBitmaskInplace);
}
REGISTER_EXTENSION(common_ops)

View File

@@ -25,19 +25,24 @@
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
#if !defined(USE_ROCM) && (!defined(CUDA_VERSION) || CUDA_VERSION < 12040)
void ApplyTokenBitmaskInplace(at::Tensor logits, at::Tensor bitmask, at::optional<at::Tensor> indices = at::nullopt) {
TORCH_CHECK(false, "CUDA version must be >= 12.4 for ApplyTokenBitmaskInplace");
}
#else
#ifndef CUDART_INF_FP16
#ifndef USE_ROCM
#define CUDART_INF_FP16 __ushort_as_half((unsigned short)0x7C00U)
#endif
#endif
#ifndef CUDART_INF_BF16
#ifndef USE_ROCM
#define CUDART_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)
#endif
#endif
constexpr int32_t BITS_PER_BLOCK = 32;
constexpr int32_t THREADS_PER_THREAD_BLOCK = 256;
@@ -49,12 +54,20 @@ __device__ T NegativeInfinity() {
template <>
__device__ __half NegativeInfinity<__half>() {
#ifdef USE_ROCM
return __float2half(-INFINITY);
#else
return -CUDART_INF_FP16;
#endif
}
template <>
__device__ __nv_bfloat16 NegativeInfinity<__nv_bfloat16>() {
#ifdef USE_ROCM
return __nv_bfloat16(-INFINITY);
#else
return -CUDART_INF_BF16;
#endif
}
template <typename T, typename PackedT>