[DeepseekV32] Add fast_topk_transform_ragged_fused kernel (#11815)

Signed-off-by: Hao Lu <14827759+hlu1@users.noreply.github.com>
This commit is contained in:
hlu1
2025-10-19 17:13:39 -07:00
committed by GitHub
parent 252dc4e112
commit 3b80232d06
6 changed files with 201 additions and 20 deletions

View File

@@ -113,6 +113,10 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
"fast_topk_transform_fused(Tensor score, Tensor lengths, Tensor dst_page_table, Tensor src_page_table, Tensor "
"cu_seqlens_q) -> ()");
m.impl("fast_topk_transform_fused", torch::kCUDA, &fast_topk_transform_interface);
m.def(
"fast_topk_transform_ragged_fused(Tensor score, Tensor lengths, Tensor topk_indices_ragged, Tensor "
"topk_indices_offset) -> ()");
m.impl("fast_topk_transform_ragged_fused", torch::kCUDA, &fast_topk_transform_ragged_interface);
/*
* From gguf quantiztion