1040 lines
48 KiB
Python
1040 lines
48 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""
|
|
Combined Top-K and Top-P Triton kernels.
|
|
|
|
Based on the paper "Qrita: High-performance Top-k and Top-p Algorithm for GPUs
|
|
using Pivot-based Truncation and Selection" By Park et al.
|
|
(https://arxiv.org/abs/2602.01518)
|
|
|
|
"""
|
|
|
|
import torch
|
|
|
|
from vllm.triton_utils import tl, triton
|
|
from vllm.utils.math_utils import next_power_of_2
|
|
from vllm.utils.platform_utils import num_compute_units
|
|
|
|
_TRITON_TABLE_CACHE: dict[tuple[torch.device], tuple[torch.Tensor, torch.Tensor]] = {}
|
|
_TRITON_BUFFER_CACHE: dict[tuple[torch.device, torch.dtype, int], torch.Tensor] = {}
|
|
|
|
# fmt: off
|
|
_NORMAL_CDF_TO_SIGMA_TABLE = [
|
|
3.656, 3.650, 3.650, 3.650, 3.626, 3.626, 3.626, 3.514, 3.514, 3.503,
|
|
3.503, 3.434, 3.434, 3.428, 3.428, 3.387, 3.380, 3.380, 3.376, 3.373,
|
|
3.373, 3.356, 3.354, 3.354, 3.291, 3.249, 3.234, 3.214, 3.198, 3.198,
|
|
3.185, 3.177, 3.177, 3.165, 3.164, 3.161, 3.138, 3.120, 3.115, 3.113,
|
|
3.093, 3.066, 3.054, 3.043, 3.037, 3.023, 2.993, 2.991, 2.976, 2.970,
|
|
2.952, 2.946, 2.932, 2.908, 2.902, 2.895, 2.886, 2.874, 2.861, 2.844,
|
|
2.836, 2.810, 2.801, 2.790, 2.784, 2.779, 2.767, 2.757, 2.745, 2.733,
|
|
2.723, 2.716, 2.693, 2.678, 2.671, 2.656, 2.649, 2.629, 2.611, 2.595,
|
|
2.592, 2.585, 2.574, 2.550, 2.543, 2.534, 2.521, 2.518, 2.497, 2.485,
|
|
2.468, 2.450, 2.441, 2.430, 2.412, 2.402, 2.389, 2.383, 2.377, 2.364,
|
|
2.349, 2.338, 2.332, 2.319, 2.310, 2.301, 2.282, 2.274, 2.266, 2.250,
|
|
2.242, 2.236, 2.226, 2.215, 2.207, 2.196, 2.179, 2.171, 2.162, 2.147,
|
|
2.135, 2.121, 2.109, 2.095, 2.085, 2.073, 2.063, 2.045, 2.030, 2.016,
|
|
2.003, 1.992, 1.983, 1.972, 1.960, 1.949, 1.940, 1.928, 1.912, 1.897,
|
|
1.881, 1.869, 1.854, 1.838, 1.824, 1.807, 1.792, 1.779, 1.764, 1.751,
|
|
1.739, 1.726, 1.711, 1.697, 1.685, 1.668, 1.652, 1.636, 1.622, 1.603,
|
|
1.585, 1.568, 1.551, 1.534, 1.513, 1.499, 1.480, 1.464, 1.441, 1.422,
|
|
1.394, 1.373, 1.347, 1.320, 1.296, 1.270, 1.246, 1.219, 1.190, 1.163,
|
|
1.135, 1.104, 1.073, 1.041, 1.006, 0.969, 0.931, 0.894, 0.851, 0.806,
|
|
0.757, 0.702, 0.643, 0.574, 0.498, 0.405, 0.288, 0.134, -0.110, -3.813
|
|
]
|
|
|
|
_PERCENTILE_TO_STD_TABLE = [
|
|
2.576, 2.319, 2.178, 2.064, 1.968, 1.892, 1.819, 1.757, 1.708, 1.659,
|
|
1.616, 1.568, 1.526, 1.492, 1.456, 1.420, 1.382, 1.342, 1.309, 1.280,
|
|
1.249, 1.221, 1.193, 1.169, 1.145, 1.121, 1.095, 1.073, 1.050, 1.030,
|
|
1.008, 0.987, 0.966, 0.945, 0.926, 0.910, 0.891, 0.871, 0.854, 0.837,
|
|
0.819, 0.803, 0.784, 0.767, 0.753, 0.734, 0.719, 0.702, 0.690, 0.675,
|
|
0.658, 0.640, 0.625, 0.609, 0.595, 0.578, 0.564, 0.550, 0.537, 0.521,
|
|
0.509, 0.495, 0.481, 0.466, 0.453, 0.439, 0.424, 0.410, 0.397, 0.383,
|
|
0.370, 0.356, 0.343, 0.330, 0.316, 0.302, 0.289, 0.274, 0.261, 0.247,
|
|
0.235, 0.223, 0.209, 0.196, 0.184, 0.172, 0.159, 0.149, 0.137, 0.124,
|
|
0.112, 0.100, 0.086, 0.074, 0.062, 0.050, 0.035, 0.023, 0.009, -0.003,
|
|
-0.015, -0.027, -0.039, -0.052, -0.063, -0.074, -0.085, -0.097, -0.109, -0.122,
|
|
-0.134, -0.147, -0.158, -0.171, -0.184, -0.196, -0.210, -0.223, -0.235, -0.248,
|
|
-0.261, -0.275, -0.289, -0.302, -0.317, -0.328, -0.341, -0.353, -0.368, -0.382,
|
|
-0.396, -0.410, -0.426, -0.439, -0.452, -0.465, -0.480, -0.493, -0.507, -0.521,
|
|
-0.537, -0.551, -0.568, -0.582, -0.597, -0.614, -0.628, -0.643, -0.658, -0.673,
|
|
-0.691, -0.706, -0.721, -0.738, -0.754, -0.769, -0.789, -0.808, -0.824, -0.838,
|
|
-0.857, -0.877, -0.893, -0.912, -0.929, -0.947, -0.965, -0.983, -1.003, -1.027,
|
|
-1.050, -1.070, -1.092, -1.117, -1.139, -1.162, -1.189, -1.216, -1.241, -1.272,
|
|
-1.300, -1.330, -1.367, -1.404, -1.441, -1.485, -1.523, -1.564, -1.607, -1.658,
|
|
-1.710, -1.778, -1.832, -1.901, -1.978, -2.068, -2.174, -2.325, -2.577, -3.813
|
|
]
|
|
# fmt: on
|
|
|
|
|
|
@triton.jit
|
|
def _topk_topp_kernel(
|
|
LOGITS,
|
|
BUFFER,
|
|
PERCENTILE_TO_STD_TABLE,
|
|
NORMAL_CDF_TO_SIGMA_TABLE,
|
|
K,
|
|
P,
|
|
BATCH_SIZE,
|
|
VOCAB_SIZE: tl.constexpr,
|
|
MASK_VALUE: tl.constexpr,
|
|
BLOCK_SIZE: tl.constexpr,
|
|
BLOCK_SIZE_TRUNC: tl.constexpr,
|
|
TOPK_ENABLED: tl.constexpr,
|
|
TOPP_ENABLED: tl.constexpr,
|
|
):
|
|
NUM_TILES: tl.constexpr = (VOCAB_SIZE + BLOCK_SIZE - 1) // BLOCK_SIZE
|
|
pid = tl.program_id(0)
|
|
num_programs = tl.num_programs(0)
|
|
for row_id in tl.range(pid, BATCH_SIZE, num_programs):
|
|
LOGITS_ROW = LOGITS + row_id * VOCAB_SIZE
|
|
BUFFER_ROW = BUFFER + pid * VOCAB_SIZE
|
|
|
|
final_pivot = -float("inf")
|
|
duplicate_logit = float("inf")
|
|
num_duplicate_logit = tl.zeros((), dtype=tl.uint32)
|
|
num_keep = tl.zeros((), dtype=tl.uint32)
|
|
num_kept = tl.zeros((), dtype=tl.uint32)
|
|
|
|
max_logit = -float("inf")
|
|
min_logit = float("inf")
|
|
|
|
if TOPK_ENABLED:
|
|
k = tl.load(K + row_id)
|
|
if k < VOCAB_SIZE:
|
|
# Zeroth pass: Compute avg and std from a sample block
|
|
offs = tl.arange(0, BLOCK_SIZE)
|
|
mask_n = offs < VOCAB_SIZE
|
|
logits_blk0 = tl.load(
|
|
LOGITS_ROW + offs, mask=mask_n, other=-float("inf")
|
|
)
|
|
# Exclude -inf values (e.g. from grammar bitmasks) from
|
|
# statistics to avoid NaN in pivot computation.
|
|
finite_mask = (logits_blk0 > -float("inf")) & mask_n
|
|
num_finite = tl.sum(finite_mask)
|
|
finite_logits = tl.where(finite_mask, logits_blk0, 0.0)
|
|
avg_logit = tl.where(
|
|
num_finite > 0, tl.sum(finite_logits) / num_finite, 0.0
|
|
)
|
|
sq_avg_logit = tl.where(
|
|
num_finite > 0,
|
|
tl.sum(finite_logits * finite_logits) / num_finite,
|
|
0.0,
|
|
)
|
|
std_logit = tl.sqrt(
|
|
tl.maximum(sq_avg_logit - avg_logit * avg_logit, 0.0)
|
|
)
|
|
|
|
# Calculate outlier pivot t for Gaussian sigma-truncation
|
|
percentile = tl.cast(k / VOCAB_SIZE * 200, tl.uint32)
|
|
percentile = tl.minimum(percentile, 199)
|
|
sigma = tl.load(PERCENTILE_TO_STD_TABLE + percentile)
|
|
sigma = sigma + tl.abs(sigma) * -0.15
|
|
outlier_pivot = avg_logit + std_logit * sigma
|
|
num_outliers = tl.zeros((), dtype=tl.uint32)
|
|
|
|
# First pass: compute max and min logits and gather outliers
|
|
num_finite_total = tl.zeros((), dtype=tl.uint32)
|
|
for i in range(0, NUM_TILES):
|
|
offs_n = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
|
|
mask_n = offs_n < VOCAB_SIZE
|
|
logits_blk = tl.load(
|
|
LOGITS_ROW + offs_n, mask=mask_n, other=-float("inf")
|
|
)
|
|
|
|
max_logit = tl.maximum(max_logit, tl.max(logits_blk))
|
|
# Exclude -inf from min to keep binary search bounds
|
|
# finite (avoids NaN pivots).
|
|
finite_blk_mask = logits_blk > -float("inf")
|
|
finite_blk = tl.where(finite_blk_mask, logits_blk, float("inf"))
|
|
min_logit = tl.minimum(min_logit, tl.min(finite_blk))
|
|
num_finite_total += tl.sum(finite_blk_mask & mask_n)
|
|
|
|
outlier_mask = (logits_blk > outlier_pivot) & mask_n
|
|
cumulative_pos = tl.cast(
|
|
tl.cumsum(outlier_mask) - 1 + num_outliers, tl.int32
|
|
)
|
|
num_outliers += tl.sum(outlier_mask)
|
|
write_pos = tl.where(outlier_mask, cumulative_pos, -1)
|
|
tl.store(BUFFER_ROW + write_pos, logits_blk, mask=outlier_mask)
|
|
|
|
# If no finite logits exist (all -inf), clamp min to
|
|
# max so the search converges to -inf (no masking).
|
|
min_logit = tl.minimum(min_logit, max_logit)
|
|
|
|
# Second passes: Ternary search for pivots
|
|
num_iters = 0
|
|
k_pivot = float("inf")
|
|
k_pivots_num = tl.zeros((), dtype=tl.uint32)
|
|
min_larger = float("inf")
|
|
num_min_larger = tl.zeros((), dtype=tl.uint32)
|
|
if num_outliers > k:
|
|
max_range = max_logit
|
|
min_range = outlier_pivot
|
|
search_range = tl.cast(num_outliers, tl.int32)
|
|
search_iters = tl.cast(
|
|
(num_outliers + BLOCK_SIZE_TRUNC - 1) // BLOCK_SIZE_TRUNC,
|
|
tl.int32,
|
|
)
|
|
found_pivot = 0
|
|
while found_pivot == 0:
|
|
k_pivot_0 = (max_range - min_range) * 1.0 / 3.0 + min_range
|
|
k_pivots_num_0 = tl.zeros((), dtype=tl.uint32)
|
|
min_larger_0 = float("inf")
|
|
num_min_larger_0 = tl.zeros((), dtype=tl.uint32)
|
|
|
|
k_pivot_1 = (max_range - min_range) * 2.0 / 3.0 + min_range
|
|
k_pivots_num_1 = tl.zeros((), dtype=tl.uint32)
|
|
min_larger_1 = float("inf")
|
|
num_min_larger_1 = tl.zeros((), dtype=tl.uint32)
|
|
|
|
# First pass: Calculate k_pivots_num and min_larger
|
|
for i in range(0, search_iters):
|
|
offs_n = i * BLOCK_SIZE_TRUNC + tl.arange(
|
|
0, BLOCK_SIZE_TRUNC
|
|
)
|
|
mask_n_2 = offs_n < search_range
|
|
logits_blk2 = tl.load(
|
|
BUFFER_ROW + offs_n, mask=mask_n_2, other=-float("inf")
|
|
)
|
|
|
|
k_pivots_num_0 += tl.sum(logits_blk2 > k_pivot_0)
|
|
k_pivots_num_1 += tl.sum(logits_blk2 > k_pivot_1)
|
|
|
|
min_larger_0 = tl.minimum(min_larger_0, tl.min(logits_blk2))
|
|
min_larger_1 = tl.minimum(min_larger_1, tl.min(logits_blk2))
|
|
|
|
# Second pass: Calculate num_min_larger
|
|
for i in range(0, search_iters):
|
|
offs_n = i * BLOCK_SIZE_TRUNC + tl.arange(
|
|
0, BLOCK_SIZE_TRUNC
|
|
)
|
|
mask_n_2 = offs_n < search_range
|
|
logits_blk2 = tl.load(
|
|
BUFFER_ROW + offs_n, mask=mask_n_2, other=-float("inf")
|
|
)
|
|
|
|
num_min_larger_0 += tl.sum(
|
|
tl.abs(logits_blk2 - min_larger_0) < 1e-9
|
|
)
|
|
num_min_larger_1 += tl.sum(
|
|
tl.abs(logits_blk2 - min_larger_1) < 1e-9
|
|
)
|
|
|
|
# Check if any of the pivots satisfy termination condition
|
|
if (
|
|
k_pivots_num_0 >= k
|
|
and k_pivots_num_0 - num_min_larger_0 < k
|
|
):
|
|
k_pivot = k_pivot_0
|
|
k_pivots_num = k_pivots_num_0
|
|
min_larger = min_larger_0
|
|
num_min_larger = num_min_larger_0
|
|
found_pivot = 1
|
|
if (
|
|
k_pivots_num_1 >= k
|
|
and k_pivots_num_1 - num_min_larger_1 < k
|
|
):
|
|
k_pivot = k_pivot_1
|
|
k_pivots_num = k_pivots_num_1
|
|
min_larger = min_larger_1
|
|
num_min_larger = num_min_larger_1
|
|
found_pivot = 1
|
|
|
|
# Update range
|
|
if k_pivots_num_1 > k:
|
|
min_range = k_pivot_1
|
|
elif k_pivots_num_0 > k:
|
|
min_range = k_pivot_0
|
|
|
|
if k_pivots_num_0 < k:
|
|
max_range = k_pivot_0
|
|
elif k_pivots_num_1 < k:
|
|
max_range = k_pivot_1
|
|
|
|
num_iters += 1
|
|
if num_iters >= 18 or tl.abs(min_range - max_range) < 1e-9:
|
|
k_pivot = (max_range + min_range) / 2.0
|
|
found_pivot = 1
|
|
else:
|
|
# If top-k outlier gathering failed, search whole logit space
|
|
max_range = max_logit
|
|
min_range = min_logit
|
|
found_pivot = 0
|
|
while found_pivot == 0:
|
|
k_pivot_0 = (max_range - min_range) * 1.0 / 4.0 + min_range
|
|
k_pivots_num_0 = tl.zeros((), dtype=tl.uint32)
|
|
min_larger_0 = float("inf")
|
|
num_min_larger_0 = tl.zeros((), dtype=tl.uint32)
|
|
|
|
k_pivot_1 = (max_range - min_range) * 2.0 / 4.0 + min_range
|
|
k_pivots_num_1 = tl.zeros((), dtype=tl.uint32)
|
|
min_larger_1 = float("inf")
|
|
num_min_larger_1 = tl.zeros((), dtype=tl.uint32)
|
|
|
|
# First pass: Calculate k_pivots_num and min_larger
|
|
for i in range(0, NUM_TILES):
|
|
offs_n = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
|
|
mask_n = offs_n < VOCAB_SIZE
|
|
logits_blk2 = tl.load(
|
|
LOGITS_ROW + offs_n, mask=mask_n, other=-float("inf")
|
|
)
|
|
|
|
k_pivots_num_0 += tl.sum(logits_blk2 > k_pivot_0)
|
|
k_pivots_num_1 += tl.sum(logits_blk2 > k_pivot_1)
|
|
|
|
# Exclude -inf from min_larger to avoid
|
|
# poisoning the convergence check.
|
|
finite_blk2 = tl.where(
|
|
logits_blk2 > -float("inf"), logits_blk2, float("inf")
|
|
)
|
|
min_larger_0 = tl.minimum(min_larger_0, tl.min(finite_blk2))
|
|
min_larger_1 = tl.minimum(min_larger_1, tl.min(finite_blk2))
|
|
|
|
# Second pass: Calculate num_min_larger
|
|
for i in range(0, NUM_TILES):
|
|
offs_n = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
|
|
mask_n = offs_n < VOCAB_SIZE
|
|
logits_blk2 = tl.load(
|
|
LOGITS_ROW + offs_n, mask=mask_n, other=-float("inf")
|
|
)
|
|
|
|
num_min_larger_0 += tl.sum(
|
|
tl.abs(logits_blk2 - min_larger_0) < 1e-9
|
|
)
|
|
num_min_larger_1 += tl.sum(
|
|
tl.abs(logits_blk2 - min_larger_1) < 1e-9
|
|
)
|
|
|
|
# Check if any of the pivots satisfy termination condition
|
|
if (
|
|
k_pivots_num_0 >= k
|
|
and k_pivots_num_0 - num_min_larger_0 < k
|
|
):
|
|
k_pivot = k_pivot_0
|
|
k_pivots_num = k_pivots_num_0
|
|
min_larger = min_larger_0
|
|
num_min_larger = num_min_larger_0
|
|
found_pivot = 1
|
|
if (
|
|
k_pivots_num_1 >= k
|
|
and k_pivots_num_1 - num_min_larger_1 < k
|
|
):
|
|
k_pivot = k_pivot_1
|
|
k_pivots_num = k_pivots_num_1
|
|
min_larger = min_larger_1
|
|
num_min_larger = num_min_larger_1
|
|
found_pivot = 1
|
|
|
|
# Update range
|
|
if k_pivots_num_1 > k:
|
|
min_range = k_pivot_1
|
|
elif k_pivots_num_0 > k:
|
|
min_range = k_pivot_0
|
|
|
|
if k_pivots_num_0 < k:
|
|
max_range = k_pivot_0
|
|
elif k_pivots_num_1 < k:
|
|
max_range = k_pivot_1
|
|
|
|
num_iters += 1
|
|
if num_iters >= 18 or tl.abs(min_range - max_range) < 1e-9:
|
|
k_pivot = (max_range + min_range) / 2.0
|
|
found_pivot = 1
|
|
|
|
duplicate_logit = min_larger
|
|
num_duplicate_logit = num_min_larger
|
|
num_keep = num_duplicate_logit - (k_pivots_num - k)
|
|
num_kept = tl.zeros((), dtype=tl.uint32)
|
|
|
|
# Top-k only path. If there are fewer finite values
|
|
# than k (e.g. grammar mask), keep everything.
|
|
final_pivot = k_pivot if num_finite_total > k else -float("inf")
|
|
|
|
if TOPP_ENABLED and num_finite_total > k:
|
|
#### TOP-P SAMPLING AFTER TOP-K ####
|
|
p = tl.load(P + row_id)
|
|
if p < 1.0:
|
|
min_logit = k_pivot
|
|
sum_exp_logits = 0.0
|
|
num_outliers_2 = tl.zeros((), dtype=tl.uint32)
|
|
search_range = tl.cast(num_outliers, tl.int32)
|
|
search_iters = tl.cast(
|
|
(num_outliers + BLOCK_SIZE_TRUNC - 1) // BLOCK_SIZE_TRUNC,
|
|
tl.int32,
|
|
)
|
|
|
|
# Third pass: Calculate exp logits and sum, gather outliers
|
|
if num_outliers > k:
|
|
for i in range(0, search_iters):
|
|
offs_n = i * BLOCK_SIZE_TRUNC + tl.arange(
|
|
0, BLOCK_SIZE_TRUNC
|
|
)
|
|
mask_n_2 = offs_n < search_range
|
|
|
|
probs_blk = tl.load(
|
|
BUFFER_ROW + offs_n,
|
|
mask=mask_n_2,
|
|
other=-float("inf"),
|
|
)
|
|
|
|
outlier_mask = (probs_blk > min_logit) & mask_n_2
|
|
|
|
# Duplicate logit handling for Top-k
|
|
if num_keep < num_duplicate_logit:
|
|
duplicate_mask = (
|
|
tl.abs(probs_blk - duplicate_logit) < 1e-9
|
|
)
|
|
duplicate_count = (
|
|
tl.cumsum(duplicate_mask) + num_kept
|
|
)
|
|
duplicate_keep_mask = (
|
|
duplicate_count <= num_keep
|
|
) & duplicate_mask
|
|
duplicate_remove_mask = (
|
|
duplicate_mask & ~duplicate_keep_mask
|
|
)
|
|
outlier_mask = outlier_mask & (
|
|
~duplicate_remove_mask
|
|
)
|
|
num_kept += tl.sum(duplicate_keep_mask)
|
|
|
|
probs_blk = tl.where(
|
|
outlier_mask, probs_blk, -float("inf")
|
|
)
|
|
probs_blk = probs_blk - max_logit
|
|
probs_blk = tl.exp(probs_blk)
|
|
sum_exp_logits += tl.sum(probs_blk)
|
|
|
|
# Fourth pass: Calculate BUFFER and get outliers
|
|
for i in range(0, search_iters):
|
|
offs_n = i * BLOCK_SIZE_TRUNC + tl.arange(
|
|
0, BLOCK_SIZE_TRUNC
|
|
)
|
|
mask_n_2 = offs_n < search_range
|
|
|
|
probs_blk = tl.load(
|
|
BUFFER_ROW + offs_n,
|
|
mask=mask_n_2,
|
|
other=-float("inf"),
|
|
)
|
|
|
|
probs_blk = probs_blk - max_logit
|
|
probs_blk = tl.exp(probs_blk)
|
|
probs_blk = probs_blk / sum_exp_logits
|
|
tl.store(BUFFER_ROW + offs_n, probs_blk, mask=mask_n_2)
|
|
else:
|
|
# If top-k outlier gathering failed,
|
|
# retry gathering using top-k pivot
|
|
for i in range(0, NUM_TILES):
|
|
offs_n = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
|
|
mask_n = offs_n < VOCAB_SIZE
|
|
|
|
probs_blk = tl.load(
|
|
LOGITS_ROW + offs_n,
|
|
mask=mask_n,
|
|
other=-float("inf"),
|
|
)
|
|
|
|
outlier_mask = (probs_blk > min_logit) & mask_n
|
|
|
|
# Duplicate logit handling for Top-k
|
|
duplicate_mask = (
|
|
tl.abs(probs_blk - duplicate_logit) < 1e-9
|
|
)
|
|
duplicate_count = tl.cumsum(duplicate_mask) + num_kept
|
|
duplicate_keep_mask = (
|
|
duplicate_count <= num_keep
|
|
) & duplicate_mask
|
|
duplicate_remove_mask = (
|
|
duplicate_mask & ~duplicate_keep_mask
|
|
)
|
|
outlier_mask = outlier_mask & (~duplicate_remove_mask)
|
|
num_kept += tl.sum(duplicate_keep_mask)
|
|
|
|
probs_blk = tl.where(
|
|
outlier_mask, probs_blk, -float("inf")
|
|
)
|
|
probs_blk = probs_blk - max_logit
|
|
probs_blk = tl.exp(probs_blk)
|
|
sum_exp_logits += tl.sum(probs_blk)
|
|
|
|
cumulative_pos = tl.cast(
|
|
tl.cumsum(outlier_mask) - 1 + num_outliers_2,
|
|
tl.int32,
|
|
)
|
|
num_outliers_2 += tl.sum(outlier_mask)
|
|
write_pos = tl.where(outlier_mask, cumulative_pos, -1)
|
|
tl.store(
|
|
BUFFER_ROW + write_pos, probs_blk, mask=outlier_mask
|
|
)
|
|
|
|
search_range = tl.cast(num_outliers_2, tl.int32)
|
|
search_iters = tl.cast(
|
|
(num_outliers_2 + BLOCK_SIZE_TRUNC - 1)
|
|
// BLOCK_SIZE_TRUNC,
|
|
tl.int32,
|
|
)
|
|
|
|
# Fourth pass: Calculate BUFFER and get outliers
|
|
for i in range(0, search_iters):
|
|
offs_n = i * BLOCK_SIZE_TRUNC + tl.arange(
|
|
0, BLOCK_SIZE_TRUNC
|
|
)
|
|
mask_n_2 = offs_n < search_range
|
|
|
|
probs_blk = tl.load(
|
|
BUFFER_ROW + offs_n, mask=mask_n_2, other=0.0
|
|
)
|
|
probs_blk = probs_blk / sum_exp_logits
|
|
tl.store(BUFFER_ROW + offs_n, probs_blk, mask=mask_n_2)
|
|
|
|
max_range = tl.exp(max_logit - max_logit) / sum_exp_logits
|
|
min_range = tl.exp(min_logit - max_logit) / sum_exp_logits
|
|
|
|
p_pivot = 1.0
|
|
num_iters = 0
|
|
min_larger_prob = 1.0
|
|
num_min_larger = tl.zeros((), dtype=tl.uint32)
|
|
p_pivots_sum = 0.0
|
|
|
|
# Fifth passes: Search for p_pivot
|
|
found_pivot = 0
|
|
while found_pivot == 0:
|
|
p_pivot_0 = (max_range - min_range) * 1.0 / 3.0 + min_range
|
|
p_pivots_sum_0 = 0.0
|
|
min_larger_0 = 1.0
|
|
num_min_larger_0 = tl.zeros((), dtype=tl.uint32)
|
|
|
|
p_pivot_1 = (max_range - min_range) * 2.0 / 3.0 + min_range
|
|
p_pivots_sum_1 = 0.0
|
|
min_larger_1 = 1.0
|
|
num_min_larger_1 = tl.zeros((), dtype=tl.uint32)
|
|
|
|
# First pass: Calculate p_pivots_sum and min_larger
|
|
for i in range(0, search_iters):
|
|
offs_n = i * BLOCK_SIZE_TRUNC + tl.arange(
|
|
0, BLOCK_SIZE_TRUNC
|
|
)
|
|
mask_n_2 = offs_n < search_range
|
|
probs_blk = tl.load(
|
|
BUFFER_ROW + offs_n, mask=mask_n_2, other=0.0
|
|
)
|
|
|
|
p_pivots_sum_0 += tl.sum(
|
|
probs_blk * (probs_blk > p_pivot_0)
|
|
)
|
|
masked_larger_0 = tl.where(
|
|
probs_blk > p_pivot_0, probs_blk, 1.0
|
|
)
|
|
min_larger_0 = tl.minimum(
|
|
min_larger_0, tl.min(masked_larger_0)
|
|
)
|
|
|
|
p_pivots_sum_1 += tl.sum(
|
|
probs_blk * (probs_blk > p_pivot_1)
|
|
)
|
|
masked_larger_1 = tl.where(
|
|
probs_blk > p_pivot_1, probs_blk, 1.0
|
|
)
|
|
min_larger_1 = tl.minimum(
|
|
min_larger_1, tl.min(masked_larger_1)
|
|
)
|
|
|
|
# Second pass: Calculate num_min_larger
|
|
for i in range(0, search_iters):
|
|
offs_n = i * BLOCK_SIZE_TRUNC + tl.arange(
|
|
0, BLOCK_SIZE_TRUNC
|
|
)
|
|
mask_n_2 = offs_n < search_range
|
|
probs_blk = tl.load(
|
|
BUFFER_ROW + offs_n, mask=mask_n_2, other=0.0
|
|
)
|
|
|
|
num_min_larger_0 += tl.sum(
|
|
tl.abs(probs_blk - min_larger_0) < 1e-9
|
|
)
|
|
num_min_larger_1 += tl.sum(
|
|
tl.abs(probs_blk - min_larger_1) < 1e-9
|
|
)
|
|
|
|
# Check if any of the pivots satisfy termination condition
|
|
if p_pivots_sum_1 >= p and (
|
|
p_pivots_sum_1 - (min_larger_1 * num_min_larger_1) < p
|
|
):
|
|
p_pivot = p_pivot_1
|
|
min_larger_prob = min_larger_1
|
|
num_min_larger = num_min_larger_1
|
|
p_pivots_sum = p_pivots_sum_1
|
|
found_pivot = 1
|
|
if p_pivots_sum_0 >= p and (
|
|
p_pivots_sum_0 - (min_larger_0 * num_min_larger_0) < p
|
|
):
|
|
p_pivot = p_pivot_0
|
|
min_larger_prob = min_larger_0
|
|
num_min_larger = num_min_larger_0
|
|
p_pivots_sum = p_pivots_sum_0
|
|
found_pivot = 1
|
|
|
|
# Update range
|
|
if p_pivots_sum_1 > p:
|
|
min_range = p_pivot_1
|
|
elif p_pivots_sum_0 > p:
|
|
min_range = p_pivot_0
|
|
|
|
if p_pivots_sum_0 < p:
|
|
max_range = p_pivot_0
|
|
elif p_pivots_sum_1 < p:
|
|
max_range = p_pivot_1
|
|
|
|
num_iters += 1
|
|
if (max_range - min_range) < 1e-9 or num_iters >= 18:
|
|
p_pivot = (max_range + min_range) / 2.0
|
|
found_pivot = 1
|
|
|
|
duplicate_logit = (
|
|
tl.log(min_larger_prob * sum_exp_logits) + max_logit
|
|
)
|
|
num_duplicate_logit = num_min_larger
|
|
num_keep = num_duplicate_logit - tl.cast(
|
|
(p_pivots_sum - p) / min_larger_prob, tl.uint32
|
|
)
|
|
num_kept = tl.zeros((), dtype=tl.uint32)
|
|
|
|
# Top-k + Top-p path
|
|
final_pivot = tl.log(p_pivot * sum_exp_logits) + max_logit
|
|
|
|
if TOPP_ENABLED and final_pivot == -float("inf"):
|
|
#### STANDALONE TOP-P SAMPLING ####
|
|
p = tl.load(P + row_id)
|
|
if p < 1.0:
|
|
# Zeroth pass: Compute avg and std from a sample block
|
|
offs = tl.arange(0, BLOCK_SIZE)
|
|
mask_n = offs < VOCAB_SIZE
|
|
logits_blk0 = tl.load(
|
|
LOGITS_ROW + offs, mask=mask_n, other=-float("inf")
|
|
)
|
|
# Exclude -inf values (e.g. from grammar bitmasks) from
|
|
# statistics to avoid NaN in pivot computation.
|
|
finite_mask = (logits_blk0 > -float("inf")) & mask_n
|
|
num_finite = tl.sum(finite_mask)
|
|
finite_logits = tl.where(finite_mask, logits_blk0, 0.0)
|
|
avg_logit = tl.where(
|
|
num_finite > 0, tl.sum(finite_logits) / num_finite, 0.0
|
|
)
|
|
sq_avg_logit = tl.where(
|
|
num_finite > 0,
|
|
tl.sum(finite_logits * finite_logits) / num_finite,
|
|
0.0,
|
|
)
|
|
std_logit = tl.sqrt(
|
|
tl.maximum(sq_avg_logit - avg_logit * avg_logit, 0.0)
|
|
)
|
|
max_sample = avg_logit + std_logit * 10.0
|
|
sum_exp_logits = 0.0
|
|
|
|
# First pass: compute max and min logits and sum_exp_logits
|
|
for i in range(0, NUM_TILES):
|
|
offs_n = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
|
|
mask_n = offs_n < VOCAB_SIZE
|
|
logits_blk = tl.load(
|
|
LOGITS_ROW + offs_n, mask=mask_n, other=-float("inf")
|
|
)
|
|
max_logit = tl.maximum(max_logit, tl.max(logits_blk))
|
|
# Exclude -inf from min to keep binary search bounds
|
|
# finite (avoids NaN pivots).
|
|
finite_blk = tl.where(
|
|
logits_blk > -float("inf"), logits_blk, float("inf")
|
|
)
|
|
min_logit = tl.minimum(min_logit, tl.min(finite_blk))
|
|
|
|
probs_blk = tl.exp(logits_blk - max_sample)
|
|
probs_blk = tl.where(mask_n, probs_blk, 0.0)
|
|
sum_exp_logits += tl.sum(probs_blk)
|
|
|
|
# If no finite logits exist (all -inf), clamp min to
|
|
# max so the search converges to -inf (no masking).
|
|
min_logit = tl.minimum(min_logit, max_logit)
|
|
|
|
idx = tl.cast(p * 200, tl.int32)
|
|
idx = tl.maximum(0, tl.minimum(idx, 199))
|
|
sigma = tl.load(NORMAL_CDF_TO_SIGMA_TABLE + idx)
|
|
sigma = sigma + tl.abs(sigma) * -0.25
|
|
outlier_pivot = avg_logit + std_logit * sigma
|
|
|
|
outlier_prob = tl.exp(outlier_pivot - max_sample) / sum_exp_logits
|
|
sum_outlier_probs = 0.0
|
|
num_outliers = tl.zeros((), dtype=tl.uint32)
|
|
|
|
# Second pass: Calculate softmax and gather outliers
|
|
for i in range(0, NUM_TILES):
|
|
offs_n = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
|
|
mask_n = offs_n < VOCAB_SIZE
|
|
|
|
probs_blk = tl.load(
|
|
LOGITS_ROW + offs_n, mask=mask_n, other=-float("inf")
|
|
)
|
|
probs_blk = tl.exp(probs_blk - max_sample)
|
|
probs_blk = probs_blk / sum_exp_logits
|
|
|
|
outlier_mask = (probs_blk > outlier_prob) & mask_n
|
|
sum_outlier_probs += tl.sum(outlier_mask * probs_blk)
|
|
cumulative_pos = tl.cast(
|
|
tl.cumsum(outlier_mask) - 1 + num_outliers, tl.int32
|
|
)
|
|
num_outliers += tl.sum(outlier_mask)
|
|
write_pos = tl.where(outlier_mask, cumulative_pos, -1)
|
|
tl.store(BUFFER_ROW + write_pos, probs_blk, mask=outlier_mask)
|
|
|
|
max_range = tl.exp(max_logit - max_sample) / sum_exp_logits
|
|
min_range = tl.exp(min_logit - max_sample) / sum_exp_logits
|
|
|
|
p_pivot = 1.0
|
|
num_iters = 0
|
|
min_larger_prob = 1.0
|
|
num_min_larger = tl.zeros((), dtype=tl.uint32)
|
|
p_pivots_sum = 0.0
|
|
|
|
# Third pass: Search for p_pivot
|
|
if sum_outlier_probs > p:
|
|
min_range = outlier_prob
|
|
search_range = tl.cast(num_outliers, tl.int32)
|
|
search_iters = tl.cast(
|
|
(num_outliers + BLOCK_SIZE_TRUNC - 1) // BLOCK_SIZE_TRUNC,
|
|
tl.int32,
|
|
)
|
|
|
|
found_pivot = 0
|
|
while found_pivot == 0:
|
|
p_pivot_0 = (max_range - min_range) * 1.0 / 3.0 + min_range
|
|
p_pivots_sum_0 = 0.0
|
|
min_larger_0 = 1.0
|
|
num_min_larger_0 = tl.zeros((), dtype=tl.uint32)
|
|
|
|
p_pivot_1 = (max_range - min_range) * 2.0 / 3.0 + min_range
|
|
p_pivots_sum_1 = 0.0
|
|
min_larger_1 = 1.0
|
|
num_min_larger_1 = tl.zeros((), dtype=tl.uint32)
|
|
|
|
# First pass: Calculate p_pivots_sum and min_larger
|
|
for i in range(0, search_iters):
|
|
offs_n = i * BLOCK_SIZE_TRUNC + tl.arange(
|
|
0, BLOCK_SIZE_TRUNC
|
|
)
|
|
mask_n_2 = offs_n < search_range
|
|
probs_blk = tl.load(
|
|
BUFFER_ROW + offs_n, mask=mask_n_2, other=0.0
|
|
)
|
|
|
|
p_pivots_sum_0 += tl.sum(
|
|
probs_blk * (probs_blk > p_pivot_0)
|
|
)
|
|
masked_larger_0 = tl.where(
|
|
probs_blk > p_pivot_0, probs_blk, 1.0
|
|
)
|
|
min_larger_0 = tl.minimum(
|
|
min_larger_0, tl.min(masked_larger_0)
|
|
)
|
|
|
|
p_pivots_sum_1 += tl.sum(
|
|
probs_blk * (probs_blk > p_pivot_1)
|
|
)
|
|
masked_larger_1 = tl.where(
|
|
probs_blk > p_pivot_1, probs_blk, 1.0
|
|
)
|
|
min_larger_1 = tl.minimum(
|
|
min_larger_1, tl.min(masked_larger_1)
|
|
)
|
|
|
|
# Second pass: Calculate num_min_larger
|
|
for i in range(0, search_iters):
|
|
offs_n = i * BLOCK_SIZE_TRUNC + tl.arange(
|
|
0, BLOCK_SIZE_TRUNC
|
|
)
|
|
mask_n_2 = offs_n < search_range
|
|
probs_blk = tl.load(
|
|
BUFFER_ROW + offs_n, mask=mask_n_2, other=0.0
|
|
)
|
|
|
|
num_min_larger_0 += tl.sum(
|
|
tl.abs(probs_blk - min_larger_0) < 1e-9
|
|
)
|
|
num_min_larger_1 += tl.sum(
|
|
tl.abs(probs_blk - min_larger_1) < 1e-9
|
|
)
|
|
|
|
# Check if any of the pivots satisfy termination condition
|
|
if (
|
|
p_pivots_sum_1 >= p
|
|
and p_pivots_sum_1 - (min_larger_1 * num_min_larger_1) < p
|
|
):
|
|
p_pivot = p_pivot_1
|
|
min_larger_prob = min_larger_1
|
|
num_min_larger = num_min_larger_1
|
|
p_pivots_sum = p_pivots_sum_1
|
|
found_pivot = 1
|
|
if (
|
|
p_pivots_sum_0 >= p
|
|
and p_pivots_sum_0 - (min_larger_0 * num_min_larger_0) < p
|
|
):
|
|
p_pivot = p_pivot_0
|
|
min_larger_prob = min_larger_0
|
|
num_min_larger = num_min_larger_0
|
|
p_pivots_sum = p_pivots_sum_0
|
|
found_pivot = 1
|
|
|
|
# Update range
|
|
if p_pivots_sum_1 > p:
|
|
min_range = p_pivot_1
|
|
elif p_pivots_sum_0 > p:
|
|
min_range = p_pivot_0
|
|
|
|
if p_pivots_sum_0 < p:
|
|
max_range = p_pivot_0
|
|
elif p_pivots_sum_1 < p:
|
|
max_range = p_pivot_1
|
|
|
|
num_iters += 1
|
|
if (max_range - min_range) < 1e-9 or num_iters >= 18:
|
|
p_pivot = (max_range + min_range) / 2.0
|
|
found_pivot = 1
|
|
else:
|
|
# Re-populate the buffer with full softmax probabilities
|
|
for i in range(0, NUM_TILES):
|
|
offs_n = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
|
|
mask_n = offs_n < VOCAB_SIZE
|
|
|
|
probs_blk = tl.load(
|
|
LOGITS_ROW + offs_n, mask=mask_n, other=-float("inf")
|
|
)
|
|
probs_blk = tl.exp(probs_blk - max_sample)
|
|
probs_blk = probs_blk / sum_exp_logits
|
|
tl.store(BUFFER_ROW + offs_n, probs_blk, mask=mask_n)
|
|
|
|
found_pivot = 0
|
|
while found_pivot == 0:
|
|
p_pivot_0 = (max_range - min_range) * 1.0 / 3.0 + min_range
|
|
p_pivots_sum_0 = 0.0
|
|
min_larger_0 = 1.0
|
|
num_min_larger_0 = tl.zeros((), dtype=tl.uint32)
|
|
|
|
p_pivot_1 = (max_range - min_range) * 2.0 / 3.0 + min_range
|
|
p_pivots_sum_1 = 0.0
|
|
min_larger_1 = 1.0
|
|
num_min_larger_1 = tl.zeros((), dtype=tl.uint32)
|
|
|
|
# First pass: Calculate p_pivots_sum and min_larger
|
|
for i in range(0, NUM_TILES):
|
|
offs_n = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
|
|
mask_n = offs_n < VOCAB_SIZE
|
|
probs_blk = tl.load(
|
|
BUFFER_ROW + offs_n, mask=mask_n, other=0.0
|
|
)
|
|
|
|
p_pivots_sum_0 += tl.sum(
|
|
probs_blk * (probs_blk > p_pivot_0)
|
|
)
|
|
masked_larger_0 = tl.where(
|
|
probs_blk > p_pivot_0, probs_blk, 1.0
|
|
)
|
|
min_larger_0 = tl.minimum(
|
|
min_larger_0, tl.min(masked_larger_0)
|
|
)
|
|
|
|
p_pivots_sum_1 += tl.sum(
|
|
probs_blk * (probs_blk > p_pivot_1)
|
|
)
|
|
masked_larger_1 = tl.where(
|
|
probs_blk > p_pivot_1, probs_blk, 1.0
|
|
)
|
|
min_larger_1 = tl.minimum(
|
|
min_larger_1, tl.min(masked_larger_1)
|
|
)
|
|
|
|
# Second pass: Calculate num_min_larger
|
|
for i in range(0, NUM_TILES):
|
|
offs_n = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
|
|
mask_n = offs_n < VOCAB_SIZE
|
|
probs_blk = tl.load(
|
|
BUFFER_ROW + offs_n, mask=mask_n, other=0.0
|
|
)
|
|
|
|
num_min_larger_0 += tl.sum(
|
|
tl.abs(probs_blk - min_larger_0) < 1e-9
|
|
)
|
|
num_min_larger_1 += tl.sum(
|
|
tl.abs(probs_blk - min_larger_1) < 1e-9
|
|
)
|
|
|
|
# Check if any of the pivots satisfy termination condition
|
|
if (
|
|
p_pivots_sum_1 >= p
|
|
and p_pivots_sum_1 - (min_larger_1 * num_min_larger_1) < p
|
|
):
|
|
p_pivot = p_pivot_1
|
|
min_larger_prob = min_larger_1
|
|
num_min_larger = num_min_larger_1
|
|
p_pivots_sum = p_pivots_sum_1
|
|
found_pivot = 1
|
|
if (
|
|
p_pivots_sum_0 >= p
|
|
and p_pivots_sum_0 - (min_larger_0 * num_min_larger_0) < p
|
|
):
|
|
p_pivot = p_pivot_0
|
|
min_larger_prob = min_larger_0
|
|
num_min_larger = num_min_larger_0
|
|
p_pivots_sum = p_pivots_sum_0
|
|
found_pivot = 1
|
|
|
|
# Update range
|
|
if p_pivots_sum_1 > p:
|
|
min_range = p_pivot_1
|
|
elif p_pivots_sum_0 > p:
|
|
min_range = p_pivot_0
|
|
|
|
if p_pivots_sum_0 < p:
|
|
max_range = p_pivot_0
|
|
elif p_pivots_sum_1 < p:
|
|
max_range = p_pivot_1
|
|
|
|
num_iters += 1
|
|
if (max_range - min_range) < 1e-9 or num_iters >= 18:
|
|
p_pivot = (max_range + min_range) / 2.0
|
|
found_pivot = 1
|
|
|
|
duplicate_logit = tl.log(min_larger_prob * sum_exp_logits) + max_logit
|
|
num_duplicate_logit = num_min_larger
|
|
num_keep = num_duplicate_logit - tl.cast(
|
|
(p_pivots_sum - p) / min_larger_prob, tl.uint32
|
|
)
|
|
num_kept = tl.zeros((), dtype=tl.uint32)
|
|
|
|
# Top-p only path
|
|
final_pivot = tl.log(p_pivot * sum_exp_logits) + max_sample
|
|
|
|
# Sixth pass: Apply mask and store final output.
|
|
# If the pivot >= max logit (or is NaN), no token would
|
|
# survive the strict `>` keep_mask. Skip masking.
|
|
# Using `not <` instead of `>=` so that NaN is also caught.
|
|
if not (final_pivot < max_logit):
|
|
final_pivot = -float("inf")
|
|
elif final_pivot != -float("inf"):
|
|
for i in range(0, NUM_TILES):
|
|
offs_n = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
|
|
mask_n = offs_n < VOCAB_SIZE
|
|
logits_blk = tl.load(
|
|
LOGITS_ROW + offs_n, mask=mask_n, other=-float("inf")
|
|
)
|
|
keep_mask = (logits_blk > final_pivot) & mask_n
|
|
|
|
# Duplicate logit handling
|
|
if num_keep < num_duplicate_logit:
|
|
duplicate_mask = (
|
|
tl.abs(logits_blk - duplicate_logit) < 1e-9
|
|
) & mask_n
|
|
duplicate_count = tl.cumsum(duplicate_mask) + num_kept
|
|
duplicate_keep_mask = (
|
|
duplicate_count <= num_duplicate_logit
|
|
) & duplicate_mask
|
|
duplicate_remove_mask = duplicate_mask & ~duplicate_keep_mask
|
|
num_kept += tl.sum(duplicate_keep_mask)
|
|
keep_mask = keep_mask & (~duplicate_remove_mask)
|
|
|
|
logits_blk = tl.where(keep_mask, logits_blk, MASK_VALUE)
|
|
tl.store(LOGITS_ROW + offs_n, logits_blk, mask=mask_n)
|
|
|
|
|
|
def apply_top_k_top_p_triton(
|
|
logits: torch.Tensor,
|
|
k: torch.Tensor | None,
|
|
p: torch.Tensor | None,
|
|
mask_value: float = float("-inf"),
|
|
) -> torch.Tensor:
|
|
"""
|
|
Apply combined top-k and top-p masking using Triton.
|
|
|
|
Top-k is applied first (by logit value), then top-p is applied
|
|
to the remaining k values (by probability).
|
|
|
|
Args:
|
|
logits: [batch_size, vocab_size] float32 tensor, modified in-place
|
|
k: [batch_size] int32 tensor of top-k values per row, or None to disable top-k
|
|
p: [batch_size] float32 tensor of top-p values per row (0 to 1),
|
|
or None to disable top-p
|
|
mask_value: Value for masked positions (default: -inf)
|
|
|
|
Returns:
|
|
The logits tensor (modified in-place)
|
|
"""
|
|
assert logits.ndim == 2
|
|
assert logits.dtype == torch.float32
|
|
|
|
batch_size, vocab_size = logits.shape
|
|
|
|
topk_enabled = k is not None
|
|
topp_enabled = p is not None
|
|
|
|
if batch_size == 0 or not (topk_enabled or topp_enabled):
|
|
return logits
|
|
|
|
if k is not None:
|
|
assert k.ndim == 1 and k.shape[0] == batch_size
|
|
k_ptr = k.to(torch.int32)
|
|
else:
|
|
k_ptr = logits # Dummy pointer (won't be read)
|
|
|
|
if p is not None:
|
|
assert p.ndim == 1 and p.shape[0] == batch_size
|
|
p_ptr = p.to(torch.float32)
|
|
else:
|
|
p_ptr = logits # Dummy pointer (won't be read)
|
|
|
|
num_sm = num_compute_units(logits.device.index)
|
|
NUM_PROGRAMS = min(num_sm, batch_size)
|
|
|
|
# Cache per-Triton Program buffer on each device.
|
|
buf_key = (logits.device, logits.dtype, vocab_size)
|
|
buffer = _TRITON_BUFFER_CACHE.get(buf_key)
|
|
if buffer is None or buffer.shape[0] < NUM_PROGRAMS:
|
|
size = min(next_power_of_2(NUM_PROGRAMS), num_sm)
|
|
buffer = logits.new_empty((size, vocab_size))
|
|
_TRITON_BUFFER_CACHE[buf_key] = buffer
|
|
if buffer.shape[0] > NUM_PROGRAMS:
|
|
buffer = buffer[:NUM_PROGRAMS]
|
|
|
|
# Cache lookup table entries on each device.
|
|
tables = _TRITON_TABLE_CACHE.get(logits.device)
|
|
if tables is None:
|
|
normal_cdf_to_sigma_table = logits.new_tensor(_NORMAL_CDF_TO_SIGMA_TABLE)
|
|
percentile_to_std_table = logits.new_tensor(_PERCENTILE_TO_STD_TABLE)
|
|
_TRITON_TABLE_CACHE[logits.device] = (
|
|
normal_cdf_to_sigma_table,
|
|
percentile_to_std_table,
|
|
)
|
|
else:
|
|
normal_cdf_to_sigma_table, percentile_to_std_table = tables
|
|
|
|
_topk_topp_kernel[(NUM_PROGRAMS,)](
|
|
logits,
|
|
buffer,
|
|
percentile_to_std_table,
|
|
normal_cdf_to_sigma_table,
|
|
k_ptr,
|
|
p_ptr,
|
|
BATCH_SIZE=batch_size,
|
|
MASK_VALUE=mask_value,
|
|
VOCAB_SIZE=vocab_size,
|
|
BLOCK_SIZE=8192,
|
|
BLOCK_SIZE_TRUNC=4096,
|
|
TOPK_ENABLED=topk_enabled,
|
|
TOPP_ENABLED=topp_enabled,
|
|
)
|
|
|
|
return logits
|
|
|
|
|
|
def reset_buffer_cache():
|
|
_TRITON_BUFFER_CACHE.clear()
|
|
_TRITON_TABLE_CACHE.clear()
|
|
torch.cuda.empty_cache()
|