Files
enginex-bi_150-vllm/v1/sample/ops/penalties.py
2026-03-05 18:06:10 +08:00

58 lines
1.9 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm.model_executor.layers.utils import apply_penalties
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad
def apply_all_penalties(
logits: torch.Tensor,
prompt_token_ids: torch.Tensor,
presence_penalties: torch.Tensor,
frequency_penalties: torch.Tensor,
repetition_penalties: torch.Tensor,
output_token_ids: list[list[int]],
) -> torch.Tensor:
"""
Applies presence, frequency and repetition penalties to the logits.
"""
_, vocab_size = logits.shape
output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size, logits.device)
# In the async scheduling case, rows that won't have penalties applied may contain
# -1 placeholder token ids. We must replace these with valid token ids so that the
# scatter done in apply_penalties is valid.
# NOTE(nick): The penalties implementation is currently quite inefficient and
# will be reworked anyhow.
output_tokens_t.masked_fill_(output_tokens_t == -1, vocab_size)
return apply_penalties(
logits,
prompt_token_ids,
output_tokens_t,
presence_penalties,
frequency_penalties,
repetition_penalties,
)
def _convert_to_tensors(
output_token_ids: list[list[int]], vocab_size: int, device: torch.device
) -> torch.Tensor:
"""
Convert the different list data structures to tensors.
"""
output_tokens_tensor = make_tensor_with_pad(
output_token_ids,
# Use the value of vocab_size as a pad since we don't have a
# token_id of this value.
pad=vocab_size,
device="cpu",
dtype=torch.int64,
pin_memory=is_pin_memory_available(),
)
return output_tokens_tensor.to(device, non_blocking=True)