# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy as np import torch from vllm.sampling_params import SamplingParams from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p from vllm.v1.worker.gpu.buffer_utils import UvaBackedTensor from vllm.v1.worker.gpu.sample.gumbel import apply_temperature from vllm.v1.worker.gpu.sample.min_p import apply_min_p NO_LOGPROBS = -1 _NP_INT64_MIN = np.iinfo(np.int64).min _NP_INT64_MAX = np.iinfo(np.int64).max class SamplingStates: def __init__(self, max_num_reqs: int, vocab_size: int): self.max_num_reqs = max_num_reqs self.vocab_size = vocab_size self.temperature = UvaBackedTensor(max_num_reqs, dtype=torch.float32) self.top_k = UvaBackedTensor(max_num_reqs, dtype=torch.int32) self.top_p = UvaBackedTensor(max_num_reqs, dtype=torch.float32) self.min_p = UvaBackedTensor(max_num_reqs, dtype=torch.float32) self.seeds = UvaBackedTensor(max_num_reqs, dtype=torch.int64) # Initialize top_k and top_p manually because 0 is an invalid value for them. self.top_k.np.fill(self.vocab_size) self.top_k.copy_to_uva() self.top_p.np.fill(1.0) self.top_p.copy_to_uva() self.num_logprobs = np.empty(self.max_num_reqs, dtype=np.int32) # -1 means no logprobs are requested. self.num_logprobs.fill(NO_LOGPROBS) def add_request(self, req_idx: int, sampling_params: SamplingParams) -> None: self.temperature.np[req_idx] = sampling_params.temperature self.top_p.np[req_idx] = sampling_params.top_p top_k = sampling_params.top_k if top_k <= 0 or top_k > self.vocab_size: top_k = self.vocab_size self.top_k.np[req_idx] = top_k self.min_p.np[req_idx] = sampling_params.min_p seed = sampling_params.seed if seed is None: seed = np.random.randint(_NP_INT64_MIN, _NP_INT64_MAX) self.seeds.np[req_idx] = seed num_logprobs = sampling_params.logprobs if num_logprobs is None: num_logprobs = NO_LOGPROBS self.num_logprobs[req_idx] = num_logprobs def apply_staged_writes(self) -> None: self.temperature.copy_to_uva() self.top_p.copy_to_uva() self.top_k.copy_to_uva() self.min_p.copy_to_uva() self.seeds.copy_to_uva() def apply_temperature( self, logits: torch.Tensor, expanded_idx_mapping: torch.Tensor, idx_mapping_np: np.ndarray, ) -> None: temp_np = self.temperature.np[idx_mapping_np] if np.all((temp_np == 0.0) | (temp_np == 1.0)): # No request requires temperature. Skip the kernel launch. return apply_temperature(logits, expanded_idx_mapping, self.temperature.gpu) def apply_min_p( self, logits: torch.Tensor, expanded_idx_mapping: torch.Tensor, idx_mapping_np: np.ndarray, ) -> None: if np.all(self.min_p.np[idx_mapping_np] == 0.0): # No request uses min_p. Skip the kernel launch. return apply_min_p(logits, expanded_idx_mapping, self.min_p.gpu) def apply_top_k_top_p( self, logits: torch.Tensor, expanded_idx_mapping: torch.Tensor, idx_mapping_np: np.ndarray, ) -> torch.Tensor: do_top_k = np.any(self.top_k.np[idx_mapping_np] != self.vocab_size) do_top_p = np.any(self.top_p.np[idx_mapping_np] != 1.0) if not (do_top_k or do_top_p): return logits top_k = self.top_k.gpu[expanded_idx_mapping] if do_top_k else None top_p = self.top_p.gpu[expanded_idx_mapping] if do_top_p else None return apply_top_k_top_p(logits, top_k, top_p) def max_num_logprobs(self, idx_mapping_np: np.ndarray) -> int: return int(np.max(self.num_logprobs[idx_mapping_np]))