Fuse more ops & Simplify token mapping (#1758)

This commit is contained in:
Lianmin Zheng
2024-10-22 23:20:43 -07:00
committed by GitHub
parent 17536e7e3d
commit ad4125d1a9
9 changed files with 99 additions and 75 deletions

View File

@@ -33,56 +33,61 @@ class Sampler(nn.Module):
if isinstance(logits, LogitsProcessorOutput):
logits = logits.next_token_logits
# Post process logits
logits = logits.contiguous()
logits.div_(sampling_info.temperatures)
probs = torch.softmax(logits, dim=-1)
logits = None
del logits
if self.use_nan_detectioin and torch.any(torch.isnan(probs)):
logger.warning("Detected errors during sampling! NaN in the probability.")
probs = torch.where(
torch.isnan(probs), torch.full_like(probs, 1e-10), probs
if self.use_nan_detectioin and torch.any(torch.isnan(logits)):
logger.warning("Detected errors during sampling! NaN in the logits.")
logits = torch.where(
torch.isnan(logits), torch.full_like(logits, -1e5), logits
)
if sampling_info.is_all_greedy:
# Use torch.argmax if all requests use greedy sampling
batch_next_token_ids = torch.argmax(probs, -1)
elif global_server_args_dict["sampling_backend"] == "flashinfer":
max_top_k_round, batch_size = 32, probs.shape[0]
uniform_samples = torch.rand(
(max_top_k_round, batch_size), device=probs.device
)
if sampling_info.need_min_p_sampling:
probs = top_k_renorm_prob(probs, sampling_info.top_ks)
probs = top_p_renorm_prob(probs, sampling_info.top_ps)
batch_next_token_ids, success = min_p_sampling_from_probs(
probs, uniform_samples, sampling_info.min_ps
batch_next_token_ids = torch.argmax(logits, -1)
else:
# Post process logits
logits.div_(sampling_info.temperatures)
probs = torch.softmax(logits, dim=-1)
logits = None
del logits
if global_server_args_dict["sampling_backend"] == "flashinfer":
max_top_k_round, batch_size = 32, probs.shape[0]
uniform_samples = torch.rand(
(max_top_k_round, batch_size), device=probs.device
)
else:
batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
if sampling_info.need_min_p_sampling:
probs = top_k_renorm_prob(probs, sampling_info.top_ks)
probs = top_p_renorm_prob(probs, sampling_info.top_ps)
batch_next_token_ids, success = min_p_sampling_from_probs(
probs, uniform_samples, sampling_info.min_ps
)
else:
batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
probs,
uniform_samples,
sampling_info.top_ks,
sampling_info.top_ps,
filter_apply_order="joint",
)
if not torch.all(success):
logger.warning("Detected errors during sampling!")
batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
elif global_server_args_dict["sampling_backend"] == "pytorch":
# A slower fallback implementation with torch native operations.
batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
probs,
uniform_samples,
sampling_info.top_ks,
sampling_info.top_ps,
filter_apply_order="joint",
sampling_info.min_ps,
)
else:
raise ValueError(
f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
)
if not torch.all(success):
logger.warning("Detected errors during sampling!")
batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
elif global_server_args_dict["sampling_backend"] == "pytorch":
# Here we provide a slower fallback implementation.
batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
)
else:
raise ValueError(
f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
)
return batch_next_token_ids
return batch_next_token_ids.to(torch.int32)
def top_k_top_p_min_p_sampling_from_probs_torch(