@@ -1145,9 +1145,7 @@ class ModelRunner:
|
||||
[self.sample(values, forward_batch) for values in logits_output],
|
||||
axis=-1,
|
||||
)
|
||||
sampling_info = forward_batch.sampling_info
|
||||
if sampling_info.thinking_budgets is not None:
|
||||
sampling_info.apply_thinking_budgets(logits_output.next_token_logits)
|
||||
|
||||
self._preprocess_logits(logits_output, forward_batch.sampling_info)
|
||||
|
||||
# Sample the next tokens
|
||||
@@ -1158,8 +1156,6 @@ class ModelRunner:
|
||||
forward_batch.top_logprobs_nums,
|
||||
forward_batch.token_ids_logprobs,
|
||||
)
|
||||
if sampling_info.thinking_budgets is not None:
|
||||
sampling_info.update_thinking_budgets(next_token_ids)
|
||||
return next_token_ids
|
||||
|
||||
@property
|
||||
|
||||
@@ -529,7 +529,6 @@ def v1_generate_request(
|
||||
"temperature": request.temperature,
|
||||
"max_new_tokens": request.max_tokens,
|
||||
"min_new_tokens": request.min_tokens,
|
||||
"thinking_budget": request.thinking_budget,
|
||||
"stop": request.stop,
|
||||
"stop_token_ids": request.stop_token_ids,
|
||||
"top_p": request.top_p,
|
||||
@@ -1102,7 +1101,6 @@ def v1_chat_generate_request(
|
||||
"temperature": request.temperature,
|
||||
"max_new_tokens": request.max_tokens or request.max_completion_tokens,
|
||||
"min_new_tokens": request.min_tokens,
|
||||
"thinking_budget": request.thinking_budget,
|
||||
"stop": stop,
|
||||
"stop_token_ids": request.stop_token_ids,
|
||||
"top_p": request.top_p,
|
||||
|
||||
@@ -172,7 +172,6 @@ class CompletionRequest(BaseModel):
|
||||
top_k: int = -1
|
||||
min_p: float = 0.0
|
||||
min_tokens: int = 0
|
||||
thinking_budget: Optional[int] = None
|
||||
json_schema: Optional[str] = None
|
||||
regex: Optional[str] = None
|
||||
ebnf: Optional[str] = None
|
||||
@@ -351,13 +350,6 @@ class ChatCompletionRequest(BaseModel):
|
||||
description="The maximum number of completion tokens for a chat completion request, "
|
||||
"including visible output tokens and reasoning tokens. Input tokens are not included. ",
|
||||
)
|
||||
thinking_budget: Optional[int] = Field(
|
||||
default=None,
|
||||
description="The maximum number of reasoning tokens that can be generated for a request. "
|
||||
"This setting of does not affect the thinking process of models. "
|
||||
"If the number of tokens generated by the model's thinking process exceeds thinking_budget, "
|
||||
"the reasoning content will be truncated and the final response content will be generated immediately.",
|
||||
)
|
||||
n: int = 1
|
||||
presence_penalty: float = 0.0
|
||||
response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
|
||||
|
||||
@@ -32,7 +32,7 @@ class BaseReasoningFormatDetector:
|
||||
One-time parsing: Detects and parses reasoning sections in the provided text.
|
||||
Returns both reasoning content and normal text separately.
|
||||
"""
|
||||
text = text.replace(self.think_start_token, "")
|
||||
text = text.replace(self.think_start_token, "").strip()
|
||||
if self.think_end_token not in text:
|
||||
# Assume reasoning was truncated before `</think>` token
|
||||
return StreamingParseResult(reasoning_text=text)
|
||||
@@ -73,7 +73,7 @@ class BaseReasoningFormatDetector:
|
||||
normal_text = current_text[end_idx + len(self.think_end_token) :]
|
||||
|
||||
return StreamingParseResult(
|
||||
normal_text=normal_text, reasoning_text=reasoning_text
|
||||
normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
|
||||
)
|
||||
|
||||
# Continue with reasoning content
|
||||
|
||||
@@ -30,13 +30,8 @@ class SamplingBatchInfo:
|
||||
# Whether any request needs min_p sampling
|
||||
need_min_p_sampling: bool
|
||||
|
||||
# Use thinking_budget to truncate thinking
|
||||
num_thinking_tokens: Optional[torch.Tensor] = None
|
||||
think_end_ids: Optional[torch.Tensor] = None
|
||||
thinking_budgets: Optional[torch.Tensor] = None
|
||||
|
||||
# Masking tensors for grammar-guided structured outputs
|
||||
vocab_size: int = 0
|
||||
vocab_size: int
|
||||
grammars: Optional[List] = None
|
||||
vocab_mask: Optional[torch.Tensor] = None
|
||||
apply_mask_func: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
|
||||
@@ -81,22 +76,7 @@ class SamplingBatchInfo:
|
||||
min_ps = torch.tensor(
|
||||
[r.sampling_params.min_p for r in reqs], dtype=torch.float
|
||||
).to(device, non_blocking=True)
|
||||
if any(hasattr(r.tokenizer, "think_end_id") for r in reqs):
|
||||
think_end_ids = torch.tensor(
|
||||
[getattr(r.tokenizer, "think_end_id", -1) for r in reqs],
|
||||
dtype=torch.int64,
|
||||
).to(device, non_blocking=True)
|
||||
num_thinking_tokens = torch.tensor([0 for _ in reqs], dtype=torch.int64).to(
|
||||
device, non_blocking=True
|
||||
)
|
||||
thinking_budgets = torch.tensor(
|
||||
[r.sampling_params.thinking_budget or -1 for r in reqs],
|
||||
dtype=torch.int64,
|
||||
).to(device, non_blocking=True)
|
||||
else:
|
||||
think_end_ids = None
|
||||
num_thinking_tokens = None
|
||||
thinking_budgets = None
|
||||
|
||||
# Check if any request has custom logit processor
|
||||
has_custom_logit_processor = (
|
||||
batch.enable_custom_logit_processor # check the flag first.
|
||||
@@ -152,9 +132,6 @@ class SamplingBatchInfo:
|
||||
top_ps=top_ps,
|
||||
top_ks=top_ks,
|
||||
min_ps=min_ps,
|
||||
think_end_ids=think_end_ids,
|
||||
num_thinking_tokens=num_thinking_tokens,
|
||||
thinking_budgets=thinking_budgets,
|
||||
is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
|
||||
need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
|
||||
vocab_size=vocab_size,
|
||||
@@ -169,35 +146,6 @@ class SamplingBatchInfo:
|
||||
def __len__(self):
|
||||
return len(self.temperatures)
|
||||
|
||||
def apply_thinking_budgets(self, next_token_logits: torch.Tensor):
|
||||
has_budget = self.thinking_budgets > 0
|
||||
if not has_budget.any():
|
||||
return
|
||||
torch.where(
|
||||
has_budget,
|
||||
self.num_thinking_tokens + 1,
|
||||
self.num_thinking_tokens,
|
||||
out=self.num_thinking_tokens,
|
||||
)
|
||||
should_stop = has_budget & (
|
||||
self.num_thinking_tokens - 1 > self.thinking_budgets
|
||||
)
|
||||
next_token_logits.masked_fill_(should_stop.unsqueeze(0), float("-inf"))
|
||||
batch_indices = torch.nonzero(should_stop, as_tuple=True)[0]
|
||||
if len(batch_indices) > 0:
|
||||
end_token_indices = self.think_end_ids[batch_indices]
|
||||
next_token_logits[batch_indices, end_token_indices] = 0.0
|
||||
|
||||
def update_thinking_budgets(self, next_token_ids: torch.Tensor):
|
||||
if not torch.any(self.thinking_budgets > 0):
|
||||
return
|
||||
torch.where(
|
||||
next_token_ids == self.think_end_ids,
|
||||
torch.tensor(-1, device=self.thinking_budgets.device),
|
||||
self.thinking_budgets,
|
||||
out=self.thinking_budgets,
|
||||
)
|
||||
|
||||
def update_regex_vocab_mask(self):
|
||||
if not self.grammars:
|
||||
self.vocab_mask = None
|
||||
|
||||
@@ -30,7 +30,6 @@ class SamplingParams:
|
||||
def __init__(
|
||||
self,
|
||||
max_new_tokens: int = 128,
|
||||
thinking_budget: Optional[int] = None,
|
||||
stop: Optional[Union[str, List[str]]] = None,
|
||||
stop_token_ids: Optional[List[int]] = None,
|
||||
temperature: float = 1.0,
|
||||
@@ -58,7 +57,6 @@ class SamplingParams:
|
||||
self.stop_token_ids = set(stop_token_ids)
|
||||
else:
|
||||
self.stop_token_ids = None
|
||||
self.thinking_budget = thinking_budget
|
||||
self.temperature = temperature
|
||||
self.top_p = top_p
|
||||
self.top_k = top_k
|
||||
|
||||
Reference in New Issue
Block a user