109 lines
4.0 KiB
Python
109 lines
4.0 KiB
Python
"""
|
|
Reasoning parser for Qwen3 / Qwen3.5 / Qwen3.6 model family.
|
|
Adapted from vllm-original/vllm/reasoning/qwen3_reasoning_parser.py.
|
|
|
|
The model uses <think>...</think> to wrap chain-of-thought output.
|
|
For Qwen3.5+ the chat template injects <think> into the prompt, so only
|
|
</think> appears in the generated tokens; older templates generate <think>
|
|
themselves. Both styles are handled.
|
|
"""
|
|
|
|
from typing import Optional, Sequence, Any
|
|
|
|
from vllm.reasoning.abs_reasoning_parsers import (
|
|
BaseThinkingReasoningParser,
|
|
ReasoningParserManager,
|
|
)
|
|
|
|
|
|
class Qwen3ReasoningParser(BaseThinkingReasoningParser):
|
|
|
|
def __init__(self, tokenizer: Any, *args, **kwargs):
|
|
super().__init__(tokenizer, *args, **kwargs)
|
|
chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
|
|
self.thinking_enabled = chat_kwargs.get("enable_thinking", True)
|
|
|
|
@property
|
|
def start_token(self) -> str:
|
|
return "<think>"
|
|
|
|
@property
|
|
def end_token(self) -> str:
|
|
return "</think>"
|
|
|
|
def extract_reasoning(
|
|
self, model_output: str, request: Any
|
|
) -> "tuple[Optional[str], Optional[str]]":
|
|
# Strip <think> if the model generated it (old template / edge case).
|
|
parts = model_output.partition(self.start_token)
|
|
model_output = parts[2] if parts[1] else parts[0]
|
|
|
|
if self.end_token not in model_output:
|
|
if not self.thinking_enabled:
|
|
return None, model_output
|
|
# Thinking enabled but output truncated before </think>.
|
|
return model_output, None
|
|
|
|
reasoning, _, content = model_output.partition(self.end_token)
|
|
return reasoning, content or None
|
|
|
|
def count_reasoning_tokens(self, token_ids: Sequence[int]) -> int:
|
|
token_ids = list(token_ids)
|
|
if self.start_token_id in token_ids:
|
|
# Old-style template: model generates <think> itself.
|
|
# Use depth-counting from the base class.
|
|
return super().count_reasoning_tokens(token_ids)
|
|
elif self.end_token_id in token_ids:
|
|
# New-style template (Qwen3.5+): <think> is injected into the
|
|
# prompt, so output starts already inside the thinking block.
|
|
# Every token before </think> is a reasoning token.
|
|
return token_ids.index(self.end_token_id)
|
|
else:
|
|
# No </think> in output: either truncated (all reasoning)
|
|
# or thinking disabled (none).
|
|
return len(token_ids) if self.thinking_enabled else 0
|
|
|
|
def extract_reasoning_streaming(
|
|
self,
|
|
previous_text: str,
|
|
current_text: str,
|
|
delta_text: str,
|
|
previous_token_ids: Sequence[int],
|
|
current_token_ids: Sequence[int],
|
|
delta_token_ids: Sequence[int],
|
|
):
|
|
from vllm.entrypoints.openai.protocol import DeltaMessage
|
|
|
|
if not self.thinking_enabled:
|
|
return DeltaMessage(content=delta_text) if delta_text else None
|
|
|
|
# Strip <think> from delta if the model generates it itself.
|
|
if self.start_token_id in delta_token_ids:
|
|
start_idx = delta_text.find(self.start_token)
|
|
if start_idx >= 0:
|
|
delta_text = delta_text[start_idx + len(self.start_token):]
|
|
|
|
if self.end_token_id in delta_token_ids:
|
|
end_idx = delta_text.find(self.end_token)
|
|
if end_idx >= 0:
|
|
reasoning = delta_text[:end_idx]
|
|
content = delta_text[end_idx + len(self.end_token):]
|
|
if not reasoning and not content:
|
|
return None
|
|
return DeltaMessage(
|
|
reasoning_content=reasoning or None,
|
|
content=content or None,
|
|
)
|
|
return None
|
|
|
|
if not delta_text:
|
|
return None
|
|
elif self.end_token_id in previous_token_ids:
|
|
return DeltaMessage(content=delta_text)
|
|
else:
|
|
return DeltaMessage(reasoning_content=delta_text)
|
|
|
|
|
|
# Register immediately when this module is imported.
|
|
ReasoningParserManager.register_module("qwen3", Qwen3ReasoningParser)
|