""" Reasoning parser for Qwen3 / Qwen3.5 / Qwen3.6 model family. Adapted from vllm-original/vllm/reasoning/qwen3_reasoning_parser.py. The model uses ... to wrap chain-of-thought output. For Qwen3.5+ the chat template injects into the prompt, so only appears in the generated tokens; older templates generate themselves. Both styles are handled. """ from typing import Optional, Sequence, Any from vllm.reasoning.abs_reasoning_parsers import ( BaseThinkingReasoningParser, ReasoningParserManager, ) class Qwen3ReasoningParser(BaseThinkingReasoningParser): def __init__(self, tokenizer: Any, *args, **kwargs): super().__init__(tokenizer, *args, **kwargs) chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {} self.thinking_enabled = chat_kwargs.get("enable_thinking", True) @property def start_token(self) -> str: return "" @property def end_token(self) -> str: return "" def extract_reasoning( self, model_output: str, request: Any ) -> "tuple[Optional[str], Optional[str]]": # Strip if the model generated it (old template / edge case). parts = model_output.partition(self.start_token) model_output = parts[2] if parts[1] else parts[0] if self.end_token not in model_output: if not self.thinking_enabled: return None, model_output # Thinking enabled but output truncated before . return model_output, None reasoning, _, content = model_output.partition(self.end_token) return reasoning, content or None def count_reasoning_tokens(self, token_ids: Sequence[int]) -> int: token_ids = list(token_ids) if self.start_token_id in token_ids: # Old-style template: model generates itself. # Use depth-counting from the base class. return super().count_reasoning_tokens(token_ids) elif self.end_token_id in token_ids: # New-style template (Qwen3.5+): is injected into the # prompt, so output starts already inside the thinking block. # Every token before is a reasoning token. return token_ids.index(self.end_token_id) else: # No in output: either truncated (all reasoning) # or thinking disabled (none). return len(token_ids) if self.thinking_enabled else 0 def extract_reasoning_streaming( self, previous_text: str, current_text: str, delta_text: str, previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], ): from vllm.entrypoints.openai.protocol import DeltaMessage if not self.thinking_enabled: return DeltaMessage(content=delta_text) if delta_text else None # Strip from delta if the model generates it itself. if self.start_token_id in delta_token_ids: start_idx = delta_text.find(self.start_token) if start_idx >= 0: delta_text = delta_text[start_idx + len(self.start_token):] if self.end_token_id in delta_token_ids: end_idx = delta_text.find(self.end_token) if end_idx >= 0: reasoning = delta_text[:end_idx] content = delta_text[end_idx + len(self.end_token):] if not reasoning and not content: return None return DeltaMessage( reasoning_content=reasoning or None, content=content or None, ) return None if not delta_text: return None elif self.end_token_id in previous_token_ids: return DeltaMessage(content=delta_text) else: return DeltaMessage(reasoning_content=delta_text) # Register immediately when this module is imported. ReasoningParserManager.register_module("qwen3", Qwen3ReasoningParser)