# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ) from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.entrypoints.openai.responses.protocol import ( ResponsesRequest, ) from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser from vllm.tokenizers import TokenizerLike class Qwen3ReasoningParser(BaseThinkingReasoningParser): """ Reasoning parser for the Qwen3/Qwen3.5 model family. The Qwen3 model family uses ... tokens to denote reasoning text. Starting with Qwen3.5, the chat template places in the prompt so only appears in the generated output. The model provides a strict switch to disable reasoning output via the 'enable_thinking=False' parameter. When thinking is disabled, the template places \\n\\n\\n\\n in the prompt. The serving layer detects this via prompt_is_reasoning_end and routes deltas as content without calling the streaming parser. NOTE: Models up to the 2507 release (e.g., Qwen/Qwen3-235B-A22B-Instruct-2507) use an older chat template where the model generates itself. This parser handles both styles: if appears in the generated output it is stripped before extraction (non-streaming) or skipped (streaming). """ def __init__(self, tokenizer: TokenizerLike, *args, **kwargs): super().__init__(tokenizer, *args, **kwargs) chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {} # Qwen3 defaults to thinking enabled; only treat output as # pure content when the user explicitly disables it. self.thinking_enabled = chat_kwargs.get("enable_thinking", True) @property def start_token(self) -> str: """The token that starts reasoning content.""" return "" @property def end_token(self) -> str: """The token that ends reasoning content.""" return "" def extract_reasoning( self, model_output: str, request: ChatCompletionRequest | ResponsesRequest ) -> tuple[str | None, str | None]: """ Extract reasoning content from the model output. The token is placed in the prompt by the chat template, so typically only appears in the generated output. If is present (e.g. from a different template), it is stripped before extraction. When thinking is explicitly disabled and no appears, returns (None, model_output) — all output is content. Otherwise (thinking enabled, default), a missing means the output was truncated and everything is reasoning: returns (model_output, None). Returns: tuple[Optional[str], Optional[str]]: reasoning content and content """ # Strip if present in the generated output. model_output_parts = model_output.partition(self.start_token) model_output = ( model_output_parts[2] if model_output_parts[1] else model_output_parts[0] ) if self.end_token not in model_output: if not self.thinking_enabled: # Thinking explicitly disabled — treat everything as content. return None, model_output # Thinking enabled but no : output was truncated. # Everything generated so far is reasoning. return model_output, None # Extract reasoning content from the model output. reasoning, _, content = model_output.partition(self.end_token) final_content = content or None return reasoning, final_content def extract_reasoning_streaming( self, previous_text: str, current_text: str, delta_text: str, previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], ) -> DeltaMessage | None: """ Extract reasoning content from a streaming delta. Since is placed in the prompt by the chat template, all generated tokens before are reasoning and tokens after are content. NOTE: When thinking is disabled, no think tokens appear in the generated output. The serving layer detects this via prompt_is_reasoning_end and routes deltas as content without calling this method. """ # Strip from delta if present (old template / edge case # where the model generates itself). if self.start_token_id in delta_token_ids: start_idx = delta_text.find(self.start_token) if start_idx >= 0: delta_text = delta_text[start_idx + len(self.start_token) :] if self.end_token_id in delta_token_ids: # End token in this delta: split reasoning from content. end_index = delta_text.find(self.end_token) if end_index >= 0: reasoning = delta_text[:end_index] content = delta_text[end_index + len(self.end_token) :] if not reasoning and not content: return None return DeltaMessage( reasoning=reasoning if reasoning else None, content=content if content else None, ) # end_token_id in IDs but not in text (already stripped) return None # No end token in this delta. if not delta_text: # Nothing left after stripping start token. return None elif self.end_token_id in previous_token_ids: # End token already passed: everything is content now. return DeltaMessage(content=delta_text) else: # No end token yet: still in reasoning phase. return DeltaMessage(reasoning=delta_text)