diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 4ccb2197c..4694b0823 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -101,19 +101,16 @@ "\n", "**Reasoning Parser Options:**\n", "- `--reasoning-parser deepseek-r1`: For DeepSeek-R1 family models (R1, R1-0528, R1-Distill)\n", - "- `--reasoning-parser qwen3`: For standard Qwen3 models that support `enable_thinking` parameter\n", - "- `--reasoning-parser qwen3-thinking`: For Qwen3-Thinking models (e.g., Qwen/Qwen3-235B-A22B-Thinking-2507) that always generate thinking content\n", + "- `--reasoning-parser qwen3`: For both standard Qwen3 models that support `enable_thinking` parameter and Qwen3-Thinking models\n", + "- `--reasoning-parser qwen3-thinking`: For Qwen3-Thinking models, force reasoning version of qwen3 parser\n", "- `--reasoning-parser kimi`: For Kimi thinking models\n", "\n", "Here's an example demonstrating how to enable thinking and retrieve the reasoning content separately (using `separate_reasoning: True`):\n", "\n", "```python\n", - "# For standard Qwen3 models with enable_thinking support:\n", + "# For Qwen3 models with enable_thinking support:\n", "# python3 -m sglang.launch_server --model-path QwQ/Qwen3-32B-250415 --reasoning-parser qwen3 ...\n", "\n", - "# For Qwen3-Thinking models that always think:\n", - "# python3 -m sglang.launch_server --model-path Qwen/Qwen3-235B-A22B-Thinking-2507 --reasoning-parser qwen3-thinking ...\n", - "\n", "from openai import OpenAI\n", "\n", "# Modify OpenAI's API key and API base to use SGLang's API server.\n", @@ -132,7 +129,7 @@ " model=model,\n", " messages=messages,\n", " extra_body={\n", - " \"chat_template_kwargs\": {\"enable_thinking\": True}, # Only for standard Qwen3 models\n", + " \"chat_template_kwargs\": {\"enable_thinking\": True},\n", " \"separate_reasoning\": True\n", " }\n", ")\n", @@ -158,7 +155,7 @@ "\n", "Setting `\"enable_thinking\": False` (or omitting it) will result in `reasoning_content` being `None`.\n", "\n", - "**Note for Qwen3-Thinking models:** These models always generate thinking content and do not support the `enable_thinking` parameter. When using `--reasoning-parser qwen3-thinking`, the model will always produce reasoning content regardless of the `enable_thinking` setting.\n", + "**Note for Qwen3-Thinking models:** These models always generate thinking content and do not support the `enable_thinking` parameter. Use `--reasoning-parser qwen3-thinking` or `--reasoning-parser qwen3` to parse the thinking content.\n", "\n", "Here is an example of a detailed chat completion request using standard OpenAI parameters:" ] diff --git a/docs/backend/separate_reasoning.ipynb b/docs/backend/separate_reasoning.ipynb index cd0ab23c4..aae7dcef9 100644 --- a/docs/backend/separate_reasoning.ipynb +++ b/docs/backend/separate_reasoning.ipynb @@ -14,7 +14,7 @@ "|---------|-----------------------------|------------------|-------|\n", "| [DeepSeek‑R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `` … `` | `deepseek-r1` | Supports all variants (R1, R1-0528, R1-Distill) |\n", "| [Standard Qwen3 models](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `` … `` | `qwen3` | Supports `enable_thinking` parameter |\n", - "| [Qwen3-Thinking models](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507) | `` … `` | `qwen3-thinking` | Always generates thinking content |\n", + "| [Qwen3-Thinking models](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507) | `` … `` | `qwen3` or `qwen3-thinking` | Always generates thinking content |\n", "| [Kimi models](https://huggingface.co/collections/MoonshotAI/kimi-675e30c072b7ba7e79833be7) | `◁think▷` … `◁/think▷` | `kimi` | Uses special thinking delimiters |\n", "\n", "### Model-Specific Behaviors\n", @@ -26,7 +26,10 @@ "\n", "**Qwen3 Family:**\n", "- Standard Qwen3 (e.g., Qwen3-2507): Use `qwen3` parser, supports `enable_thinking` in chat templates\n", - "- Qwen3-Thinking (e.g., Qwen3-235B-A22B-Thinking-2507): Use `qwen3-thinking` parser, always thinks" + "- Qwen3-Thinking (e.g., Qwen3-235B-A22B-Thinking-2507): Use `qwen3` or `qwen3-thinking` parser, always thinks\n", + "\n", + "**Kimi:**\n", + "- Kimi: Uses special `◁think▷` and `◁/think▷` tags" ] }, { diff --git a/python/sglang/srt/entrypoints/openai/serving_chat.py b/python/sglang/srt/entrypoints/openai/serving_chat.py index c8918ed4c..db7bc252c 100644 --- a/python/sglang/srt/entrypoints/openai/serving_chat.py +++ b/python/sglang/srt/entrypoints/openai/serving_chat.py @@ -332,6 +332,8 @@ class OpenAIServingChat(OpenAIServingBase): prompt = prompt[: -len(conv.sep2)] else: prompt = conv.get_prompt() + if self._get_enable_thinking_from_request(request): + prompt += "" # Note(Xinyuan): hard code thinking token image_data = conv.image_data if conv.image_data else None video_data = conv.video_data if conv.video_data else None @@ -840,7 +842,9 @@ class OpenAIServingChat(OpenAIServingBase): if reasoning_parser and request.separate_reasoning: try: parser = ReasoningParser( - model_type=reasoning_parser, stream_reasoning=False + model_type=reasoning_parser, + stream_reasoning=False, + force_reasoning=self.template_manager.force_reasoning, ) reasoning_text, text = parser.parse_non_stream(text) except Exception as e: @@ -1006,11 +1010,12 @@ class OpenAIServingChat(OpenAIServingBase): reasoning_parser_dict[index] = ReasoningParser( self.tokenizer_manager.server_args.reasoning_parser, request.stream_reasoning, + self.template_manager.force_reasoning, ) reasoning_parser = reasoning_parser_dict[index] return reasoning_parser.parse_stream_chunk(delta) - def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool: + def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool: """Extracts the 'enable_thinking' flag from request chat_template_kwargs. NOTE: This parameter is only useful for models that support enable_thinking @@ -1019,7 +1024,7 @@ class OpenAIServingChat(OpenAIServingBase): Args: request_obj: The request object (or an item from a list of requests). Returns: - The boolean value of 'enable_thinking' if found and not True, otherwise True. + The boolean value of 'enable_thinking' if found, otherwise False. """ if ( hasattr(request, "chat_template_kwargs") @@ -1027,7 +1032,7 @@ class OpenAIServingChat(OpenAIServingBase): and request.chat_template_kwargs.get("enable_thinking") is not None ): return request.chat_template_kwargs.get("enable_thinking") - return True + return False async def _process_tool_call_stream( self, diff --git a/python/sglang/srt/managers/template_manager.py b/python/sglang/srt/managers/template_manager.py index c98e3d14a..2327f942b 100644 --- a/python/sglang/srt/managers/template_manager.py +++ b/python/sglang/srt/managers/template_manager.py @@ -21,6 +21,7 @@ and code completion templates, eliminating global state and improving modularity import json import logging import os +import re from typing import Optional from sglang.srt.code_completion_parser import ( @@ -54,6 +55,7 @@ class TemplateManager: self._chat_template_name: Optional[str] = None self._completion_template_name: Optional[str] = None self._jinja_template_content_format: Optional[str] = "openai" + self._force_reasoning: bool = False @property def chat_template_name(self) -> Optional[str]: @@ -70,6 +72,31 @@ class TemplateManager: """Get the detected template content format ('string' or 'openai' or None).""" return self._jinja_template_content_format + @property + def force_reasoning(self) -> bool: + """ + Check if the current chat template enforces reasoning/thinking. + + Returns: + True if the template contains reasoning patterns like tags + """ + return self._force_reasoning + + def _detect_reasoning_pattern(self, template: str) -> bool: + """ + Detect if the chat template contains reasoning/thinking patterns. + """ + if template is None: + return False + + force_reasoning_pattern = r"<\|im_start\|>assistant\\n\\n" + has_reasoning = re.search(force_reasoning_pattern, template) is not None + + if has_reasoning: + logger.info("Detected the force reasoning pattern in chat template.") + + return has_reasoning + def load_chat_template( self, tokenizer_manager, chat_template_arg: Optional[str], model_path: str ) -> None: @@ -93,7 +120,8 @@ class TemplateManager: hf_template = self._resolve_hf_chat_template(tokenizer_manager) if hf_template: # override the chat template - tokenizer_manager.tokenizer.chat_template = hf_template + if tokenizer_manager.tokenizer: + tokenizer_manager.tokenizer.chat_template = hf_template self._jinja_template_content_format = ( detect_jinja_template_content_format(hf_template) ) @@ -106,6 +134,12 @@ class TemplateManager: self._jinja_template_content_format = "string" logger.info("No chat template found, defaulting to 'string' content format") + # Detect reasoning pattern from chat template + if tokenizer_manager.tokenizer: + self._force_reasoning = self._detect_reasoning_pattern( + tokenizer_manager.tokenizer.chat_template + ) + def _load_explicit_chat_template( self, tokenizer_manager, chat_template_arg: str ) -> None: diff --git a/python/sglang/srt/reasoning_parser.py b/python/sglang/srt/reasoning_parser.py index a2561a18d..9e96fa92d 100644 --- a/python/sglang/srt/reasoning_parser.py +++ b/python/sglang/srt/reasoning_parser.py @@ -131,7 +131,7 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector): If True, streams reasoning content as it arrives. """ - def __init__(self, stream_reasoning: bool = True): + def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True): # DeepSeek-R1 is assumed to be reasoning until `` token super().__init__( "", @@ -144,7 +144,7 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector): class Qwen3Detector(BaseReasoningFormatDetector): """ - Detector for standard Qwen3 models (e.g., Qwen/Qwen3-235B-A22B). + Detector for Qwen3 models (e.g., Qwen/Qwen3-235B-A22B). Assumes reasoning format: ()*(.*) @@ -153,47 +153,16 @@ class Qwen3Detector(BaseReasoningFormatDetector): - enable_thinking=True: "reasoning contentThe answer is 42." - enable_thinking=False: "The answer is 42." (no thinking tokens) - This detector handles both cases. - - NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507). - Those models always generate thinking content without start tags. - Use "qwen3-thinking" parser type for those models instead. - Args: stream_reasoning (bool): If False, accumulates reasoning content until the end tag. If True, streams reasoning content as it arrives. """ - def __init__(self, stream_reasoning: bool = True): + def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False): super().__init__( "", "", - force_reasoning=False, - stream_reasoning=stream_reasoning, - ) - - -class Qwen3ThinkingDetector(BaseReasoningFormatDetector): - """ - Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507). - Assumes reasoning format: - *(.*) - - These models always generate thinking content without start tag. - They do not support the enable_thinking parameter and always think. - - Format: "I need to think about this...The answer is 42." - - Args: - stream_reasoning (bool): If False, accumulates reasoning content until the end tag. - If True, streams reasoning content as it arrives. - """ - - def __init__(self, stream_reasoning: bool = True): - super().__init__( - "", - "", - force_reasoning=True, + force_reasoning=force_reasoning, stream_reasoning=stream_reasoning, ) @@ -207,7 +176,7 @@ class KimiDetector(BaseReasoningFormatDetector): and the rest of the text as `normal_text`. """ - def __init__(self, stream_reasoning: bool = True): + def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False): super().__init__( "◁think▷", "◁/think▷", @@ -230,13 +199,18 @@ class ReasoningParser: DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = { "deepseek-r1": DeepSeekR1Detector, "qwen3": Qwen3Detector, - "qwen3-thinking": Qwen3ThinkingDetector, + "qwen3-thinking": Qwen3Detector, "glm45": Qwen3Detector, "kimi": KimiDetector, "step3": DeepSeekR1Detector, } - def __init__(self, model_type: Optional[str] = None, stream_reasoning: bool = True): + def __init__( + self, + model_type: Optional[str] = None, + stream_reasoning: bool = True, + force_reasoning: bool = False, + ): if not model_type: raise ValueError("Model type must be specified") @@ -244,7 +218,12 @@ class ReasoningParser: if not detector_class: raise ValueError(f"Unsupported model type: {model_type}") - self.detector = detector_class(stream_reasoning=stream_reasoning) + if model_type.lower() == "qwen3-thinking": + force_reasoning = True + + self.detector = detector_class( + stream_reasoning=stream_reasoning, force_reasoning=force_reasoning + ) def parse_non_stream(self, full_text: str) -> Tuple[str, str]: """Non-streaming call: one-time parsing""" diff --git a/test/srt/test_reasoning_parser.py b/test/srt/test_reasoning_parser.py index 97eea82b4..dca314d35 100644 --- a/test/srt/test_reasoning_parser.py +++ b/test/srt/test_reasoning_parser.py @@ -5,7 +5,6 @@ from sglang.srt.reasoning_parser import ( DeepSeekR1Detector, KimiDetector, Qwen3Detector, - Qwen3ThinkingDetector, ReasoningParser, StreamingParseResult, ) @@ -216,19 +215,19 @@ class TestQwen3Detector(CustomTestCase): self.assertEqual(result.reasoning_text, "") -class TestQwen3ThinkingDetector(CustomTestCase): +class TestQwen3ForcedReasoningDetector(CustomTestCase): def setUp(self): - self.detector = Qwen3ThinkingDetector() + self.detector = Qwen3Detector(force_reasoning=True) def test_init(self): - """Test Qwen3ThinkingDetector initialization.""" + """Test Qwen3ForcedReasoningDetector initialization.""" self.assertEqual(self.detector.think_start_token, "") self.assertEqual(self.detector.think_end_token, "") self.assertTrue(self.detector._in_reasoning) # force_reasoning=True self.assertTrue(self.detector.stream_reasoning) - def test_detect_and_parse_qwen3_thinking_format(self): - """Test parsing Qwen3-Thinking format (no start tag).""" + def test_detect_and_parse_qwen3_forced_reasoning_format(self): + """Test parsing Qwen3-ForcedReasoning format (no start tag).""" text = "I need to think about this step by step.The answer is 42." result = self.detector.detect_and_parse(text) self.assertEqual( @@ -237,15 +236,15 @@ class TestQwen3ThinkingDetector(CustomTestCase): self.assertEqual(result.normal_text, "The answer is 42.") def test_detect_and_parse_with_start_token(self): - """Test parsing Qwen3-Thinking with optional start tag.""" + """Test parsing Qwen3-ForcedReasoning with optional start tag.""" text = "I need to think about this.The answer is 42." result = self.detector.detect_and_parse(text) # Should work because base class logic handles both force_reasoning=True OR start token self.assertEqual(result.reasoning_text, "I need to think about this.") self.assertEqual(result.normal_text, "The answer is 42.") - def test_streaming_qwen3_thinking_format(self): - """Test streaming parse of Qwen3-Thinking format.""" + def test_streaming_qwen3_forced_reasoning_format(self): + """Test streaming parse of Qwen3-ForcedReasoning format.""" # First chunk without start result = self.detector.parse_streaming_increment("I need to") self.assertEqual(result.reasoning_text, "I need to") @@ -320,9 +319,6 @@ class TestReasoningParser(CustomTestCase): parser = ReasoningParser("qwen3") self.assertIsInstance(parser.detector, Qwen3Detector) - parser = ReasoningParser("qwen3-thinking") - self.assertIsInstance(parser.detector, Qwen3ThinkingDetector) - parser = ReasoningParser("kimi") self.assertIsInstance(parser.detector, KimiDetector) @@ -370,13 +366,11 @@ class TestReasoningParser(CustomTestCase): """Test case insensitive model type matching.""" parser1 = ReasoningParser("DeepSeek-R1") parser2 = ReasoningParser("QWEN3") - parser3 = ReasoningParser("QWEN3-THINKING") - parser4 = ReasoningParser("Kimi") + parser3 = ReasoningParser("Kimi") self.assertIsInstance(parser1.detector, DeepSeekR1Detector) self.assertIsInstance(parser2.detector, Qwen3Detector) - self.assertIsInstance(parser3.detector, Qwen3ThinkingDetector) - self.assertIsInstance(parser4.detector, KimiDetector) + self.assertIsInstance(parser3.detector, KimiDetector) def test_stream_reasoning_parameter(self): """Test stream_reasoning parameter is passed correctly.""" @@ -458,9 +452,9 @@ class TestIntegrationScenarios(CustomTestCase): self.assertEqual(reasoning, "") self.assertEqual(normal, "Just the answer.") - def test_qwen3_thinking_complete_response(self): - """Test complete Qwen3-Thinking response parsing.""" - parser = ReasoningParser("qwen3-thinking") + def test_qwen3_forced_reasoning_complete_response(self): + """Test complete Qwen3-ForcedReasoning response parsing.""" + parser = ReasoningParser("qwen3", force_reasoning=True) text = "Let me solve this step by step. The equation is x + 2 = 5. Subtracting 2 from both sides gives x = 3.The solution is x = 3." reasoning, normal = parser.parse_non_stream(text) @@ -468,9 +462,9 @@ class TestIntegrationScenarios(CustomTestCase): self.assertIn("x = 3", reasoning) self.assertEqual(normal, "The solution is x = 3.") - def test_qwen3_thinking_streaming_scenario(self): - """Test Qwen3-Thinking streaming scenario.""" - parser = ReasoningParser("qwen3-thinking") + def test_qwen3_forced_reasoning_streaming_scenario(self): + """Test Qwen3-ForcedReasoning streaming scenario.""" + parser = ReasoningParser("qwen3", force_reasoning=True) chunks = [ "I need to analyze",