Enables force reasoning based on chat template for Qwen3-Thinking (#8369)

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Signed-off-by: Xinyuan Tong <justinning0323@outlook.com> Co-authored-by: Chang Su <csu272@usc.edu>
2025-08-06 20:02:47 -07:00
parent 6210e2c4f0
commit 3fa3c6cd6a
6 changed files with 88 additions and 76 deletions
--- a/python/sglang/srt/entrypoints/openai/serving_chat.py
+++ b/python/sglang/srt/entrypoints/openai/serving_chat.py
@@ -332,6 +332,8 @@ class OpenAIServingChat(OpenAIServingBase):
                prompt = prompt[: -len(conv.sep2)]
        else:
            prompt = conv.get_prompt()
+            if self._get_enable_thinking_from_request(request):
+                prompt += "<think>"  # Note(Xinyuan): hard code thinking token

        image_data = conv.image_data if conv.image_data else None
        video_data = conv.video_data if conv.video_data else None
@@ -840,7 +842,9 @@ class OpenAIServingChat(OpenAIServingBase):
            if reasoning_parser and request.separate_reasoning:
                try:
                    parser = ReasoningParser(
-                        model_type=reasoning_parser, stream_reasoning=False
+                        model_type=reasoning_parser,
+                        stream_reasoning=False,
+                        force_reasoning=self.template_manager.force_reasoning,
                    )
                    reasoning_text, text = parser.parse_non_stream(text)
                except Exception as e:
@@ -1006,11 +1010,12 @@ class OpenAIServingChat(OpenAIServingBase):
            reasoning_parser_dict[index] = ReasoningParser(
                self.tokenizer_manager.server_args.reasoning_parser,
                request.stream_reasoning,
+                self.template_manager.force_reasoning,
            )
        reasoning_parser = reasoning_parser_dict[index]
        return reasoning_parser.parse_stream_chunk(delta)

-    def _get_enable_thinking_from_request(request: ChatCompletionRequest) -> bool:
+    def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
        """Extracts the 'enable_thinking' flag from request chat_template_kwargs.

        NOTE: This parameter is only useful for models that support enable_thinking
@@ -1019,7 +1024,7 @@ class OpenAIServingChat(OpenAIServingBase):
        Args:
            request_obj: The request object (or an item from a list of requests).
        Returns:
-            The boolean value of 'enable_thinking' if found and not True, otherwise True.
+            The boolean value of 'enable_thinking' if found, otherwise False.
        """
        if (
            hasattr(request, "chat_template_kwargs")
@@ -1027,7 +1032,7 @@ class OpenAIServingChat(OpenAIServingBase):
            and request.chat_template_kwargs.get("enable_thinking") is not None
        ):
            return request.chat_template_kwargs.get("enable_thinking")
-        return True
+        return False

    async def _process_tool_call_stream(
        self,