fix: correct stream response when enable_thinking is set to false (#5881)

2025-05-01 10:44:37 +08:00
parent 9f21e75453
commit 256c4c2519
4 changed files with 211 additions and 17 deletions
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -899,6 +899,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
    return response


+def _get_enable_thinking_from_request(request_obj):
+    """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
+
+    Args:
+        request_obj: The request object (or an item from a list of requests).
+
+    Returns:
+        The boolean value of 'enable_thinking' if found and not True, otherwise True.
+    """
+    if (
+        hasattr(request_obj, "chat_template_kwargs")
+        and request_obj.chat_template_kwargs
+        and request_obj.chat_template_kwargs.get("enable_thinking") is not None
+    ):
+        return request_obj.chat_template_kwargs.get("enable_thinking")
+    return True
+
+
 def v1_chat_generate_request(
    all_requests: List[ChatCompletionRequest],
    tokenizer_manager,
@@ -1263,31 +1281,16 @@ def v1_chat_generate_response(
        tool_calls = None
        text = ret_item["text"]

-        enable_thinking = True
        if isinstance(request, list):
            tool_choice = request[idx].tool_choice
            tools = request[idx].tools
            separate_reasoning = request[idx].separate_reasoning
-
-            if (
-                request[idx].chat_template_kwargs
-                and request[idx].chat_template_kwargs.get("enable_thinking") is not None
-            ):
-                enable_thinking = request[idx].chat_template_kwargs.get(
-                    "enable_thinking", True
-                )
+            enable_thinking = _get_enable_thinking_from_request(request[idx])
        else:
            tool_choice = request.tool_choice
            tools = request.tools
            separate_reasoning = request.separate_reasoning
-
-            if (
-                request.chat_template_kwargs
-                and request.chat_template_kwargs.get("enable_thinking") is not None
-            ):
-                enable_thinking = request.chat_template_kwargs.get(
-                    "enable_thinking", True
-                )
+            enable_thinking = _get_enable_thinking_from_request(request)

        reasoning_text = None
        if reasoning_parser and separate_reasoning and enable_thinking:
@@ -1526,9 +1529,12 @@ async def v1_chat_completions(
                    delta = text[len(stream_buffer) :]
                    new_stream_buffer = stream_buffer + delta

+                    enable_thinking = _get_enable_thinking_from_request(request)
+
                    if (
                        tokenizer_manager.server_args.reasoning_parser
                        and request.separate_reasoning
+                        and enable_thinking
                    ):
                        if index not in reasoning_parser_dict:
                            reasoning_parser_dict[index] = ReasoningParser(
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -69,6 +69,7 @@ DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
 DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
    "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
 )
+DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"

 # Nightly tests
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"