fix: correct stream response when enable_thinking is set to false (#5881)

2025-05-01 10:44:37 +08:00
parent 9f21e75453
commit 256c4c2519
4 changed files with 211 additions and 17 deletions
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -899,6 +899,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
    return response
 def _get_enable_thinking_from_request(request_obj):
    """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
    Args:
        request_obj: The request object (or an item from a list of requests).
    Returns:
        The boolean value of 'enable_thinking' if found and not True, otherwise True.
    """
    if (
        hasattr(request_obj, "chat_template_kwargs")
        and request_obj.chat_template_kwargs
        and request_obj.chat_template_kwargs.get("enable_thinking") is not None
    ):
        return request_obj.chat_template_kwargs.get("enable_thinking")
    return True
 def v1_chat_generate_request(
    all_requests: List[ChatCompletionRequest],
    tokenizer_manager,
@@ -1263,31 +1281,16 @@ def v1_chat_generate_response(
        tool_calls = None
        text = ret_item["text"]
        enable_thinking = True
        if isinstance(request, list):
            tool_choice = request[idx].tool_choice
            tools = request[idx].tools
            separate_reasoning = request[idx].separate_reasoning
-
+            enable_thinking = _get_enable_thinking_from_request(request[idx])
            if (
                request[idx].chat_template_kwargs
                and request[idx].chat_template_kwargs.get("enable_thinking") is not None
            ):
                enable_thinking = request[idx].chat_template_kwargs.get(
                    "enable_thinking", True
                )
        else:
            tool_choice = request.tool_choice
            tools = request.tools
            separate_reasoning = request.separate_reasoning
-
+            enable_thinking = _get_enable_thinking_from_request(request)
            if (
                request.chat_template_kwargs
                and request.chat_template_kwargs.get("enable_thinking") is not None
            ):
                enable_thinking = request.chat_template_kwargs.get(
                    "enable_thinking", True
                )
        reasoning_text = None
        if reasoning_parser and separate_reasoning and enable_thinking:
@@ -1526,9 +1529,12 @@ async def v1_chat_completions(
                    delta = text[len(stream_buffer) :]
                    new_stream_buffer = stream_buffer + delta
                    enable_thinking = _get_enable_thinking_from_request(request)
                    if (
                        tokenizer_manager.server_args.reasoning_parser
                        and request.separate_reasoning
                        and enable_thinking
                    ):
                        if index not in reasoning_parser_dict:
                            reasoning_parser_dict[index] = ReasoningParser(
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -69,6 +69,7 @@ DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
 DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
    "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
 )
 DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
 # Nightly tests
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -59,6 +59,7 @@ suites = {
        TestFile("test_pytorch_sampling_backend.py", 66),
        TestFile("test_radix_attention.py", 167),
        TestFile("test_reasoning_content.py", 89),
        TestFile("test_enable_thinking.py", 70),
        TestFile("test_regex_constrained.py", 64),
        TestFile("test_release_memory_occupation.py", 44),
        TestFile("test_request_length_validation.py", 31),
--- a/test/srt/test_enable_thinking.py
+++ b/test/srt/test_enable_thinking.py
@@ -0,0 +1,186 @@
 """
 Usage:
 python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_with_reasoning
 python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_without_reasoning
 python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_with_reasoning
 python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_without_reasoning
 """
 import asyncio
 import json
 import os
 import sys
 import time
 import unittest
 import requests
 from sglang.srt.utils import kill_process_tree
 from sglang.test.test_utils import (
    DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
    popen_launch_server,
 )
 class TestEnableThinking(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-1234"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            api_key=cls.api_key,
            other_args=[
                "--reasoning-parser",
                "qwen3",
            ],
        )
    @classmethod
    def tearDownClass(cls):
        kill_process_tree(cls.process.pid)
    def test_chat_completion_with_reasoning(self):
        # Test non-streaming with "enable_thinking": True, reasoning_content should not be empty
        client = requests.post(
            f"{self.base_url}/v1/chat/completions",
            headers={"Authorization": f"Bearer {self.api_key}"},
            json={
                "model": self.model,
                "messages": [{"role": "user", "content": "Hello"}],
                "temperature": 0,
                "separate_reasoning": True,
                "chat_template_kwargs": {"enable_thinking": True},
            },
        )
        self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
        data = client.json()
        self.assertIn("choices", data)
        self.assertTrue(len(data["choices"]) > 0)
        self.assertIn("message", data["choices"][0])
        self.assertIn("reasoning_content", data["choices"][0]["message"])
        self.assertIsNotNone(data["choices"][0]["message"]["reasoning_content"])
    def test_chat_completion_without_reasoning(self):
        # Test non-streaming with "enable_thinking": False, reasoning_content should be empty
        client = requests.post(
            f"{self.base_url}/v1/chat/completions",
            headers={"Authorization": f"Bearer {self.api_key}"},
            json={
                "model": self.model,
                "messages": [{"role": "user", "content": "Hello"}],
                "temperature": 0,
                "separate_reasoning": True,
                "chat_template_kwargs": {"enable_thinking": False},
            },
        )
        self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
        data = client.json()
        self.assertIn("choices", data)
        self.assertTrue(len(data["choices"]) > 0)
        self.assertIn("message", data["choices"][0])
        if "reasoning_content" in data["choices"][0]["message"]:
            self.assertIsNone(data["choices"][0]["message"]["reasoning_content"])
    def test_stream_chat_completion_with_reasoning(self):
        # Test streaming with "enable_thinking": True, reasoning_content should not be empty
        response = requests.post(
            f"{self.base_url}/v1/chat/completions",
            headers={"Authorization": f"Bearer {self.api_key}"},
            json={
                "model": self.model,
                "messages": [{"role": "user", "content": "Hello"}],
                "temperature": 0,
                "separate_reasoning": True,
                "stream": True,
                "chat_template_kwargs": {"enable_thinking": True},
            },
            stream=True,
        )
        self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
        has_reasoning = False
        has_content = False
        print("\n=== Stream With Reasoning ===")
        for line in response.iter_lines():
            if line:
                line = line.decode("utf-8")
                if line.startswith("data:") and not line.startswith("data: [DONE]"):
                    data = json.loads(line[6:])
                    if "choices" in data and len(data["choices"]) > 0:
                        delta = data["choices"][0].get("delta", {})
                        if "reasoning_content" in delta and delta["reasoning_content"]:
                            has_reasoning = True
                        if "content" in delta and delta["content"]:
                            has_content = True
        self.assertTrue(
            has_reasoning,
            "The reasoning content is not included in the stream response",
        )
        self.assertTrue(
            has_content, "The stream response does not contain normal content"
        )
    def test_stream_chat_completion_without_reasoning(self):
        # Test streaming with "enable_thinking": False, reasoning_content should  be empty
        response = requests.post(
            f"{self.base_url}/v1/chat/completions",
            headers={"Authorization": f"Bearer {self.api_key}"},
            json={
                "model": self.model,
                "messages": [{"role": "user", "content": "Hello"}],
                "temperature": 0,
                "separate_reasoning": True,
                "stream": True,
                "chat_template_kwargs": {"enable_thinking": False},
            },
            stream=True,
        )
        self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
        has_reasoning = False
        has_content = False
        print("\n=== Stream Without Reasoning ===")
        for line in response.iter_lines():
            if line:
                line = line.decode("utf-8")
                if line.startswith("data:") and not line.startswith("data: [DONE]"):
                    data = json.loads(line[6:])
                    if "choices" in data and len(data["choices"]) > 0:
                        delta = data["choices"][0].get("delta", {})
                        if "reasoning_content" in delta and delta["reasoning_content"]:
                            has_reasoning = True
                        if "content" in delta and delta["content"]:
                            has_content = True
        self.assertFalse(
            has_reasoning,
            "The reasoning content should not be included in the stream response",
        )
        self.assertTrue(
            has_content, "The stream response does not contain normal content"
        )
 if __name__ == "__main__":
    unittest.main()