diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index adf2cebc8..74333e0ce 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -899,6 +899,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request): return response +def _get_enable_thinking_from_request(request_obj): + """Extracts the 'enable_thinking' flag from request chat_template_kwargs. + + Args: + request_obj: The request object (or an item from a list of requests). + + Returns: + The boolean value of 'enable_thinking' if found and not True, otherwise True. + """ + if ( + hasattr(request_obj, "chat_template_kwargs") + and request_obj.chat_template_kwargs + and request_obj.chat_template_kwargs.get("enable_thinking") is not None + ): + return request_obj.chat_template_kwargs.get("enable_thinking") + return True + + def v1_chat_generate_request( all_requests: List[ChatCompletionRequest], tokenizer_manager, @@ -1263,31 +1281,16 @@ def v1_chat_generate_response( tool_calls = None text = ret_item["text"] - enable_thinking = True if isinstance(request, list): tool_choice = request[idx].tool_choice tools = request[idx].tools separate_reasoning = request[idx].separate_reasoning - - if ( - request[idx].chat_template_kwargs - and request[idx].chat_template_kwargs.get("enable_thinking") is not None - ): - enable_thinking = request[idx].chat_template_kwargs.get( - "enable_thinking", True - ) + enable_thinking = _get_enable_thinking_from_request(request[idx]) else: tool_choice = request.tool_choice tools = request.tools separate_reasoning = request.separate_reasoning - - if ( - request.chat_template_kwargs - and request.chat_template_kwargs.get("enable_thinking") is not None - ): - enable_thinking = request.chat_template_kwargs.get( - "enable_thinking", True - ) + enable_thinking = _get_enable_thinking_from_request(request) reasoning_text = None if reasoning_parser and separate_reasoning and enable_thinking: @@ -1526,9 +1529,12 @@ async def v1_chat_completions( delta = text[len(stream_buffer) :] new_stream_buffer = stream_buffer + delta + enable_thinking = _get_enable_thinking_from_request(request) + if ( tokenizer_manager.server_args.reasoning_parser and request.separate_reasoning + and enable_thinking ): if index not in reasoning_parser_dict: reasoning_parser_dict[index] = ReasoningParser( diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 7bf35582b..79c43d5c3 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -69,6 +69,7 @@ DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = ( "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4" ) +DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B" # Nightly tests DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 9f593ca9f..4afb76e00 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -59,6 +59,7 @@ suites = { TestFile("test_pytorch_sampling_backend.py", 66), TestFile("test_radix_attention.py", 167), TestFile("test_reasoning_content.py", 89), + TestFile("test_enable_thinking.py", 70), TestFile("test_regex_constrained.py", 64), TestFile("test_release_memory_occupation.py", 44), TestFile("test_request_length_validation.py", 31), diff --git a/test/srt/test_enable_thinking.py b/test/srt/test_enable_thinking.py new file mode 100644 index 000000000..f258f3262 --- /dev/null +++ b/test/srt/test_enable_thinking.py @@ -0,0 +1,186 @@ +""" +Usage: +python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_with_reasoning +python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_without_reasoning +python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_with_reasoning +python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_without_reasoning +""" + +import asyncio +import json +import os +import sys +import time +import unittest + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.test_utils import ( + DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + + +class TestEnableThinking(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-1234" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + api_key=cls.api_key, + other_args=[ + "--reasoning-parser", + "qwen3", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_chat_completion_with_reasoning(self): + # Test non-streaming with "enable_thinking": True, reasoning_content should not be empty + client = requests.post( + f"{self.base_url}/v1/chat/completions", + headers={"Authorization": f"Bearer {self.api_key}"}, + json={ + "model": self.model, + "messages": [{"role": "user", "content": "Hello"}], + "temperature": 0, + "separate_reasoning": True, + "chat_template_kwargs": {"enable_thinking": True}, + }, + ) + + self.assertEqual(client.status_code, 200, f"Failed with: {client.text}") + data = client.json() + + self.assertIn("choices", data) + self.assertTrue(len(data["choices"]) > 0) + self.assertIn("message", data["choices"][0]) + self.assertIn("reasoning_content", data["choices"][0]["message"]) + self.assertIsNotNone(data["choices"][0]["message"]["reasoning_content"]) + + def test_chat_completion_without_reasoning(self): + # Test non-streaming with "enable_thinking": False, reasoning_content should be empty + client = requests.post( + f"{self.base_url}/v1/chat/completions", + headers={"Authorization": f"Bearer {self.api_key}"}, + json={ + "model": self.model, + "messages": [{"role": "user", "content": "Hello"}], + "temperature": 0, + "separate_reasoning": True, + "chat_template_kwargs": {"enable_thinking": False}, + }, + ) + + self.assertEqual(client.status_code, 200, f"Failed with: {client.text}") + data = client.json() + + self.assertIn("choices", data) + self.assertTrue(len(data["choices"]) > 0) + self.assertIn("message", data["choices"][0]) + + if "reasoning_content" in data["choices"][0]["message"]: + self.assertIsNone(data["choices"][0]["message"]["reasoning_content"]) + + def test_stream_chat_completion_with_reasoning(self): + # Test streaming with "enable_thinking": True, reasoning_content should not be empty + response = requests.post( + f"{self.base_url}/v1/chat/completions", + headers={"Authorization": f"Bearer {self.api_key}"}, + json={ + "model": self.model, + "messages": [{"role": "user", "content": "Hello"}], + "temperature": 0, + "separate_reasoning": True, + "stream": True, + "chat_template_kwargs": {"enable_thinking": True}, + }, + stream=True, + ) + + self.assertEqual(response.status_code, 200, f"Failed with: {response.text}") + + has_reasoning = False + has_content = False + + print("\n=== Stream With Reasoning ===") + for line in response.iter_lines(): + if line: + line = line.decode("utf-8") + if line.startswith("data:") and not line.startswith("data: [DONE]"): + data = json.loads(line[6:]) + if "choices" in data and len(data["choices"]) > 0: + delta = data["choices"][0].get("delta", {}) + + if "reasoning_content" in delta and delta["reasoning_content"]: + has_reasoning = True + + if "content" in delta and delta["content"]: + has_content = True + + self.assertTrue( + has_reasoning, + "The reasoning content is not included in the stream response", + ) + self.assertTrue( + has_content, "The stream response does not contain normal content" + ) + + def test_stream_chat_completion_without_reasoning(self): + # Test streaming with "enable_thinking": False, reasoning_content should be empty + response = requests.post( + f"{self.base_url}/v1/chat/completions", + headers={"Authorization": f"Bearer {self.api_key}"}, + json={ + "model": self.model, + "messages": [{"role": "user", "content": "Hello"}], + "temperature": 0, + "separate_reasoning": True, + "stream": True, + "chat_template_kwargs": {"enable_thinking": False}, + }, + stream=True, + ) + + self.assertEqual(response.status_code, 200, f"Failed with: {response.text}") + + has_reasoning = False + has_content = False + + print("\n=== Stream Without Reasoning ===") + for line in response.iter_lines(): + if line: + line = line.decode("utf-8") + if line.startswith("data:") and not line.startswith("data: [DONE]"): + data = json.loads(line[6:]) + if "choices" in data and len(data["choices"]) > 0: + delta = data["choices"][0].get("delta", {}) + + if "reasoning_content" in delta and delta["reasoning_content"]: + has_reasoning = True + + if "content" in delta and delta["content"]: + has_content = True + + self.assertFalse( + has_reasoning, + "The reasoning content should not be included in the stream response", + ) + self.assertTrue( + has_content, "The stream response does not contain normal content" + ) + + +if __name__ == "__main__": + unittest.main()