fix: correct stream response when enable_thinking is set to false (#5881)
This commit is contained in:
@@ -899,6 +899,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
def _get_enable_thinking_from_request(request_obj):
|
||||||
|
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request_obj: The request object (or an item from a list of requests).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The boolean value of 'enable_thinking' if found and not True, otherwise True.
|
||||||
|
"""
|
||||||
|
if (
|
||||||
|
hasattr(request_obj, "chat_template_kwargs")
|
||||||
|
and request_obj.chat_template_kwargs
|
||||||
|
and request_obj.chat_template_kwargs.get("enable_thinking") is not None
|
||||||
|
):
|
||||||
|
return request_obj.chat_template_kwargs.get("enable_thinking")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def v1_chat_generate_request(
|
def v1_chat_generate_request(
|
||||||
all_requests: List[ChatCompletionRequest],
|
all_requests: List[ChatCompletionRequest],
|
||||||
tokenizer_manager,
|
tokenizer_manager,
|
||||||
@@ -1263,31 +1281,16 @@ def v1_chat_generate_response(
|
|||||||
tool_calls = None
|
tool_calls = None
|
||||||
text = ret_item["text"]
|
text = ret_item["text"]
|
||||||
|
|
||||||
enable_thinking = True
|
|
||||||
if isinstance(request, list):
|
if isinstance(request, list):
|
||||||
tool_choice = request[idx].tool_choice
|
tool_choice = request[idx].tool_choice
|
||||||
tools = request[idx].tools
|
tools = request[idx].tools
|
||||||
separate_reasoning = request[idx].separate_reasoning
|
separate_reasoning = request[idx].separate_reasoning
|
||||||
|
enable_thinking = _get_enable_thinking_from_request(request[idx])
|
||||||
if (
|
|
||||||
request[idx].chat_template_kwargs
|
|
||||||
and request[idx].chat_template_kwargs.get("enable_thinking") is not None
|
|
||||||
):
|
|
||||||
enable_thinking = request[idx].chat_template_kwargs.get(
|
|
||||||
"enable_thinking", True
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
tool_choice = request.tool_choice
|
tool_choice = request.tool_choice
|
||||||
tools = request.tools
|
tools = request.tools
|
||||||
separate_reasoning = request.separate_reasoning
|
separate_reasoning = request.separate_reasoning
|
||||||
|
enable_thinking = _get_enable_thinking_from_request(request)
|
||||||
if (
|
|
||||||
request.chat_template_kwargs
|
|
||||||
and request.chat_template_kwargs.get("enable_thinking") is not None
|
|
||||||
):
|
|
||||||
enable_thinking = request.chat_template_kwargs.get(
|
|
||||||
"enable_thinking", True
|
|
||||||
)
|
|
||||||
|
|
||||||
reasoning_text = None
|
reasoning_text = None
|
||||||
if reasoning_parser and separate_reasoning and enable_thinking:
|
if reasoning_parser and separate_reasoning and enable_thinking:
|
||||||
@@ -1526,9 +1529,12 @@ async def v1_chat_completions(
|
|||||||
delta = text[len(stream_buffer) :]
|
delta = text[len(stream_buffer) :]
|
||||||
new_stream_buffer = stream_buffer + delta
|
new_stream_buffer = stream_buffer + delta
|
||||||
|
|
||||||
|
enable_thinking = _get_enable_thinking_from_request(request)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
tokenizer_manager.server_args.reasoning_parser
|
tokenizer_manager.server_args.reasoning_parser
|
||||||
and request.separate_reasoning
|
and request.separate_reasoning
|
||||||
|
and enable_thinking
|
||||||
):
|
):
|
||||||
if index not in reasoning_parser_dict:
|
if index not in reasoning_parser_dict:
|
||||||
reasoning_parser_dict[index] = ReasoningParser(
|
reasoning_parser_dict[index] = ReasoningParser(
|
||||||
|
|||||||
@@ -69,6 +69,7 @@ DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
|||||||
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
|
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
|
||||||
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
||||||
)
|
)
|
||||||
|
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
|
||||||
|
|
||||||
# Nightly tests
|
# Nightly tests
|
||||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
||||||
|
|||||||
@@ -59,6 +59,7 @@ suites = {
|
|||||||
TestFile("test_pytorch_sampling_backend.py", 66),
|
TestFile("test_pytorch_sampling_backend.py", 66),
|
||||||
TestFile("test_radix_attention.py", 167),
|
TestFile("test_radix_attention.py", 167),
|
||||||
TestFile("test_reasoning_content.py", 89),
|
TestFile("test_reasoning_content.py", 89),
|
||||||
|
TestFile("test_enable_thinking.py", 70),
|
||||||
TestFile("test_regex_constrained.py", 64),
|
TestFile("test_regex_constrained.py", 64),
|
||||||
TestFile("test_release_memory_occupation.py", 44),
|
TestFile("test_release_memory_occupation.py", 44),
|
||||||
TestFile("test_request_length_validation.py", 31),
|
TestFile("test_request_length_validation.py", 31),
|
||||||
|
|||||||
186
test/srt/test_enable_thinking.py
Normal file
186
test/srt/test_enable_thinking.py
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
"""
|
||||||
|
Usage:
|
||||||
|
python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_with_reasoning
|
||||||
|
python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_without_reasoning
|
||||||
|
python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_with_reasoning
|
||||||
|
python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_without_reasoning
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from sglang.srt.utils import kill_process_tree
|
||||||
|
from sglang.test.test_utils import (
|
||||||
|
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST,
|
||||||
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
|
DEFAULT_URL_FOR_TEST,
|
||||||
|
CustomTestCase,
|
||||||
|
popen_launch_server,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestEnableThinking(CustomTestCase):
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.model = DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST
|
||||||
|
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||||
|
cls.api_key = "sk-1234"
|
||||||
|
cls.process = popen_launch_server(
|
||||||
|
cls.model,
|
||||||
|
cls.base_url,
|
||||||
|
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||||
|
api_key=cls.api_key,
|
||||||
|
other_args=[
|
||||||
|
"--reasoning-parser",
|
||||||
|
"qwen3",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def tearDownClass(cls):
|
||||||
|
kill_process_tree(cls.process.pid)
|
||||||
|
|
||||||
|
def test_chat_completion_with_reasoning(self):
|
||||||
|
# Test non-streaming with "enable_thinking": True, reasoning_content should not be empty
|
||||||
|
client = requests.post(
|
||||||
|
f"{self.base_url}/v1/chat/completions",
|
||||||
|
headers={"Authorization": f"Bearer {self.api_key}"},
|
||||||
|
json={
|
||||||
|
"model": self.model,
|
||||||
|
"messages": [{"role": "user", "content": "Hello"}],
|
||||||
|
"temperature": 0,
|
||||||
|
"separate_reasoning": True,
|
||||||
|
"chat_template_kwargs": {"enable_thinking": True},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
|
||||||
|
data = client.json()
|
||||||
|
|
||||||
|
self.assertIn("choices", data)
|
||||||
|
self.assertTrue(len(data["choices"]) > 0)
|
||||||
|
self.assertIn("message", data["choices"][0])
|
||||||
|
self.assertIn("reasoning_content", data["choices"][0]["message"])
|
||||||
|
self.assertIsNotNone(data["choices"][0]["message"]["reasoning_content"])
|
||||||
|
|
||||||
|
def test_chat_completion_without_reasoning(self):
|
||||||
|
# Test non-streaming with "enable_thinking": False, reasoning_content should be empty
|
||||||
|
client = requests.post(
|
||||||
|
f"{self.base_url}/v1/chat/completions",
|
||||||
|
headers={"Authorization": f"Bearer {self.api_key}"},
|
||||||
|
json={
|
||||||
|
"model": self.model,
|
||||||
|
"messages": [{"role": "user", "content": "Hello"}],
|
||||||
|
"temperature": 0,
|
||||||
|
"separate_reasoning": True,
|
||||||
|
"chat_template_kwargs": {"enable_thinking": False},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
|
||||||
|
data = client.json()
|
||||||
|
|
||||||
|
self.assertIn("choices", data)
|
||||||
|
self.assertTrue(len(data["choices"]) > 0)
|
||||||
|
self.assertIn("message", data["choices"][0])
|
||||||
|
|
||||||
|
if "reasoning_content" in data["choices"][0]["message"]:
|
||||||
|
self.assertIsNone(data["choices"][0]["message"]["reasoning_content"])
|
||||||
|
|
||||||
|
def test_stream_chat_completion_with_reasoning(self):
|
||||||
|
# Test streaming with "enable_thinking": True, reasoning_content should not be empty
|
||||||
|
response = requests.post(
|
||||||
|
f"{self.base_url}/v1/chat/completions",
|
||||||
|
headers={"Authorization": f"Bearer {self.api_key}"},
|
||||||
|
json={
|
||||||
|
"model": self.model,
|
||||||
|
"messages": [{"role": "user", "content": "Hello"}],
|
||||||
|
"temperature": 0,
|
||||||
|
"separate_reasoning": True,
|
||||||
|
"stream": True,
|
||||||
|
"chat_template_kwargs": {"enable_thinking": True},
|
||||||
|
},
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
|
||||||
|
|
||||||
|
has_reasoning = False
|
||||||
|
has_content = False
|
||||||
|
|
||||||
|
print("\n=== Stream With Reasoning ===")
|
||||||
|
for line in response.iter_lines():
|
||||||
|
if line:
|
||||||
|
line = line.decode("utf-8")
|
||||||
|
if line.startswith("data:") and not line.startswith("data: [DONE]"):
|
||||||
|
data = json.loads(line[6:])
|
||||||
|
if "choices" in data and len(data["choices"]) > 0:
|
||||||
|
delta = data["choices"][0].get("delta", {})
|
||||||
|
|
||||||
|
if "reasoning_content" in delta and delta["reasoning_content"]:
|
||||||
|
has_reasoning = True
|
||||||
|
|
||||||
|
if "content" in delta and delta["content"]:
|
||||||
|
has_content = True
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
has_reasoning,
|
||||||
|
"The reasoning content is not included in the stream response",
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
has_content, "The stream response does not contain normal content"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_stream_chat_completion_without_reasoning(self):
|
||||||
|
# Test streaming with "enable_thinking": False, reasoning_content should be empty
|
||||||
|
response = requests.post(
|
||||||
|
f"{self.base_url}/v1/chat/completions",
|
||||||
|
headers={"Authorization": f"Bearer {self.api_key}"},
|
||||||
|
json={
|
||||||
|
"model": self.model,
|
||||||
|
"messages": [{"role": "user", "content": "Hello"}],
|
||||||
|
"temperature": 0,
|
||||||
|
"separate_reasoning": True,
|
||||||
|
"stream": True,
|
||||||
|
"chat_template_kwargs": {"enable_thinking": False},
|
||||||
|
},
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
|
||||||
|
|
||||||
|
has_reasoning = False
|
||||||
|
has_content = False
|
||||||
|
|
||||||
|
print("\n=== Stream Without Reasoning ===")
|
||||||
|
for line in response.iter_lines():
|
||||||
|
if line:
|
||||||
|
line = line.decode("utf-8")
|
||||||
|
if line.startswith("data:") and not line.startswith("data: [DONE]"):
|
||||||
|
data = json.loads(line[6:])
|
||||||
|
if "choices" in data and len(data["choices"]) > 0:
|
||||||
|
delta = data["choices"][0].get("delta", {})
|
||||||
|
|
||||||
|
if "reasoning_content" in delta and delta["reasoning_content"]:
|
||||||
|
has_reasoning = True
|
||||||
|
|
||||||
|
if "content" in delta and delta["content"]:
|
||||||
|
has_content = True
|
||||||
|
|
||||||
|
self.assertFalse(
|
||||||
|
has_reasoning,
|
||||||
|
"The reasoning content should not be included in the stream response",
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
has_content, "The stream response does not contain normal content"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user