fix: correct stream response when enable_thinking is set to false (#5881)

This commit is contained in:
mlmz
2025-05-01 10:44:37 +08:00
committed by GitHub
parent 9f21e75453
commit 256c4c2519
4 changed files with 211 additions and 17 deletions

View File

@@ -899,6 +899,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
return response
def _get_enable_thinking_from_request(request_obj):
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
Args:
request_obj: The request object (or an item from a list of requests).
Returns:
The boolean value of 'enable_thinking' if found and not True, otherwise True.
"""
if (
hasattr(request_obj, "chat_template_kwargs")
and request_obj.chat_template_kwargs
and request_obj.chat_template_kwargs.get("enable_thinking") is not None
):
return request_obj.chat_template_kwargs.get("enable_thinking")
return True
def v1_chat_generate_request(
all_requests: List[ChatCompletionRequest],
tokenizer_manager,
@@ -1263,31 +1281,16 @@ def v1_chat_generate_response(
tool_calls = None
text = ret_item["text"]
enable_thinking = True
if isinstance(request, list):
tool_choice = request[idx].tool_choice
tools = request[idx].tools
separate_reasoning = request[idx].separate_reasoning
if (
request[idx].chat_template_kwargs
and request[idx].chat_template_kwargs.get("enable_thinking") is not None
):
enable_thinking = request[idx].chat_template_kwargs.get(
"enable_thinking", True
)
enable_thinking = _get_enable_thinking_from_request(request[idx])
else:
tool_choice = request.tool_choice
tools = request.tools
separate_reasoning = request.separate_reasoning
if (
request.chat_template_kwargs
and request.chat_template_kwargs.get("enable_thinking") is not None
):
enable_thinking = request.chat_template_kwargs.get(
"enable_thinking", True
)
enable_thinking = _get_enable_thinking_from_request(request)
reasoning_text = None
if reasoning_parser and separate_reasoning and enable_thinking:
@@ -1526,9 +1529,12 @@ async def v1_chat_completions(
delta = text[len(stream_buffer) :]
new_stream_buffer = stream_buffer + delta
enable_thinking = _get_enable_thinking_from_request(request)
if (
tokenizer_manager.server_args.reasoning_parser
and request.separate_reasoning
and enable_thinking
):
if index not in reasoning_parser_dict:
reasoning_parser_dict[index] = ReasoningParser(

View File

@@ -69,6 +69,7 @@ DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
)
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
# Nightly tests
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"