[releases/v0.18.0][Platform][BugFix] Guard forced tool choice with empty content (#8400)
### What this PR does / why we need it? This backports the forced-tool-choice `content=None` guard to the `releases/v0.18.0` compatibility layer. Upstream vLLM still has forced named tool-choice branches that assert `content is not None` after reasoning extraction. Some reasoning parsers can legally consume the full output and return `(reasoning, None)`, which makes the assert reachable and can surface as a server-side failure. This PR follows the same compatibility-patch pattern used by: - `7314bbe2` fix(platform): reimplement MiniMax usage accounting patch (#7835) - `f83cb0e6` [Bugfix][Platform] Fix GLM47 tool-call finish backfill (#7710) The patch is intentionally narrow: - normalize `content=None` to `""` only for forced named tool choice - patch both chat-completions and responses parser entry points - keep the rest of upstream behavior unchanged Upstream tracking: - issue: vllm-project/vllm#40147 - PR: vllm-project/vllm#40148 ### Does this PR introduce _any_ user-facing change? Yes. Forced named tool choice becomes robust when the reasoning parser returns no post-reasoning content, avoiding an internal assertion failure and emitting an empty-argument function call instead. ### How was this patch tested? Unit tests: ```bash pytest -sv tests/ut/patch/platform/test_patch_tool_choice_none_content.py \ tests/ut/patch/platform/test_patch_glm_tool_call_parser.py \ tests/ut/patch/platform/test_patch_minimax_usage_accounting.py ``` Result: 22 passed. --------- Signed-off-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com> Co-authored-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com>
This commit is contained in:
@@ -0,0 +1,95 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
|
||||
from vllm.entrypoints.openai.engine.serving import OpenAIServing
|
||||
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
|
||||
from vllm.parser.abstract_parser import DelegatingParser
|
||||
|
||||
from vllm_ascend.patch.platform import patch_tool_choice_none_content # noqa: F401
|
||||
|
||||
|
||||
class _DummyDelegatingParser(DelegatingParser):
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return False
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
return input_ids
|
||||
|
||||
def extract_reasoning(self, model_output: str, request):
|
||||
return None, model_output
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: list[int],
|
||||
current_token_ids: list[int],
|
||||
delta_token_ids: list[int],
|
||||
):
|
||||
return None
|
||||
|
||||
def extract_tool_calls(self, model_output: str, request):
|
||||
return None
|
||||
|
||||
|
||||
def test_parse_tool_calls_from_content_allows_named_tool_choice_with_none_content():
|
||||
request = ChatCompletionRequest.model_validate(
|
||||
{
|
||||
"model": "test-model",
|
||||
"messages": [{"role": "user", "content": "test"}],
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"parameters": {"type": "object", "properties": {}},
|
||||
},
|
||||
}
|
||||
],
|
||||
"tool_choice": {"type": "function", "function": {"name": "get_weather"}},
|
||||
}
|
||||
)
|
||||
|
||||
tool_calls, content = OpenAIServing._parse_tool_calls_from_content(
|
||||
request=request,
|
||||
tokenizer=None,
|
||||
enable_auto_tools=True,
|
||||
tool_parser_cls=None,
|
||||
content=None,
|
||||
)
|
||||
|
||||
assert content is None
|
||||
assert tool_calls is not None
|
||||
assert len(tool_calls) == 1
|
||||
assert tool_calls[0].name == "get_weather"
|
||||
assert tool_calls[0].arguments == ""
|
||||
|
||||
|
||||
def test_responses_parser_allows_named_tool_choice_with_none_content():
|
||||
request = ResponsesRequest.model_validate(
|
||||
{
|
||||
"model": "test-model",
|
||||
"input": "test",
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"parameters": {"type": "object", "properties": {}},
|
||||
}
|
||||
],
|
||||
"tool_choice": {"type": "function", "name": "get_weather"},
|
||||
}
|
||||
)
|
||||
parser = _DummyDelegatingParser(tokenizer=None)
|
||||
|
||||
tool_calls, content = parser._parse_tool_calls(
|
||||
request=request,
|
||||
content=None,
|
||||
enable_auto_tools=False,
|
||||
)
|
||||
|
||||
assert content is None
|
||||
assert len(tool_calls) == 1
|
||||
assert tool_calls[0].name == "get_weather"
|
||||
assert tool_calls[0].arguments == ""
|
||||
@@ -238,6 +238,26 @@
|
||||
# finish-backfill fix are present in the runtime vLLM version used by
|
||||
# vllm-ascend.
|
||||
#
|
||||
# ** 11. File: platform/patch_tool_choice_none_content.py**
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# 1. `vllm.entrypoints.openai.engine.serving.OpenAIServing`
|
||||
# `vllm.parser.abstract_parser.DelegatingParser`
|
||||
# Why:
|
||||
# Some reasoning parsers can consume the full model output and return
|
||||
# `content=None`. On the release runtime, forced named tool choice still
|
||||
# asserts that content is present before constructing a function call,
|
||||
# which can surface as a server-side failure instead of an empty-argument
|
||||
# tool call.
|
||||
# How:
|
||||
# Monkey-patch the forced-tool-choice parsing entry points to normalize
|
||||
# `content=None` to `""` before delegating back to the original upstream
|
||||
# implementations.
|
||||
# Related PR (if no, explain why):
|
||||
# https://github.com/vllm-project/vllm/pull/40148
|
||||
# Future Plan:
|
||||
# Remove this patch once the upstream forced-tool-choice fix is included
|
||||
# in the runtime vLLM version used by vllm-ascend.
|
||||
#
|
||||
# * Worker Patch:
|
||||
# ===============
|
||||
#
|
||||
|
||||
@@ -30,6 +30,7 @@ import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
||||
import vllm_ascend.patch.platform.patch_torch_accelerator # noqa
|
||||
import vllm_ascend.patch.platform.patch_minimax_usage_accounting # noqa
|
||||
import vllm_ascend.patch.platform.patch_glm_tool_call_parser # noqa
|
||||
import vllm_ascend.patch.platform.patch_tool_choice_none_content # noqa
|
||||
|
||||
if envs.VLLM_ASCEND_BALANCE_SCHEDULING:
|
||||
import vllm_ascend.patch.platform.patch_balance_schedule # noqa
|
||||
|
||||
86
vllm_ascend/patch/platform/patch_tool_choice_none_content.py
Normal file
86
vllm_ascend/patch/platform/patch_tool_choice_none_content.py
Normal file
@@ -0,0 +1,86 @@
|
||||
#
|
||||
# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# OpenAI forced tool choice: tolerate None content after reasoning extraction.
|
||||
#
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from openai.types.responses import ToolChoiceFunction
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionNamedToolChoiceParam,
|
||||
)
|
||||
from vllm.entrypoints.openai.engine.serving import OpenAIServing
|
||||
from vllm.parser.abstract_parser import DelegatingParser
|
||||
|
||||
|
||||
def _normalize_tool_choice_content(
|
||||
request,
|
||||
content: str | None,
|
||||
) -> str | None:
|
||||
if content is not None:
|
||||
return content
|
||||
|
||||
tool_choice = getattr(request, "tool_choice", None)
|
||||
if isinstance(
|
||||
tool_choice,
|
||||
(ToolChoiceFunction, ChatCompletionNamedToolChoiceParam),
|
||||
):
|
||||
return ""
|
||||
return content
|
||||
|
||||
|
||||
_original_parse_tool_calls_from_content = OpenAIServing._parse_tool_calls_from_content
|
||||
|
||||
|
||||
def _patched_parse_tool_calls_from_content(
|
||||
request,
|
||||
tokenizer,
|
||||
enable_auto_tools: bool,
|
||||
tool_parser_cls,
|
||||
content: str | None = None,
|
||||
):
|
||||
content = _normalize_tool_choice_content(request, content)
|
||||
return _original_parse_tool_calls_from_content(
|
||||
request=request,
|
||||
tokenizer=tokenizer,
|
||||
enable_auto_tools=enable_auto_tools,
|
||||
tool_parser_cls=tool_parser_cls,
|
||||
content=content,
|
||||
)
|
||||
|
||||
|
||||
OpenAIServing._parse_tool_calls_from_content = staticmethod(_patched_parse_tool_calls_from_content)
|
||||
|
||||
_original_delegating_parse_tool_calls = DelegatingParser._parse_tool_calls
|
||||
|
||||
|
||||
def _patched_delegating_parse_tool_calls(
|
||||
self,
|
||||
request,
|
||||
content: str | None,
|
||||
enable_auto_tools: bool,
|
||||
):
|
||||
content = _normalize_tool_choice_content(request, content)
|
||||
return _original_delegating_parse_tool_calls(
|
||||
self,
|
||||
request,
|
||||
content,
|
||||
enable_auto_tools,
|
||||
)
|
||||
|
||||
|
||||
DelegatingParser._parse_tool_calls = _patched_delegating_parse_tool_calls
|
||||
Reference in New Issue
Block a user