From d81101acdd3683cdf5289f4f7dfff0610044921d Mon Sep 17 00:00:00 2001 From: jack Date: Thu, 23 Apr 2026 16:46:10 +0800 Subject: [PATCH] [releases/v0.18.0][Platform][BugFix] Guard forced tool choice with empty content (#8400) ### What this PR does / why we need it? This backports the forced-tool-choice `content=None` guard to the `releases/v0.18.0` compatibility layer. Upstream vLLM still has forced named tool-choice branches that assert `content is not None` after reasoning extraction. Some reasoning parsers can legally consume the full output and return `(reasoning, None)`, which makes the assert reachable and can surface as a server-side failure. This PR follows the same compatibility-patch pattern used by: - `7314bbe2` fix(platform): reimplement MiniMax usage accounting patch (#7835) - `f83cb0e6` [Bugfix][Platform] Fix GLM47 tool-call finish backfill (#7710) The patch is intentionally narrow: - normalize `content=None` to `""` only for forced named tool choice - patch both chat-completions and responses parser entry points - keep the rest of upstream behavior unchanged Upstream tracking: - issue: vllm-project/vllm#40147 - PR: vllm-project/vllm#40148 ### Does this PR introduce _any_ user-facing change? Yes. Forced named tool choice becomes robust when the reasoning parser returns no post-reasoning content, avoiding an internal assertion failure and emitting an empty-argument function call instead. ### How was this patch tested? Unit tests: ```bash pytest -sv tests/ut/patch/platform/test_patch_tool_choice_none_content.py \ tests/ut/patch/platform/test_patch_glm_tool_call_parser.py \ tests/ut/patch/platform/test_patch_minimax_usage_accounting.py ``` Result: 22 passed. --------- Signed-off-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com> Co-authored-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com> --- .../test_patch_tool_choice_none_content.py | 95 +++++++++++++++++++ vllm_ascend/patch/__init__.py | 20 ++++ vllm_ascend/patch/platform/__init__.py | 1 + .../patch_tool_choice_none_content.py | 86 +++++++++++++++++ 4 files changed, 202 insertions(+) create mode 100644 tests/ut/patch/platform/test_patch_tool_choice_none_content.py create mode 100644 vllm_ascend/patch/platform/patch_tool_choice_none_content.py diff --git a/tests/ut/patch/platform/test_patch_tool_choice_none_content.py b/tests/ut/patch/platform/test_patch_tool_choice_none_content.py new file mode 100644 index 00000000..5e6e0424 --- /dev/null +++ b/tests/ut/patch/platform/test_patch_tool_choice_none_content.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.engine.serving import OpenAIServing +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest +from vllm.parser.abstract_parser import DelegatingParser + +from vllm_ascend.patch.platform import patch_tool_choice_none_content # noqa: F401 + + +class _DummyDelegatingParser(DelegatingParser): + def is_reasoning_end(self, input_ids: list[int]) -> bool: + return False + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + return input_ids + + def extract_reasoning(self, model_output: str, request): + return None, model_output + + def extract_reasoning_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: list[int], + current_token_ids: list[int], + delta_token_ids: list[int], + ): + return None + + def extract_tool_calls(self, model_output: str, request): + return None + + +def test_parse_tool_calls_from_content_allows_named_tool_choice_with_none_content(): + request = ChatCompletionRequest.model_validate( + { + "model": "test-model", + "messages": [{"role": "user", "content": "test"}], + "tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "parameters": {"type": "object", "properties": {}}, + }, + } + ], + "tool_choice": {"type": "function", "function": {"name": "get_weather"}}, + } + ) + + tool_calls, content = OpenAIServing._parse_tool_calls_from_content( + request=request, + tokenizer=None, + enable_auto_tools=True, + tool_parser_cls=None, + content=None, + ) + + assert content is None + assert tool_calls is not None + assert len(tool_calls) == 1 + assert tool_calls[0].name == "get_weather" + assert tool_calls[0].arguments == "" + + +def test_responses_parser_allows_named_tool_choice_with_none_content(): + request = ResponsesRequest.model_validate( + { + "model": "test-model", + "input": "test", + "tools": [ + { + "type": "function", + "name": "get_weather", + "parameters": {"type": "object", "properties": {}}, + } + ], + "tool_choice": {"type": "function", "name": "get_weather"}, + } + ) + parser = _DummyDelegatingParser(tokenizer=None) + + tool_calls, content = parser._parse_tool_calls( + request=request, + content=None, + enable_auto_tools=False, + ) + + assert content is None + assert len(tool_calls) == 1 + assert tool_calls[0].name == "get_weather" + assert tool_calls[0].arguments == "" diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 4d8c086d..4c719e40 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -238,6 +238,26 @@ # finish-backfill fix are present in the runtime vLLM version used by # vllm-ascend. # +# ** 11. File: platform/patch_tool_choice_none_content.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.entrypoints.openai.engine.serving.OpenAIServing` +# `vllm.parser.abstract_parser.DelegatingParser` +# Why: +# Some reasoning parsers can consume the full model output and return +# `content=None`. On the release runtime, forced named tool choice still +# asserts that content is present before constructing a function call, +# which can surface as a server-side failure instead of an empty-argument +# tool call. +# How: +# Monkey-patch the forced-tool-choice parsing entry points to normalize +# `content=None` to `""` before delegating back to the original upstream +# implementations. +# Related PR (if no, explain why): +# https://github.com/vllm-project/vllm/pull/40148 +# Future Plan: +# Remove this patch once the upstream forced-tool-choice fix is included +# in the runtime vLLM version used by vllm-ascend. +# # * Worker Patch: # =============== # diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index 306a63e3..bbcd1df2 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -30,6 +30,7 @@ import vllm_ascend.patch.platform.patch_sched_yield # noqa import vllm_ascend.patch.platform.patch_torch_accelerator # noqa import vllm_ascend.patch.platform.patch_minimax_usage_accounting # noqa import vllm_ascend.patch.platform.patch_glm_tool_call_parser # noqa +import vllm_ascend.patch.platform.patch_tool_choice_none_content # noqa if envs.VLLM_ASCEND_BALANCE_SCHEDULING: import vllm_ascend.patch.platform.patch_balance_schedule # noqa diff --git a/vllm_ascend/patch/platform/patch_tool_choice_none_content.py b/vllm_ascend/patch/platform/patch_tool_choice_none_content.py new file mode 100644 index 00000000..a64ddf42 --- /dev/null +++ b/vllm_ascend/patch/platform/patch_tool_choice_none_content.py @@ -0,0 +1,86 @@ +# +# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# OpenAI forced tool choice: tolerate None content after reasoning extraction. +# + +from __future__ import annotations + +from openai.types.responses import ToolChoiceFunction +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionNamedToolChoiceParam, +) +from vllm.entrypoints.openai.engine.serving import OpenAIServing +from vllm.parser.abstract_parser import DelegatingParser + + +def _normalize_tool_choice_content( + request, + content: str | None, +) -> str | None: + if content is not None: + return content + + tool_choice = getattr(request, "tool_choice", None) + if isinstance( + tool_choice, + (ToolChoiceFunction, ChatCompletionNamedToolChoiceParam), + ): + return "" + return content + + +_original_parse_tool_calls_from_content = OpenAIServing._parse_tool_calls_from_content + + +def _patched_parse_tool_calls_from_content( + request, + tokenizer, + enable_auto_tools: bool, + tool_parser_cls, + content: str | None = None, +): + content = _normalize_tool_choice_content(request, content) + return _original_parse_tool_calls_from_content( + request=request, + tokenizer=tokenizer, + enable_auto_tools=enable_auto_tools, + tool_parser_cls=tool_parser_cls, + content=content, + ) + + +OpenAIServing._parse_tool_calls_from_content = staticmethod(_patched_parse_tool_calls_from_content) + +_original_delegating_parse_tool_calls = DelegatingParser._parse_tool_calls + + +def _patched_delegating_parse_tool_calls( + self, + request, + content: str | None, + enable_auto_tools: bool, +): + content = _normalize_tool_choice_content(request, content) + return _original_delegating_parse_tool_calls( + self, + request, + content, + enable_auto_tools, + ) + + +DelegatingParser._parse_tool_calls = _patched_delegating_parse_tool_calls