[Bugfix][Platform] Fix GLM47 tool-call finish backfill (#7710)
### What this PR does / why we need it? This rebases the GLM47 tool-call parser fix onto `releases/v0.18.0` after the MiniMax usage-accounting patch merged upstream on March 27, 2026. It fixes OpenAI chat tool-call streaming for GLM47 by: - draining terminal parser chunks that contain both the final argument text and the closing `</tool_call>` suffix - computing finish backfill from the tool argument bytes actually emitted to the client, instead of trusting parser-internal buffered state - adding focused regression tests for finish backfill and terminal chunk handling ### Does this PR introduce _any_ user-facing change? Yes. GLM47 OpenAI-compatible streaming tool-call responses now emit correct final chunks and argument payloads on `releases/v0.18.0`. ### How was this patch tested? - `pytest -q tests/ut/patch/platform/test_patch_glm_tool_call_parser.py tests/ut/patch/platform/test_patch_minimax_usage_accounting.py` - `python -m pre_commit run --files vllm_ascend/patch/platform/patch_glm_tool_call_parser.py tests/ut/patch/platform/test_patch_glm_tool_call_parser.py vllm_ascend/patch/platform/__init__.py vllm_ascend/patch/__init__.py` --------- Signed-off-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com> Co-authored-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com>
This commit is contained in:
314
tests/ut/patch/platform/test_patch_glm_tool_call_parser.py
Normal file
314
tests/ut/patch/platform/test_patch_glm_tool_call_parser.py
Normal file
@@ -0,0 +1,314 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||||
|
ChatCompletionRequest,
|
||||||
|
ChatCompletionResponseStreamChoice,
|
||||||
|
ChatCompletionStreamResponse,
|
||||||
|
)
|
||||||
|
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
|
||||||
|
from vllm.entrypoints.openai.engine.protocol import (
|
||||||
|
DeltaFunctionCall,
|
||||||
|
DeltaMessage,
|
||||||
|
DeltaToolCall,
|
||||||
|
)
|
||||||
|
from vllm.tool_parsers.glm4_moe_tool_parser import Glm4MoeModelToolParser
|
||||||
|
from vllm.tool_parsers.glm47_moe_tool_parser import Glm47MoeModelToolParser
|
||||||
|
|
||||||
|
|
||||||
|
class FakeTokenizer:
|
||||||
|
def get_vocab(self):
|
||||||
|
return {
|
||||||
|
"<tool_call>": 1,
|
||||||
|
"</tool_call>": 2,
|
||||||
|
"<arg_key>": 3,
|
||||||
|
"</arg_key>": 4,
|
||||||
|
"<arg_value>": 5,
|
||||||
|
"</arg_value>": 6,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _reset_streaming_state(parser):
|
||||||
|
parser._buffer = ""
|
||||||
|
parser._in_tool_call = False
|
||||||
|
parser.current_tool_name_sent = False
|
||||||
|
parser._current_tool_name = None
|
||||||
|
parser._pending_key = None
|
||||||
|
parser._streaming_string_value = False
|
||||||
|
parser.prev_tool_call_arr = []
|
||||||
|
parser.current_tool_id = -1
|
||||||
|
parser.streamed_args_for_tool = []
|
||||||
|
parser._tool_call_ids = []
|
||||||
|
parser._args_started = []
|
||||||
|
parser._args_closed = []
|
||||||
|
parser._seen_keys = []
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_remaining_args_delta_uses_fallback_metadata_for_args_only_delta():
|
||||||
|
original_delta = DeltaMessage(
|
||||||
|
tool_calls=[
|
||||||
|
DeltaToolCall(
|
||||||
|
index=0,
|
||||||
|
function=DeltaFunctionCall(arguments='{"files":['),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
result = OpenAIServingChat._create_remaining_args_delta(
|
||||||
|
original_delta,
|
||||||
|
'{"files":[{"filepath":"HumanEval-X/README.md"}]}',
|
||||||
|
0,
|
||||||
|
fallback_tool_call_id="call_files",
|
||||||
|
fallback_tool_call_type="function",
|
||||||
|
fallback_tool_call_name="builtin_read_many_files",
|
||||||
|
)
|
||||||
|
|
||||||
|
tc = result.tool_calls[0]
|
||||||
|
assert tc.index == 0
|
||||||
|
assert tc.id == "call_files"
|
||||||
|
assert tc.type == "function"
|
||||||
|
assert tc.function.name == "builtin_read_many_files"
|
||||||
|
assert tc.function.arguments == ('{"files":[{"filepath":"HumanEval-X/README.md"}]}')
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_remaining_args_delta_prefers_current_metadata_over_fallback():
|
||||||
|
original_delta = DeltaMessage(
|
||||||
|
tool_calls=[
|
||||||
|
DeltaToolCall(
|
||||||
|
index=0,
|
||||||
|
id="call_current",
|
||||||
|
type="function",
|
||||||
|
function=DeltaFunctionCall(
|
||||||
|
name="current_name",
|
||||||
|
arguments='{"files":[',
|
||||||
|
),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
result = OpenAIServingChat._create_remaining_args_delta(
|
||||||
|
original_delta,
|
||||||
|
"]}",
|
||||||
|
0,
|
||||||
|
fallback_tool_call_id="call_fallback",
|
||||||
|
fallback_tool_call_type="function",
|
||||||
|
fallback_tool_call_name="fallback_name",
|
||||||
|
)
|
||||||
|
|
||||||
|
tc = result.tool_calls[0]
|
||||||
|
assert tc.id == "call_current"
|
||||||
|
assert tc.type == "function"
|
||||||
|
assert tc.function.name == "current_name"
|
||||||
|
assert tc.function.arguments == "]}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_record_streamed_tool_args_tracks_emitted_bytes():
|
||||||
|
streamed_tool_args = {0: '{"files":['}
|
||||||
|
delta_message = DeltaMessage(
|
||||||
|
tool_calls=[
|
||||||
|
DeltaToolCall(
|
||||||
|
index=0,
|
||||||
|
function=DeltaFunctionCall(arguments='{"filepath":"HumanEval-X/README.md"}]}'),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
OpenAIServingChat._record_streamed_tool_args(delta_message, streamed_tool_args)
|
||||||
|
|
||||||
|
assert streamed_tool_args[0] == ('{"files":[{"filepath":"HumanEval-X/README.md"}]}')
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_remaining_tool_args_handles_compact_prefix():
|
||||||
|
remaining = OpenAIServingChat._compute_remaining_tool_args(
|
||||||
|
expected_args={"a": 1},
|
||||||
|
streamed_args='{"a":1',
|
||||||
|
)
|
||||||
|
|
||||||
|
assert remaining == "}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_remaining_tool_args_handles_stringified_expected_args():
|
||||||
|
remaining = OpenAIServingChat._compute_remaining_tool_args(
|
||||||
|
expected_args='{"a":1}',
|
||||||
|
streamed_args='{"a":1',
|
||||||
|
)
|
||||||
|
|
||||||
|
assert remaining == "}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_remaining_tool_args_handles_glm_mixed_whitespace_prefix():
|
||||||
|
expected_args = {
|
||||||
|
"todos": [
|
||||||
|
{
|
||||||
|
"content": "A",
|
||||||
|
"activeForm": "B",
|
||||||
|
"status": "in_progress",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
remaining = OpenAIServingChat._compute_remaining_tool_args(
|
||||||
|
expected_args=expected_args,
|
||||||
|
streamed_args=('{"todos":[{"content": "A", "activeForm": "B", "status": "in_progress"}]'),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert remaining == "}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_remaining_tool_args_backfills_missing_suffix_for_glm_partial_prefix():
|
||||||
|
expected_args = {
|
||||||
|
"todos": [
|
||||||
|
{
|
||||||
|
"content": "A",
|
||||||
|
"activeForm": "B",
|
||||||
|
"status": "in_progress",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
remaining = OpenAIServingChat._compute_remaining_tool_args(
|
||||||
|
expected_args=expected_args,
|
||||||
|
streamed_args='{"todos":[{"content": "A"',
|
||||||
|
)
|
||||||
|
|
||||||
|
assert remaining == ',"activeForm":"B","status":"in_progress"}]}'
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_remaining_tool_args_returns_empty_for_non_matching_prefix():
|
||||||
|
remaining = OpenAIServingChat._compute_remaining_tool_args(
|
||||||
|
expected_args={"a": 1},
|
||||||
|
streamed_args="not-json",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert remaining == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_remaining_tool_args_returns_full_call_when_no_args_were_sent():
|
||||||
|
remaining = OpenAIServingChat._compute_remaining_tool_args(
|
||||||
|
expected_args={
|
||||||
|
"todos": "- [x] 分析项目结构和代码\n- [ ] 添加单元测试框架",
|
||||||
|
},
|
||||||
|
streamed_args="",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert remaining == ('{"todos": "- [x] 分析项目结构和代码\\n- [ ] 添加单元测试框架"}')
|
||||||
|
|
||||||
|
|
||||||
|
def test_glm_streaming_final_chunk_emits_inline_string_value():
|
||||||
|
parser = Glm4MoeModelToolParser(FakeTokenizer())
|
||||||
|
_reset_streaming_state(parser)
|
||||||
|
|
||||||
|
request = ChatCompletionRequest(
|
||||||
|
model="zai-org/GLM-4.7",
|
||||||
|
messages=[],
|
||||||
|
tools=[
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "builtin_get_problems",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"filepath": {"type": "string"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks = [
|
||||||
|
"<tool_call>",
|
||||||
|
"builtin_get_problems\n",
|
||||||
|
"<arg_key>filepath</arg_key>",
|
||||||
|
"<arg_value>pong.py</arg_value></tool_call>",
|
||||||
|
]
|
||||||
|
|
||||||
|
last_tool_delta = None
|
||||||
|
for chunk in chunks:
|
||||||
|
result = parser.extract_tool_calls_streaming(
|
||||||
|
previous_text="",
|
||||||
|
current_text="",
|
||||||
|
delta_text=chunk,
|
||||||
|
previous_token_ids=[],
|
||||||
|
current_token_ids=[],
|
||||||
|
delta_token_ids=[],
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
if result is not None and result.tool_calls:
|
||||||
|
last_tool_delta = result
|
||||||
|
|
||||||
|
assert last_tool_delta is not None
|
||||||
|
assert last_tool_delta.tool_calls[0].function.arguments == '{"filepath":"pong.py"}'
|
||||||
|
assert parser.streamed_args_for_tool == ['{"filepath":"pong.py"}']
|
||||||
|
assert parser.prev_tool_call_arr == [
|
||||||
|
{
|
||||||
|
"name": "builtin_get_problems",
|
||||||
|
"arguments": {"filepath": "pong.py"},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_glm47_streaming_delta_serializes_tool_call_fields():
|
||||||
|
parser = Glm47MoeModelToolParser(FakeTokenizer())
|
||||||
|
_reset_streaming_state(parser)
|
||||||
|
|
||||||
|
request = ChatCompletionRequest(
|
||||||
|
model="GLM-5",
|
||||||
|
messages=[],
|
||||||
|
tools=[
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "builtin_get_problems",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"filepath": {"type": "string"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks = [
|
||||||
|
"<tool_call>",
|
||||||
|
"builtin_get_problems\n",
|
||||||
|
"<arg_key>filepath</arg_key>",
|
||||||
|
"<arg_value>pong.py</arg_value></tool_call>",
|
||||||
|
]
|
||||||
|
|
||||||
|
serialized_deltas = []
|
||||||
|
for chunk in chunks:
|
||||||
|
result = parser.extract_tool_calls_streaming(
|
||||||
|
previous_text="",
|
||||||
|
current_text="",
|
||||||
|
delta_text=chunk,
|
||||||
|
previous_token_ids=[],
|
||||||
|
current_token_ids=[],
|
||||||
|
delta_token_ids=[],
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
if result is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
choice = ChatCompletionResponseStreamChoice(
|
||||||
|
index=0,
|
||||||
|
delta=result,
|
||||||
|
logprobs=None,
|
||||||
|
finish_reason=None,
|
||||||
|
)
|
||||||
|
response = ChatCompletionStreamResponse(
|
||||||
|
id="chatcmpl-test",
|
||||||
|
created=0,
|
||||||
|
model="GLM-5",
|
||||||
|
choices=[choice],
|
||||||
|
)
|
||||||
|
serialized_deltas.append(response.model_dump(exclude_unset=True)["choices"][0]["delta"])
|
||||||
|
|
||||||
|
assert len(serialized_deltas) == 2
|
||||||
|
assert serialized_deltas[0]["tool_calls"][0]["type"] == "function"
|
||||||
|
assert serialized_deltas[0]["tool_calls"][0]["function"]["name"] == "builtin_get_problems"
|
||||||
|
assert serialized_deltas[-1] != {}
|
||||||
|
assert serialized_deltas[-1]["tool_calls"][0]["index"] == 0
|
||||||
|
assert serialized_deltas[-1]["tool_calls"][0]["function"]["arguments"] == '{"filepath":"pong.py"}'
|
||||||
@@ -211,6 +211,33 @@
|
|||||||
# Remove this patch once the upstream MiniMax usage-accounting fix is in
|
# Remove this patch once the upstream MiniMax usage-accounting fix is in
|
||||||
# the runtime vLLM version used by vllm-ascend.
|
# the runtime vLLM version used by vllm-ascend.
|
||||||
#
|
#
|
||||||
|
# ** 10. File: platform/patch_glm_tool_call_parser.py**
|
||||||
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
# 1. `vllm.entrypoints.openai.chat_completion.serving.OpenAIServingChat`
|
||||||
|
# `vllm.tool_parsers.glm4_moe_tool_parser.Glm4MoeModelToolParser`
|
||||||
|
# Why:
|
||||||
|
# GLM-4.7 / GLM-4.5 tool-call streaming on the release runtime still has
|
||||||
|
# two independent finish-path bugs:
|
||||||
|
# 1. the parser can leave a terminal `<arg_value>... </tool_call>` chunk
|
||||||
|
# partially undrained, and
|
||||||
|
# 2. finish backfill trusts the parser's internal accumulated arguments
|
||||||
|
# instead of the argument bytes actually sent to the client.
|
||||||
|
# Together these can drop a full string value or emit only a suffix like
|
||||||
|
# `"}` in the final SSE chunk even when non-stream output is correct.
|
||||||
|
# How:
|
||||||
|
# Monkey-patch the GLM parser to keep draining a single chunk through
|
||||||
|
# terminal state transitions, and monkey-patch chat streaming to track
|
||||||
|
# per-tool arguments actually emitted to the client before computing the
|
||||||
|
# finish-chunk suffix. The suffix logic still tolerates mixed JSON
|
||||||
|
# whitespace styles from GLM tool parsers.
|
||||||
|
# Related PR (if no, explain why):
|
||||||
|
# https://github.com/vllm-project/vllm/pull/37845
|
||||||
|
# https://github.com/vllm-project/vllm/pull/33218
|
||||||
|
# Future Plan:
|
||||||
|
# Remove this patch once both the GLM parser drain fix and the serving
|
||||||
|
# finish-backfill fix are present in the runtime vLLM version used by
|
||||||
|
# vllm-ascend.
|
||||||
|
#
|
||||||
# * Worker Patch:
|
# * Worker Patch:
|
||||||
# ===============
|
# ===============
|
||||||
#
|
#
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ import vllm_ascend.patch.platform.patch_minimax_m2_config # noqa
|
|||||||
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
||||||
import vllm_ascend.patch.platform.patch_torch_accelerator # noqa
|
import vllm_ascend.patch.platform.patch_torch_accelerator # noqa
|
||||||
import vllm_ascend.patch.platform.patch_minimax_usage_accounting # noqa
|
import vllm_ascend.patch.platform.patch_minimax_usage_accounting # noqa
|
||||||
|
import vllm_ascend.patch.platform.patch_glm_tool_call_parser # noqa
|
||||||
|
|
||||||
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
|
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
|
||||||
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
|
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
|
||||||
|
|||||||
1061
vllm_ascend/patch/platform/patch_glm_tool_call_parser.py
Normal file
1061
vllm_ascend/patch/platform/patch_glm_tool_call_parser.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user