[Bugfix][Platform] Fix GLM47 tool-call finish backfill (#7710)

### What this PR does / why we need it? This rebases the GLM47 tool-call parser fix onto `releases/v0.18.0` after the MiniMax usage-accounting patch merged upstream on March 27, 2026. It fixes OpenAI chat tool-call streaming for GLM47 by: - draining terminal parser chunks that contain both the final argument text and the closing `</tool_call>` suffix - computing finish backfill from the tool argument bytes actually emitted to the client, instead of trusting parser-internal buffered state - adding focused regression tests for finish backfill and terminal chunk handling ### Does this PR introduce _any_ user-facing change? Yes. GLM47 OpenAI-compatible streaming tool-call responses now emit correct final chunks and argument payloads on `releases/v0.18.0`. ### How was this patch tested? - `pytest -q tests/ut/patch/platform/test_patch_glm_tool_call_parser.py tests/ut/patch/platform/test_patch_minimax_usage_accounting.py` - `python -m pre_commit run --files vllm_ascend/patch/platform/patch_glm_tool_call_parser.py tests/ut/patch/platform/test_patch_glm_tool_call_parser.py vllm_ascend/patch/platform/__init__.py vllm_ascend/patch/__init__.py` --------- Signed-off-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com> Co-authored-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com>
2026-03-28 09:15:04 +08:00
parent 6fbd0049df
commit f83cb0e6dc
4 changed files with 1403 additions and 0 deletions
--- a/tests/ut/patch/platform/test_patch_glm_tool_call_parser.py
+++ b/tests/ut/patch/platform/test_patch_glm_tool_call_parser.py
@@ -0,0 +1,314 @@
 # SPDX-License-Identifier: Apache-2.0
 from vllm.entrypoints.openai.chat_completion.protocol import (
    ChatCompletionRequest,
    ChatCompletionResponseStreamChoice,
    ChatCompletionStreamResponse,
 )
 from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
 from vllm.entrypoints.openai.engine.protocol import (
    DeltaFunctionCall,
    DeltaMessage,
    DeltaToolCall,
 )
 from vllm.tool_parsers.glm4_moe_tool_parser import Glm4MoeModelToolParser
 from vllm.tool_parsers.glm47_moe_tool_parser import Glm47MoeModelToolParser
 class FakeTokenizer:
    def get_vocab(self):
        return {
            "<tool_call>": 1,
            "</tool_call>": 2,
            "<arg_key>": 3,
            "</arg_key>": 4,
            "<arg_value>": 5,
            "</arg_value>": 6,
        }
 def _reset_streaming_state(parser):
    parser._buffer = ""
    parser._in_tool_call = False
    parser.current_tool_name_sent = False
    parser._current_tool_name = None
    parser._pending_key = None
    parser._streaming_string_value = False
    parser.prev_tool_call_arr = []
    parser.current_tool_id = -1
    parser.streamed_args_for_tool = []
    parser._tool_call_ids = []
    parser._args_started = []
    parser._args_closed = []
    parser._seen_keys = []
 def test_create_remaining_args_delta_uses_fallback_metadata_for_args_only_delta():
    original_delta = DeltaMessage(
        tool_calls=[
            DeltaToolCall(
                index=0,
                function=DeltaFunctionCall(arguments='{"files":['),
            )
        ]
    )
    result = OpenAIServingChat._create_remaining_args_delta(
        original_delta,
        '{"files":[{"filepath":"HumanEval-X/README.md"}]}',
        0,
        fallback_tool_call_id="call_files",
        fallback_tool_call_type="function",
        fallback_tool_call_name="builtin_read_many_files",
    )
    tc = result.tool_calls[0]
    assert tc.index == 0
    assert tc.id == "call_files"
    assert tc.type == "function"
    assert tc.function.name == "builtin_read_many_files"
    assert tc.function.arguments == ('{"files":[{"filepath":"HumanEval-X/README.md"}]}')
 def test_create_remaining_args_delta_prefers_current_metadata_over_fallback():
    original_delta = DeltaMessage(
        tool_calls=[
            DeltaToolCall(
                index=0,
                id="call_current",
                type="function",
                function=DeltaFunctionCall(
                    name="current_name",
                    arguments='{"files":[',
                ),
            )
        ]
    )
    result = OpenAIServingChat._create_remaining_args_delta(
        original_delta,
        "]}",
        0,
        fallback_tool_call_id="call_fallback",
        fallback_tool_call_type="function",
        fallback_tool_call_name="fallback_name",
    )
    tc = result.tool_calls[0]
    assert tc.id == "call_current"
    assert tc.type == "function"
    assert tc.function.name == "current_name"
    assert tc.function.arguments == "]}"
 def test_record_streamed_tool_args_tracks_emitted_bytes():
    streamed_tool_args = {0: '{"files":['}
    delta_message = DeltaMessage(
        tool_calls=[
            DeltaToolCall(
                index=0,
                function=DeltaFunctionCall(arguments='{"filepath":"HumanEval-X/README.md"}]}'),
            )
        ]
    )
    OpenAIServingChat._record_streamed_tool_args(delta_message, streamed_tool_args)
    assert streamed_tool_args[0] == ('{"files":[{"filepath":"HumanEval-X/README.md"}]}')
 def test_compute_remaining_tool_args_handles_compact_prefix():
    remaining = OpenAIServingChat._compute_remaining_tool_args(
        expected_args={"a": 1},
        streamed_args='{"a":1',
    )
    assert remaining == "}"
 def test_compute_remaining_tool_args_handles_stringified_expected_args():
    remaining = OpenAIServingChat._compute_remaining_tool_args(
        expected_args='{"a":1}',
        streamed_args='{"a":1',
    )
    assert remaining == "}"
 def test_compute_remaining_tool_args_handles_glm_mixed_whitespace_prefix():
    expected_args = {
        "todos": [
            {
                "content": "A",
                "activeForm": "B",
                "status": "in_progress",
            }
        ]
    }
    remaining = OpenAIServingChat._compute_remaining_tool_args(
        expected_args=expected_args,
        streamed_args=('{"todos":[{"content": "A", "activeForm": "B", "status": "in_progress"}]'),
    )
    assert remaining == "}"
 def test_compute_remaining_tool_args_backfills_missing_suffix_for_glm_partial_prefix():
    expected_args = {
        "todos": [
            {
                "content": "A",
                "activeForm": "B",
                "status": "in_progress",
            }
        ]
    }
    remaining = OpenAIServingChat._compute_remaining_tool_args(
        expected_args=expected_args,
        streamed_args='{"todos":[{"content": "A"',
    )
    assert remaining == ',"activeForm":"B","status":"in_progress"}]}'
 def test_compute_remaining_tool_args_returns_empty_for_non_matching_prefix():
    remaining = OpenAIServingChat._compute_remaining_tool_args(
        expected_args={"a": 1},
        streamed_args="not-json",
    )
    assert remaining == ""
 def test_compute_remaining_tool_args_returns_full_call_when_no_args_were_sent():
    remaining = OpenAIServingChat._compute_remaining_tool_args(
        expected_args={
            "todos": "- [x] 分析项目结构和代码\n- [ ] 添加单元测试框架",
        },
        streamed_args="",
    )
    assert remaining == ('{"todos": "- [x] 分析项目结构和代码\\n- [ ] 添加单元测试框架"}')
 def test_glm_streaming_final_chunk_emits_inline_string_value():
    parser = Glm4MoeModelToolParser(FakeTokenizer())
    _reset_streaming_state(parser)
    request = ChatCompletionRequest(
        model="zai-org/GLM-4.7",
        messages=[],
        tools=[
            {
                "type": "function",
                "function": {
                    "name": "builtin_get_problems",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "filepath": {"type": "string"},
                        },
                    },
                },
            }
        ],
    )
    chunks = [
        "<tool_call>",
        "builtin_get_problems\n",
        "<arg_key>filepath</arg_key>",
        "<arg_value>pong.py</arg_value></tool_call>",
    ]
    last_tool_delta = None
    for chunk in chunks:
        result = parser.extract_tool_calls_streaming(
            previous_text="",
            current_text="",
            delta_text=chunk,
            previous_token_ids=[],
            current_token_ids=[],
            delta_token_ids=[],
            request=request,
        )
        if result is not None and result.tool_calls:
            last_tool_delta = result
    assert last_tool_delta is not None
    assert last_tool_delta.tool_calls[0].function.arguments == '{"filepath":"pong.py"}'
    assert parser.streamed_args_for_tool == ['{"filepath":"pong.py"}']
    assert parser.prev_tool_call_arr == [
        {
            "name": "builtin_get_problems",
            "arguments": {"filepath": "pong.py"},
        }
    ]
 def test_glm47_streaming_delta_serializes_tool_call_fields():
    parser = Glm47MoeModelToolParser(FakeTokenizer())
    _reset_streaming_state(parser)
    request = ChatCompletionRequest(
        model="GLM-5",
        messages=[],
        tools=[
            {
                "type": "function",
                "function": {
                    "name": "builtin_get_problems",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "filepath": {"type": "string"},
                        },
                    },
                },
            }
        ],
    )
    chunks = [
        "<tool_call>",
        "builtin_get_problems\n",
        "<arg_key>filepath</arg_key>",
        "<arg_value>pong.py</arg_value></tool_call>",
    ]
    serialized_deltas = []
    for chunk in chunks:
        result = parser.extract_tool_calls_streaming(
            previous_text="",
            current_text="",
            delta_text=chunk,
            previous_token_ids=[],
            current_token_ids=[],
            delta_token_ids=[],
            request=request,
        )
        if result is None:
            continue
        choice = ChatCompletionResponseStreamChoice(
            index=0,
            delta=result,
            logprobs=None,
            finish_reason=None,
        )
        response = ChatCompletionStreamResponse(
            id="chatcmpl-test",
            created=0,
            model="GLM-5",
            choices=[choice],
        )
        serialized_deltas.append(response.model_dump(exclude_unset=True)["choices"][0]["delta"])
    assert len(serialized_deltas) == 2
    assert serialized_deltas[0]["tool_calls"][0]["type"] == "function"
    assert serialized_deltas[0]["tool_calls"][0]["function"]["name"] == "builtin_get_problems"
    assert serialized_deltas[-1] != {}
    assert serialized_deltas[-1]["tool_calls"][0]["index"] == 0
    assert serialized_deltas[-1]["tool_calls"][0]["function"]["arguments"] == '{"filepath":"pong.py"}'
--- a/vllm_ascend/patch/init.py
+++ b/vllm_ascend/patch/init.py
@@ -211,6 +211,33 @@
 #       Remove this patch once the upstream MiniMax usage-accounting fix is in
 #       the runtime vLLM version used by vllm-ascend.
 #
 # ** 10. File: platform/patch_glm_tool_call_parser.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.entrypoints.openai.chat_completion.serving.OpenAIServingChat`
 #      `vllm.tool_parsers.glm4_moe_tool_parser.Glm4MoeModelToolParser`
 #    Why:
 #       GLM-4.7 / GLM-4.5 tool-call streaming on the release runtime still has
 #       two independent finish-path bugs:
 #       1. the parser can leave a terminal `<arg_value>... </tool_call>` chunk
 #          partially undrained, and
 #       2. finish backfill trusts the parser's internal accumulated arguments
 #          instead of the argument bytes actually sent to the client.
 #       Together these can drop a full string value or emit only a suffix like
 #       `"}` in the final SSE chunk even when non-stream output is correct.
 #    How：
 #       Monkey-patch the GLM parser to keep draining a single chunk through
 #       terminal state transitions, and monkey-patch chat streaming to track
 #       per-tool arguments actually emitted to the client before computing the
 #       finish-chunk suffix. The suffix logic still tolerates mixed JSON
 #       whitespace styles from GLM tool parsers.
 #    Related PR (if no, explain why):
 #       https://github.com/vllm-project/vllm/pull/37845
 #       https://github.com/vllm-project/vllm/pull/33218
 #    Future Plan:
 #       Remove this patch once both the GLM parser drain fix and the serving
 #       finish-backfill fix are present in the runtime vLLM version used by
 #       vllm-ascend.
 #
 # * Worker Patch:
 # ===============
 #
--- a/vllm_ascend/patch/platform/init.py
+++ b/vllm_ascend/patch/platform/init.py
@@ -30,6 +30,7 @@ import vllm_ascend.patch.platform.patch_minimax_m2_config  # noqa
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa
 import vllm_ascend.patch.platform.patch_torch_accelerator  # noqa
 import vllm_ascend.patch.platform.patch_minimax_usage_accounting  # noqa
 import vllm_ascend.patch.platform.patch_glm_tool_call_parser  # noqa
 if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
    import vllm_ascend.patch.platform.patch_multiproc_executor  # noqa
--- a/vllm_ascend/patch/platform/patch_glm_tool_call_parser.py
+++ b/vllm_ascend/patch/platform/patch_glm_tool_call_parser.py