fix(platform): reimplement MiniMax usage accounting patch (#7835)

## Summary - replace the MiniMax usage accounting monkey patch with a runtime wrapper implementation instead of source-text rewriting - preserve MiniMax reasoning-token semantics when `</think>` is missing by counting the emitted output as reasoning tokens - add unit coverage for usage tracking helpers and MiniMax reasoning-token counting ## Why The previous implementation rewrote `OpenAIServingChat` by matching exact source blocks. That was brittle against `vllm` source drift and could crash during early plugin initialization with: `RuntimeError: Failed to locate expected block while patching OpenAIServingChat usage accounting.` This change keeps the usage-accounting backport, but applies it by wrapping the original stream/full generators and tracking output token ids at runtime. For MiniMax reasoning counting, a missing `</think>` should not be treated as zero reasoning tokens. It can mean the whole output is still in thinking mode, or that generation stopped before the closing token was produced. In that case, the emitted output should still be counted as reasoning. ## Validation - `pytest -q tests/ut/patch/platform/test_patch_minimax_usage_accounting.py` - `vllm serve --help` Signed-off-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com> Co-authored-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com>
2026-03-31 16:27:00 +08:00
parent 4f259d4fd8
commit 7314bbe2df
2 changed files with 196 additions and 239 deletions
--- a/tests/ut/patch/platform/test_patch_minimax_usage_accounting.py
+++ b/tests/ut/patch/platform/test_patch_minimax_usage_accounting.py
@@ -1,12 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0

-import pytest
+from types import SimpleNamespace

+import pytest
 from vllm.reasoning.minimax_m2_reasoning_parser import (
    MiniMaxM2AppendThinkReasoningParser,
    MiniMaxM2ReasoningParser,
 )

+from vllm_ascend.patch.platform import (
+    patch_minimax_usage_accounting as minimax_usage_patch,
+)
+

 class FakeTokenizer:
    def get_vocab(self):
@@ -35,13 +40,13 @@ class FakeTokenizer:
            MiniMaxM2ReasoningParser,
            [10, 11, 20],
            3,
-            id="minimax-all-tokens-are-reasoning-before-end-token",
+            id="minimax-no-end-token-means-all-output-is-reasoning",
        ),
        pytest.param(
            MiniMaxM2AppendThinkReasoningParser,
            [10, 11, 20],
            3,
-            id="append-think-all-tokens-are-reasoning-before-end-token",
+            id="append-think-no-end-token-means-all-output-is-reasoning",
        ),
        pytest.param(
            MiniMaxM2ReasoningParser,
@@ -65,3 +70,73 @@ def test_count_reasoning_tokens(
    parser = parser_cls(FakeTokenizer())

    assert parser.count_reasoning_tokens(token_ids) == expected_reasoning_tokens
+
+
+def test_update_usage_tracking_state_tracks_prompt_and_completion_tokens():
+    state = minimax_usage_patch._create_usage_tracking_state(
+        num_choices=2,
+        reasoning_parser=None,
+    )
+
+    res = SimpleNamespace(
+        prompt_token_ids=[1, 2],
+        encoder_prompt_token_ids=[3],
+        num_cached_tokens=4,
+        outputs=[
+            SimpleNamespace(index=0, token_ids=(10, 11)),
+            SimpleNamespace(index=1, token_ids=[20]),
+        ],
+    )
+
+    minimax_usage_patch._update_usage_tracking_state(state, res)
+
+    assert state.num_prompt_tokens == 3
+    assert state.num_cached_tokens == 4
+    assert state.completion_tokens == [2, 1]
+    assert state.raw_output_token_ids == [[10, 11], [20]]
+
+
+def test_make_usage_info_injects_reasoning_token_details():
+    fake_serving = SimpleNamespace(enable_prompt_tokens_details=True)
+    usage = minimax_usage_patch._make_usage_info(
+        fake_serving,
+        prompt_tokens=3,
+        completion_tokens=4,
+        num_cached_tokens=1,
+        reasoning_tokens=2,
+    )
+
+    payload = usage.model_dump(exclude_none=True)
+
+    assert payload["completion_tokens_details"]["reasoning_tokens"] == 2
+    assert payload["prompt_tokens_details"]["cached_tokens"] == 1
+
+
+def test_make_full_response_usage_sums_reasoning_tokens():
+    class FakeServing:
+        enable_prompt_tokens_details = False
+
+        def _make_usage_info(self, **kwargs):
+            return minimax_usage_patch._make_usage_info(self, **kwargs)
+
+    class FakeReasoningParser:
+        def count_reasoning_tokens(self, token_ids):
+            return 2 if 2 in token_ids else 0
+
+    state = minimax_usage_patch._create_usage_tracking_state(
+        num_choices=2,
+        reasoning_parser=FakeReasoningParser(),
+    )
+    state.num_prompt_tokens = 3
+    state.num_cached_tokens = 1
+    state.final_res = SimpleNamespace(num_cached_tokens=1)
+    state.completion_tokens = [4, 2]
+    state.raw_output_token_ids = [[10, 11, 2, 20], [30, 31]]
+
+    usage = minimax_usage_patch._make_full_response_usage(FakeServing(), state)
+
+    assert usage.prompt_tokens == 3
+    assert usage.completion_tokens == 6
+    assert usage.total_tokens == 9
+    assert usage.completion_tokens_details.reasoning_tokens == 2
+    assert usage.prompt_tokens_details is None