fix(platform): reimplement MiniMax usage accounting patch (#7835)

## Summary
- replace the MiniMax usage accounting monkey patch with a runtime
wrapper implementation instead of source-text rewriting
- preserve MiniMax reasoning-token semantics when `</think>` is missing
by counting the emitted output as reasoning tokens
- add unit coverage for usage tracking helpers and MiniMax
reasoning-token counting

## Why
The previous implementation rewrote `OpenAIServingChat` by matching
exact source blocks. That was brittle against `vllm` source drift and
could crash during early plugin initialization with:
`RuntimeError: Failed to locate expected block while patching
OpenAIServingChat usage accounting.`

This change keeps the usage-accounting backport, but applies it by
wrapping the original stream/full generators and tracking output token
ids at runtime.

For MiniMax reasoning counting, a missing `</think>` should not be
treated as zero reasoning tokens. It can mean the whole output is still
in thinking mode, or that generation stopped before the closing token
was produced. In that case, the emitted output should still be counted
as reasoning.

## Validation
- `pytest -q
tests/ut/patch/platform/test_patch_minimax_usage_accounting.py`
- `vllm serve --help`

Signed-off-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com>
Co-authored-by: QwertyJack <7554089+QwertyJack@users.noreply.github.com>
This commit is contained in:
jack
2026-03-31 16:27:00 +08:00
committed by GitHub
parent 4f259d4fd8
commit 7314bbe2df
2 changed files with 196 additions and 239 deletions

View File

@@ -1,12 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
import pytest
from types import SimpleNamespace
import pytest
from vllm.reasoning.minimax_m2_reasoning_parser import (
MiniMaxM2AppendThinkReasoningParser,
MiniMaxM2ReasoningParser,
)
from vllm_ascend.patch.platform import (
patch_minimax_usage_accounting as minimax_usage_patch,
)
class FakeTokenizer:
def get_vocab(self):
@@ -35,13 +40,13 @@ class FakeTokenizer:
MiniMaxM2ReasoningParser,
[10, 11, 20],
3,
id="minimax-all-tokens-are-reasoning-before-end-token",
id="minimax-no-end-token-means-all-output-is-reasoning",
),
pytest.param(
MiniMaxM2AppendThinkReasoningParser,
[10, 11, 20],
3,
id="append-think-all-tokens-are-reasoning-before-end-token",
id="append-think-no-end-token-means-all-output-is-reasoning",
),
pytest.param(
MiniMaxM2ReasoningParser,
@@ -65,3 +70,73 @@ def test_count_reasoning_tokens(
parser = parser_cls(FakeTokenizer())
assert parser.count_reasoning_tokens(token_ids) == expected_reasoning_tokens
def test_update_usage_tracking_state_tracks_prompt_and_completion_tokens():
state = minimax_usage_patch._create_usage_tracking_state(
num_choices=2,
reasoning_parser=None,
)
res = SimpleNamespace(
prompt_token_ids=[1, 2],
encoder_prompt_token_ids=[3],
num_cached_tokens=4,
outputs=[
SimpleNamespace(index=0, token_ids=(10, 11)),
SimpleNamespace(index=1, token_ids=[20]),
],
)
minimax_usage_patch._update_usage_tracking_state(state, res)
assert state.num_prompt_tokens == 3
assert state.num_cached_tokens == 4
assert state.completion_tokens == [2, 1]
assert state.raw_output_token_ids == [[10, 11], [20]]
def test_make_usage_info_injects_reasoning_token_details():
fake_serving = SimpleNamespace(enable_prompt_tokens_details=True)
usage = minimax_usage_patch._make_usage_info(
fake_serving,
prompt_tokens=3,
completion_tokens=4,
num_cached_tokens=1,
reasoning_tokens=2,
)
payload = usage.model_dump(exclude_none=True)
assert payload["completion_tokens_details"]["reasoning_tokens"] == 2
assert payload["prompt_tokens_details"]["cached_tokens"] == 1
def test_make_full_response_usage_sums_reasoning_tokens():
class FakeServing:
enable_prompt_tokens_details = False
def _make_usage_info(self, **kwargs):
return minimax_usage_patch._make_usage_info(self, **kwargs)
class FakeReasoningParser:
def count_reasoning_tokens(self, token_ids):
return 2 if 2 in token_ids else 0
state = minimax_usage_patch._create_usage_tracking_state(
num_choices=2,
reasoning_parser=FakeReasoningParser(),
)
state.num_prompt_tokens = 3
state.num_cached_tokens = 1
state.final_res = SimpleNamespace(num_cached_tokens=1)
state.completion_tokens = [4, 2]
state.raw_output_token_ids = [[10, 11, 2, 20], [30, 31]]
usage = minimax_usage_patch._make_full_response_usage(FakeServing(), state)
assert usage.prompt_tokens == 3
assert usage.completion_tokens == 6
assert usage.total_tokens == 9
assert usage.completion_tokens_details.reasoning_tokens == 2
assert usage.prompt_tokens_details is None