Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,421 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
# Create a concrete test implementation of BaseThinkingReasoningParser
class TestThinkingReasoningParser(BaseThinkingReasoningParser):
"""Test implementation of BaseThinkingReasoningParser."""
@property
def start_token(self) -> str:
return "<test:think>"
@property
def end_token(self) -> str:
return "</test:think>"
class TestThinkingReasoningParserAlt(BaseThinkingReasoningParser):
"""Alternative test implementation with different tokens."""
@property
def start_token(self) -> str:
return "<alt:start>"
@property
def end_token(self) -> str:
return "<alt:end>"
# Use a test model
REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
@pytest.fixture(scope="module")
def test_tokenizer():
tokenizer = AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
# Add custom test tokens
test_tokens = ["<test:think>", "</test:think>", "<alt:start>", "<alt:end>"]
existing_tokens = set(tokenizer.get_vocab().keys())
new_tokens = [token for token in test_tokens if token not in existing_tokens]
if new_tokens:
tokenizer.add_tokens(new_tokens)
return tokenizer
class TestBaseThinkingReasoningParserInit:
"""
Test initialization and basic properties of
BaseThinkingReasoningParser.
"""
def test_successful_initialization(self, test_tokenizer):
"""Test successful initialization with valid tokens."""
parser = TestThinkingReasoningParser(test_tokenizer)
assert parser.start_token == "<test:think>"
assert parser.end_token == "</test:think>"
assert parser.start_token_id is not None
assert parser.end_token_id is not None
def test_initialization_with_missing_tokenizer(self):
"""Test that initialization fails without tokenizer."""
with pytest.raises(ValueError, match="model tokenizer must be passed"):
TestThinkingReasoningParser(None)
def test_initialization_with_missing_tokens(self, test_tokenizer):
"""Test that initialization fails when tokens are not in vocabulary."""
# Create a parser with tokens not in vocabulary
class MissingTokenParser(BaseThinkingReasoningParser):
@property
def start_token(self) -> str:
return "<missing:start>"
@property
def end_token(self) -> str:
return "<missing:end>"
with pytest.raises(
RuntimeError, match="could not locate think start/end tokens"
):
MissingTokenParser(test_tokenizer)
def test_initialization_with_empty_tokens(self, test_tokenizer):
"""Test that initialization fails with empty token strings."""
class EmptyTokenParser(BaseThinkingReasoningParser):
@property
def start_token(self) -> str:
return ""
@property
def end_token(self) -> str:
return ""
with pytest.raises(
ValueError, match="start_token and end_token must be defined"
):
EmptyTokenParser(test_tokenizer)
class TestBaseThinkingReasoningParserMethods:
"""Test the methods of BaseThinkingReasoningParser."""
def test_is_reasoning_end(self, test_tokenizer):
"""Test the is_reasoning_end method."""
parser = TestThinkingReasoningParser(test_tokenizer)
end_token_id = parser.end_token_id
start_token_id = parser.start_token_id
# Test with end token present
assert parser.is_reasoning_end([1, 2, end_token_id, 4]) is True
# Test without end token
assert parser.is_reasoning_end([1, 2, 3, 4]) is False
# Test with empty list
assert parser.is_reasoning_end([]) is False
# Test with interleaved thinking
assert parser.is_reasoning_end([1, start_token_id, 2, end_token_id]) is True
assert parser.is_reasoning_end([1, start_token_id, 2, 3]) is False
assert (
parser.is_reasoning_end(
[1, start_token_id, 2, end_token_id, 2, 2, start_token_id]
)
is False
)
def test_is_reasoning_end_streaming(self, test_tokenizer):
"""Test the is_reasoning_end_streaming method."""
parser = TestThinkingReasoningParser(test_tokenizer)
end_token_id = parser.end_token_id
start_token_id = parser.start_token_id
assert (
parser.is_reasoning_end_streaming([1, 2, end_token_id], [end_token_id])
is True
)
assert parser.is_reasoning_end_streaming([1, 2, 3, 4], [4]) is False
assert parser.is_reasoning_end_streaming([], []) is False
assert (
parser.is_reasoning_end_streaming(
[1, start_token_id, 2, end_token_id], [end_token_id]
)
is True
)
assert (
parser.is_reasoning_end_streaming([1, start_token_id, 2, 3], [3]) is False
)
assert (
parser.is_reasoning_end_streaming(
[1, start_token_id, 2, end_token_id, 2, start_token_id, 2],
[2],
)
is False
)
assert (
parser.is_reasoning_end_streaming(
[1, start_token_id, 2, end_token_id, 2, 2], [2]
)
is False
)
def test_extract_content_ids(self, test_tokenizer):
"""Test the extract_content_ids method."""
parser = TestThinkingReasoningParser(test_tokenizer)
end_token_id = parser.end_token_id
# Test with end token in the middle
input_ids = [1, 2, end_token_id, 4, 5]
content_ids = parser.extract_content_ids(input_ids)
assert content_ids == [4, 5]
# Test with end token at the end
input_ids = [1, 2, 3, end_token_id]
content_ids = parser.extract_content_ids(input_ids)
assert content_ids == []
# Test without end token
input_ids = [1, 2, 3, 4]
content_ids = parser.extract_content_ids(input_ids)
assert content_ids == []
# Test with end token as last element (should not extract)
input_ids = [1, 2, 3, end_token_id]
content_ids = parser.extract_content_ids(input_ids)
assert content_ids == []
class TestBaseThinkingReasoningParserExtraction:
"""Test reasoning content extraction methods."""
def test_extract_reasoning_with_both_tokens(self, test_tokenizer):
"""Test extraction when both start and end tokens are present."""
parser = TestThinkingReasoningParser(test_tokenizer)
request = ChatCompletionRequest(messages=[], model="test-model")
model_output = "<test:think>This is reasoning</test:think>This is content"
reasoning, content = parser.extract_reasoning(model_output, request)
assert reasoning == "This is reasoning"
assert content == "This is content"
def test_extract_reasoning_only_end_token(self, test_tokenizer):
"""Test extraction when only end token is present."""
parser = TestThinkingReasoningParser(test_tokenizer)
request = ChatCompletionRequest(messages=[], model="test-model")
model_output = "This is reasoning</test:think>This is content"
reasoning, content = parser.extract_reasoning(model_output, request)
assert reasoning == "This is reasoning"
assert content == "This is content"
def test_extract_reasoning_no_end_token(self, test_tokenizer):
"""Test extraction when no end token is present."""
parser = TestThinkingReasoningParser(test_tokenizer)
request = ChatCompletionRequest(messages=[], model="test-model")
model_output = "This is just content"
reasoning, content = parser.extract_reasoning(model_output, request)
assert reasoning == "This is just content"
assert content is None
def test_extract_reasoning_empty_output(self, test_tokenizer):
"""Test extraction with empty output."""
parser = TestThinkingReasoningParser(test_tokenizer)
request = ChatCompletionRequest(messages=[], model="test-model")
model_output = ""
reasoning, content = parser.extract_reasoning(model_output, request)
assert reasoning == ""
assert content is None
def test_extract_reasoning_only_tokens(self, test_tokenizer):
"""Test extraction with only tokens and no content."""
parser = TestThinkingReasoningParser(test_tokenizer)
request = ChatCompletionRequest(messages=[], model="test-model")
model_output = "<test:think></test:think>"
reasoning, content = parser.extract_reasoning(model_output, request)
assert reasoning == ""
assert content is None
class TestBaseThinkingReasoningParserStreaming:
"""Test streaming functionality of BaseThinkingReasoningParser."""
@pytest.mark.parametrize("streaming", [True, False])
def test_simple_reasoning_extraction(self, test_tokenizer, streaming):
"""
Test basic reasoning extraction in both
streaming and non-streaming modes.
"""
parser = TestThinkingReasoningParser(test_tokenizer)
model_output = [
"<test:think>",
"Some ",
"reasoning ",
"content",
"</test:think>",
"Final ",
"answer",
]
reasoning, content = run_reasoning_extraction(
parser, model_output, streaming=streaming
)
assert reasoning == "Some reasoning content"
assert content == "Final answer"
def test_streaming_with_incremental_deltas(self, test_tokenizer):
"""Test streaming processing with small incremental deltas."""
parser = TestThinkingReasoningParser(test_tokenizer)
deltas = [
"<test:think>",
"Some ",
"reasoning ",
"content",
"</test:think>",
"Final ",
"answer",
]
reasoning, content = run_reasoning_extraction(parser, deltas, streaming=True)
assert reasoning == "Some reasoning content"
assert content == "Final answer"
def test_streaming_with_start_token(self, test_tokenizer):
"""Test streaming with start token included."""
parser = TestThinkingReasoningParser(test_tokenizer)
deltas = [
"<test:think>",
"Some ",
"reasoning",
"</test:think>",
"Answer",
]
reasoning, content = run_reasoning_extraction(parser, deltas, streaming=True)
assert reasoning == "Some reasoning"
assert content == "Answer"
def test_streaming_no_end_token(self, test_tokenizer):
"""Test streaming when no end token is encountered."""
parser = TestThinkingReasoningParser(test_tokenizer)
deltas = [
"<test:think>",
"Some ",
"reasoning ",
"without ",
"end",
]
reasoning, content = run_reasoning_extraction(parser, deltas, streaming=True)
assert reasoning == "Some reasoning without end"
assert content is None
def test_streaming_only_end_token(self, test_tokenizer):
"""Test streaming when only end token appears."""
parser = TestThinkingReasoningParser(test_tokenizer)
deltas = [
"<test:think>",
"Reasoning ",
"content",
"</test:think>",
"Final",
]
reasoning, content = run_reasoning_extraction(parser, deltas, streaming=True)
assert reasoning == "Reasoning content"
assert content == "Final"
class TestBaseThinkingReasoningParserMultipleImplementations:
"""
Test that multiple implementations of
BaseThinkingReasoningParser work correctly.
"""
def test_different_token_implementations(self, test_tokenizer):
"""
Test that different implementations
with different tokens work independently.
"""
parser1 = TestThinkingReasoningParser(test_tokenizer)
parser2 = TestThinkingReasoningParserAlt(test_tokenizer)
# Test parser1
model_output1 = "Reasoning1</test:think>Content1"
reasoning1, content1 = run_reasoning_extraction(parser1, [model_output1])
assert reasoning1 == "Reasoning1"
assert content1 == "Content1"
# Test parser2
model_output2 = "Reasoning2<alt:end>Content2"
reasoning2, content2 = run_reasoning_extraction(parser2, [model_output2])
assert reasoning2 == "Reasoning2"
assert content2 == "Content2"
# Verify tokens are different
assert parser1.start_token != parser2.start_token
assert parser1.end_token != parser2.end_token
assert parser1.start_token_id != parser2.start_token_id
assert parser1.end_token_id != parser2.end_token_id
class TestBaseThinkingReasoningParserEdgeCases:
"""Test edge cases and error conditions."""
def test_multiple_end_tokens(self, test_tokenizer):
"""Test behavior with multiple end tokens."""
parser = TestThinkingReasoningParser(test_tokenizer)
model_output = "First</test:think>Middle</test:think>Last"
reasoning, content = run_reasoning_extraction(parser, [model_output])
# Should stop at first end token
assert reasoning == "First"
assert content == "Middle</test:think>Last"
def test_nested_tokens(self, test_tokenizer):
"""Test behavior with nested-like token patterns."""
parser = TestThinkingReasoningParser(test_tokenizer)
model_output = "<test:think>Outer<test:think>Inner</test:think>Content"
reasoning, content = run_reasoning_extraction(parser, [model_output])
# Should process normally, start from first start token
assert reasoning == "Outer<test:think>Inner"
assert content == "Content"
def test_malformed_tokens(self, test_tokenizer):
"""Test behavior with malformed token-like strings."""
parser = TestThinkingReasoningParser(test_tokenizer)
model_output = "<test:thinking>Not a real token</test:thinking>Content"
reasoning, content = run_reasoning_extraction(parser, [model_output])
# Should treat as regular content since tokens don't match exactly
assert reasoning == ("<test:thinking>Not a real token</test:thinking>Content")
assert content is None

View File

@@ -0,0 +1,288 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "deepseek_r1"
start_token = "<think>"
end_token = "</think>"
REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
@pytest.fixture(scope="module")
def deepseek_r1_qwen_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
SIMPLE_REASONING = {
"output": "This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
COMPLETE_REASONING = {
"output": "This is a reasoning section</think>",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
NO_CONTENT = {
"output": "This is content",
"reasoning": "This is content",
"content": None,
"is_reasoning_end": False,
}
NO_REASONING_STREAMING = {
"output": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
MULTIPLE_LINES = {
"output": "This\nThat</think>This is the rest\nThat",
"reasoning": "This\nThat",
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
SHORTEST_REASONING_NO_STREAMING = {
"output": "</think>This is the rest",
"reasoning": "",
"content": "This is the rest",
"is_reasoning_end": True,
}
SHORTEST_REASONING = {
"output": "</think>This is the rest",
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": True,
}
REASONING_WITH_THINK = {
"output": "<think>This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
COMPLETE_REASONING_WITH_THINK = {
"output": "<think>This is a reasoning section</think>",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
MULTIPLE_LINES_WITH_THINK = {
"output": "<think>This\nThat</think>This is the rest\nThat",
"reasoning": "This\nThat",
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
"output": "</think>This is the rest",
"reasoning": "",
"content": "This is the rest",
"is_reasoning_end": True,
}
SHORTEST_REASONING_WITH_THINK = {
"output": "</think>This is the rest",
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": True,
}
THINK_NO_END = {
"output": "<think>This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
EMPTY = {
"output": "",
"reasoning": "",
"content": None,
"is_reasoning_end": False,
}
EMPTY_STREAMING = {
"output": "",
"reasoning": None,
"content": None,
"is_reasoning_end": False,
}
NEW_LINE = {
"output": "\n<think>This is a reasoning section</think>\nThis is the rest",
"reasoning": "This is a reasoning section",
"content": "\nThis is the rest",
"is_reasoning_end": True,
}
# Streaming cannot handle new lines at the beginning of the output
# because we need to support <think>...</think> and </think>...
# We cannot know if the text before <think> is reasoning content
# or not.
NEW_LINE_STREAMING = {
"output": "\n<think>This is a reasoning section</think>\nThis is the rest",
"reasoning": "\nThis is a reasoning section",
"content": "\nThis is the rest",
"is_reasoning_end": True,
}
TEST_CASES = [
pytest.param(
False,
SIMPLE_REASONING,
id="simple_reasoning",
),
pytest.param(
True,
SIMPLE_REASONING,
id="simple_reasoning_streaming",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_reasoning",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_reasoning_streaming",
),
pytest.param(
False,
NO_CONTENT,
id="no_content_token",
),
pytest.param(
True,
NO_REASONING_STREAMING,
id="no_reasoning_token_streaming",
),
pytest.param(
False,
MULTIPLE_LINES,
id="multiple_lines",
),
pytest.param(
True,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
True,
SHORTEST_REASONING,
id="shortest",
),
pytest.param(
False,
SHORTEST_REASONING_NO_STREAMING,
id="shortest_streaming",
),
pytest.param(
False,
REASONING_WITH_THINK,
id="reasoning_with_think",
),
pytest.param(
True,
REASONING_WITH_THINK,
id="reasoning_with_think_streaming",
),
pytest.param(
False,
COMPLETE_REASONING_WITH_THINK,
id="complete_reasoning_with_think",
),
pytest.param(
True,
COMPLETE_REASONING_WITH_THINK,
id="complete_reasoning_with_think_streaming",
),
pytest.param(
False,
MULTIPLE_LINES_WITH_THINK,
id="multiple_lines_with_think",
),
pytest.param(
True,
MULTIPLE_LINES_WITH_THINK,
id="multiple_lines_with_think_streaming",
),
pytest.param(
False,
SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
id="shortest_with_think",
),
pytest.param(
True,
SHORTEST_REASONING_WITH_THINK,
id="shortest_with_think_streaming",
),
pytest.param(
False,
THINK_NO_END,
id="think_no_end",
),
pytest.param(
True,
THINK_NO_END,
id="think_no_end_streaming",
),
pytest.param(
False,
EMPTY,
id="empty",
),
pytest.param(
True,
EMPTY_STREAMING,
id="empty_streaming",
),
pytest.param(
False,
NEW_LINE,
id="new_line",
),
pytest.param(
True,
NEW_LINE_STREAMING,
id="new_line_streaming",
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
deepseek_r1_qwen_tokenizer,
):
output = deepseek_r1_qwen_tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
deepseek_r1_qwen_tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
deepseek_r1_qwen_tokenizer
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]
# Test is_reasoning_end
output_ids = deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(output)
is_reasoning_end = parser.is_reasoning_end(output_ids)
assert is_reasoning_end == param_dict["is_reasoning_end"]
# Test extract_content
if param_dict["content"] is not None:
content = parser.extract_content_ids(output_ids)
assert content == deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(
deepseek_r1_qwen_tokenizer.tokenize(param_dict["content"])
)
else:
content = parser.extract_content_ids(output)
assert content == []

View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
from vllm.reasoning.deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser
from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-V3.1"
@pytest.fixture(scope="module")
def tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
@pytest.mark.parametrize(
"thinking,expected_parser_type",
[
(True, DeepSeekR1ReasoningParser),
(False, IdentityReasoningParser),
],
)
def test_parser_selection(tokenizer, thinking, expected_parser_type):
parser = DeepSeekV3ReasoningParser(
tokenizer, chat_template_kwargs={"thinking": thinking}
)
assert isinstance(parser._parser, expected_parser_type)
def test_identity_reasoning_parser_basic(tokenizer):
parser = IdentityReasoningParser(tokenizer)
# Test is_reasoning_end always returns True
input_text = "This is some output"
input_tokens = tokenizer.tokenize(input_text)
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
assert parser.is_reasoning_end(input_ids) is True
assert parser.is_reasoning_end_streaming(input_ids, input_ids) is True
# Test extract_content_ids returns all input_ids
assert parser.extract_content_ids(input_ids) == input_ids
# Test extract_reasoning returns (None, model_output)
request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0)
reasoning, content = parser.extract_reasoning(input_text, request)
assert reasoning is None
assert content == input_text
# Test extract_reasoning_streaming returns DeltaMessage or None
result = parser.extract_reasoning_streaming(
previous_text="",
current_text="Hello world",
delta_text="Hello world",
previous_token_ids=[],
current_token_ids=input_ids,
delta_token_ids=input_ids,
)
assert isinstance(result, DeltaMessage)
assert result.content == "Hello world"
# If delta_text is empty, should return None
result_none = parser.extract_reasoning_streaming(
previous_text="Hello world",
current_text="Hello world",
delta_text="",
previous_token_ids=input_ids,
current_token_ids=input_ids,
delta_token_ids=[],
)
assert result_none is None

View File

@@ -0,0 +1,124 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "ernie45"
REASONING_MODEL_NAME = "baidu/ERNIE-4.5-21B-A3B-Thinking"
@pytest.fixture(scope="module")
def ernie45_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
# 带 </think>非stream
WITH_THINK = {
"output": "abc</think>def",
"reasoning": "abc",
"content": "def",
}
# 带 </think>stream
WITH_THINK_STREAM = {
"output": "abc</think>def",
"reasoning": "abc",
"content": "def",
}
# without </think>, all is reasoning
WITHOUT_THINK = {
"output": "abc",
"reasoning": "abc",
"content": None,
}
# without </think>, all is reasoning
WITHOUT_THINK_STREAM = {
"output": "abc",
"reasoning": "abc",
"content": None,
}
COMPLETE_REASONING = {
"output": "abc</think>",
"reasoning": "abc",
"content": None,
}
MULTILINE_REASONING = {
"output": "abc\nABC</think>def\nDEF",
"reasoning": "abc\nABC",
"content": "def\nDEF",
}
TEST_CASES = [
pytest.param(
False,
WITH_THINK,
id="with_think",
),
pytest.param(
True,
WITH_THINK_STREAM,
id="with_think_stream",
),
pytest.param(
False,
WITHOUT_THINK,
id="without_think",
),
pytest.param(
True,
WITHOUT_THINK_STREAM,
id="without_think_stream",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_reasoning",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_reasoning_stream",
),
pytest.param(
False,
MULTILINE_REASONING,
id="multiline_reasoning",
),
pytest.param(
True,
MULTILINE_REASONING,
id="multiline_reasoning_stream",
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
ernie45_tokenizer,
):
output = ernie45_tokenizer.tokenize(param_dict["output"])
output_tokens: list[str] = []
for token in output:
one_token = ernie45_tokenizer.convert_tokens_to_string([token])
if one_token:
output_tokens.append(one_token)
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
ernie45_tokenizer
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
print()
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]

View File

@@ -0,0 +1,205 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "glm45"
start_token = "<think>"
end_token = "</think>"
REASONING_MODEL_NAME = "zai-org/GLM-4.5"
@pytest.fixture(scope="module")
def glm45_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
WITH_THINK = {
"output": "<think>This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
WITH_THINK_STREAM = {
"output": "<think>This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
WITHOUT_THINK = {
"output": "This is the rest",
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": False,
}
WITHOUT_THINK_STREAM = {
"output": "This is the rest",
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": False,
}
COMPLETE_REASONING = {
"output": "<think>This is a reasoning section</think>",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
MULTILINE_REASONING = {
"output": "<think>This is a reasoning\nsection</think>This is the rest\nThat",
"reasoning": "This is a reasoning\nsection",
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
ONLY_OPEN_TAG = {
"output": "<think>This is a reasoning section",
"reasoning": None,
"content": "<think>This is a reasoning section",
"is_reasoning_end": False,
}
ONLY_OPEN_TAG_STREAM = {
"output": "<think>This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
TEST_CASES = [
pytest.param(
False,
WITH_THINK,
id="with_think",
),
pytest.param(
True,
WITH_THINK_STREAM,
id="with_think_stream",
),
pytest.param(
False,
WITHOUT_THINK,
id="without_think",
),
pytest.param(
True,
WITHOUT_THINK_STREAM,
id="without_think_stream",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_reasoning",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_reasoning_stream",
),
pytest.param(
False,
MULTILINE_REASONING,
id="multiline_reasoning",
),
pytest.param(
True,
MULTILINE_REASONING,
id="multiline_reasoning_stream",
),
pytest.param(
False,
ONLY_OPEN_TAG,
id="only_open_tag",
),
pytest.param(
True,
ONLY_OPEN_TAG_STREAM,
id="only_open_tag_stream",
),
]
STILL_REASONING_PROMPT = """[gMASK]<sop><|system|>
You are a helpful assistant.<|user|>
What is the capital of France?<|assistant|>
<think>The user is asking for the capital of"""
DONE_REASONING_PROMPT = """[gMASK]<sop><|system|>
You are a helpful assistant.<|user|>
What is the capital of France?<|assistant|>
<think>The user is asking for the capital of France.</think>
The capital of France is Paris."""
MULTI_TURN_STILL_REASONING_PROMPT = """[gMASK]<sop><|system|>
You are a helpful assistant.<|user|>
What is the capital of France?<|assistant|>
<think></think>
The capital of France is Paris.<|user|>
What about Chile?<|assistant|>
<think>The user is asking for the capital of"""
MULTI_TURN_DONE_REASONING_PROMPT = """[gMASK]<sop><|system|>
You are a helpful assistant.<|user|>
What is the capital of France?<|assistant|>
<think></think>
The capital of France is Paris.<|user|>
What about Chile?<|assistant|>
<think>The user is asking for the capital of Chile.</think>
The capital of Chile is Santiago."""
REASONING_END_TEST_CASES = [
pytest.param(STILL_REASONING_PROMPT, False, id="still_reasoning"),
pytest.param(DONE_REASONING_PROMPT, True, id="done_reasoning"),
pytest.param(
MULTI_TURN_STILL_REASONING_PROMPT, False, id="multi_turn_still_reasoning"
),
pytest.param(
MULTI_TURN_DONE_REASONING_PROMPT, True, id="multi_turn_done_reasoning"
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
glm45_tokenizer,
):
output = glm45_tokenizer.tokenize(param_dict["output"])
output_tokens: list[str] = [
glm45_tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
glm45_tokenizer
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]
output_ids = glm45_tokenizer.convert_tokens_to_ids(output)
is_reasoning_end = parser.is_reasoning_end(output_ids)
assert is_reasoning_end == param_dict["is_reasoning_end"]
@pytest.mark.parametrize("prompt, is_reasoning_end", REASONING_END_TEST_CASES)
def test_is_reasoning_end_full_prompt(
prompt: str, is_reasoning_end: bool, glm45_tokenizer
):
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
glm45_tokenizer
)
tokens = glm45_tokenizer.tokenize(prompt)
token_ids = glm45_tokenizer.convert_tokens_to_ids(tokens)
check_is_reasoning_end = parser.is_reasoning_end(token_ids)
assert check_is_reasoning_end == is_reasoning_end

View File

@@ -0,0 +1,127 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from vllm.reasoning import ReasoningParser
from vllm.reasoning.gptoss_reasoning_parser import GptOssReasoningParser
REASONING_MODEL_NAME = "openai/gpt-oss-120b"
@pytest.fixture(scope="module")
def gpt_oss_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
USER_MESSAGE_START = "<|start|>user<|message|>"
REASONING_SECTION_START = "<|end|><|start|>assistant<|channel|>analysis<|message|>"
ASSISTANT_CONTENT_START_PREFIX = "<|end|><|start|>assistant<|channel|>final"
ASSISTANT_CONTENT_START_SUFFIX = "<|message|>"
ASSISTANT_CONTENT_START = (
ASSISTANT_CONTENT_START_PREFIX + ASSISTANT_CONTENT_START_SUFFIX
)
BASIC_CONTENT = {
"output": REASONING_SECTION_START
+ "This is reasoning"
+ ASSISTANT_CONTENT_START
+ "This is the rest",
"is_reasoning_end": True,
}
BASIC_REASONING_ONLY = {
"output": REASONING_SECTION_START + "This is reasoning" + "<|end|>",
"is_reasoning_end": False,
}
BASIC_NO_REASONING_NO_ASSISTANT = {
"output": USER_MESSAGE_START + "This is a user message",
"is_reasoning_end": False,
}
# Edge-case where the model omits the assistant tag entirely.
BASIC_NO_REASONING_ASSISTANT = {
"output": USER_MESSAGE_START + "This is a user message<|end|><|channel|>final",
"is_reasoning_end": True,
}
COMPLEX_CONTENT_INCOMPLETE_PREFIX_ONLY = {
"output": REASONING_SECTION_START
+ "This is reasoning"
+ ASSISTANT_CONTENT_START_PREFIX,
"is_reasoning_end": False,
}
COMPLEX_CONTENT_SUFFIX_ONLY = {
"output": REASONING_SECTION_START
+ "This is reasoning"
+ ASSISTANT_CONTENT_START_SUFFIX,
"is_reasoning_end": False,
}
COMPLEX_CONTENT_1_NO_SUFFIX = {
"output": REASONING_SECTION_START
+ "This is reasoning"
+ ASSISTANT_CONTENT_START_PREFIX
+ "<|constrain|> JSON ",
"is_reasoning_end": False,
}
COMPLEX_CONTENT_1 = {
"output": REASONING_SECTION_START
+ "This is reasoning"
+ ASSISTANT_CONTENT_START_PREFIX
+ "<|constrain|> JSON "
+ ASSISTANT_CONTENT_START_SUFFIX,
"is_reasoning_end": True,
}
COMPLEX_CONTENT_1_WITH_CONTENT = {
"output": REASONING_SECTION_START
+ "This is reasoning"
+ ASSISTANT_CONTENT_START_PREFIX
+ "<|constrain|> JSON "
+ ASSISTANT_CONTENT_START_SUFFIX
+ "This is the rest",
"is_reasoning_end": True,
}
COMPLEX_CONTENT_2 = {
"output": REASONING_SECTION_START
+ "This is reasoning"
+ ASSISTANT_CONTENT_START_PREFIX
+ "<|constrain|>ReplyAction "
+ ASSISTANT_CONTENT_START_SUFFIX
+ "This is the rest",
"is_reasoning_end": True,
}
TEST_CASES = [
BASIC_CONTENT,
BASIC_REASONING_ONLY,
COMPLEX_CONTENT_INCOMPLETE_PREFIX_ONLY,
COMPLEX_CONTENT_SUFFIX_ONLY,
COMPLEX_CONTENT_1_NO_SUFFIX,
COMPLEX_CONTENT_1,
COMPLEX_CONTENT_1_WITH_CONTENT,
COMPLEX_CONTENT_2,
]
@pytest.mark.parametrize(
"output, is_reasoning_end",
[(t["output"], t["is_reasoning_end"]) for t in TEST_CASES],
)
def test_gptoss_is_reasoning_end(
output,
is_reasoning_end,
gpt_oss_tokenizer,
):
output = gpt_oss_tokenizer.tokenize(output)
parser: ReasoningParser = GptOssReasoningParser(gpt_oss_tokenizer)
# Test is_reasoning_end
output_ids = gpt_oss_tokenizer.convert_tokens_to_ids(output)
actual_is_reasoning_end = parser.is_reasoning_end(output_ids)
assert is_reasoning_end == actual_is_reasoning_end

View File

@@ -0,0 +1,344 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import DeltaMessage, run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "granite"
START_REASONING = "Here is my thought process:"
START_RESPONSE = "Here is my response:"
SIMPLE_REASONING = {
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", # noqa: E501
"reasoning": "This is a reasoning section",
"content": "This is the rest",
}
COMPLETE_REASONING = {
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
"reasoning": "This is a reasoning section",
"content": None,
}
NO_REASONING = {
"output": "This is content",
"reasoning": None,
"content": "This is content",
}
MULTIPLE_LINES = {
"output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
"reasoning": "This\nThat",
"content": "This is the rest\nThat",
}
REASONING_WITH_THINK = {
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", # noqa: E501
"reasoning": "This is a reasoning section",
"content": "This is the rest",
}
COMPLETE_REASONING_WITH_THINK = {
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
"reasoning": "This is a reasoning section",
"content": None,
}
MULTIPLE_LINES_WITH_THINK = {
"output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
"reasoning": "This\nThat",
"content": "This is the rest\nThat",
}
TEST_CASES = [
pytest.param(
False,
SIMPLE_REASONING,
id="simple_reasoning",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_reasoning",
),
pytest.param(
False,
NO_REASONING,
id="no_reasoning",
),
pytest.param(
False,
MULTIPLE_LINES,
id="multiple_lines",
),
pytest.param(
False,
REASONING_WITH_THINK,
id="reasoning_with_think",
),
pytest.param(
False,
COMPLETE_REASONING_WITH_THINK,
id="complete_reasoning_with_think",
),
pytest.param(
False,
MULTIPLE_LINES_WITH_THINK,
id="multiple_lines_with_think",
),
pytest.param(
True,
SIMPLE_REASONING,
id="simple_reasoning_streaming",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_reasoning_streaming",
),
pytest.param(
True,
NO_REASONING,
id="no_reasoning_streaming",
),
pytest.param(
True,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
True,
REASONING_WITH_THINK,
id="reasoning_with_think_streaming",
),
pytest.param(
True,
COMPLETE_REASONING_WITH_THINK,
id="complete_reasoning_with_think_streaming",
),
pytest.param(
True,
MULTIPLE_LINES_WITH_THINK,
id="multiple_lines_with_think_streaming",
),
]
# Global tokenizer initialization to avoid repeated loading
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
):
output = tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
tokenizer
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]
# Additional tests for verifying the correctness of granite streaming; this
# is complicated because granite uses multiple tokens to indicate when thinking
# is starting / when it's starting its response, so skipping special tokens
# is awkward.
### Handling the start of reasoning
STREAMING_1 = {
"previous_text": None,
"current_text": "Here",
"delta_text": "Here",
"reasoning": None,
"content": None,
}
# When we fail, we should give what was previously being silenced first
STREAMING_2 = {
"previous_text": "Here is my thought",
"current_text": "Here is my thought failure",
"delta_text": " failure",
"reasoning": None,
"content": "Here is my thought failure",
}
# But then after the first one, we should only add the delta text to content
STREAMING_3 = {
"previous_text": "Here wrong",
"current_text": " words",
"delta_text": " Here wrong words",
"reasoning": None,
"content": " words",
}
# But then after the first one, we should only add the delta text to content
STREAMING_4 = {
"previous_text": "Here is my thought",
"current_text": "Here is my thought process:",
"delta_text": " process:",
"reasoning": None,
"content": None,
}
# Reasoning started successfully; parse reasoning content
STREAMING_5 = {
"previous_text": "Here is my thought process:",
"current_text": "Here is my thought process: foo",
"delta_text": " foo",
"reasoning": " foo",
"content": None,
}
# Response special sequence has started, but not finished.
STREAMING_6 = {
"previous_text": "Here is my thought process: foo",
"current_text": "Here is my thought process: foo Here is",
"delta_text": " Here is",
"reasoning": " ",
"content": None,
}
# Response special sequence started, but was broken; the reasoning
# content should be the content that was previously unused.
STREAMING_7 = {
"previous_text": "Here is my thought process: foo Here is",
"current_text": "Here is my thought process: foo Here is Here",
"delta_text": " Here",
"reasoning": "Here is ",
"content": None,
}
# Response special sequence is ongoing
STREAMING_8 = {
"previous_text": "Here is my thought process: foo Here is my response:",
"current_text": "Here is my thought process: foo Here is my response: bar",
"delta_text": " bar",
"reasoning": None,
"content": " bar",
}
# The delta text has everything; we should be able to correctly parse both
STREAMING_9 = {
"previous_text": None,
"current_text": "Here is my thought process: foo Here is my response: bar",
"delta_text": "Here is my thought process: foo Here is my response: bar",
"reasoning": " foo ",
"content": " bar",
}
## The Response is ongoing, and the delta mixes reasoning content / content
STREAMING_10 = {
"previous_text": "Here is my thought process: foo",
"current_text": "Here is my thought process: foo bar Here is my response: baz",
"delta_text": " bar Here is my response: baz",
"reasoning": " bar ",
"content": " baz",
}
# The delta text starts a new substring that might be a response special seq
STREAMING_11 = {
"previous_text": "Here is my thought process: This is a reasoning section ",
"current_text": "Here is my thought process: This is a reasoning section Here",
"delta_text": "Here",
"reasoning": None,
"content": None,
}
# The delta text is finishing the response special seq
STREAMING_12 = {
"previous_text": "Here is my thought process: foo Here is my response",
"current_text": "Here is my thought process: foo Here is my response:",
"delta_text": ":",
"reasoning": None,
"content": None,
}
STREAMING_13 = {
"previous_text": "Here is my thought process: foo Here",
"current_text": "Here is my thought process: foo Here was",
"delta_text": " was",
"reasoning": "Here was",
"content": None,
}
STREAMING_SUBCASES = [
pytest.param(
STREAMING_1,
id="Starting reasoning special sequence",
),
pytest.param(
STREAMING_2,
id="Unexpected start reasoning sequence",
),
pytest.param(
STREAMING_3,
id="Continuing unexpected start reasoning sequence",
),
pytest.param(
STREAMING_4,
id="Only start reasoning sequence and nothing else",
),
pytest.param(
STREAMING_5,
id="Reasoning content has started",
),
pytest.param(
STREAMING_6,
id="Response special sequence has started",
),
pytest.param(
STREAMING_7,
id="Response special sequence reset",
),
pytest.param(
STREAMING_8,
id="Response text has started",
),
pytest.param(
STREAMING_9,
id="Delta contains everything",
),
pytest.param(
STREAMING_10,
id="Delta contains some reasoning and response",
),
pytest.param(
STREAMING_11,
id="Delta starts response sequence",
),
pytest.param(
STREAMING_12,
id="Delta finishes response sequence",
),
pytest.param(
STREAMING_13,
id="Delta breaks potential responise sequence",
),
]
@pytest.mark.parametrize("param_dict", STREAMING_SUBCASES)
def test_streaming_subcases(param_dict):
# Get all of the token IDs
previous_token_ids = (
tokenizer.encode(param_dict["previous_text"])
if param_dict["previous_text"] is not None
else []
)
current_token_ids = tokenizer.encode(param_dict["current_text"])
delta_token_ids = tokenizer.encode(param_dict["delta_text"])
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
tokenizer
)
response = parser.extract_reasoning_streaming(
previous_text=param_dict["previous_text"],
current_text=param_dict["current_text"],
delta_text=param_dict["delta_text"],
previous_token_ids=previous_token_ids,
current_token_ids=current_token_ids,
delta_token_ids=delta_token_ids,
)
# Streaming currently expects at least one of reasoning content / content,
# so the response should return None in that case.
if param_dict["reasoning"] is None and param_dict["content"] is None:
assert response is None
else:
assert isinstance(response, DeltaMessage)
assert param_dict["reasoning"] == response.reasoning
assert param_dict["content"] == response.content

View File

@@ -0,0 +1,188 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
from vllm.reasoning.holo2_reasoning_parser import Holo2ReasoningParser
from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
REASONING_MODEL_NAME = "HCompany/Holo2-4B"
@pytest.fixture(scope="module")
def tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
@pytest.mark.parametrize(
"thinking,expected_parser_type",
[
(True, DeepSeekR1ReasoningParser),
(False, IdentityReasoningParser),
],
)
def test_parser_selection(tokenizer, thinking, expected_parser_type):
parser = Holo2ReasoningParser(
tokenizer,
chat_template_kwargs={
"thinking": thinking,
},
)
assert isinstance(parser._parser, expected_parser_type)
def test_holo2_default_parser_is_deepseekr1(tokenizer):
parser = Holo2ReasoningParser(tokenizer)
assert isinstance(parser._parser, DeepSeekR1ReasoningParser)
def test_holo2_supports_structured_output(tokenizer):
# Structured output manager uses the reasoning parser to check if the
# reasoning content is ended before applying the grammar. The main function
# used is is_reasoning_end. This test checks if the parser is able to
# correctly identify the end of the reasoning content.
# important to not pass chat_template_kwargs here as it is done in the
# StructuredOutputManager
parser = Holo2ReasoningParser(tokenizer)
end_token_id = tokenizer.encode("</think>", add_special_tokens=False)[0]
assert parser.is_reasoning_end([1, 2, 4, end_token_id])
assert not parser.is_reasoning_end([1, 2, 4])
assert parser.is_reasoning_end([1, 2, 4, end_token_id, 5])
# thinking is True, non-streaming
WITH_THINK = {
"output": "This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
}
# thinking is True, streaming
WITH_THINK_STREAM = {
"output": "This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
}
# thinking is False, non-streaming
THINKING_DISABLED = {
"output": "This is the rest",
"reasoning": None,
"content": "This is the rest",
}
# thinking is False, streaming
THINKING_DISABLED_STREAM = {
"output": "This is the rest",
"reasoning": None,
"content": "This is the rest",
}
# thinking is False but the model output </think>, non-streaming
THINKING_DISABLED_WITH_CLOSE_TAG = {
"output": "</think>This is the rest",
"reasoning": None,
"content": "</think>This is the rest",
}
# thinking is False but the model output </think>, streaming
THINKING_DISABLED_WITH_CLOSE_TAG_STREAM = {
"output": "some text</think>This is the rest",
"reasoning": None,
"content": "some text</think>This is the rest",
}
COMPLETE_REASONING = {
"output": "This is a reasoning section</think>",
"reasoning": "This is a reasoning section",
"content": None,
}
TEST_CASES = [
pytest.param(
False,
WITH_THINK,
None,
id="with_think",
),
pytest.param(
True,
WITH_THINK_STREAM,
None,
id="with_think_stream",
),
pytest.param(
False,
WITH_THINK,
{"thinking": True},
id="with_think_enabled",
),
pytest.param(
True,
WITH_THINK_STREAM,
{"thinking": True},
id="with_think_stream_enabled",
),
pytest.param(
False,
THINKING_DISABLED,
{"thinking": False},
id="thinking_disabled",
),
pytest.param(
True,
THINKING_DISABLED_STREAM,
{"thinking": False},
id="thinking_disabled_stream",
),
pytest.param(
False,
THINKING_DISABLED_WITH_CLOSE_TAG,
{"thinking": False},
id="thinking_disabled_with_close_tag",
),
pytest.param(
True,
THINKING_DISABLED_WITH_CLOSE_TAG_STREAM,
{"thinking": False},
id="thinking_disabled_with_close_tag_stream",
),
pytest.param(
False,
COMPLETE_REASONING,
None,
id="complete_reasoning",
),
pytest.param(
True,
COMPLETE_REASONING,
None,
id="complete_reasoning_stream",
),
]
@pytest.mark.parametrize("streaming, param_dict, chat_template_kwargs", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
chat_template_kwargs: dict | None,
tokenizer,
):
output = tokenizer.tokenize(param_dict["output"])
output_tokens: list[str] = [
tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser("holo2")(
tokenizer,
chat_template_kwargs=chat_template_kwargs,
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]

View File

@@ -0,0 +1,168 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "hunyuan_a13b"
START_REASONING = "<think>\n"
START_RESPONSE = "\n</think>\n<answer>\n"
END_RESPONSE = "\n</answer>"
NO_REASONING_QUICK_THROUGHT = {
"output": f"{START_REASONING}{START_RESPONSE}This is the rest{END_RESPONSE}", # noqa: E501
"reasoning": None,
"content": "This is the rest",
}
SIMPLE_REASONING = {
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest{END_RESPONSE}", # noqa: E501
"reasoning": "This is a reasoning section",
"content": "This is the rest",
}
COMPLETE_REASONING = {
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
"reasoning": "This is a reasoning section",
"content": None,
}
COMPLETE_REASONING_WITH_SYMBOL = {
"output": f"{START_REASONING}This is a reasoning section!{START_RESPONSE}",
"reasoning": "This is a reasoning section!",
"content": None,
}
NO_REASONING = {
"output": "This is content",
"reasoning": None,
"content": "This is content",
}
MULTIPLE_LINES = {
"output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
"reasoning": "This\nThat",
"content": "This is the rest\nThat",
}
REASONING_WITH_THINK = {
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest", # noqa: E501
"reasoning": "This is a reasoning section",
"content": "This is the rest",
}
COMPLETE_REASONING_WITH_THINK = {
"output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
"reasoning": "This is a reasoning section",
"content": None,
}
MULTIPLE_LINES_WITH_THINK = {
"output": f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
"reasoning": "This\nThat",
"content": "This is the rest\nThat",
}
TEST_CASES = [
pytest.param(
False,
SIMPLE_REASONING,
id="simple_reasoning",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_reasoning",
),
pytest.param(
False,
COMPLETE_REASONING_WITH_SYMBOL,
id="complete_reasoning_with_symbol",
),
pytest.param(
False,
NO_REASONING,
id="no_reasoning",
),
pytest.param(False, NO_REASONING_QUICK_THROUGHT, id="no_reasoning_quick"),
pytest.param(
False,
MULTIPLE_LINES,
id="multiple_lines",
),
pytest.param(
False,
REASONING_WITH_THINK,
id="reasoning_with_think",
),
pytest.param(
False,
COMPLETE_REASONING_WITH_THINK,
id="complete_reasoning_with_think",
),
pytest.param(
False,
MULTIPLE_LINES_WITH_THINK,
id="multiple_lines_with_think",
),
pytest.param(
True,
SIMPLE_REASONING,
id="simple_reasoning_streaming",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_reasoning_streaming",
),
pytest.param(
True,
NO_REASONING,
id="no_reasoning_streaming",
),
pytest.param(True, NO_REASONING_QUICK_THROUGHT, id="no_reasoning_quick_stream"),
pytest.param(
True,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
True,
REASONING_WITH_THINK,
id="reasoning_with_think_streaming",
),
pytest.param(
True,
COMPLETE_REASONING_WITH_THINK,
id="complete_reasoning_with_think_streaming",
),
pytest.param(
True,
MULTIPLE_LINES_WITH_THINK,
id="multiple_lines_with_think_streaming",
),
]
# Global tokenizer initialization to avoid repeated loading
tokenizer = AutoTokenizer.from_pretrained(
"tencent/Hunyuan-A13B-Instruct", trust_remote_code=True
)
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
):
output = tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
tokenizer
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]

View File

@@ -0,0 +1,195 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "minimax_m2_append_think"
end_token = "</think>"
# MiniMax M2 model path
REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
@pytest.fixture(scope="module")
def minimax_m2_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
# =============================================================================
# MiniMaxM2AppendThinkReasoningParser behavior:
# - Prepends <think> to the beginning of the output
# - Does NOT separate reasoning and content
# - Returns everything as content (with <think> prepended)
# - reasoning is always None
#
# This parser is used when you want to keep the raw output with <think> added
# =============================================================================
# Case: simple output with end token
SIMPLE_OUTPUT = {
"output": "This is reasoning</think>This is response",
"reasoning": None,
"content": "<think>This is reasoning</think>This is response",
"is_reasoning_end": True,
}
# Case: output without end token (reasoning in progress)
NO_END_TOKEN = {
"output": "This is reasoning in progress",
"reasoning": None,
"content": "<think>This is reasoning in progress",
"is_reasoning_end": False,
}
# Case: only end token
ONLY_END_TOKEN = {
"output": "</think>This is response",
"reasoning": None,
"content": "<think></think>This is response",
"is_reasoning_end": True,
}
# Case: multiple lines
MULTIPLE_LINES = {
"output": "Line 1\nLine 2</think>Response 1\nResponse 2",
"reasoning": None,
"content": "<think>Line 1\nLine 2</think>Response 1\nResponse 2",
"is_reasoning_end": True,
}
# Case: empty output (non-streaming prepends <think>)
EMPTY = {
"output": "",
"reasoning": None,
"content": "<think>",
"is_reasoning_end": False,
}
# Case: empty output streaming (no tokens = no output)
EMPTY_STREAMING = {
"output": "",
"reasoning": None,
"content": None,
"is_reasoning_end": False,
}
# Case: special characters
SPECIAL_CHARS = {
"output": "Let me think... 1+1=2</think>Yes!",
"reasoning": None,
"content": "<think>Let me think... 1+1=2</think>Yes!",
"is_reasoning_end": True,
}
# Case: code in output
CODE_OUTPUT = {
"output": "```python\nprint('hi')\n```</think>Here's the code.",
"reasoning": None,
"content": "<think>```python\nprint('hi')\n```</think>Here's the code.",
"is_reasoning_end": True,
}
TEST_CASES = [
pytest.param(
False,
SIMPLE_OUTPUT,
id="simple_output",
),
pytest.param(
True,
SIMPLE_OUTPUT,
id="simple_output_streaming",
),
pytest.param(
False,
NO_END_TOKEN,
id="no_end_token",
),
pytest.param(
True,
NO_END_TOKEN,
id="no_end_token_streaming",
),
pytest.param(
False,
ONLY_END_TOKEN,
id="only_end_token",
),
pytest.param(
True,
ONLY_END_TOKEN,
id="only_end_token_streaming",
),
pytest.param(
False,
MULTIPLE_LINES,
id="multiple_lines",
),
pytest.param(
True,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
False,
EMPTY,
id="empty",
),
pytest.param(
True,
EMPTY_STREAMING,
id="empty_streaming",
),
pytest.param(
False,
SPECIAL_CHARS,
id="special_chars",
),
pytest.param(
True,
SPECIAL_CHARS,
id="special_chars_streaming",
),
pytest.param(
False,
CODE_OUTPUT,
id="code_output",
),
pytest.param(
True,
CODE_OUTPUT,
id="code_output_streaming",
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
minimax_m2_tokenizer,
):
output = minimax_m2_tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
minimax_m2_tokenizer
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]
# Test is_reasoning_end
output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
is_reasoning_end = parser.is_reasoning_end(output_ids)
assert is_reasoning_end == param_dict["is_reasoning_end"]

View File

@@ -0,0 +1,230 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "minimax_m2"
end_token = "</think>"
# MiniMax M2 model path
REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
@pytest.fixture(scope="module")
def minimax_m2_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
# =============================================================================
# MiniMax M2 specific behavior:
# - Model does NOT generate <think> start token
# - Model only generates </think> end token
# - All content before </think> is reasoning
# - All content after </think> is the actual response (content)
# =============================================================================
# Case: reasoning + end token + content (typical case)
SIMPLE_REASONING = {
"output": "This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
# Case: reasoning + end token only (no content after)
COMPLETE_REASONING = {
"output": "This is a reasoning section</think>",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
# Case: no end token yet (streaming in progress, all is reasoning)
NO_END_TOKEN = {
"output": "This is reasoning in progress",
"reasoning": "This is reasoning in progress",
"content": None,
"is_reasoning_end": False,
}
# Case: multiple lines of reasoning
MULTIPLE_LINES = {
"output": "First line\nSecond line</think>Response first line\nResponse second",
"reasoning": "First line\nSecond line",
"content": "Response first line\nResponse second",
"is_reasoning_end": True,
}
# Case: only end token (empty reasoning, immediate response)
SHORTEST_REASONING_NO_STREAMING = {
"output": "</think>This is the response",
"reasoning": "",
"content": "This is the response",
"is_reasoning_end": True,
}
# Case: only end token streaming (reasoning is None because it's just the token)
SHORTEST_REASONING_STREAMING = {
"output": "</think>This is the response",
"reasoning": None,
"content": "This is the response",
"is_reasoning_end": True,
}
# Case: empty output
EMPTY = {
"output": "",
"reasoning": "",
"content": None,
"is_reasoning_end": False,
}
# Case: empty streaming
EMPTY_STREAMING = {
"output": "",
"reasoning": None,
"content": None,
"is_reasoning_end": False,
}
# Case: long reasoning with special characters
SPECIAL_CHARS = {
"output": "Let me think... 1+1=2, right?</think>Yes, 1+1=2.",
"reasoning": "Let me think... 1+1=2, right?",
"content": "Yes, 1+1=2.",
"is_reasoning_end": True,
}
# Case: reasoning with code blocks
CODE_IN_REASONING = {
"output": "```python\nprint('hello')\n```</think>Here is the code.",
"reasoning": "```python\nprint('hello')\n```",
"content": "Here is the code.",
"is_reasoning_end": True,
}
TEST_CASES = [
# Core cases: no start token (MiniMax M2 actual behavior)
pytest.param(
False,
SIMPLE_REASONING,
id="simple_reasoning",
),
pytest.param(
True,
SIMPLE_REASONING,
id="simple_reasoning_streaming",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_reasoning",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_reasoning_streaming",
),
pytest.param(
False,
NO_END_TOKEN,
id="no_end_token",
),
pytest.param(
True,
NO_END_TOKEN,
id="no_end_token_streaming",
),
pytest.param(
False,
MULTIPLE_LINES,
id="multiple_lines",
),
pytest.param(
True,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
False,
SHORTEST_REASONING_NO_STREAMING,
id="shortest_reasoning",
),
pytest.param(
True,
SHORTEST_REASONING_STREAMING,
id="shortest_reasoning_streaming",
),
pytest.param(
False,
EMPTY,
id="empty",
),
pytest.param(
True,
EMPTY_STREAMING,
id="empty_streaming",
),
pytest.param(
False,
SPECIAL_CHARS,
id="special_chars",
),
pytest.param(
True,
SPECIAL_CHARS,
id="special_chars_streaming",
),
pytest.param(
False,
CODE_IN_REASONING,
id="code_in_reasoning",
),
pytest.param(
True,
CODE_IN_REASONING,
id="code_in_reasoning_streaming",
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
minimax_m2_tokenizer,
):
output = minimax_m2_tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
minimax_m2_tokenizer
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]
# Test is_reasoning_end
output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
is_reasoning_end = parser.is_reasoning_end(output_ids)
assert is_reasoning_end == param_dict["is_reasoning_end"]
# Test extract_content
if param_dict["content"] is not None:
content = parser.extract_content_ids(output_ids)
assert content == minimax_m2_tokenizer.convert_tokens_to_ids(
minimax_m2_tokenizer.tokenize(param_dict["content"])
)
else:
content = parser.extract_content_ids(output)
assert content == []

View File

@@ -0,0 +1,348 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.reasoning.utils import run_reasoning_extraction_mistral
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.tokenizers.mistral import MistralTokenizer
parser_name = "mistral"
@pytest.fixture(scope="module")
def mistral_tokenizer():
mistral_tokenizer = MistralTokenizer.from_pretrained(
"mistralai/Magistral-Small-2509"
)
return mistral_tokenizer
INVALID_SIMPLE_REASONING = {
"output": "This is a reasoning section[/THINK]This is the rest",
"reasoning": None,
"content": "This is a reasoning sectionThis is the rest",
"is_reasoning_end": False,
}
INVALID_COMPLETE_REASONING = {
"output": "This is a reasoning section[/THINK]",
"reasoning": None,
"content": "This is a reasoning section",
"is_reasoning_end": False,
}
NO_CONTENT = {
"output": "[THINK]This is reasoning",
"reasoning": "This is reasoning",
"content": None,
"is_reasoning_end": False,
}
NO_REASONING = {
"output": "This is content",
"reasoning": None,
"content": "This is content",
"is_reasoning_end": False,
}
NO_REASONING_STREAMING = {
"output": "This is a reasoning section",
"reasoning": None,
"content": "This is a reasoning section",
"is_reasoning_end": False,
}
INVALID_MULTIPLE_LINES = {
"output": "This\nThat[/THINK]This is the rest\nThat",
"reasoning": None,
"content": "This\nThatThis is the rest\nThat",
"is_reasoning_end": False,
}
INVALID_SHORTEST_REASONING_NO_STREAMING = {
"output": "[/THINK]This is the rest",
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": False,
}
INVALID_SHORTEST_REASONING = {
"output": "[/THINK]This is the rest",
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": False,
}
REASONING_WITH_THINK = {
"output": "[THINK]This is a reasoning section[/THINK]This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
COMPLETE_REASONING_WITH_THINK = {
"output": "[THINK]This is a reasoning section[/THINK]",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
MULTIPLE_LINES_WITH_THINK = {
"output": "[THINK]This\nThat[/THINK]This is the rest\nThat",
"reasoning": "This\nThat",
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
"output": "[/THINK]This is the rest",
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": False,
}
INVALID_SHORTEST_REASONING_WITH_THINK = {
"output": "[/THINK]This is the rest",
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": False,
}
THINK_NO_END = {
"output": "[THINK]This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
EMPTY = {
"output": "",
"reasoning": None,
"content": "",
"is_reasoning_end": False,
}
EMPTY_STREAMING = {
"output": "",
"reasoning": None,
"content": None,
"is_reasoning_end": False,
}
NEW_LINE = {
"output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
"reasoning": "This is a reasoning section",
"content": "Before\n\nThis is the rest",
"is_reasoning_end": True,
}
NEW_LINE_STREAMING = {
"output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
"reasoning": "This is a reasoning section",
"content": "Before\n\nThis is the rest",
"is_reasoning_end": True,
}
TEST_CASES = [
pytest.param(
False,
INVALID_SIMPLE_REASONING,
id="invalid_simple_reasoning",
),
pytest.param(
True,
INVALID_SIMPLE_REASONING,
id="invalid_simple_reasoning_streaming",
),
pytest.param(
False,
INVALID_COMPLETE_REASONING,
id="invalid_complete_reasoning",
),
pytest.param(
True,
INVALID_COMPLETE_REASONING,
id="invalid_complete_reasoning_streaming",
),
pytest.param(
False,
NO_CONTENT,
id="no_content",
),
pytest.param(
False,
NO_REASONING,
id="no_reasoning",
),
pytest.param(
True,
NO_REASONING_STREAMING,
id="no_reasoning_token_streaming",
),
pytest.param(
False,
INVALID_MULTIPLE_LINES,
id="invalid_multiple_lines",
),
pytest.param(
True,
INVALID_MULTIPLE_LINES,
id="invalid_multiple_lines_streaming",
),
pytest.param(
True,
INVALID_SHORTEST_REASONING,
id="invalid_shortest",
),
pytest.param(
False,
INVALID_SHORTEST_REASONING_NO_STREAMING,
id="invalid_shortest_streaming",
),
pytest.param(
False,
REASONING_WITH_THINK,
id="reasoning_with_think",
),
pytest.param(
True,
REASONING_WITH_THINK,
id="reasoning_with_think_streaming",
),
pytest.param(
False,
COMPLETE_REASONING_WITH_THINK,
id="complete_reasoning_with_think",
),
pytest.param(
True,
COMPLETE_REASONING_WITH_THINK,
id="complete_reasoning_with_think_streaming",
),
pytest.param(
False,
MULTIPLE_LINES_WITH_THINK,
id="multiple_lines_with_think",
),
pytest.param(
True,
MULTIPLE_LINES_WITH_THINK,
id="multiple_lines_with_think_streaming",
),
pytest.param(
False,
INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
id="invalid_shortest_with_think",
),
pytest.param(
True,
INVALID_SHORTEST_REASONING_WITH_THINK,
id="invalid_shortest_with_think_streaming",
),
pytest.param(
False,
THINK_NO_END,
id="think_no_end",
),
pytest.param(
True,
THINK_NO_END,
id="think_no_end_streaming",
),
pytest.param(
False,
EMPTY,
id="empty",
),
pytest.param(
True,
EMPTY_STREAMING,
id="empty_streaming",
),
pytest.param(
False,
NEW_LINE,
id="new_line",
),
pytest.param(
True,
NEW_LINE_STREAMING,
id="new_line_streaming",
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_mistral_reasoning(
streaming: bool,
param_dict: dict,
mistral_tokenizer: MistralTokenizer,
):
output = param_dict["output"]
index_think = output.find("[THINK]")
len_think = len("[THINK]")
index_end_think = output.find("[/THINK]")
len_end_think = len("[/THINK]")
# encode everything to tokens ids
output_tokens = []
if index_think != -1:
output_before_think = output[:index_think]
output_tokens += mistral_tokenizer.tokenizer.encode(
output_before_think, False, False
)
output_tokens += [mistral_tokenizer.instruct.BEGIN_THINK]
if index_end_think != -1:
output_middle = output[index_think + len_think : index_end_think]
output_after_think = output[index_end_think + len_end_think :]
output_tokens += mistral_tokenizer.tokenizer.encode(
output_middle, False, False
)
output_tokens += [mistral_tokenizer.instruct.END_THINK]
output_tokens += mistral_tokenizer.tokenizer.encode(
output_after_think, False, False
)
else:
output_middle = output[index_think + len_think :]
output_tokens += mistral_tokenizer.tokenizer.encode(
output_middle, False, False
)
elif index_end_think != -1:
output_before_think = output[:index_end_think]
output_after_think = output[index_end_think + len_end_think :]
output_tokens += mistral_tokenizer.tokenizer.encode(
output_before_think, False, False
)
output_tokens += [mistral_tokenizer.instruct.END_THINK]
output_tokens += mistral_tokenizer.tokenizer.encode(
output_after_think, False, False
)
else:
output_tokens += mistral_tokenizer.tokenizer.encode(output, False, False)
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
mistral_tokenizer
)
reasoning, content = run_reasoning_extraction_mistral(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]
# Test is_reasoning_end
is_reasoning_end = parser.is_reasoning_end(output_tokens)
assert is_reasoning_end == param_dict["is_reasoning_end"]
# Test extract_content
if param_dict["content"] is not None:
# Handle the case where there are tokens outputted before Thinking.
# This should not occur if the model is well trained and prompted.
if "[THINK]" in param_dict["output"] and not param_dict["output"].startswith(
"[THINK]"
):
before_content = param_dict["output"].split("[THINK]")[0]
before_token_ids = mistral_tokenizer.tokenizer.encode(
before_content, bos=False, eos=False
)
left_to_encode = param_dict["content"][len(before_content) :]
# Normal situation.
else:
before_token_ids = []
left_to_encode = param_dict["content"]
content_tokens = parser.extract_content_ids(output_tokens)
expected_token_ids = before_token_ids + mistral_tokenizer.tokenizer.encode(
left_to_encode, bos=False, eos=False
)
assert content_tokens == expected_token_ids
else:
content = parser.extract_content_ids(output_tokens)
assert content == []

View File

@@ -0,0 +1,152 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "olmo3"
START_REASONING = "<think>"
END_REASONING = "</think>"
NO_REASONING = {
"output": f"{START_REASONING}{END_REASONING}No thoughts, head empty!",
"reasoning": None,
"content": "No thoughts, head empty!",
}
NO_REASONING_WITH_NEWLINE = {
"output": f"{START_REASONING}\n{END_REASONING}\n\nNo thoughts, head empty!",
"reasoning": "\n",
"content": "\n\nNo thoughts, head empty!",
}
SIMPLE_REASONING = {
"output": f"{START_REASONING}This is a reasoning section{END_REASONING}This is the rest", # noqa: E501
"reasoning": "This is a reasoning section",
"content": "This is the rest",
}
SIMPLE_REASONING_WITH_NEWLINE = {
"output": f"{START_REASONING} Look!\n\nI'm thinking...{END_REASONING}\nThis is the rest", # noqa: E501
"reasoning": " Look!\n\nI'm thinking...",
"content": "\nThis is the rest",
}
SIMPLE_REASONING_WITH_MULTIPLE_NEWLINES = {
"output": f"{START_REASONING}\nLook!\nI'm thinking...\n\n{END_REASONING}\n\n\nThis is the rest", # noqa: E501
"reasoning": "\nLook!\nI'm thinking...\n\n",
"content": "\n\n\nThis is the rest",
}
NO_REASONING_ONLY_END_THINK = {
"output": f"{END_REASONING}\n\nNo thoughts, head empty!",
"reasoning": None,
"content": "\n\nNo thoughts, head empty!",
}
REASONING_ONLY_END_THINK = {
"output": f"The user is asking me not to think.{END_REASONING}No thoughts!",
"reasoning": "The user is asking me not to think.",
"content": "No thoughts!",
}
TEST_CASES = [
pytest.param(
False, # not streaming
NO_REASONING,
id="no_reasoning",
),
pytest.param(
False, # not streaming
NO_REASONING_WITH_NEWLINE,
id="no_reasoning_with_newline",
),
pytest.param(
False, # not streaming
SIMPLE_REASONING,
id="simple_reasoning",
),
pytest.param(
False, # not streaming
SIMPLE_REASONING_WITH_NEWLINE,
id="simple_reasoning_with_newline",
),
pytest.param(
True, # enable streaming
SIMPLE_REASONING_WITH_MULTIPLE_NEWLINES,
id="simple_reasoning_with_multiple_newlines",
),
pytest.param(
False, # not streaming
NO_REASONING_ONLY_END_THINK,
id="no_reasoning_only_end_think",
),
pytest.param(
False, # not streaming
REASONING_ONLY_END_THINK,
id="yes_reasoning_only_end_think",
),
pytest.param(
True, # enable streaming
NO_REASONING,
id="no_reasoning_streaming",
),
pytest.param(
True, # enable streaming
NO_REASONING_WITH_NEWLINE,
id="no_reasoning_with_newline_streaming",
),
pytest.param(
True, # enable streaming
SIMPLE_REASONING,
id="simple_reasoning_streaming",
),
pytest.param(
True, # enable streaming
SIMPLE_REASONING_WITH_NEWLINE,
id="simple_reasoning_with_newline_streaming",
),
pytest.param(
True, # enable streaming
SIMPLE_REASONING_WITH_MULTIPLE_NEWLINES,
id="simple_reasoning_with_multiple_newlines_streaming",
),
pytest.param(
True, # enable streaming
NO_REASONING_ONLY_END_THINK,
id="no_reasoning_only_end_think_streaming",
),
pytest.param(
True, # enable streaming
REASONING_ONLY_END_THINK,
id="yes_reasoning_only_end_think_streaming",
),
]
# Global tokenizer initialization to avoid repeated loading
tokenizer = AutoTokenizer.from_pretrained("allenai/dolma2-tokenizer")
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict[str, str],
):
output = tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
model_output: list[str] = [
tokenizer.convert_tokens_to_string([token]) for token in output
]
parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
parser: ReasoningParser = parser_cls(tokenizer)
reasoning, content = run_reasoning_extraction(
reasoning_parser=parser, model_output=model_output, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]

View File

@@ -0,0 +1,142 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "qwen3"
start_token = "<think>"
end_token = "</think>"
REASONING_MODEL_NAME = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module")
def qwen3_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
# 带 <think></think>非stream
WITH_THINK = {
"output": "<think>This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
}
# 带 <think></think>stream
WITH_THINK_STREAM = {
"output": "<think>This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
}
# 不带 <think></think>非stream
WITHOUT_THINK = {
"output": "This is the rest",
"reasoning": None,
"content": "This is the rest",
}
# 不带 <think></think>stream
WITHOUT_THINK_STREAM = {
"output": "This is the rest",
"reasoning": None,
"content": "This is the rest",
}
COMPLETE_REASONING = {
"output": "<think>This is a reasoning section</think>",
"reasoning": "This is a reasoning section",
"content": None,
}
MULTILINE_REASONING = {
"output": "<think>This is a reasoning\nsection</think>This is the rest\nThat",
"reasoning": "This is a reasoning\nsection",
"content": "This is the rest\nThat",
}
ONLY_OPEN_TAG = {
"output": "<think>This is a reasoning section",
"reasoning": None,
"content": "<think>This is a reasoning section",
}
ONLY_OPEN_TAG_STREAM = {
"output": "<think>This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
}
TEST_CASES = [
pytest.param(
False,
WITH_THINK,
id="with_think",
),
pytest.param(
True,
WITH_THINK_STREAM,
id="with_think_stream",
),
pytest.param(
False,
WITHOUT_THINK,
id="without_think",
),
pytest.param(
True,
WITHOUT_THINK_STREAM,
id="without_think_stream",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_reasoning",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_reasoning_stream",
),
pytest.param(
False,
MULTILINE_REASONING,
id="multiline_reasoning",
),
pytest.param(
True,
MULTILINE_REASONING,
id="multiline_reasoning_stream",
),
pytest.param(
False,
ONLY_OPEN_TAG,
id="only_open_tag",
),
pytest.param(
True,
ONLY_OPEN_TAG_STREAM,
id="only_open_tag_stream",
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
qwen3_tokenizer,
):
output = qwen3_tokenizer.tokenize(param_dict["output"])
output_tokens: list[str] = [
qwen3_tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
qwen3_tokenizer
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]

View File

@@ -0,0 +1,236 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, cast
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "seed_oss"
start_token = "<seed:think>"
end_token = "</seed:think>"
# Use a test model that contains our custom tokens
REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
@pytest.fixture(scope="module")
def seedoss_tokenizer():
tokenizer = AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
# Add custom SeedOSS tokens if they don't exist
if start_token not in tokenizer.get_vocab():
tokenizer.add_tokens([start_token, end_token])
return tokenizer
SIMPLE_REASONING: dict[str, Any] = {
"output": "This is a reasoning section</seed:think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
COMPLETE_REASONING: dict[str, Any] = {
"output": "This is a reasoning section</seed:think>",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
NO_CONTENT: dict[str, Any] = {
"output": "This is content",
"reasoning": "This is content",
"content": None,
"is_reasoning_end": False,
}
NO_REASONING_STREAMING: dict[str, Any] = {
"output": "This is a reasoning section",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": False,
}
MULTIPLE_LINES: dict[str, Any] = {
"output": "This\nThat</seed:think>This is the rest\nThat",
"reasoning": "This\nThat",
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
WITH_START_TOKEN: dict[str, Any] = {
"output": ("<seed:think>This is a reasoning section</seed:think>This is the rest"),
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
ONLY_END_TOKEN: dict[str, Any] = {
"output": "Some reasoning</seed:think>This is the rest",
"reasoning": "Some reasoning",
"content": "This is the rest",
"is_reasoning_end": True,
}
NO_TOKENS: dict[str, Any] = {
"output": "This is just content without any reasoning tokens",
"reasoning": "This is just content without any reasoning tokens",
"content": None,
"is_reasoning_end": False,
}
def test_seedoss_reasoning_parser_creation(seedoss_tokenizer):
"""Test that the SeedOSS reasoning parser can be created and registered."""
parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
parser = parser_cls(seedoss_tokenizer)
assert isinstance(parser, ReasoningParser)
assert parser.start_token == start_token
assert parser.end_token == end_token
@pytest.mark.parametrize("streaming", [True, False])
def test_simple_reasoning(seedoss_tokenizer, streaming):
"""Test basic reasoning extraction with both tokens."""
parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
parser = parser_cls(seedoss_tokenizer)
reasoning, content = run_reasoning_extraction(
parser, [cast(str, SIMPLE_REASONING["output"])], streaming=streaming
)
assert reasoning == SIMPLE_REASONING["reasoning"]
assert content == SIMPLE_REASONING["content"]
@pytest.mark.parametrize("streaming", [True, False])
def test_complete_reasoning(seedoss_tokenizer, streaming):
"""Test reasoning extraction when there's no content after reasoning."""
parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
parser = parser_cls(seedoss_tokenizer)
reasoning, content = run_reasoning_extraction(
parser, [cast(str, COMPLETE_REASONING["output"])], streaming=streaming
)
assert reasoning == COMPLETE_REASONING["reasoning"]
assert content == COMPLETE_REASONING["content"]
@pytest.mark.parametrize("streaming", [True, False])
def test_no_content(seedoss_tokenizer, streaming):
"""Test when there's no end token - everything is reasoning content."""
parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
parser = parser_cls(seedoss_tokenizer)
reasoning, content = run_reasoning_extraction(
parser, [cast(str, NO_CONTENT["output"])], streaming=streaming
)
assert reasoning == NO_CONTENT["reasoning"]
assert content == NO_CONTENT["content"]
@pytest.mark.parametrize("streaming", [True, False])
def test_multiple_lines(seedoss_tokenizer, streaming):
"""Test reasoning extraction with multiline content."""
parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
parser = parser_cls(seedoss_tokenizer)
reasoning, content = run_reasoning_extraction(
parser, [cast(str, MULTIPLE_LINES["output"])], streaming=streaming
)
assert reasoning == MULTIPLE_LINES["reasoning"]
assert content == MULTIPLE_LINES["content"]
@pytest.mark.parametrize("streaming", [True, False])
def test_with_start_token(seedoss_tokenizer, streaming):
"""Test reasoning extraction with both start and end tokens."""
parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
parser = parser_cls(seedoss_tokenizer)
reasoning, content = run_reasoning_extraction(
parser, [cast(str, WITH_START_TOKEN["output"])], streaming=streaming
)
assert reasoning == WITH_START_TOKEN["reasoning"]
assert content == WITH_START_TOKEN["content"]
@pytest.mark.parametrize("streaming", [True, False])
def test_only_end_token(seedoss_tokenizer, streaming):
"""
Test reasoning extraction with only end token
(SeedOSS typical behavior).
"""
parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
parser = parser_cls(seedoss_tokenizer)
reasoning, content = run_reasoning_extraction(
parser, [cast(str, ONLY_END_TOKEN["output"])], streaming=streaming
)
assert reasoning == ONLY_END_TOKEN["reasoning"]
assert content == ONLY_END_TOKEN["content"]
@pytest.mark.parametrize("streaming", [True, False])
def test_no_tokens(seedoss_tokenizer, streaming):
"""Test when there are no reasoning tokens at all."""
parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
parser = parser_cls(seedoss_tokenizer)
reasoning, content = run_reasoning_extraction(
parser, [cast(str, NO_TOKENS["output"])], streaming=streaming
)
assert reasoning == NO_TOKENS["reasoning"]
assert content == NO_TOKENS["content"]
def test_is_reasoning_end(seedoss_tokenizer):
"""Test the is_reasoning_end method."""
parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
parser = parser_cls(seedoss_tokenizer)
# Test with end token present
end_token_id = parser.end_token_id
assert parser.is_reasoning_end([1, 2, end_token_id, 4]) is True
# Test without end token
assert parser.is_reasoning_end([1, 2, 3, 4]) is False
def test_extract_content_ids(seedoss_tokenizer):
"""Test the extract_content_ids method."""
parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
parser = parser_cls(seedoss_tokenizer)
end_token_id = parser.end_token_id
# Test with end token in the middle
input_ids = [1, 2, end_token_id, 4, 5]
content_ids = parser.extract_content_ids(input_ids)
assert content_ids == [4, 5]
# Test with end token at the end
input_ids = [1, 2, 3, end_token_id]
content_ids = parser.extract_content_ids(input_ids)
assert content_ids == []
# Test without end token
input_ids = [1, 2, 3, 4]
content_ids = parser.extract_content_ids(input_ids)
assert content_ids == []
def test_streaming_delta_processing(seedoss_tokenizer):
"""Test streaming processing with small deltas."""
parser_cls = ReasoningParserManager.get_reasoning_parser(parser_name)
parser = parser_cls(seedoss_tokenizer)
# Test streaming with incremental tokens
deltas = ["Some ", "reasoning ", "content", "</seed:think>", "Final ", "answer"]
reasoning, content = run_reasoning_extraction(parser, deltas, streaming=True)
assert reasoning == "Some reasoning content"
assert content == "Final answer"

160
tests/reasoning/utils.py Normal file
View File

@@ -0,0 +1,160 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.reasoning import ReasoningParser
from vllm.tokenizers.mistral import MistralTokenizer
class StreamingReasoningReconstructor:
def __init__(self):
self.reasoning = None
self.other_content = None
def append_delta(self, delta: DeltaMessage):
# content and the reasoning content should not be present
# at the same time
assert delta.content is None or delta.reasoning is None, (
"Both content and reasoning content are present in the delta message"
)
assert delta.reasoning == delta.reasoning_content, (
"reasoning_content should be present for backwards compatibility"
)
if delta.content is not None:
if self.other_content is None:
self.other_content = delta.content
else:
self.other_content += delta.content
else:
if self.reasoning is None:
self.reasoning = delta.reasoning
else:
self.reasoning += delta.reasoning
def run_reasoning_extraction(
reasoning_parser: ReasoningParser,
model_output: list[str],
request: ChatCompletionRequest | None = None,
streaming: bool = False,
) -> tuple[str | None, str | None]:
if streaming:
reconstructor = run_reasoning_extraction_streaming(
reasoning_parser,
model_output,
request,
)
return (
reconstructor.reasoning,
reconstructor.other_content or None,
)
else:
reasoning, content = run_reasoning_extraction_nonstreaming(
reasoning_parser, model_output, request
)
return reasoning, content
def run_reasoning_extraction_mistral(
reasoning_parser: ReasoningParser,
model_output: list[int],
request: ChatCompletionRequest | None = None,
streaming: bool = False,
) -> tuple[str | None, str | None]:
assert isinstance(reasoning_parser.model_tokenizer, MistralTokenizer), type(
reasoning_parser.model_tokenizer
)
if streaming:
reconstructor = run_reasoning_extraction_streaming_mistral(
reasoning_parser,
model_output,
request,
)
return (
reconstructor.reasoning,
reconstructor.other_content or None,
)
else:
str_output = reasoning_parser.model_tokenizer.convert_ids_to_tokens(
model_output
)
reasoning, content = run_reasoning_extraction_nonstreaming(
reasoning_parser, str_output, request
)
return reasoning, content
def run_reasoning_extraction_nonstreaming(
reasoning_parser: ReasoningParser,
model_output: list[str],
request: ChatCompletionRequest | None = None,
) -> tuple[str | None, str | None]:
request = request or ChatCompletionRequest(messages=[], model="test-model")
return reasoning_parser.extract_reasoning(
model_output="".join(model_output), request=request
)
def run_reasoning_extraction_streaming(
reasoning_parser: ReasoningParser,
model_deltas: list[str],
request: ChatCompletionRequest | None = None,
) -> StreamingReasoningReconstructor:
request = request or ChatCompletionRequest(messages=[], model="test-model")
reconstructor = StreamingReasoningReconstructor()
previous_text = ""
previous_tokens: list[int] = []
for delta in model_deltas:
token_delta = [
reasoning_parser.vocab.get(token)
for token in reasoning_parser.model_tokenizer.tokenize(delta)
if token in reasoning_parser.vocab
]
current_text = previous_text + delta
current_tokens = previous_tokens + token_delta
delta_message = reasoning_parser.extract_reasoning_streaming(
previous_text,
current_text,
delta,
previous_tokens,
current_tokens,
token_delta,
)
if delta_message is not None:
reconstructor.append_delta(delta_message)
previous_text = current_text
previous_tokens = current_tokens
return reconstructor
def run_reasoning_extraction_streaming_mistral(
reasoning_parser: ReasoningParser,
model_deltas: list[int],
request: ChatCompletionRequest | None = None,
) -> StreamingReasoningReconstructor:
assert isinstance(reasoning_parser.model_tokenizer, MistralTokenizer), type(
reasoning_parser.model_tokenizer
)
request = request or ChatCompletionRequest(messages=[], model="test-model")
reconstructor = StreamingReasoningReconstructor()
previous_text = ""
previous_tokens: list[int] = []
for model_delta in model_deltas:
token_delta = [model_delta]
delta = reasoning_parser.model_tokenizer.convert_ids_to_tokens([model_delta])[0]
current_text = previous_text + delta
current_tokens = previous_tokens + token_delta
delta_message = reasoning_parser.extract_reasoning_streaming(
previous_text,
current_text,
delta,
previous_tokens,
current_tokens,
token_delta,
)
if delta_message is not None:
reconstructor.append_delta(delta_message)
previous_text = current_text
previous_tokens = current_tokens
return reconstructor