Add minimal vLLM 0.16.1 build repo for BI-V150

This commit is contained in:
2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions

100
vllm/reasoning/__init__.py Normal file
View File

@@ -0,0 +1,100 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
__all__ = [
"ReasoningParser",
"ReasoningParserManager",
]
"""
Register a lazy module mapping.
Example:
ReasoningParserManager.register_lazy_module(
name="qwen3",
module_path="vllm.reasoning.qwen3_reasoning_parser",
class_name="Qwen3ReasoningParser",
)
"""
_REASONING_PARSERS_TO_REGISTER = {
"deepseek_r1": ( # name
"deepseek_r1_reasoning_parser", # filename
"DeepSeekR1ReasoningParser", # class_name
),
"deepseek_v3": (
"deepseek_v3_reasoning_parser",
"DeepSeekV3ReasoningParser",
),
"ernie45": (
"ernie45_reasoning_parser",
"Ernie45ReasoningParser",
),
"glm45": (
"deepseek_v3_reasoning_parser",
"DeepSeekV3ReasoningWithThinkingParser",
),
"openai_gptoss": (
"gptoss_reasoning_parser",
"GptOssReasoningParser",
),
"granite": (
"granite_reasoning_parser",
"GraniteReasoningParser",
),
"holo2": (
"deepseek_v3_reasoning_parser",
"DeepSeekV3ReasoningWithThinkingParser",
),
"hunyuan_a13b": (
"hunyuan_a13b_reasoning_parser",
"HunyuanA13BReasoningParser",
),
"kimi_k2": (
"deepseek_v3_reasoning_parser",
"DeepSeekV3ReasoningWithThinkingParser",
),
"minimax_m2": (
"minimax_m2_reasoning_parser",
"MiniMaxM2ReasoningParser",
),
"minimax_m2_append_think": (
"minimax_m2_reasoning_parser",
"MiniMaxM2AppendThinkReasoningParser",
),
"mistral": (
"mistral_reasoning_parser",
"MistralReasoningParser",
),
"olmo3": (
"olmo3_reasoning_parser",
"Olmo3ReasoningParser",
),
"qwen3": (
"qwen3_reasoning_parser",
"Qwen3ReasoningParser",
),
"seed_oss": (
"seedoss_reasoning_parser",
"SeedOSSReasoningParser",
),
"step3": (
"step3_reasoning_parser",
"Step3ReasoningParser",
),
"step3p5": (
"step3p5_reasoning_parser",
"Step3p5ReasoningParser",
),
}
def register_lazy_reasoning_parsers():
for name, (file_name, class_name) in _REASONING_PARSERS_TO_REGISTER.items():
module_path = f"vllm.reasoning.{file_name}"
ReasoningParserManager.register_lazy_module(name, module_path, class_name)
register_lazy_reasoning_parsers()

View File

@@ -0,0 +1,341 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import os
from abc import abstractmethod
from collections.abc import Callable, Iterable, Sequence
from functools import cached_property
from typing import TYPE_CHECKING, Any
from vllm.entrypoints.mcp.tool_server import ToolServer
from vllm.logger import init_logger
from vllm.utils.collection_utils import is_list_of
from vllm.utils.import_utils import import_from_path
if TYPE_CHECKING:
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage,
)
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.tokenizers import TokenizerLike
else:
ChatCompletionRequest = Any
DeltaMessage = Any
ResponsesRequest = Any
TokenizerLike = Any
logger = init_logger(__name__)
class ReasoningParser:
"""
Abstract reasoning parser class that should not be used directly.
Provided and methods should be used in derived classes.
It is used to extract reasoning content from the model output.
"""
def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
self.model_tokenizer = tokenizer
@cached_property
def vocab(self) -> dict[str, int]:
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
# whereas all tokenizers have .get_vocab()
return self.model_tokenizer.get_vocab()
@abstractmethod
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
"""
Check if the reasoning content ends in the input_ids.
It is used in structured engines like `xgrammar` to check if the
reasoning content ends in the model output.
Parameters:
input_ids: list[int]
The input_ids of the model output.
Returns:
bool
True if the reasoning content ends in the input_ids.
"""
def is_reasoning_end_streaming(
self, input_ids: Sequence[int], delta_ids: Iterable[int]
) -> bool:
"""
Check if the reasoning content ends in the input_ids on a
decode step.
It is used in structured engines like `xgrammar` to check if the
reasoning content ends in the model output during a decode step.
`input_ids` the entire model output and `delta_ids` are the last few
computed tokens of the model output (like during a decode step).
Parameters:
input_ids: list[int]
The entire model output.
delta_ids: list[int]
The last few computed tokens of the model output at the current decode step.
Returns:
bool
True if the reasoning content ends in the `delta_ids` on a
decode step.
"""
return self.is_reasoning_end(input_ids)
@abstractmethod
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
"""
Extract content token ids from the input_ids.
Parameters:
input_ids: list[int]
The input_ids of the model output.
Returns:
list[int]
The extracted content from the input_ids.
"""
def count_reasoning_tokens(self, token_ids: Sequence[int]) -> int:
"""Count the number of reasoning tokens in a sequence.
Text-based reasoning models typically wrap their chain-of-thought
between special start/end tokens (e.g., ``<think> ... </think>``).
Implementations that support reasoning token counting should override
this method. The default implementation returns ``0`` so existing
parsers remain unchanged unless they explicitly opt in.
Args:
token_ids: Sequence of generated token ids (excluding prompt).
Returns:
int: Number of tokens that belong to reasoning content.
"""
# By default, assume the parser cannot detect reasoning spans.
return 0
@abstractmethod
def extract_reasoning(
self,
model_output: str,
request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]:
"""
Extract reasoning content from a complete model-generated string.
Used for non-streaming responses where we have the entire model response
available before sending to the client.
Parameters:
model_output: str
The model-generated string to extract reasoning content from.
request: ChatCompletionRequest
The request object that was used to generate the model_output.
Returns:
tuple[Optional[str], Optional[str]]
A tuple containing the reasoning content and the content.
"""
@abstractmethod
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""
Instance method that should be implemented for extracting reasoning
from an incomplete response; for use when handling reasoning calls and
streaming. Has to be an instance method because it requires state -
the current tokens/diffs, but also the information about what has
previously been parsed and extracted (see constructor)
"""
def prepare_structured_tag(
self,
original_tag: str | None,
tool_server: ToolServer | None,
) -> str | None:
"""
Instance method that is implemented for preparing the structured tag
Otherwise, None is returned
"""
return None
class ReasoningParserManager:
"""
Central registry for ReasoningParser implementations.
Supports two registration modes:
- Eager registration via `register_module`
- Lazy registration via `register_lazy_module`
Each reasoning parser must inherit from `ReasoningParser`.
"""
reasoning_parsers: dict[str, type[ReasoningParser]] = {}
lazy_parsers: dict[str, tuple[str, str]] = {} # name -> (module_path, class_name)
@classmethod
def get_reasoning_parser(cls, name: str) -> type[ReasoningParser]:
"""
Retrieve a registered or lazily registered ReasoningParser class.
If the parser is lazily registered, it will be imported and cached
on first access.
Raises:
KeyError: if no parser is found under the given name.
"""
if name in cls.reasoning_parsers:
return cls.reasoning_parsers[name]
if name in cls.lazy_parsers:
return cls._load_lazy_parser(name)
registered = ", ".join(cls.list_registered())
raise KeyError(
f"Reasoning parser '{name}' not found. Available parsers: {registered}"
)
@classmethod
def list_registered(cls) -> list[str]:
"""Return names of all eagerly and lazily registered reasoning parsers."""
return sorted(set(cls.reasoning_parsers.keys()) | set(cls.lazy_parsers.keys()))
@classmethod
def _load_lazy_parser(cls, name: str) -> type[ReasoningParser]:
"""Import and register a lazily loaded reasoning parser."""
module_path, class_name = cls.lazy_parsers[name]
try:
mod = importlib.import_module(module_path)
parser_cls = getattr(mod, class_name)
if not issubclass(parser_cls, ReasoningParser):
raise TypeError(
f"{class_name} in {module_path} is not a ReasoningParser subclass."
)
cls.reasoning_parsers[name] = parser_cls # cache
return parser_cls
except Exception as e:
logger.exception(
"Failed to import lazy reasoning parser '%s' from %s: %s",
name,
module_path,
e,
)
raise
@classmethod
def _register_module(
cls,
module: type[ReasoningParser],
module_name: str | list[str] | None = None,
force: bool = True,
) -> None:
"""Register a ReasoningParser class immediately."""
if not issubclass(module, ReasoningParser):
raise TypeError(
f"module must be subclass of ReasoningParser, but got {type(module)}"
)
if module_name is None:
module_names = [module.__name__]
elif isinstance(module_name, str):
module_names = [module_name]
elif is_list_of(module_name, str):
module_names = module_name
else:
raise TypeError("module_name must be str, list[str], or None.")
for name in module_names:
if not force and name in cls.reasoning_parsers:
existed = cls.reasoning_parsers[name]
raise KeyError(f"{name} is already registered at {existed.__module__}")
cls.reasoning_parsers[name] = module
@classmethod
def register_lazy_module(cls, name: str, module_path: str, class_name: str) -> None:
"""
Register a lazy module mapping for delayed import.
Example:
ReasoningParserManager.register_lazy_module(
name="qwen3",
module_path="vllm.reasoning.parsers.qwen3_reasoning_parser",
class_name="Qwen3ReasoningParser",
)
"""
cls.lazy_parsers[name] = (module_path, class_name)
@classmethod
def register_module(
cls,
name: str | list[str] | None = None,
force: bool = True,
module: type[ReasoningParser] | None = None,
) -> (
type[ReasoningParser] | Callable[[type[ReasoningParser]], type[ReasoningParser]]
):
"""
Register module with the given name or name list. it can be used as a
decoder(with module as None) or normal function(with module as not
None).
"""
if not isinstance(force, bool):
raise TypeError(f"force must be a boolean, but got {type(force)}")
# Immediate registration (explicit call)
if module is not None:
cls._register_module(module=module, module_name=name, force=force)
return module
# Decorator usage
def _decorator(obj: type[ReasoningParser]) -> type[ReasoningParser]:
module_path = obj.__module__
class_name = obj.__name__
if isinstance(name, str):
names = [name]
elif is_list_of(name, str):
names = name
else:
names = [class_name]
for n in names:
cls.lazy_parsers[n] = (module_path, class_name)
return obj
return _decorator
@classmethod
def import_reasoning_parser(cls, plugin_path: str) -> None:
"""
Import a user-defined reasoning parser by the path
of the reasoning parser define file.
"""
module_name = os.path.splitext(os.path.basename(plugin_path))[0]
try:
import_from_path(module_name, plugin_path)
except Exception:
logger.exception(
"Failed to load module '%s' from %s.", module_name, plugin_path
)
return

View File

@@ -0,0 +1,198 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import abstractmethod
from collections.abc import Iterable, Sequence
from itertools import islice
from typing import TYPE_CHECKING, Any
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.tokenizers import TokenizerLike
if TYPE_CHECKING:
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
else:
ChatCompletionRequest = Any
ResponsesRequest = Any
class BaseThinkingReasoningParser(ReasoningParser):
"""
Base class for reasoning parsers that use thinking tokens.
This class provides common functionality for parsers that use start and end
tokens to delimit reasoning content (
e.g., <think>...</think>, <seed:think>...</seed:think>).
Subclasses must implement the start and end tokens via abstract
properties.
"""
@property
@abstractmethod
def start_token(self) -> str:
"""The token that starts reasoning content."""
raise NotImplementedError
@property
@abstractmethod
def end_token(self) -> str:
"""The token that ends reasoning content."""
raise NotImplementedError
def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
if not self.model_tokenizer:
raise ValueError(
"The model tokenizer must be passed to the ReasoningParser "
"constructor during construction."
)
if not self.start_token or not self.end_token:
raise ValueError("start_token and end_token must be defined in subclasses")
self.start_token_id = self.vocab.get(self.start_token)
self.end_token_id = self.vocab.get(self.end_token)
if self.start_token_id is None or self.end_token_id is None:
raise RuntimeError(
f"{self.__class__.__name__} reasoning parser could not locate "
"think start/end tokens in the tokenizer!"
)
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
start_token_id = self.start_token_id
end_token_id = self.end_token_id
for i in range(len(input_ids) - 1, -1, -1):
if input_ids[i] == start_token_id:
return False
if input_ids[i] == end_token_id:
return True
return False
def is_reasoning_end_streaming(
self, input_ids: Sequence[int], delta_ids: Iterable[int]
) -> bool:
end_token_id = self.end_token_id
return end_token_id in delta_ids
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
"""
Extract the content after the end tokens
"""
if self.end_token_id not in islice(input_ids, 0, max(0, len(input_ids) - 1)):
return []
else:
return input_ids[input_ids.index(self.end_token_id) + 1 :]
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""
Extract reasoning content from a delta message.
Handles streaming output where previous + delta = current.
Uses token IDs for faster processing.
"""
# Skip single special tokens
if len(delta_token_ids) == 1 and (
delta_token_ids[0] in [self.start_token_id, self.end_token_id]
):
return None
# Check if start token is present in previous or delta.
# Keep compatibility with models that don't generate start tokens.
if self.start_token_id in previous_token_ids:
if self.end_token_id in delta_token_ids:
# start token in previous, end token in delta,
# extract reasoning content
end_index = delta_text.find(self.end_token)
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage(
reasoning=reasoning, content=content if content else None
)
elif self.end_token_id in previous_token_ids:
# start token in previous, end token in previous,
# reasoning content continues
return DeltaMessage(content=delta_text)
else:
# start token in previous, no end token in previous or delta,
# reasoning content continues
return DeltaMessage(reasoning=delta_text)
elif self.start_token_id in delta_token_ids:
if self.end_token_id in delta_token_ids:
# start token in delta, end token in delta,
# extract reasoning content
start_index = delta_text.find(self.start_token)
end_index = delta_text.find(self.end_token)
reasoning = delta_text[start_index + len(self.start_token) : end_index]
content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage(
reasoning=reasoning, content=content if content else None
)
else:
# start token in delta, no end token in delta,
# reasoning content continues
return DeltaMessage(reasoning=delta_text)
else:
# not find thinking start token
return DeltaMessage(content=delta_text)
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
) -> tuple[str | None, str | None]:
"""
Extract reasoning content from the model output.
This is the base implementation that works for most models.
Subclasses can override this method for specific behavior.
"""
# Check if the start token is present in the model output, remove it
# if it is present.
model_output_parts = model_output.partition(self.start_token)
model_output = (
model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
)
# For models that may not generate start token,
# assume the reasoning content is always at the start.
if self.end_token not in model_output:
return model_output, None
else:
reasoning, _, content = model_output.partition(self.end_token)
# If generation stops right after end-of-think, return null content
final_content = content or None
return reasoning, final_content
def count_reasoning_tokens(self, token_ids: Sequence[int]) -> int:
"""Count tokens that fall within start/end thinking markers.
Uses a depth counter so nested spans are handled safely and stray end
tokens do not drive the counter negative.
"""
count = 0
depth = 0
for token_id in token_ids:
if token_id == self.start_token_id:
depth += 1
continue
if token_id == self.end_token_id:
if depth > 0:
depth -= 1
continue
if depth > 0:
count += 1
return count

View File

@@ -0,0 +1,67 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for DeepSeek R1 model.
The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning
text. This parser extracts the reasoning content from the model output.
"""
@property
def start_token(self) -> str:
"""The token that starts reasoning content."""
return "<think>"
@property
def end_token(self) -> str:
"""The token that ends reasoning content."""
return "</think>"
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
ret = super().extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
)
if (
ret is not None
and self.start_token_id not in previous_token_ids
and self.start_token_id not in delta_token_ids
):
if self.end_token_id in delta_token_ids:
# end token in delta with more tokens,
# extract reasoning content and content
end_index = delta_text.find(self.end_token)
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage(
reasoning=reasoning,
content=content if content else None,
)
elif self.end_token_id in previous_token_ids:
# end token in previous, thinking content ends
return DeltaMessage(content=delta_text)
else:
# no end token in previous or delta, reasoning content continues
return DeltaMessage(reasoning=delta_text)
return ret

View File

@@ -0,0 +1,88 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable, Sequence
from transformers import PreTrainedTokenizerBase
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
from .identity_reasoning_parser import IdentityReasoningParser
logger = init_logger(__name__)
class DeepSeekV3ReasoningParser(ReasoningParser):
"""
V3 parser that delegates to either DeepSeekR1ReasoningParser or
IdentityReasoningParser based on `thinking` and `separate_reasoning`.
"""
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
thinking = bool(chat_kwargs.get("thinking", False))
enable_thinking = bool(chat_kwargs.get("enable_thinking", False))
thinking = thinking or enable_thinking
if thinking:
self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
else:
self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._parser.is_reasoning_end(input_ids)
def is_reasoning_end_streaming(
self, input_ids: Sequence[int], delta_ids: Iterable[int]
) -> bool:
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return self._parser.extract_content_ids(input_ids)
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
return self._parser.extract_reasoning(model_output, request)
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
return self._parser.extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
)
class DeepSeekV3ReasoningWithThinkingParser(DeepSeekV3ReasoningParser):
"""
DeepSeekV3ReasoningParser that defaults to thinking mode.
"""
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
thinking = chat_kwargs.get("thinking", None)
enable_thinking = chat_kwargs.get("enable_thinking", None)
if thinking is None and enable_thinking is None:
chat_kwargs["thinking"] = True
chat_kwargs["enable_thinking"] = True
kwargs["chat_template_kwargs"] = chat_kwargs
super().__init__(tokenizer, *args, **kwargs)

View File

@@ -0,0 +1,168 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from transformers import PreTrainedTokenizerBase
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.logger import init_logger
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
logger = init_logger(__name__)
class Ernie45ReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for Ernie45 thinking model.
The Ernie45 thinking model ouput format is
abc\n</think>\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef
"""
response_start_token: str = "<response>"
response_end_token: str = "</response>"
newline_token: str = "<0x0A>"
@property
def start_token(self) -> str:
"""The token that starts reasoning content."""
return "<think>"
@property
def end_token(self) -> str:
"""The token that ends reasoning content."""
return "</think>"
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
if not self.model_tokenizer:
raise ValueError(
"The model tokenizer must be passed to the ReasoningParser "
"constructor during construction."
)
self.start_token_id = self.vocab.get(self.start_token)
self.end_token_id = self.vocab.get(self.end_token)
self.response_start_token_id = self.vocab.get(self.response_start_token)
self.response_end_token_id = self.vocab.get(self.response_end_token)
self.newline_token_id = self.vocab.get(self.newline_token)
self.parser_token_ids = [self.end_token_id, self.response_end_token_id]
if self.start_token_id is None or self.end_token_id is None:
raise RuntimeError(
"Ernie45 reasoning parser could not locate think start/end "
"tokens in the tokenizer!"
)
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""
Extract reasoning content from a delta message.
Handles streaming output where previous + delta = current.
Uses token IDs for faster processing.
The Ernie45 thinking model ouput format is
abc\n</think>\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef
- 'abc' goes to reasoning
- 'def' goes to content
"""
# Skip single special tokens
if len(delta_token_ids) == 1 and (
delta_token_ids[0]
in [
self.start_token_id,
self.end_token_id,
self.response_start_token_id,
self.response_end_token_id,
]
):
return None
# No <think> in previous or delta, also need to check for </think>.
# Because the model may have generated </think> without <think>
if self.end_token_id in delta_token_ids:
# </think> in delta with more tokens,
# extract reasoning content and content
think_end_index = delta_text.find(self.end_token)
reasoning = delta_text[:think_end_index]
content = delta_text[think_end_index + len(self.end_token) :]
content = content.lstrip("\n")
response_start_idx = content.find(self.response_start_token)
response_end_idx = content.rfind(self.response_end_token)
if response_start_idx != -1:
content = content[response_start_idx + len(self.response_start_token) :]
if response_end_idx != -1:
content = content[:response_end_idx]
return DeltaMessage(
reasoning=reasoning,
content=content if content else None,
)
elif self.end_token_id in previous_token_ids:
# </think> in previous, thinking content ends
content = delta_text
if self.response_start_token_id in delta_token_ids:
content = content.lstrip("\n")
response_start_idx = content.find(self.response_start_token)
content = content[response_start_idx + len(self.response_start_token) :]
# if have </response>, remove it
response_end_idx = content.rfind(self.response_end_token)
if response_end_idx != -1:
content = content[:response_end_idx]
elif self.response_end_token_id in delta_token_ids:
response_end_idx = content.rfind(self.response_end_token)
content = content[:response_end_idx]
# remove \n after </think> or </response>
if previous_token_ids[-1] in self.parser_token_ids and (
len(delta_token_ids) > 0 and delta_token_ids[0] == self.newline_token_id
):
content = content.lstrip("\n")
# remove \n after </think>\n
if (
len(previous_token_ids) > 1
and previous_token_ids[-2] == self.end_token_id
) and (
len(delta_token_ids) > 0 and delta_token_ids[0] == self.newline_token_id
):
content = content.lstrip("\n")
return DeltaMessage(content=content if content else None)
else:
# no </think> in previous or delta, reasoning content continues
return DeltaMessage(reasoning=delta_text)
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
"""
Extract reasoning content from the model output.
The Ernie45 thinking model ouput format is
abc\n</think>\n\n\n<response>\ndef\n</response>\n
or abc\n</think>\ndef
- 'abc' goes to reasoning
- 'def' goes to content
Returns:
tuple[Optional[str], Optional[str]]: reasoning content and content
"""
reasoning, content = super().extract_reasoning(model_output, request)
if content:
start_idx = content.find(self.response_start_token)
end_idx = content.rfind(self.response_end_token)
# Simultaneously existing and in the correct order
if start_idx != -1 and end_idx != -1 and start_idx < end_idx:
content = content[start_idx + len(self.response_start_token) : end_idx]
final_content = content or None
return reasoning, final_content

View File

@@ -0,0 +1,185 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import Sequence
from transformers import PreTrainedTokenizerBase
from vllm.entrypoints.mcp.tool_server import ToolServer
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.entrypoints.openai.parser.harmony_utils import parse_chat_output
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
logger = init_logger(__name__)
no_func_reaonsing_tag = {
"type": "structural_tag",
"format": {
"type": "triggered_tags",
"tags": [
{
"begin": "<|channel|>analysis<|message|>",
"content": {"type": "any_text"},
"end": "<|end|>",
}
],
"triggers": ["<|channel|>analysis"],
"stop_after_first": False,
},
}
def from_builtin_tool_to_tag(tool: str) -> list[dict]:
tag = [
{
"begin": f"<|channel|>commentary to={tool}",
"content": {"type": "any_text"},
"end": "<|end|>",
},
{
"begin": f"<|channel|>analysis to={tool}",
"content": {"type": "any_text"},
"end": "<|end|>",
},
]
return tag
def tag_with_builtin_funcs(no_func_reaonsing_tag, builtin_tool_list: list[str]) -> dict:
import copy
new_tag = copy.deepcopy(no_func_reaonsing_tag)
new_tag["format"]["triggers"].append("<|channel|>commentary to=")
for tool in builtin_tool_list:
new_tag["format"]["tags"].extend(from_builtin_tool_to_tag(tool))
return new_tag
class GptOssReasoningParser(ReasoningParser):
"""
Reasoning parser for GptOss model.
The GptOss model uses harmony to extract reasoning content and this parser
is only used for detecting the end of the reasoning content.
"""
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
# The model can output some special tokens between "final" and "<|message|>"
# So we need to look for both sequences to determine the end of reasoning.
self.reasoning_end_token_ids_prefix = self.model_tokenizer.encode(
"<|channel|>final"
)
self.reasoning_end_token_ids_suffix = self.model_tokenizer.encode("<|message|>")
# We also need to check for the <|end|> token to avoid false positives from
# previous messages in multi-turn conversations.
self.eom_token_id = self.model_tokenizer.vocab["<|end|>"]
self.reasoning_max_num_between_tokens = 20
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
end_token_ids_prefix = self.reasoning_end_token_ids_prefix
end_token_ids_suffix = self.reasoning_end_token_ids_suffix
assert len(end_token_ids_prefix) > 0, "reasoning_end_token_ids_prefix is empty"
assert len(end_token_ids_suffix) > 0, "reasoning_end_token_ids_suffix is empty"
# Check if the end sequence is present in the input_ids.
# We search from the end of input_ids to find the last match.
for i in range(len(input_ids) - len(end_token_ids_prefix), -1, -1):
if input_ids[i] == self.eom_token_id:
# We looped backwards far enough to find the end of a previous message,
# which means we have searched the entirety of the current message
# and can exit early without searching further back into prior
# messages of the conversation.
return False
if input_ids[i : i + len(end_token_ids_prefix)] == end_token_ids_prefix:
# We have found the prefix, now we look for the suffix after the prefix.
suffix_start = i + len(end_token_ids_prefix)
for j in range(
suffix_start, len(input_ids) - len(end_token_ids_suffix) + 1
):
if j - suffix_start >= self.reasoning_max_num_between_tokens:
break
if (
input_ids[j : j + len(end_token_ids_suffix)]
== end_token_ids_suffix
):
return True
return False
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
_, content, _ = parse_chat_output(input_ids)
if content is None:
return []
return self.model_tokenizer.encode(content)
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
prev_reasoning, prev_content, _ = parse_chat_output(list(previous_token_ids))
cur_reasoning, cur_content, _ = parse_chat_output(list(current_token_ids))
reasoning_delta = None
content_delta = None
if cur_reasoning is not None:
prev_r = prev_reasoning or ""
if cur_reasoning.startswith(prev_r):
reasoning_delta = cur_reasoning[len(prev_r) :] or None
else:
reasoning_delta = cur_reasoning
if cur_content is not None:
prev_c = prev_content or ""
if cur_content.startswith(prev_c):
content_delta = cur_content[len(prev_c) :] or None
else:
content_delta = cur_content
if reasoning_delta is None and content_delta is None:
return None
return DeltaMessage(reasoning=reasoning_delta, content=content_delta)
def extract_reasoning(
self,
model_output: str,
request: ChatCompletionRequest,
) -> tuple[str | None, str | None]:
raise NotImplementedError(
"gpt-oss has a special branch for parsing reasoning in non-streaming mode. This method shouldn't be used." # noqa: E501
)
# This function prepares the structural tag to format reasoning output
def prepare_structured_tag(
self, original_tag: str | None, tool_server: ToolServer | None
) -> str | None:
if original_tag is None:
if tool_server is None:
return json.dumps(no_func_reaonsing_tag)
else:
builtin_tool_list: list[str] = []
if tool_server.has_tool("browser"):
builtin_tool_list.append("browser")
if tool_server.has_tool("python"):
builtin_tool_list.append("python")
if tool_server.has_tool("container"):
builtin_tool_list.append("container")
if len(builtin_tool_list) > 0:
logger.info("Builtin_tool_list: %s", builtin_tool_list)
func_tag = json.dumps(
tag_with_builtin_funcs(no_func_reaonsing_tag, builtin_tool_list)
)
else:
logger.info("Builtin_tool_list is empty")
func_tag = json.dumps(no_func_reaonsing_tag)
return func_tag
else:
# There is potential risk for appending the tag to the original tag
return original_tag

View File

@@ -0,0 +1,366 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
import regex as re
from transformers import PreTrainedTokenizerBase
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
logger = init_logger(__name__)
class GraniteReasoningParser(ReasoningParser):
"""
Reasoning parser for IBM Granite.
IBM granite models currently use "Here is my thought process:"
and "Here is my response:" to separate its thinking / response outputs.
"""
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
# NOTE: There have been some observed occurrences of quantized
# instances of the current models using "Here's" instead of "Here is",
# so to be safe, we match on both.
self.think_start_expr = r"(?:Here's|Here is) my thought process:"
self.response_start_expr = r"(?:Here's|Here is) my response:"
self.reasoning_regex = re.compile(
rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)", re.DOTALL
)
self.valid_think_starts = [
"Here's my thought process:",
"Here is my thought process:",
]
self.valid_response_starts = ["Here's my response:", "Here is my response:"]
# Substrings to match for sequence boundaries on raw text
self.seq_boundary_end = ":"
self.seq_boundary_start = "Here"
# The longest any thinking / start of response message can be
self.longest_think_start = max(
len(think_start) for think_start in self.valid_think_starts
)
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
"""Extract the reasoning content & content sections, respectively.
If the sequence doesn't match what we expect, i.e., the model generates
something else, all content is considered non-reasoning content.
Args:
model_output (str): Output of the model to be parsed.
request (ChatCompletionRequest): Request being processed.
Returns:
tuple[Optional[str], Optional[str]]: Tuple pair containing the
reasoning content and non-reasoning content.
"""
re_match = self.reasoning_regex.findall(model_output)
if not re_match:
return None, model_output
reasoning, response_content = re_match[0]
if not response_content:
return reasoning, None
return reasoning, response_content
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""Extract the reasoning content / content emitted by granite models;
If the sequence doesn't match what we expect, i.e., the model generates
something else, all content is considered non-reasoning content.
NOTE: Granite models do not use a special token to start their reasoning
and response sections; instead they have token sequences, e.g.,
Here is my thought process: Foo Here is my response: Bar
This increases the complexity of correctly handling streams, since we
need to watch for specific sequences and correctly parse them without
dropping content that is potentially overlapping & spanning multiple
delta messages.
Args:
previous_text (str): Previous text outside of this delta message.
current_text (str): Previous text + delta text.
delta_text (str): Text to consider and parse content from.
previous_token_ids (Sequence[int]): Token IDs of previous_text.
current_token_ids (Sequence[int]): Token IDs of current_text.
delta_token_ids (Sequence[int]): Token IDs of delta_text.
Returns:
Union[DeltaMessage, None]
DeltaMessage with either reasoning content or content, or None.
"""
reasoning, resp_seq_len, content = self._get_content_sections(current_text)
# Either we haven't finished the start of the reasoning sequence,
# or the model is generating something unexpected.
if not reasoning:
delta_message = self._get_delta_message_with_no_reasoning_bounds(
current_text, delta_text
)
# We have a start of reasoning message, but have not yet finished
# the start of response sequence.
elif not content:
delta_message = self._get_delta_message_with_no_response_bounds(
current_text, reasoning, delta_text
)
# We've finished both the start of reasoning and start of response seq.
else:
# This should never happen since we matched on the response
assert resp_seq_len is not None
delta_message = self._get_delta_message_with_both_bounds(
delta_text, reasoning, content, current_text, resp_seq_len
)
if not delta_message.content and not delta_message.reasoning:
return None
return delta_message
#### Implementation details of stream parsing for granite models
def _is_reasoning_start_substr(self, text: str) -> bool:
"""Check if a text matches one of the possible start reasoning seqs.
Args:
text (str): Text to check for leading substr.
Returns:
bool: True if any of the possible reasoning start seqs match.
"""
return any(
think_start.startswith(text) for think_start in self.valid_think_starts
)
def _is_response_start_substr(self, text: str) -> bool:
"""Check if a text matches one of the possible start response seqs.
Args:
text (str): Text to check for leading substr.
Returns:
bool: True if any of the possible response start seqs match.
"""
return any(
response_start.startswith(text)
for response_start in self.valid_response_starts
)
def _get_delta_message_with_no_reasoning_bounds(
self,
current_text: str,
delta_text: str,
) -> DeltaMessage:
"""Parse the delta message when the current text has not yet completed
its start of reasoning sequence.
Args:
current_text (str): The full previous + delta text.
delta_text (str): Text to consider and parse content from.
Returns:
DeltaMessage: Message containing the parsed content.
"""
prev_longest_length = len(current_text) - len(delta_text)
is_substr = self._is_reasoning_start_substr(current_text)
was_substr = self._is_reasoning_start_substr(current_text[:prev_longest_length])
# Check if we just generated something NOT in the special token seq;
# if so, add everything that we previously skipped with this delta
# message and append everything to content in the future.
if was_substr and not is_substr:
return DeltaMessage(
reasoning=None,
content=current_text,
)
if is_substr:
# Might still be in the special token sequence; return nothing
return DeltaMessage(reasoning=None, content=None)
# Otherwise the sequence has already been broken and we already
# corrected; just return the delta text as normal content.
return DeltaMessage(reasoning=None, content=delta_text)
def _get_delta_message_with_no_response_bounds(
self,
current_text: str,
reasoning: str,
delta_text: str,
) -> DeltaMessage:
"""Parse the delta message when the current text has both reasoning
content with no (response) content. NOTE that we may have overlapping
tokens with the start of reasoning / start of response sequences on
either side of the delta text.
Args:
current_text (str): The full previous + delta text.
reasoning (str): reasoning content from current_text.
delta_text (str): Text to consider and parse content from.
Returns:
DeltaMessage: Message containing the parsed content.
"""
# If we have no reasoning content or explicitly end with the start of
# response sequence, we are in transition to the response; need to be
# careful here, since the final token (:) will match the reasoning
# content and fully parse it out; we should not pass the : back.
ends_with_start_response_seq = any(
current_text.endswith(response_start)
for response_start in self.valid_response_starts
)
if reasoning is None or ends_with_start_response_seq:
return DeltaMessage(reasoning=None, content=None)
# Consider previous / current text only within context of the reasoning
previous_text = reasoning[: -len(delta_text)]
current_text = reasoning
# We need to be careful about adding unfinished response sequences;
# Find the place at which we MIGHT be starting a response sequence
prev_idx = previous_text.rfind(self.seq_boundary_start)
delta_idx = delta_text.rfind(self.seq_boundary_start)
# Check the state of potential start of response substring matches.
prev_was_substr = (
self._is_response_start_substr(previous_text[prev_idx:])
if prev_idx >= 0
else False
)
delta_continues_substr = (
self._is_response_start_substr(current_text[prev_idx:])
if prev_idx >= 0
else False
)
delta_new_substr = (
self._is_response_start_substr(delta_text[delta_idx:])
if delta_idx >= 0
else False
)
# Delta only contains potential continued response sequence text.
if delta_continues_substr:
return DeltaMessage(reasoning=None, content=None)
if not prev_was_substr:
# Delta may be starting a new response seq but has other text too.
if delta_new_substr:
return DeltaMessage(reasoning=delta_text[:delta_idx], content=None)
# Normal case for most reasoning text (no potential special seqs).
return DeltaMessage(reasoning=delta_text, content=None)
# The substring that previously seemed to be a potential response
# seq wasn't one; we need to add the content to the delta message,
# and also slice off the potential response sequence
elif delta_new_substr:
reasoning = previous_text[prev_idx:] + delta_text[:delta_idx]
return DeltaMessage(reasoning=reasoning, content=None)
# No new substring yet, and we broke our old one; take the whole delta
return DeltaMessage(
reasoning=previous_text[prev_idx:] + delta_text,
content=None,
)
def _get_delta_message_with_both_bounds(
self,
delta_text: str,
reasoning: str,
response_content: str,
current_text: str,
response_seq_len: int,
) -> DeltaMessage:
"""Parse the delta message when the current text has both reasoning
content and normal (response) content.
Args:
delta_text: Text to consider and parse content from.
reasoning: reasoning content from current_text.
response_content: response content from current_text.
current_text: The full previous + delta text.
response_seq_len: Len of the complete response sequence used.
Returns:
DeltaMessage: Message containing the parsed content.
"""
# Always have content; take length to the end
delta_content = delta_text[-len(response_content) :]
reasoning_end_idx = len(delta_text) - (len(response_content) + response_seq_len)
if reasoning_end_idx < 0:
delta_reasoning = None
else:
# Get the starting offset
start_reasoning_idx = (
len(reasoning) + response_seq_len + len(response_content) - 1
)
delta_offset = len(current_text) - len(delta_text)
start_offset = start_reasoning_idx - delta_offset
if start_offset < 0:
start_offset = 0
delta_reasoning = delta_text[start_offset:reasoning_end_idx]
return DeltaMessage(
reasoning=delta_reasoning,
content=delta_content,
)
def _get_content_sections(
self, current_text: str
) -> tuple[str | None, int | None, str | None]:
"""Parse the text to extract the reasoning content / content
if we have them.
Args:
current_text (str): The full previous + delta text.
Returns:
tuple[Optional[str], Optional[int], Optional[str]]: Tuple of len 3
containing the reasoning content, the length of the response seq
(if there is one) and the non-reasoning content.
"""
current_chunk_start = 0
start_reasoning = None
parsed_content = False
delimiter_idxs = [
idx
for idx, char in enumerate(current_text)
if char == self.seq_boundary_end
]
for current_chunk_end in delimiter_idxs:
current_chunk = current_text[current_chunk_start:current_chunk_end]
# Check to see if the start of reasoning seq if complete
if start_reasoning is None:
for think_start in self.valid_think_starts:
if current_chunk == think_start[:-1]:
start_reasoning = current_chunk_end + 1
current_chunk_start = current_chunk_end + 1
break
# Check to see if the start of response seq if complete
elif not parsed_content:
for response_start in self.valid_response_starts:
if current_chunk[-len(response_start) + 1 :] == response_start[:-1]:
# Mark end of reasoning and start response content
# after the start of response sequence.
end_reasoning = current_chunk_end - len(response_start)
reasoning = current_text[start_reasoning:end_reasoning]
response_content = current_text[current_chunk_end + 1 :]
return reasoning, len(response_start), response_content
if start_reasoning and not parsed_content:
return current_text[start_reasoning:], None, None
return None, None, None

View File

@@ -0,0 +1,240 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
import regex as re
from transformers import PreTrainedTokenizerBase
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
logger = init_logger(__name__)
class HunyuanA13BReasoningParser(ReasoningParser):
"""
Reasoning parser for Hunyuan A13B Model
HunyuanReasoningParser
This class implements a reasoning parser specifically designed
for the Hunyuan A13B Model. It is responsible for parsing and
extracting structured reasoning and answer segments from model
outputs that follow a specific pattern.
Key Features:
- For non-stream output , Recognizes and extracts reasoning ("think")
and answer ("answer") sections from text using regular expressions.
- For stream process, it requires a token id sequences to change the
reasoning state and other state so it maintains internal state to
manage parsing across multiple token.
think start: "<think>\n": [14023, 771, 397]
think ends: "\n</think>\n<answer>\n": [198, 524, 27963, 397, 27, 9399, 397]
response ends: "\n</answer>": [524, 9399, 29]
"""
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
self.think_start_expr = r"<think>\n"
self.think_end_expr = r"\n</think>\n"
self.response_start_expr = r"\n</think>\n<answer>\n"
self.response_end_expr = r"\n</answer>"
self.full_match_reasoning_regex = re.compile(
rf"(?:{self.think_start_expr}(.*?){self.response_start_expr})?(.*?){self.response_end_expr}",
re.DOTALL,
)
self.half_match_reasoning_regex = re.compile(
rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)", re.DOTALL
)
self.think_start_ids = [14023, 771, 397]
self.think_start_ids_fast = [14023, 771, 1363]
self.response_start_ids = [198, 524, 27963, 397, 27, 9399, 397]
self.response_start_ids_fast = [524, 27963, 397, 27, 9399, 397]
self.response_end_ids = [198, 524, 9399, 29]
self.fast_think_ids = [14023, 771, 1363, 524, 27963, 397, 27, 9399, 397]
# when state change, send out all the buffered text in last state
self.buffered_text = []
self.buffered_ids = []
self.current_state = "reasoning"
self.all_states = ["reasoning", "response"]
self.current_state = "idle"
self.expected_sequence = self.think_start_ids
# this sequence only for the think start, it has two way to start.
self.expected_sequence_side = self.think_start_ids_fast
self.sequence_index = 0
self.token_buffer = []
self.text_buffer = ""
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self.current_state == "response"
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
# for hunyuan streaming reason parsing, the stream parse
# will call first, and the same token will be called in
# is_reasoning_end and extract_content_ids
# this id is not part of content, so just return [] here.
return []
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
"""Extract the reasoning content & content sections, respectively.
If the sequence doesn't match what we expect, i.e., the model generates
something else, all content is considered non-reasoning content.
Args:
model_output (str): Output of the model to be parsed.
request (ChatCompletionRequest): Request being processed.
Returns:
tuple[Optional[str], Optional[str]]: Tuple pair containing the
reasoning content and non-reasoning content.
"""
re_match = self.full_match_reasoning_regex.findall(model_output)
if re_match:
reasoning, response_content = re_match[0]
if len(reasoning) == 0:
reasoning = None
if len(response_content) == 0:
response_content = None
return reasoning, response_content
fallback_regex = self.half_match_reasoning_regex
fallback_match = fallback_regex.findall(model_output)
if fallback_match:
reasoning, response_content = fallback_match[0]
if response_content.endswith(self.response_end_expr):
response_content = response_content[: -len(self.response_end_expr)]
if len(reasoning) == 0:
reasoning = None
if len(response_content) == 0:
response_content = None
return reasoning, response_content
return None, model_output
def _is_strict_increasing_subsequence(
self, subsequence: Sequence[int], sequence: Sequence[int]
) -> bool:
if not subsequence:
return False
sub_idx = 0
for num in sequence:
if sub_idx < len(subsequence) and num == subsequence[sub_idx]:
sub_idx += 1
return sub_idx == len(subsequence)
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""Extract content using token ID sequence state machine"""
# Define sequences
think_start_sequence = self.think_start_ids
response_start_sequence = self.response_start_ids
response_end_sequence = self.response_end_ids
assert len(delta_token_ids) == 1
# Process each token in the delta
token = delta_token_ids[0]
def check_token_with_sequence(token):
if self.current_state == "idle" or self.current_state == "think":
return (
token == self.expected_sequence[self.sequence_index]
or token == self.expected_sequence_side[self.sequence_index]
)
else:
return token == self.expected_sequence[self.sequence_index]
def check_last_token(token):
if self.current_state == "idle" or self.current_state == "think":
# only return true if it's judge using a side sequence.
if (
self.sequence_index - 1 < len(self.expected_sequence_side)
and token == self.expected_sequence_side[self.sequence_index - 1]
):
return self.sequence_index == len(self.expected_sequence_side)
else:
return self.sequence_index == len(self.expected_sequence)
else:
return self.sequence_index == len(self.expected_sequence)
# Check if token matches expected sequence
token_in_state_seq = check_token_with_sequence(token)
if token_in_state_seq:
# Store matching token
self.token_buffer.append(token)
self.text_buffer += delta_text
self.sequence_index += 1
## state change from idle->think->response->idle
# Check if sequence fully matched
if check_last_token(token):
# State transition
if self.current_state == "idle":
self.current_state = "think"
self.expected_sequence = response_start_sequence
self.expected_sequence_side = self.response_start_ids_fast
elif self.current_state == "think":
self.current_state = "response"
self.expected_sequence = response_end_sequence
elif self.current_state == "response":
self.current_state = "idle"
self.expected_sequence = think_start_sequence
self.expected_sequence_side = self.think_start_ids_fast
# Reset matching state
self.sequence_index = 0
self.token_buffer = []
self.text_buffer = ""
# Do not send content for state transition texts.
else:
# Sequence broken - handle buffered content
if self.token_buffer and len(self.token_buffer) > 0:
# Send buffered tokens
buffered_content = self.text_buffer + delta_text
# Reset matching state
self.sequence_index = 0
self.token_buffer = []
self.text_buffer = ""
# Return content based on current state
if self.current_state == "think":
return DeltaMessage(reasoning=buffered_content, content=None)
else:
return DeltaMessage(reasoning=None, content=buffered_content)
else:
# No buffered content, send normally
if self.current_state == "think":
return DeltaMessage(reasoning=delta_text, content=None)
else:
return DeltaMessage(reasoning=None, content=delta_text)
# If no content to send in this delta
return None

View File

@@ -0,0 +1,66 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable, Sequence
from transformers import PreTrainedTokenizerBase
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
logger = init_logger(__name__)
class IdentityReasoningParser(ReasoningParser):
"""
Identity reasoning parser.
This parser does not attempt to parse or strip out reasoning tokens.
It treats the entire model output as content and ignores reasoning.
"""
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
if not self.model_tokenizer:
raise ValueError(
"The model tokenizer must be passed to the ReasoningParser "
"constructor during construction."
)
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
# Always return True, since we never treat reasoning specially
return True
def is_reasoning_end_streaming(
self, input_ids: Sequence[int], delta_ids: Iterable[int]
) -> bool:
return True
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
# Identity: return all tokens as content
return input_ids
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
# Just wrap delta_text as content, ignore reasoning
if delta_text:
return DeltaMessage(content=delta_text)
return None
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
# No reasoning separation: return None for reasoning,
# and full model_output as content
return None, model_output

View File

@@ -0,0 +1,114 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage,
)
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.logger import init_logger
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for MiniMax M2 model.
MiniMax M2 models don't generate <think> start token, only </think> end
token. All content before </think> is reasoning, content after is the
actual response.
"""
@property
def start_token(self) -> str:
"""The token that starts reasoning content."""
return "<think>"
@property
def end_token(self) -> str:
"""The token that ends reasoning content."""
return "</think>"
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""
Extract reasoning content from a delta message for streaming.
MiniMax M2 models don't generate <think> start token, so we assume
all content is reasoning until we encounter the </think> end token.
"""
# Skip single end token
if len(delta_token_ids) == 1 and delta_token_ids[0] == self.end_token_id:
return None
# Check if end token has already appeared in previous tokens
# meaning we're past the reasoning phase
if self.end_token_id in previous_token_ids:
# We're past the reasoning phase, this is content
return DeltaMessage(content=delta_text)
# Check if end token is in delta tokens
if self.end_token_id in delta_token_ids:
# End token in delta, split reasoning and content
end_index = delta_text.find(self.end_token)
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage(
reasoning=reasoning if reasoning else None,
content=content if content else None,
)
# No end token yet, all content is reasoning
return DeltaMessage(reasoning=delta_text)
class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
"""
Reasoning parser for MiniMax M2 model.
"""
def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
self.end_token_id = self.vocab.get("</think>")
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
end_token_id = self.end_token_id
return any(input_id == end_token_id for input_id in reversed(input_ids))
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return input_ids
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
if len(previous_token_ids) == 0:
delta_text = "<think>" + delta_text
return DeltaMessage(content=delta_text)
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
) -> tuple[str | None, str | None]:
return None, "<think>" + model_output

View File

@@ -0,0 +1,157 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from functools import cached_property
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
from vllm.tokenizers.mistral import MistralTokenizer
logger = init_logger(__name__)
class MistralReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for Mistral models.
The Mistral models uses `[THINK]`...`[/THINK]` tokens to denote reasoning
text. This parser extracts the reasoning content from the model output.
A valid reasoning trace should always start with a `[THINK]` token and end with
a `[/THINK]` token.
If `[THINK]` token is not generated, then this parser only returns content.
"""
def __init__(self, tokenizer: MistralTokenizer, *args, **kwargs):
if not isinstance(tokenizer, MistralTokenizer):
raise ValueError("The tokenizer must be an instance of MistralTokenizer.")
ReasoningParser.__init__(self, tokenizer, *args, **kwargs)
if not self.model_tokenizer:
raise ValueError(
"The model tokenizer must be passed to the ReasoningParser "
"constructor during construction."
)
self.start_token_id = tokenizer.tokenizer.get_special_token(self.start_token)
self.end_token_id = tokenizer.tokenizer.get_special_token(self.end_token)
if self.start_token_id is None or self.end_token_id is None:
raise RuntimeError(
"Mistral reasoning parser could not locate think start/end "
"tokens in the tokenizer!"
)
@cached_property
def start_token(self) -> str:
"""The token that starts reasoning content."""
from mistral_common.tokens.tokenizers.base import SpecialTokens
return SpecialTokens.begin_think
@cached_property
def end_token(self) -> str:
"""The token that ends reasoning content."""
from mistral_common.tokens.tokenizers.base import SpecialTokens
return SpecialTokens.end_think
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
has_eot_token = False
for id in reversed(input_ids):
if id == self.start_token_id:
# Reasoning ends only if a BOT token is found before a EOT token.
return has_eot_token
elif id == self.end_token_id:
has_eot_token = True
return False
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
"""
Extract the content
"""
has_bot_token = False
has_eot_token = False
bot_token_index = -1
eot_token_index = -1
# One for loop instead of multiple lookups
for i, token_id in enumerate(input_ids):
# We filter that we have multiple BOT tokens which should not
# happen for a well prompted trained model
if token_id == self.start_token_id and not has_bot_token:
has_bot_token = True
bot_token_index = i
elif token_id == self.end_token_id:
has_eot_token = True
eot_token_index = i
break
# 1. Only BOT has been outputted
if has_bot_token and not has_eot_token:
# Should be = [] if model is well prompted and trained.
return input_ids[:bot_token_index]
# 2. Neither BOT or EOT have been outputted
elif not has_bot_token and not has_eot_token:
return input_ids
# 3. Both BOT and EOT have been outputted.
elif has_bot_token and has_eot_token:
return input_ids[:bot_token_index] + input_ids[eot_token_index + 1 :]
# 4. Only EOT has been outputted => this should not have occurred for a model
# well prompted and trained.
else:
return input_ids[:eot_token_index] + input_ids[eot_token_index + 1 :]
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
) -> tuple[str | None, str | None]:
"""
Extract reasoning content from the model output.
"""
if not model_output:
return (None, "")
# Check if the start token is present in the model output, remove it
# if it is present.
prev_bot_token, bot_token, post_bot_token = model_output.partition(
self.start_token
)
has_bot_token = bool(bot_token)
# Valid EOT tokens should follow BOT token
has_valid_eot_token = has_bot_token and self.end_token in post_bot_token
# 1. If there is BOT token followed by EOT token
if has_bot_token and has_valid_eot_token:
prev_eot_token, _, post_eot_token = post_bot_token.partition(self.end_token)
# If model is well prompted and trained prev_bot_token should be ""
content = prev_bot_token + post_eot_token
return prev_eot_token, content if content else None
# 2. Only BOT token
elif has_bot_token:
# If model is well prompted and trained prev_bot_token should be ""
return post_bot_token, prev_bot_token if prev_bot_token else None
# 3. EOT token has been outputted without BOT or neither has been outputted
else:
has_non_valid_eot_token = self.end_token in prev_bot_token
# 3.a EOT token has been outputted without BOT
# If model is well prompted and trained `has_non_valid_eot_token` should
# be `False` and the parser outputs all tokens as 'content'
if has_non_valid_eot_token:
prev_eot_token, _, post_eot_token = prev_bot_token.partition(
self.end_token
)
return None, prev_eot_token + post_eot_token
# 3.b neither BOT or EOT have been outputted
else:
return None, prev_bot_token

View File

@@ -0,0 +1,305 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses as dt
import enum
from collections.abc import Sequence
from typing import TYPE_CHECKING
import regex as re
if TYPE_CHECKING:
from vllm.tokenizers import TokenizerLike
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage,
)
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
logger = init_logger(__name__)
class Olmo3ReasoningState(enum.Enum):
REASONING = 1
CONTENT = 2
@dt.dataclass(frozen=True)
class Indices:
start: int
end: int
def __len__(self):
return self.end - self.start
def string_overlap(a: str, b: str) -> tuple[Indices | None, Indices | None]:
"""
Find the longest overlap where the end of string a matches the start
of string b.
Args:
a: First string
b: Second string
Returns:
Tuple of IndicesTuples representing the overlapping portions in each
string, or a tuple of None if no overlap exists
"""
# swap so a is always the shorter string
a, b, swap = (a, b, False) if len(a) < len(b) else (b, a, True)
# first check: is a fully contained in b?
if a in b:
ind_a = Indices(0, len(a))
ind_b = Indices(b.index(a), b.index(a) + len(a))
return (ind_b, ind_a) if swap else (ind_a, ind_b)
# second check: does the end of a overlap with the
# beginning of b?
for i in range(len(a) - 1, 0, -1):
if a[-i:] == b[:i]:
ind_a = Indices(len(a) - i, len(a))
ind_b = Indices(0, i)
return (ind_b, ind_a) if swap else (ind_a, ind_b)
# third check: does the beginning of a overlap with
# the end of b?
for i in range(len(a) - 1, 0, -1):
if b[-i:] == a[:i]:
ind_a = Indices(0, i)
ind_b = Indices(len(b) - i, len(b))
return (ind_b, ind_a) if swap else (ind_a, ind_b)
return None, None
@dt.dataclass
class Olmo3ReasoningBuffer:
think_start: str = "<think>"
think_end: str = "</think>"
buffer: str = ""
# we start in reasoning state to support cases where we hardcode
# <think> as the start of the reasoning block.
# In those cases, the only token we will see is </think>, which
# is when we switch to content state.
state: Olmo3ReasoningState = Olmo3ReasoningState.REASONING
def process_buffer(self) -> DeltaMessage | None:
start_think_idx = self.buffer.find(self.think_start)
if start_think_idx >= 0:
self.state = Olmo3ReasoningState.REASONING
pretext, self.buffer = (
self.buffer[:start_think_idx],
self.buffer[start_think_idx + len(self.think_start) :],
)
if start_think_idx > 0:
# this covers the case there's content before
# the start of the reasoning block
return DeltaMessage(content=pretext)
end_think_idx = self.buffer.rfind(self.think_end)
if end_think_idx >= 0:
self.state = Olmo3ReasoningState.CONTENT
pretext, self.buffer = (
self.buffer[:end_think_idx],
self.buffer[end_think_idx + len(self.think_end) :],
)
if end_think_idx > 0:
# this covers the case there's content before
# the end of the reasoning block
return DeltaMessage(reasoning=pretext)
if self.state == Olmo3ReasoningState.REASONING:
# we are inside reasoning block, return and empty
# the text buffer
(
text_buffer,
self.buffer,
) = self.buffer, ""
return DeltaMessage(reasoning=text_buffer)
if self.state == Olmo3ReasoningState.CONTENT:
# we are outside reasoning block, return and empty
# the text buffer
(
text_buffer,
self.buffer,
) = self.buffer, ""
return DeltaMessage(content=text_buffer)
# nothing to return unless we are in reasoning or content state
return None
def __len__(self):
# is the length of the text buffer
return len(self.buffer)
def add_text(self, delta_text: str) -> DeltaMessage | None:
# we start by adding the delta text to the buffer
self.buffer += delta_text
# setting this to empty before starting
delta_message: DeltaMessage | None = None
# we start by computing the overlap between the delta_text
# and start/end of think tokens.
_, overlap_think_start = string_overlap(delta_text, self.think_start)
_, overlap_think_end = string_overlap(delta_text, self.think_end)
partial_overlap_start = overlap_think_start is not None and len(
overlap_think_start
) < len(self.think_start)
partial_overlap_end = overlap_think_end is not None and len(
overlap_think_end
) < len(self.think_end)
if (
partial_overlap_start
and self.think_start in self.buffer
and not partial_overlap_end
):
# we can only process the buffer if partial overlap
# is the last part of think token (thus causing
# text_buffer to contain the start of think token)
# and there are no partial overlaps with end think
delta_message = self.process_buffer()
elif partial_overlap_end and self.think_end in self.buffer:
# same as before (partial overlap only allowed)
# if the buffer contains the end think token,
# but we don't have to check for partial overlap
# with start think token because they are handled
# by the previous condition
delta_message = self.process_buffer()
elif partial_overlap_start or partial_overlap_end:
# in general, if there are overlaps, we don't
# process the buffer because we want to wait until
# the think token is fully completed.
return None
else:
# we process the buffer as normal
delta_message = self.process_buffer()
return delta_message
class Olmo3ReasoningParser(ReasoningParser):
"""
Reasoning parser for Olmo 3 model
Olmo3ReasoningParser
This class implements a reasoning parser specifically designed for the
Olmo 3 family of models. Olmo 3 models do not use special tokens to
indicate reasoning; rather, reasoning trace is wrapped in `<think>` and
`</think>`, which are tokenized using standard vocabulary entries.
Because of this, the parser operates in string space, accumulating the
characters in a buffer until it sees `<think>` or `</think>`. tokens
to switch modes.
Key Features:
- For non-stream output, Recognizes and extracts reasoning (text
bracketed by `<think>` and `</think>`) and content (everything
after the first `</think>`).
- For stream process, it uses a buffer to accumulate delta text,
and output progressive delta messages as soon as thinking starts
or ends.
- For reliability, some Olmo 3 models may hardcode the first
`<think>` token is the input text (similar to Deepseek R1,
or reasoning-only Qwen models). To support such variants, the
parser can optionally work in cases where the first `<think>`
token is missing from generation.
"""
def __init__(self, tokenizer: "TokenizerLike", *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
self.think_start = r"<think>"
self.think_end = r"</think>"
# notice that the first think is optional; this allows template to
# work in cases when we hardcode a <think> at the beginning of the
# reasoning template.
reasoning_expr = (
rf"^(?:{self.think_start})?(?P<reasoning>.*?)"
rf"{self.think_end}(?P<content>.*)$"
)
self.reasoning_regex = re.compile(reasoning_expr, re.DOTALL)
self.buffer = Olmo3ReasoningBuffer(
think_start=self.think_start, think_end=self.think_end
)
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
text = self.model_tokenizer.decode(input_ids)
return self.think_end in text
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
# for Olmo 3 streaming reason parsing, the stream parse
# will call first, and the same token will be called in
# is_reasoning_end and extract_content_ids
# this id is not part of content, so just return [] here.
return []
def extract_reasoning(
self,
model_output: str,
request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]:
"""Extract the reasoning content & content sections, respectively.
If the sequence doesn't match what we expect, i.e., the model generates
something else, all content is considered non-reasoning content.
Args:
model_output (str): Output of the model to be parsed.
request (ChatCompletionRequest | ResponsesRequest): Request being
processed.
Returns:
tuple[Optional[str], Optional[str]]: Tuple pair containing the
reasoning content and non-reasoning content.
"""
re_match = self.reasoning_regex.match(model_output)
if re_match:
reasoning = re_match.group("reasoning") or None
content = re_match.group("content") or None
return reasoning, content
# no reasoning content
return None, model_output
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""Extract content using token ID sequence state machine"""
delta_message = self.buffer.add_text(delta_text)
if delta_message is None and self.buffer.think_end in self.buffer.buffer:
# this is a bit hacky, but, because of how the buffer is
# constructed, if the last delta_text contains characters that
# marks the end of thinking tokens, then messages in the buffer
# would never be processed because we get no other turn. To get
# around that, we check if the text buffer contains the end of
# thinking tokens, and, if so, we reprocess the buffer again.
delta_message = self.buffer.process_buffer()
return delta_message

View File

@@ -0,0 +1,133 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
class Qwen3ReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for the Qwen3/Qwen3.5 model family.
The Qwen3 model family uses <think>...</think> tokens to denote reasoning
text. Starting with Qwen3.5, the chat template places <think> in the
prompt so only </think> appears in the generated output. The model
provides a strict switch to disable reasoning output via the
'enable_thinking=False' parameter.
When thinking is disabled, the template places <think>\\n\\n</think>\\n\\n
in the prompt. The serving layer detects this via prompt_is_reasoning_end
and routes deltas as content without calling the streaming parser.
NOTE: Models up to the 2507 release (e.g., Qwen/Qwen3-235B-A22B-Instruct-2507)
use an older chat template where the model generates <think> itself.
This parser handles both styles: if <think> appears in the generated output
it is stripped before extraction (non-streaming) or skipped (streaming).
"""
@property
def start_token(self) -> str:
"""The token that starts reasoning content."""
return "<think>"
@property
def end_token(self) -> str:
"""The token that ends reasoning content."""
return "</think>"
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
) -> tuple[str | None, str | None]:
"""
Extract reasoning content from the model output.
The <think> token is placed in the prompt by the chat template,
so typically only </think> appears in the generated output.
If <think> is present (e.g. from a different template), it is
stripped before extraction.
When thinking is disabled (no </think> in output), returns
(None, model_output) to indicate all output is content.
Returns:
tuple[Optional[str], Optional[str]]: reasoning content and content
"""
# Strip <think> if present in the generated output.
model_output_parts = model_output.partition(self.start_token)
model_output = (
model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
)
if self.end_token not in model_output:
# No end token means thinking is disabled or the model
# did not produce reasoning. Treat everything as content.
return None, model_output
# Extract reasoning content from the model output.
reasoning, _, content = model_output.partition(self.end_token)
final_content = content or None
return reasoning, final_content
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""
Extract reasoning content from a streaming delta.
Since <think> is placed in the prompt by the chat template, all
generated tokens before </think> are reasoning and tokens after
are content.
NOTE: When thinking is disabled, no think tokens appear in the
generated output. The serving layer detects this via
prompt_is_reasoning_end and routes deltas as content without
calling this method.
"""
# Strip <think> from delta if present (old template / edge case
# where the model generates <think> itself).
if self.start_token_id in delta_token_ids:
start_idx = delta_text.find(self.start_token)
if start_idx >= 0:
delta_text = delta_text[start_idx + len(self.start_token) :]
if self.end_token_id in delta_token_ids:
# End token in this delta: split reasoning from content.
end_index = delta_text.find(self.end_token)
if end_index >= 0:
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :]
if not reasoning and not content:
return None
return DeltaMessage(
reasoning=reasoning if reasoning else None,
content=content if content else None,
)
# end_token_id in IDs but not in text (already stripped)
return None
# No end token in this delta.
if not delta_text:
# Nothing left after stripping start token.
return None
elif self.end_token_id in previous_token_ids:
# End token already passed: everything is content now.
return DeltaMessage(content=delta_text)
else:
# No end token yet: still in reasoning phase.
return DeltaMessage(reasoning=delta_text)

View File

@@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
class SeedOSSReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for SeedOSS model.
The SeedOSS model uses <seed:think>...</seed:think> tokens to
denote reasoning content text. This parser extracts
the reasoning content from the model output.
Similar to DeepSeek R1, it supports cases
where the model doesn't generate the start token.
"""
@property
def start_token(self) -> str:
"""The token that starts reasoning content."""
return "<seed:think>"
@property
def end_token(self) -> str:
"""The token that ends reasoning content."""
return "</seed:think>"

View File

@@ -0,0 +1,119 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable, Sequence
from itertools import islice
import regex as re
from transformers import PreTrainedTokenizerBase
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser
logger = init_logger(__name__)
class Step3ReasoningParser(ReasoningParser):
"""
Reasoning parser for Step3 model.
The Step3 model uses </think> token to denote the end of reasoning
text. This parser extracts all content before </think> as reasoning content.
"""
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
self.think_end_token = "</think>"
self.reasoning_regex = re.compile(rf"(.*?){self.think_end_token}", re.DOTALL)
if not self.model_tokenizer:
raise ValueError(
"The model tokenizer must be passed to the ReasoningParser "
"constructor during construction."
)
self.think_end_token_id = self.vocab.get(self.think_end_token)
if self.think_end_token_id is None:
raise RuntimeError(
"Step3 reasoning parser could not locate think end "
"token in the tokenizer!"
)
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""
Extract reasoning content from a delta message.
Handles streaming output where previous + delta = current.
Uses token IDs for faster processing.
For text "abc</think>xyz":
- 'abc' goes to reasoning
- 'xyz' goes to content
"""
# Skip single special token
if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id:
return None
if self.think_end_token_id in delta_token_ids:
# </think> in delta, extract reasoning content and remaining content
end_index = delta_text.find(self.think_end_token)
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.think_end_token) :]
return DeltaMessage(
reasoning=reasoning,
content=content if content else None,
)
elif self.think_end_token_id in previous_token_ids:
# </think> already seen in previous text, everything is content
return DeltaMessage(content=delta_text)
else:
# No </think> seen yet, everything is reasoning
return DeltaMessage(reasoning=delta_text)
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
# Check if the model output contains the </think> token
if self.think_end_token not in model_output:
# If no </think> token, everything is reasoning content
return model_output, None
else:
# Find the first occurrence of </think>
end_index = model_output.find(self.think_end_token)
reasoning = model_output[:end_index]
# Content after </think> token
content = model_output[end_index + len(self.think_end_token) :]
if len(content) == 0:
content = None
return reasoning, content
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self.think_end_token_id in input_ids
def is_reasoning_end_streaming(
self, input_ids: Sequence[int], delta_ids: Iterable[int]
) -> bool:
end_token_id = self.think_end_token_id
return end_token_id in delta_ids
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
if self.think_end_token_id not in islice(
input_ids, 0, max(0, len(input_ids) - 1)
):
return []
else:
return input_ids[input_ids.index(self.think_end_token_id) + 1 :]

View File

@@ -0,0 +1,185 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable, Sequence
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
from vllm.tokenizers import TokenizerLike
class Step3p5ReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for Step3p5 model.
Step3p5 uses the <think>...</think> format, but it tends to emit an extra
newline immediately before and/or after the </think> token. This parser trims:
- the newline right before </think>
- the newline right after </think>
"""
@property
def start_token(self) -> str:
return "<think>"
@property
def end_token(self) -> str:
return "</think>"
def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
# Used to hold a trailing "\n" from reasoning content so we can decide
# whether it is immediately before </think>.
self._pending_reasoning_newline = False
# Tracks whether we've seen </think> but are still waiting for one more
# token to confirm the end.
self._end_token_pending = False
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._is_reasoning_end_from_ids(input_ids)
def is_reasoning_end_streaming(
self, input_ids: Sequence[int], delta_ids: Iterable[int]
) -> bool:
# Only examine newly generated tokens; they may contain multiple ids.
return self._is_reasoning_end_from_ids(delta_ids)
def _is_reasoning_end_from_ids(self, input_ids: Sequence[int]) -> bool:
# Scan backwards to find the last special token, <think> or </think>.
last_special = None
last_idx = -1
for i in range(len(input_ids) - 1, -1, -1):
token_id = input_ids[i]
if token_id == self.start_token_id:
last_special = "start"
last_idx = i
break
if token_id == self.end_token_id:
last_special = "end"
last_idx = i
break
if last_special == "start":
# If we're already waiting for one token after </think>, do not
# clear the pending state just because the prompt contains <think>.
# Streaming deltas should not include <think> for this model.
if self._end_token_pending:
return False
# A start token after any end token means reasoning is ongoing.
self._end_token_pending = False
return False
if last_special == "end":
# Require at least one token after </think> before ending.
if last_idx < len(input_ids) - 1:
self._end_token_pending = False
return True
self._end_token_pending = True
return False
# No special tokens in this input. If we were waiting for one token
# after </think>, any new token completes the end.
if self._end_token_pending and input_ids:
self._end_token_pending = False
return True
return False
def extract_reasoning(
self,
model_output: str,
request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]:
reasoning, content = super().extract_reasoning(model_output, request)
if reasoning is not None:
reasoning = reasoning.removesuffix("\n")
if content is not None:
content = content.removeprefix("\n")
return reasoning or None, content or None
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
# Drop the immediate newline that models often emit after </think>.
if previous_text.endswith(self.end_token) and delta_text:
if delta_text == "\n":
return None
elif delta_text.startswith("\n"):
remaining = delta_text.removeprefix("\n")
return DeltaMessage(content=remaining) if remaining else None
ret = super().extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
)
if ret is None:
return None
# Compatibility path for models that don't generate the start token:
# treat everything before </think> as reasoning and everything after
# as content.
if (
self.start_token_id not in previous_token_ids
and self.start_token_id not in delta_token_ids
):
if self.end_token_id in delta_token_ids:
end_index = delta_text.find(self.end_token)
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :]
ret = DeltaMessage(reasoning=reasoning, content=content or None)
elif self.end_token_id in previous_token_ids:
ret = DeltaMessage(content=delta_text)
else:
ret = DeltaMessage(reasoning=delta_text)
reasoning_to_output = ret.reasoning
content_to_output = ret.content
# Reasoning: handle the newline immediately before </think>.
if reasoning_to_output is not None:
if self._pending_reasoning_newline:
reasoning_to_output = "\n" + reasoning_to_output
self._pending_reasoning_newline = False
if reasoning_to_output.endswith("\n"):
reasoning_to_output = reasoning_to_output.removesuffix("\n")
if self.end_token in delta_text:
# Trailing "\n" is right before </think>, drop it.
self._pending_reasoning_newline = False
else:
# Hold the trailing "\n" until we know whether </think> follows.
self._pending_reasoning_newline = True
# Content: handle the newline immediately after </think>.
if content_to_output is not None:
# If we have content, reasoning must have ended.
self._pending_reasoning_newline = False
if self.end_token in delta_text and content_to_output.startswith("\n"):
content_to_output = content_to_output.removeprefix("\n")
reasoning_to_output = reasoning_to_output or None
content_to_output = content_to_output or None
if reasoning_to_output is None and content_to_output is None:
return None
return DeltaMessage(reasoning=reasoning_to_output, content=content_to_output)