v1.0
This commit is contained in:
92
reasoning/__init__.py
Normal file
92
reasoning/__init__.py
Normal file
@@ -0,0 +1,92 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
|
||||
|
||||
__all__ = [
|
||||
"ReasoningParser",
|
||||
"ReasoningParserManager",
|
||||
]
|
||||
"""
|
||||
Register a lazy module mapping.
|
||||
|
||||
Example:
|
||||
ReasoningParserManager.register_lazy_module(
|
||||
name="qwen3",
|
||||
module_path="vllm.reasoning.qwen3_reasoning_parser",
|
||||
class_name="Qwen3ReasoningParser",
|
||||
)
|
||||
"""
|
||||
|
||||
|
||||
_REASONING_PARSERS_TO_REGISTER = {
|
||||
"deepseek_r1": ( # name
|
||||
"deepseek_r1_reasoning_parser", # filename
|
||||
"DeepSeekR1ReasoningParser", # class_name
|
||||
),
|
||||
"deepseek_v3": (
|
||||
"deepseek_v3_reasoning_parser",
|
||||
"DeepSeekV3ReasoningParser",
|
||||
),
|
||||
"ernie45": (
|
||||
"ernie45_reasoning_parser",
|
||||
"Ernie45ReasoningParser",
|
||||
),
|
||||
"glm45": (
|
||||
"glm4_moe_reasoning_parser",
|
||||
"Glm4MoeModelReasoningParser",
|
||||
),
|
||||
"openai_gptoss": (
|
||||
"gptoss_reasoning_parser",
|
||||
"GptOssReasoningParser",
|
||||
),
|
||||
"granite": (
|
||||
"granite_reasoning_parser",
|
||||
"GraniteReasoningParser",
|
||||
),
|
||||
"hunyuan_a13b": (
|
||||
"hunyuan_a13b_reasoning_parser",
|
||||
"HunyuanA13BReasoningParser",
|
||||
),
|
||||
"kimi_k2": (
|
||||
"deepseek_r1_reasoning_parser",
|
||||
"DeepSeekR1ReasoningParser",
|
||||
),
|
||||
"minimax_m2": (
|
||||
"minimax_m2_reasoning_parser",
|
||||
"MiniMaxM2ReasoningParser",
|
||||
),
|
||||
"minimax_m2_append_think": (
|
||||
"minimax_m2_reasoning_parser",
|
||||
"MiniMaxM2AppendThinkReasoningParser",
|
||||
),
|
||||
"mistral": (
|
||||
"mistral_reasoning_parser",
|
||||
"MistralReasoningParser",
|
||||
),
|
||||
"olmo3": (
|
||||
"olmo3_reasoning_parser",
|
||||
"Olmo3ReasoningParser",
|
||||
),
|
||||
"qwen3": (
|
||||
"qwen3_reasoning_parser",
|
||||
"Qwen3ReasoningParser",
|
||||
),
|
||||
"seed_oss": (
|
||||
"seedoss_reasoning_parser",
|
||||
"SeedOSSReasoningParser",
|
||||
),
|
||||
"step3": (
|
||||
"step3_reasoning_parser",
|
||||
"Step3ReasoningParser",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def register_lazy_reasoning_parsers():
|
||||
for name, (file_name, class_name) in _REASONING_PARSERS_TO_REGISTER.items():
|
||||
module_path = f"vllm.reasoning.{file_name}"
|
||||
ReasoningParserManager.register_lazy_module(name, module_path, class_name)
|
||||
|
||||
|
||||
register_lazy_reasoning_parsers()
|
||||
BIN
reasoning/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
reasoning/__pycache__/abs_reasoning_parsers.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/abs_reasoning_parsers.cpython-312.pyc
Normal file
Binary file not shown.
BIN
reasoning/__pycache__/basic_parsers.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/basic_parsers.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
reasoning/__pycache__/ernie45_reasoning_parser.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/ernie45_reasoning_parser.cpython-312.pyc
Normal file
Binary file not shown.
BIN
reasoning/__pycache__/glm4_moe_reasoning_parser.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/glm4_moe_reasoning_parser.cpython-312.pyc
Normal file
Binary file not shown.
BIN
reasoning/__pycache__/gptoss_reasoning_parser.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/gptoss_reasoning_parser.cpython-312.pyc
Normal file
Binary file not shown.
BIN
reasoning/__pycache__/granite_reasoning_parser.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/granite_reasoning_parser.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
reasoning/__pycache__/identity_reasoning_parser.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/identity_reasoning_parser.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
reasoning/__pycache__/mistral_reasoning_parser.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/mistral_reasoning_parser.cpython-312.pyc
Normal file
Binary file not shown.
BIN
reasoning/__pycache__/olmo3_reasoning_parser.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/olmo3_reasoning_parser.cpython-312.pyc
Normal file
Binary file not shown.
BIN
reasoning/__pycache__/qwen3_reasoning_parser.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/qwen3_reasoning_parser.cpython-312.pyc
Normal file
Binary file not shown.
BIN
reasoning/__pycache__/seedoss_reasoning_parser.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/seedoss_reasoning_parser.cpython-312.pyc
Normal file
Binary file not shown.
BIN
reasoning/__pycache__/step3_reasoning_parser.cpython-312.pyc
Normal file
BIN
reasoning/__pycache__/step3_reasoning_parser.cpython-312.pyc
Normal file
Binary file not shown.
290
reasoning/abs_reasoning_parsers.py
Normal file
290
reasoning/abs_reasoning_parsers.py
Normal file
@@ -0,0 +1,290 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import importlib
|
||||
import os
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Callable, Sequence
|
||||
from functools import cached_property
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from vllm.entrypoints.tool_server import ToolServer
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.import_utils import import_from_path
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
DeltaMessage,
|
||||
ResponsesRequest,
|
||||
)
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
else:
|
||||
ChatCompletionRequest = Any
|
||||
DeltaMessage = Any
|
||||
ResponsesRequest = Any
|
||||
AnyTokenizer = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class ReasoningParser:
|
||||
"""
|
||||
Abstract reasoning parser class that should not be used directly.
|
||||
Provided and methods should be used in derived classes.
|
||||
|
||||
It is used to extract reasoning content from the model output.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
|
||||
self.model_tokenizer = tokenizer
|
||||
|
||||
@cached_property
|
||||
def vocab(self) -> dict[str, int]:
|
||||
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
|
||||
# whereas all tokenizers have .get_vocab()
|
||||
return self.model_tokenizer.get_vocab()
|
||||
|
||||
@abstractmethod
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
"""
|
||||
Check if the reasoning content ends in the input_ids.
|
||||
|
||||
It is used in structured engines like `xgrammar` to check if the
|
||||
reasoning content ends in the model output.
|
||||
|
||||
Parameters:
|
||||
input_ids: list[int]
|
||||
The input_ids of the model output.
|
||||
|
||||
Returns:
|
||||
bool
|
||||
True if the reasoning content ends in the input_ids.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
"""
|
||||
Extract content token ids from the input_ids.
|
||||
Parameters:
|
||||
input_ids: list[int]
|
||||
The input_ids of the model output.
|
||||
Returns:
|
||||
list[int]
|
||||
The extracted content from the input_ids.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_reasoning(
|
||||
self,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest | ResponsesRequest,
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
Extract reasoning content from a complete model-generated string.
|
||||
|
||||
Used for non-streaming responses where we have the entire model response
|
||||
available before sending to the client.
|
||||
|
||||
Parameters:
|
||||
model_output: str
|
||||
The model-generated string to extract reasoning content from.
|
||||
|
||||
request: ChatCompletionRequest
|
||||
The request object that was used to generate the model_output.
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]
|
||||
A tuple containing the reasoning content and the content.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
"""
|
||||
Instance method that should be implemented for extracting reasoning
|
||||
from an incomplete response; for use when handling reasoning calls and
|
||||
streaming. Has to be an instance method because it requires state -
|
||||
the current tokens/diffs, but also the information about what has
|
||||
previously been parsed and extracted (see constructor)
|
||||
"""
|
||||
|
||||
def prepare_structured_tag(
|
||||
self,
|
||||
original_tag: str | None,
|
||||
tool_server: ToolServer | None,
|
||||
) -> str:
|
||||
"""
|
||||
Instance method that is implemented for preparing the structured tag
|
||||
Otherwise, None is returned
|
||||
"""
|
||||
return None
|
||||
|
||||
|
||||
class ReasoningParserManager:
|
||||
"""
|
||||
Central registry for ReasoningParser implementations.
|
||||
|
||||
Supports two registration modes:
|
||||
- Eager registration via `register_module`
|
||||
- Lazy registration via `register_lazy_module`
|
||||
|
||||
Each reasoning parser must inherit from `ReasoningParser`.
|
||||
"""
|
||||
|
||||
reasoning_parsers: dict[str, type[ReasoningParser]] = {}
|
||||
lazy_parsers: dict[str, tuple[str, str]] = {} # name -> (module_path, class_name)
|
||||
|
||||
@classmethod
|
||||
def get_reasoning_parser(cls, name: str) -> type[ReasoningParser]:
|
||||
"""
|
||||
Retrieve a registered or lazily registered ReasoningParser class.
|
||||
|
||||
If the parser is lazily registered, it will be imported and cached
|
||||
on first access.
|
||||
|
||||
Raises:
|
||||
KeyError: if no parser is found under the given name.
|
||||
"""
|
||||
if name in cls.reasoning_parsers:
|
||||
return cls.reasoning_parsers[name]
|
||||
|
||||
if name in cls.lazy_parsers:
|
||||
return cls._load_lazy_parser(name)
|
||||
|
||||
raise KeyError(f"Reasoning parser '{name}' not found.")
|
||||
|
||||
@classmethod
|
||||
def list_registered(cls) -> list[str]:
|
||||
"""Return names of all eagerly and lazily registered reasoning parsers."""
|
||||
return sorted(set(cls.reasoning_parsers.keys()) | set(cls.lazy_parsers.keys()))
|
||||
|
||||
@classmethod
|
||||
def _load_lazy_parser(cls, name: str) -> type[ReasoningParser]:
|
||||
"""Import and register a lazily loaded reasoning parser."""
|
||||
module_path, class_name = cls.lazy_parsers[name]
|
||||
try:
|
||||
mod = importlib.import_module(module_path)
|
||||
parser_cls = getattr(mod, class_name)
|
||||
if not issubclass(parser_cls, ReasoningParser):
|
||||
raise TypeError(
|
||||
f"{class_name} in {module_path} is not a ReasoningParser subclass."
|
||||
)
|
||||
|
||||
cls.reasoning_parsers[name] = parser_cls # cache
|
||||
return parser_cls
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
"Failed to import lazy reasoning parser '%s' from %s: %s",
|
||||
name,
|
||||
module_path,
|
||||
e,
|
||||
)
|
||||
raise
|
||||
|
||||
@classmethod
|
||||
def _register_module(
|
||||
cls,
|
||||
module: type[ReasoningParser],
|
||||
module_name: str | list[str] | None = None,
|
||||
force: bool = True,
|
||||
) -> None:
|
||||
"""Register a ReasoningParser class immediately."""
|
||||
if not issubclass(module, ReasoningParser):
|
||||
raise TypeError(
|
||||
f"module must be subclass of ReasoningParser, but got {type(module)}"
|
||||
)
|
||||
|
||||
if module_name is None:
|
||||
module_names = [module.__name__]
|
||||
elif isinstance(module_name, str):
|
||||
module_names = [module_name]
|
||||
elif is_list_of(module_name, str):
|
||||
module_names = module_name
|
||||
else:
|
||||
raise TypeError("module_name must be str, list[str], or None.")
|
||||
|
||||
for name in module_names:
|
||||
if not force and name in cls.reasoning_parsers:
|
||||
existed = cls.reasoning_parsers[name]
|
||||
raise KeyError(f"{name} is already registered at {existed.__module__}")
|
||||
cls.reasoning_parsers[name] = module
|
||||
|
||||
@classmethod
|
||||
def register_lazy_module(cls, name: str, module_path: str, class_name: str) -> None:
|
||||
"""
|
||||
Register a lazy module mapping for delayed import.
|
||||
|
||||
Example:
|
||||
ReasoningParserManager.register_lazy_module(
|
||||
name="qwen3",
|
||||
module_path="vllm.reasoning.parsers.qwen3_reasoning_parser",
|
||||
class_name="Qwen3ReasoningParser",
|
||||
)
|
||||
"""
|
||||
cls.lazy_parsers[name] = (module_path, class_name)
|
||||
|
||||
@classmethod
|
||||
def register_module(
|
||||
cls,
|
||||
name: str | list[str] | None = None,
|
||||
force: bool = True,
|
||||
module: type[ReasoningParser] | None = None,
|
||||
) -> (
|
||||
type[ReasoningParser] | Callable[[type[ReasoningParser]], type[ReasoningParser]]
|
||||
):
|
||||
"""
|
||||
Register module with the given name or name list. it can be used as a
|
||||
decoder(with module as None) or normal function(with module as not
|
||||
None).
|
||||
"""
|
||||
if not isinstance(force, bool):
|
||||
raise TypeError(f"force must be a boolean, but got {type(force)}")
|
||||
|
||||
# Immediate registration (explicit call)
|
||||
if module is not None:
|
||||
cls._register_module(module=module, module_name=name, force=force)
|
||||
return module
|
||||
|
||||
# Decorator usage
|
||||
def _decorator(obj: type[ReasoningParser]) -> type[ReasoningParser]:
|
||||
module_path = obj.__module__
|
||||
class_name = obj.__name__
|
||||
|
||||
if isinstance(name, str):
|
||||
names = [name]
|
||||
elif is_list_of(name, str):
|
||||
names = name
|
||||
else:
|
||||
names = [class_name]
|
||||
|
||||
for n in names:
|
||||
cls.lazy_parsers[n] = (module_path, class_name)
|
||||
|
||||
return obj
|
||||
|
||||
return _decorator
|
||||
|
||||
@classmethod
|
||||
def import_reasoning_parser(cls, plugin_path: str) -> None:
|
||||
"""
|
||||
Import a user-defined reasoning parser by the path
|
||||
of the reasoning parser define file.
|
||||
"""
|
||||
module_name = os.path.splitext(os.path.basename(plugin_path))[0]
|
||||
|
||||
try:
|
||||
import_from_path(module_name, plugin_path)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"Failed to load module '%s' from %s.", module_name, plugin_path
|
||||
)
|
||||
return
|
||||
162
reasoning/basic_parsers.py
Normal file
162
reasoning/basic_parsers.py
Normal file
@@ -0,0 +1,162 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Sequence
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from vllm.entrypoints.openai.protocol import DeltaMessage
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
ResponsesRequest,
|
||||
)
|
||||
else:
|
||||
ChatCompletionRequest = Any
|
||||
ResponsesRequest = Any
|
||||
|
||||
|
||||
class BaseThinkingReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Base class for reasoning parsers that use thinking tokens.
|
||||
|
||||
This class provides common functionality for parsers that use start and end
|
||||
tokens to delimit reasoning content (
|
||||
e.g., <think>...</think>, <seed:think>...</seed:think>).
|
||||
|
||||
Subclasses must implement the start and end tokens via abstract
|
||||
properties.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def start_token(self) -> str:
|
||||
"""The token that starts reasoning content."""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def end_token(self) -> str:
|
||||
"""The token that ends reasoning content."""
|
||||
raise NotImplementedError
|
||||
|
||||
def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
|
||||
if not self.model_tokenizer:
|
||||
raise ValueError(
|
||||
"The model tokenizer must be passed to the ReasoningParser "
|
||||
"constructor during construction."
|
||||
)
|
||||
|
||||
if not self.start_token or not self.end_token:
|
||||
raise ValueError("start_token and end_token must be defined in subclasses")
|
||||
|
||||
self.start_token_id = self.vocab.get(self.start_token)
|
||||
self.end_token_id = self.vocab.get(self.end_token)
|
||||
if self.start_token_id is None or self.end_token_id is None:
|
||||
raise RuntimeError(
|
||||
f"{self.__class__.__name__} reasoning parser could not locate "
|
||||
"think start/end tokens in the tokenizer!"
|
||||
)
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
end_token_id = self.end_token_id
|
||||
return any(input_id == end_token_id for input_id in reversed(input_ids))
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
"""
|
||||
Extract the content after the end tokens
|
||||
"""
|
||||
if self.end_token_id not in input_ids[:-1]:
|
||||
return []
|
||||
else:
|
||||
return input_ids[input_ids.index(self.end_token_id) + 1 :]
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
"""
|
||||
Extract reasoning content from a delta message.
|
||||
Handles streaming output where previous + delta = current.
|
||||
Uses token IDs for faster processing.
|
||||
"""
|
||||
# Skip single special tokens
|
||||
if len(delta_token_ids) == 1 and (
|
||||
delta_token_ids[0] in [self.start_token_id, self.end_token_id]
|
||||
):
|
||||
return None
|
||||
|
||||
# Check if start token is present in previous or delta.
|
||||
# Keep compatibility with models that don't generate start tokens.
|
||||
if self.start_token_id in previous_token_ids:
|
||||
if self.end_token_id in delta_token_ids:
|
||||
# start token in previous, end token in delta,
|
||||
# extract reasoning content
|
||||
end_index = delta_text.find(self.end_token)
|
||||
reasoning = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self.end_token) :]
|
||||
return DeltaMessage(
|
||||
reasoning=reasoning, content=content if content else None
|
||||
)
|
||||
elif self.end_token_id in previous_token_ids:
|
||||
# start token in previous, end token in previous,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(content=delta_text)
|
||||
else:
|
||||
# start token in previous, no end token in previous or delta,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
elif self.start_token_id in delta_token_ids:
|
||||
if self.end_token_id in delta_token_ids:
|
||||
# start token in delta, end token in delta,
|
||||
# extract reasoning content
|
||||
start_index = delta_text.find(self.start_token)
|
||||
end_index = delta_text.find(self.end_token)
|
||||
reasoning = delta_text[start_index + len(self.start_token) : end_index]
|
||||
content = delta_text[end_index + len(self.end_token) :]
|
||||
return DeltaMessage(
|
||||
reasoning=reasoning, content=content if content else None
|
||||
)
|
||||
else:
|
||||
# start token in delta, no end token in delta,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
else:
|
||||
# not find thinking start token
|
||||
return DeltaMessage(content=delta_text)
|
||||
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
Extract reasoning content from the model output.
|
||||
|
||||
This is the base implementation that works for most models.
|
||||
Subclasses can override this method for specific behavior.
|
||||
"""
|
||||
# Check if the start token is present in the model output, remove it
|
||||
# if it is present.
|
||||
model_output_parts = model_output.partition(self.start_token)
|
||||
model_output = (
|
||||
model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
|
||||
)
|
||||
|
||||
# For models that may not generate start token,
|
||||
# assume the reasoning content is always at the start.
|
||||
if self.end_token not in model_output:
|
||||
return model_output, None
|
||||
else:
|
||||
reasoning, _, content = model_output.partition(self.end_token)
|
||||
# If generation stops right after end-of-think, return null content
|
||||
final_content = content or None
|
||||
return reasoning, final_content
|
||||
67
reasoning/deepseek_r1_reasoning_parser.py
Normal file
67
reasoning/deepseek_r1_reasoning_parser.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from vllm.entrypoints.openai.protocol import DeltaMessage
|
||||
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
|
||||
|
||||
|
||||
class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
|
||||
"""
|
||||
Reasoning parser for DeepSeek R1 model.
|
||||
|
||||
The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning
|
||||
text. This parser extracts the reasoning content from the model output.
|
||||
"""
|
||||
|
||||
@property
|
||||
def start_token(self) -> str:
|
||||
"""The token that starts reasoning content."""
|
||||
return "<think>"
|
||||
|
||||
@property
|
||||
def end_token(self) -> str:
|
||||
"""The token that ends reasoning content."""
|
||||
return "</think>"
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
ret = super().extract_reasoning_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
previous_token_ids,
|
||||
current_token_ids,
|
||||
delta_token_ids,
|
||||
)
|
||||
if (
|
||||
ret is not None
|
||||
and self.start_token_id not in previous_token_ids
|
||||
and self.start_token_id not in delta_token_ids
|
||||
):
|
||||
if self.end_token_id in delta_token_ids:
|
||||
# end token in delta with more tokens,
|
||||
# extract reasoning content and content
|
||||
end_index = delta_text.find(self.end_token)
|
||||
reasoning = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self.end_token) :]
|
||||
return DeltaMessage(
|
||||
reasoning=reasoning,
|
||||
content=content if content else None,
|
||||
)
|
||||
elif self.end_token_id in previous_token_ids:
|
||||
# end token in previous, thinking content ends
|
||||
return DeltaMessage(content=delta_text)
|
||||
else:
|
||||
# no end token in previous or delta, reasoning content continues
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
|
||||
return ret
|
||||
62
reasoning/deepseek_v3_reasoning_parser.py
Normal file
62
reasoning/deepseek_v3_reasoning_parser.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser
|
||||
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
|
||||
|
||||
from .identity_reasoning_parser import IdentityReasoningParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class DeepSeekV3ReasoningParser(ReasoningParser):
|
||||
"""
|
||||
V3 parser that delegates to either DeepSeekR1ReasoningParser or
|
||||
IdentityReasoningParser based on `thinking` and `separate_reasoning`.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
|
||||
chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {}
|
||||
thinking = bool(chat_kwargs.pop("thinking", False))
|
||||
|
||||
if thinking:
|
||||
self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
|
||||
else:
|
||||
self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
|
||||
|
||||
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
|
||||
return self._parser.is_reasoning_end(input_ids)
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
return self._parser.extract_content_ids(input_ids)
|
||||
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
return self._parser.extract_reasoning(model_output, request)
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
return self._parser.extract_reasoning_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
previous_token_ids,
|
||||
current_token_ids,
|
||||
delta_token_ids,
|
||||
)
|
||||
165
reasoning/ernie45_reasoning_parser.py
Normal file
165
reasoning/ernie45_reasoning_parser.py
Normal file
@@ -0,0 +1,165 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class Ernie45ReasoningParser(BaseThinkingReasoningParser):
|
||||
"""
|
||||
Reasoning parser for Ernie45 thinking model.
|
||||
The Ernie45 thinking model ouput format is
|
||||
abc\n</think>\n\n<response>\ndef\n</response>\n
|
||||
or abc\n</think>\ndef
|
||||
"""
|
||||
|
||||
response_start_token: str = "<response>"
|
||||
response_end_token: str = "</response>"
|
||||
newline_token: str = "<0x0A>"
|
||||
|
||||
@property
|
||||
def start_token(self) -> str:
|
||||
"""The token that starts reasoning content."""
|
||||
return "<think>"
|
||||
|
||||
@property
|
||||
def end_token(self) -> str:
|
||||
"""The token that ends reasoning content."""
|
||||
return "</think>"
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
|
||||
if not self.model_tokenizer:
|
||||
raise ValueError(
|
||||
"The model tokenizer must be passed to the ReasoningParser "
|
||||
"constructor during construction."
|
||||
)
|
||||
|
||||
self.start_token_id = self.vocab.get(self.start_token)
|
||||
self.end_token_id = self.vocab.get(self.end_token)
|
||||
self.response_start_token_id = self.vocab.get(self.response_start_token)
|
||||
self.response_end_token_id = self.vocab.get(self.response_end_token)
|
||||
self.newline_token_id = self.vocab.get(self.newline_token)
|
||||
|
||||
self.parser_token_ids = [self.end_token_id, self.response_end_token_id]
|
||||
|
||||
if self.start_token_id is None or self.end_token_id is None:
|
||||
raise RuntimeError(
|
||||
"Ernie45 reasoning parser could not locate think start/end "
|
||||
"tokens in the tokenizer!"
|
||||
)
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
"""
|
||||
Extract reasoning content from a delta message.
|
||||
Handles streaming output where previous + delta = current.
|
||||
Uses token IDs for faster processing.
|
||||
The Ernie45 thinking model ouput format is
|
||||
abc\n</think>\n\n<response>\ndef\n</response>\n
|
||||
or abc\n</think>\ndef
|
||||
- 'abc' goes to reasoning
|
||||
- 'def' goes to content
|
||||
"""
|
||||
# Skip single special tokens
|
||||
if len(delta_token_ids) == 1 and (
|
||||
delta_token_ids[0]
|
||||
in [
|
||||
self.start_token_id,
|
||||
self.end_token_id,
|
||||
self.response_start_token_id,
|
||||
self.response_end_token_id,
|
||||
]
|
||||
):
|
||||
return None
|
||||
|
||||
# No <think> in previous or delta, also need to check for </think>.
|
||||
# Because the model may have generated </think> without <think>
|
||||
if self.end_token_id in delta_token_ids:
|
||||
# </think> in delta with more tokens,
|
||||
# extract reasoning content and content
|
||||
think_end_index = delta_text.find(self.end_token)
|
||||
reasoning = delta_text[:think_end_index]
|
||||
content = delta_text[think_end_index + len(self.end_token) :]
|
||||
content = content.lstrip("\n")
|
||||
response_start_idx = content.find(self.response_start_token)
|
||||
response_end_idx = content.rfind(self.response_end_token)
|
||||
if response_start_idx != -1:
|
||||
content = content[response_start_idx + len(self.response_start_token) :]
|
||||
if response_end_idx != -1:
|
||||
content = content[:response_end_idx]
|
||||
return DeltaMessage(
|
||||
reasoning=reasoning,
|
||||
content=content if content else None,
|
||||
)
|
||||
elif self.end_token_id in previous_token_ids:
|
||||
# </think> in previous, thinking content ends
|
||||
content = delta_text
|
||||
if self.response_start_token_id in delta_token_ids:
|
||||
content = content.lstrip("\n")
|
||||
response_start_idx = content.find(self.response_start_token)
|
||||
content = content[response_start_idx + len(self.response_start_token) :]
|
||||
# if have </response>, remove it
|
||||
response_end_idx = content.rfind(self.response_end_token)
|
||||
if response_end_idx != -1:
|
||||
content = content[:response_end_idx]
|
||||
elif self.response_end_token_id in delta_token_ids:
|
||||
response_end_idx = content.rfind(self.response_end_token)
|
||||
content = content[:response_end_idx]
|
||||
# remove \n after </think> or </response>
|
||||
if previous_token_ids[-1] in self.parser_token_ids and (
|
||||
len(delta_token_ids) > 0 and delta_token_ids[0] == self.newline_token_id
|
||||
):
|
||||
content = content.lstrip("\n")
|
||||
# remove \n after </think>\n
|
||||
if (
|
||||
len(previous_token_ids) > 1
|
||||
and previous_token_ids[-2] == self.end_token_id
|
||||
) and (
|
||||
len(delta_token_ids) > 0 and delta_token_ids[0] == self.newline_token_id
|
||||
):
|
||||
content = content.lstrip("\n")
|
||||
|
||||
return DeltaMessage(content=content if content else None)
|
||||
else:
|
||||
# no </think> in previous or delta, reasoning content continues
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
Extract reasoning content from the model output.
|
||||
The Ernie45 thinking model ouput format is
|
||||
abc\n</think>\n\n\n<response>\ndef\n</response>\n
|
||||
or abc\n</think>\ndef
|
||||
- 'abc' goes to reasoning
|
||||
- 'def' goes to content
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]: reasoning content and content
|
||||
"""
|
||||
reasoning, content = super().extract_reasoning(model_output, request)
|
||||
if content:
|
||||
start_idx = content.find(self.response_start_token)
|
||||
end_idx = content.rfind(self.response_end_token)
|
||||
# Simultaneously existing and in the correct order
|
||||
if start_idx != -1 and end_idx != -1 and start_idx < end_idx:
|
||||
content = content[start_idx + len(self.response_start_token) : end_idx]
|
||||
final_content = content or None
|
||||
|
||||
return reasoning, final_content
|
||||
171
reasoning/glm4_moe_reasoning_parser.py
Normal file
171
reasoning/glm4_moe_reasoning_parser.py
Normal file
@@ -0,0 +1,171 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class Glm4MoeModelReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for the Glm4MoeModel model.
|
||||
|
||||
The Glm4MoeModel model uses <think>...</think> tokens to denote reasoning
|
||||
text within its output. The model provides a strict switch to disable
|
||||
reasoning output via the 'enable_thinking=False' parameter. This parser
|
||||
extracts the reasoning content enclosed by <think> and </think> tokens
|
||||
from the model's output.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
self.think_start_token = "<think>"
|
||||
self.think_end_token = "</think>"
|
||||
self.assistant_token = "<|assistant|>"
|
||||
|
||||
if not self.model_tokenizer:
|
||||
raise ValueError(
|
||||
"The model tokenizer must be passed to the ReasoningParser "
|
||||
"constructor during construction."
|
||||
)
|
||||
|
||||
self.think_start_token_id = self.vocab.get(self.think_start_token)
|
||||
self.think_end_token_id = self.vocab.get(self.think_end_token)
|
||||
self.assistant_token_id = self.vocab.get(self.assistant_token)
|
||||
if (
|
||||
self.think_start_token_id is None
|
||||
or self.think_end_token_id is None
|
||||
or self.assistant_token_id is None
|
||||
):
|
||||
raise RuntimeError(
|
||||
"Glm4MoeModel reasoning parser could not locate "
|
||||
"think start/end or assistant tokens in the tokenizer!"
|
||||
)
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
"""
|
||||
GLM's chat template has <think></think> tokens after every
|
||||
<|assistant|> token. Thus, we need to check if </think> is
|
||||
after the most recent <|assistant|> token (if present).
|
||||
"""
|
||||
for token_id in input_ids[::-1]:
|
||||
if token_id == self.think_end_token_id:
|
||||
return True
|
||||
elif token_id == self.assistant_token_id:
|
||||
return False
|
||||
return False
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
"""
|
||||
Extract the content after the end tokens
|
||||
"""
|
||||
if self.think_end_token_id not in input_ids[:-1]:
|
||||
return []
|
||||
else:
|
||||
return input_ids[input_ids.index(self.think_end_token_id) + 1 :]
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
"""
|
||||
Extract reasoning content from a delta message.
|
||||
Handles streaming output where previous + delta = current.
|
||||
Uses token IDs for faster processing.
|
||||
For text <think>abc</think>xyz:
|
||||
- 'abc' goes to reasoning
|
||||
- 'xyz' goes to content
|
||||
"""
|
||||
# Skip single special tokens
|
||||
if len(delta_token_ids) == 1 and (
|
||||
delta_token_ids[0] in [self.think_start_token_id, self.think_end_token_id]
|
||||
):
|
||||
return None
|
||||
|
||||
if self.think_start_token_id in previous_token_ids:
|
||||
if self.think_end_token_id in delta_token_ids:
|
||||
# <think> in previous, </think> in delta,
|
||||
# extract reasoning content
|
||||
end_index = delta_text.find(self.think_end_token)
|
||||
reasoning = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self.think_end_token) :]
|
||||
return DeltaMessage(
|
||||
reasoning=reasoning,
|
||||
content=content if content else None,
|
||||
)
|
||||
elif self.think_end_token_id in previous_token_ids:
|
||||
# <think> in previous, </think> in previous,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(content=delta_text)
|
||||
else:
|
||||
# <think> in previous, no </think> in previous or delta,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
elif self.think_start_token_id in delta_token_ids:
|
||||
if self.think_end_token_id in delta_token_ids:
|
||||
# <think> in delta, </think> in delta, extract reasoning content
|
||||
start_index = delta_text.find(self.think_start_token)
|
||||
end_index = delta_text.find(self.think_end_token)
|
||||
reasoning = delta_text[
|
||||
start_index + len(self.think_start_token) : end_index
|
||||
]
|
||||
content = delta_text[end_index + len(self.think_end_token) :]
|
||||
return DeltaMessage(
|
||||
reasoning=reasoning,
|
||||
content=content if content else None,
|
||||
)
|
||||
else:
|
||||
# <think> in delta, no </think> in delta,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
else:
|
||||
# thinking is disabled, just content
|
||||
return DeltaMessage(content=delta_text)
|
||||
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
Extract reasoning content from the model output.
|
||||
|
||||
For text <think>abc</think>xyz:
|
||||
- 'abc' goes to reasoning
|
||||
- 'xyz' goes to content
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]: reasoning content and content
|
||||
"""
|
||||
|
||||
# Check if the model output contains the <think> and </think> tokens.
|
||||
if (
|
||||
self.think_start_token not in model_output
|
||||
or self.think_end_token not in model_output
|
||||
):
|
||||
return None, model_output
|
||||
# Check if the <think> is present in the model output, remove it
|
||||
# if it is present.
|
||||
model_output_parts = model_output.partition(self.think_start_token)
|
||||
model_output = (
|
||||
model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
|
||||
)
|
||||
# Check if the model output contains the </think> tokens.
|
||||
# If the end token is not found, return the model output as is.
|
||||
if self.think_end_token not in model_output:
|
||||
return None, model_output
|
||||
|
||||
# Extract reasoning content from the model output.
|
||||
reasoning, _, content = model_output.partition(self.think_end_token)
|
||||
|
||||
final_content = content or None
|
||||
return reasoning, final_content
|
||||
173
reasoning/gptoss_reasoning_parser.py
Normal file
173
reasoning/gptoss_reasoning_parser.py
Normal file
@@ -0,0 +1,173 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
from collections.abc import Sequence
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.harmony_utils import parse_chat_output
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
|
||||
from vllm.entrypoints.tool_server import ToolServer
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
no_func_reaonsing_tag = {
|
||||
"type": "structural_tag",
|
||||
"format": {
|
||||
"type": "triggered_tags",
|
||||
"tags": [
|
||||
{
|
||||
"begin": "<|channel|>analysis<|message|>",
|
||||
"content": {"type": "any_text"},
|
||||
"end": "<|end|>",
|
||||
}
|
||||
],
|
||||
"triggers": ["<|channel|>analysis"],
|
||||
"stop_after_first": False,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def from_builtin_tool_to_tag(tool: str) -> list[dict]:
|
||||
tag = [
|
||||
{
|
||||
"begin": f"<|channel|>commentary to={tool}",
|
||||
"content": {"type": "any_text"},
|
||||
"end": "<|end|>",
|
||||
},
|
||||
{
|
||||
"begin": f"<|channel|>analysis to={tool}",
|
||||
"content": {"type": "any_text"},
|
||||
"end": "<|end|>",
|
||||
},
|
||||
]
|
||||
return tag
|
||||
|
||||
|
||||
def tag_with_builtin_funcs(no_func_reaonsing_tag, builtin_tool_list: list[str]) -> dict:
|
||||
import copy
|
||||
|
||||
new_tag = copy.deepcopy(no_func_reaonsing_tag)
|
||||
new_tag["format"]["triggers"].append("<|channel|>commentary to=")
|
||||
|
||||
for tool in builtin_tool_list:
|
||||
new_tag["format"]["tags"].extend(from_builtin_tool_to_tag(tool))
|
||||
return new_tag
|
||||
|
||||
|
||||
class GptOssReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for GptOss model.
|
||||
|
||||
The GptOss model uses harmony to extract reasoning content and this parser
|
||||
is only used for detecting the end of the reasoning content.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
# The model can output some special tokens between "final" and "<|message|>"
|
||||
# So we need to look for both sequences to determine the end of reasoning.
|
||||
self.reasoning_end_token_ids_prefix = self.model_tokenizer.encode(
|
||||
"<|channel|>final"
|
||||
)
|
||||
self.reasoning_end_token_ids_suffix = self.model_tokenizer.encode("<|message|>")
|
||||
self.reasoning_max_num_between_tokens = 20
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
end_token_ids_prefix = self.reasoning_end_token_ids_prefix
|
||||
end_token_ids_suffix = self.reasoning_end_token_ids_suffix
|
||||
assert len(end_token_ids_prefix) > 0, "reasoning_end_token_ids_prefix is empty"
|
||||
assert len(end_token_ids_suffix) > 0, "reasoning_end_token_ids_suffix is empty"
|
||||
# Check if the end sequence is present in the input_ids.
|
||||
# We search from the end of input_ids to find the last match.
|
||||
for i in range(len(input_ids) - len(end_token_ids_prefix), -1, -1):
|
||||
if input_ids[i : i + len(end_token_ids_prefix)] == end_token_ids_prefix:
|
||||
# We have found the prefix, now we look for the suffix after the prefix.
|
||||
suffix_start = i + len(end_token_ids_prefix)
|
||||
for j in range(
|
||||
suffix_start, len(input_ids) - len(end_token_ids_suffix) + 1
|
||||
):
|
||||
if j - suffix_start >= self.reasoning_max_num_between_tokens:
|
||||
break
|
||||
if (
|
||||
input_ids[j : j + len(end_token_ids_suffix)]
|
||||
== end_token_ids_suffix
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
_, content, _ = parse_chat_output(input_ids)
|
||||
if content is None:
|
||||
return []
|
||||
return self.model_tokenizer.encode(content)
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
prev_reasoning, prev_content, _ = parse_chat_output(list(previous_token_ids))
|
||||
cur_reasoning, cur_content, _ = parse_chat_output(list(current_token_ids))
|
||||
reasoning_delta = None
|
||||
content_delta = None
|
||||
if cur_reasoning is not None:
|
||||
prev_r = prev_reasoning or ""
|
||||
if cur_reasoning.startswith(prev_r):
|
||||
reasoning_delta = cur_reasoning[len(prev_r) :] or None
|
||||
else:
|
||||
reasoning_delta = cur_reasoning
|
||||
if cur_content is not None:
|
||||
prev_c = prev_content or ""
|
||||
if cur_content.startswith(prev_c):
|
||||
content_delta = cur_content[len(prev_c) :] or None
|
||||
else:
|
||||
content_delta = cur_content
|
||||
if reasoning_delta is None and content_delta is None:
|
||||
return None
|
||||
return DeltaMessage(reasoning=reasoning_delta, content=content_delta)
|
||||
|
||||
def extract_reasoning(
|
||||
self,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest,
|
||||
) -> tuple[str | None, str | None]:
|
||||
raise NotImplementedError(
|
||||
"gpt-oss has a special branch for parsing reasoning in non-streaming mode. This method shouldn't be used." # noqa: E501
|
||||
)
|
||||
|
||||
# This function prepares the structural tag to format reasoning output
|
||||
def prepare_structured_tag(
|
||||
self, original_tag: str | None, tool_server: ToolServer | None
|
||||
) -> str:
|
||||
if original_tag is None:
|
||||
if tool_server is None:
|
||||
return json.dumps(no_func_reaonsing_tag)
|
||||
else:
|
||||
builtin_tool_list: list[str] = []
|
||||
if tool_server.has_tool("browser"):
|
||||
builtin_tool_list.append("browser")
|
||||
if tool_server.has_tool("python"):
|
||||
builtin_tool_list.append("python")
|
||||
if tool_server.has_tool("container"):
|
||||
builtin_tool_list.append("container")
|
||||
|
||||
if len(builtin_tool_list) > 0:
|
||||
logger.info("Builtin_tool_list: %s", builtin_tool_list)
|
||||
func_tag = json.dumps(
|
||||
tag_with_builtin_funcs(no_func_reaonsing_tag, builtin_tool_list)
|
||||
)
|
||||
else:
|
||||
logger.info("Builtin_tool_list is empty")
|
||||
func_tag = json.dumps(no_func_reaonsing_tag)
|
||||
|
||||
return func_tag
|
||||
else:
|
||||
# There is potential risk for appending the tag to the original tag
|
||||
return original_tag
|
||||
363
reasoning/granite_reasoning_parser.py
Normal file
363
reasoning/granite_reasoning_parser.py
Normal file
@@ -0,0 +1,363 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import regex as re
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class GraniteReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for IBM Granite.
|
||||
|
||||
IBM granite models currently use "Here is my thought process:"
|
||||
and "Here is my response:" to separate its thinking / response outputs.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
|
||||
# NOTE: There have been some observed occurrences of quantized
|
||||
# instances of the current models using "Here's" instead of "Here is",
|
||||
# so to be safe, we match on both.
|
||||
self.think_start_expr = r"(?:Here's|Here is) my thought process:"
|
||||
self.response_start_expr = r"(?:Here's|Here is) my response:"
|
||||
|
||||
self.reasoning_regex = re.compile(
|
||||
rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)", re.DOTALL
|
||||
)
|
||||
|
||||
self.valid_think_starts = [
|
||||
"Here's my thought process:",
|
||||
"Here is my thought process:",
|
||||
]
|
||||
self.valid_response_starts = ["Here's my response:", "Here is my response:"]
|
||||
|
||||
# Substrings to match for sequence boundaries on raw text
|
||||
self.seq_boundary_end = ":"
|
||||
self.seq_boundary_start = "Here"
|
||||
|
||||
# The longest any thinking / start of response message can be
|
||||
self.longest_think_start = max(
|
||||
len(think_start) for think_start in self.valid_think_starts
|
||||
)
|
||||
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""Extract the reasoning content & content sections, respectively.
|
||||
If the sequence doesn't match what we expect, i.e., the model generates
|
||||
something else, all content is considered non-reasoning content.
|
||||
|
||||
Args:
|
||||
model_output (str): Output of the model to be parsed.
|
||||
request (ChatCompletionRequest): Request being processed.
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]: Tuple pair containing the
|
||||
reasoning content and non-reasoning content.
|
||||
"""
|
||||
re_match = self.reasoning_regex.findall(model_output)
|
||||
if not re_match:
|
||||
return None, model_output
|
||||
reasoning, response_content = re_match[0]
|
||||
if not response_content:
|
||||
return reasoning, None
|
||||
return reasoning, response_content
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
"""Extract the reasoning content / content emitted by granite models;
|
||||
If the sequence doesn't match what we expect, i.e., the model generates
|
||||
something else, all content is considered non-reasoning content.
|
||||
|
||||
NOTE: Granite models do not use a special token to start their reasoning
|
||||
and response sections; instead they have token sequences, e.g.,
|
||||
|
||||
Here is my thought process: Foo Here is my response: Bar
|
||||
|
||||
This increases the complexity of correctly handling streams, since we
|
||||
need to watch for specific sequences and correctly parse them without
|
||||
dropping content that is potentially overlapping & spanning multiple
|
||||
delta messages.
|
||||
|
||||
Args:
|
||||
previous_text (str): Previous text outside of this delta message.
|
||||
current_text (str): Previous text + delta text.
|
||||
delta_text (str): Text to consider and parse content from.
|
||||
previous_token_ids (Sequence[int]): Token IDs of previous_text.
|
||||
current_token_ids (Sequence[int]): Token IDs of current_text.
|
||||
delta_token_ids (Sequence[int]): Token IDs of delta_text.
|
||||
|
||||
Returns:
|
||||
Union[DeltaMessage, None]
|
||||
DeltaMessage with either reasoning content or content, or None.
|
||||
"""
|
||||
reasoning, resp_seq_len, content = self._get_content_sections(current_text)
|
||||
# Either we haven't finished the start of the reasoning sequence,
|
||||
# or the model is generating something unexpected.
|
||||
if not reasoning:
|
||||
delta_message = self._get_delta_message_with_no_reasoning_bounds(
|
||||
current_text, delta_text
|
||||
)
|
||||
# We have a start of reasoning message, but have not yet finished
|
||||
# the start of response sequence.
|
||||
elif not content:
|
||||
delta_message = self._get_delta_message_with_no_response_bounds(
|
||||
current_text, reasoning, delta_text
|
||||
)
|
||||
# We've finished both the start of reasoning and start of response seq.
|
||||
else:
|
||||
# This should never happen since we matched on the response
|
||||
assert resp_seq_len is not None
|
||||
delta_message = self._get_delta_message_with_both_bounds(
|
||||
delta_text, reasoning, content, current_text, resp_seq_len
|
||||
)
|
||||
if not delta_message.content and not delta_message.reasoning:
|
||||
return None
|
||||
return delta_message
|
||||
|
||||
#### Implementation details of stream parsing for granite models
|
||||
def _is_reasoning_start_substr(self, text: str) -> bool:
|
||||
"""Check if a text matches one of the possible start reasoning seqs.
|
||||
|
||||
Args:
|
||||
text (str): Text to check for leading substr.
|
||||
|
||||
Returns:
|
||||
bool: True if any of the possible reasoning start seqs match.
|
||||
"""
|
||||
return any(
|
||||
think_start.startswith(text) for think_start in self.valid_think_starts
|
||||
)
|
||||
|
||||
def _is_response_start_substr(self, text: str) -> bool:
|
||||
"""Check if a text matches one of the possible start response seqs.
|
||||
|
||||
Args:
|
||||
text (str): Text to check for leading substr.
|
||||
|
||||
Returns:
|
||||
bool: True if any of the possible response start seqs match.
|
||||
"""
|
||||
return any(
|
||||
response_start.startswith(text)
|
||||
for response_start in self.valid_response_starts
|
||||
)
|
||||
|
||||
def _get_delta_message_with_no_reasoning_bounds(
|
||||
self,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
) -> DeltaMessage:
|
||||
"""Parse the delta message when the current text has not yet completed
|
||||
its start of reasoning sequence.
|
||||
|
||||
Args:
|
||||
current_text (str): The full previous + delta text.
|
||||
delta_text (str): Text to consider and parse content from.
|
||||
|
||||
Returns:
|
||||
DeltaMessage: Message containing the parsed content.
|
||||
"""
|
||||
prev_longest_length = len(current_text) - len(delta_text)
|
||||
is_substr = self._is_reasoning_start_substr(current_text)
|
||||
was_substr = self._is_reasoning_start_substr(current_text[:prev_longest_length])
|
||||
|
||||
# Check if we just generated something NOT in the special token seq;
|
||||
# if so, add everything that we previously skipped with this delta
|
||||
# message and append everything to content in the future.
|
||||
if was_substr and not is_substr:
|
||||
return DeltaMessage(
|
||||
reasoning=None,
|
||||
content=current_text,
|
||||
)
|
||||
if is_substr:
|
||||
# Might still be in the special token sequence; return nothing
|
||||
return DeltaMessage(reasoning=None, content=None)
|
||||
# Otherwise the sequence has already been broken and we already
|
||||
# corrected; just return the delta text as normal content.
|
||||
return DeltaMessage(reasoning=None, content=delta_text)
|
||||
|
||||
def _get_delta_message_with_no_response_bounds(
|
||||
self,
|
||||
current_text: str,
|
||||
reasoning: str,
|
||||
delta_text: str,
|
||||
) -> DeltaMessage:
|
||||
"""Parse the delta message when the current text has both reasoning
|
||||
content with no (response) content. NOTE that we may have overlapping
|
||||
tokens with the start of reasoning / start of response sequences on
|
||||
either side of the delta text.
|
||||
|
||||
Args:
|
||||
current_text (str): The full previous + delta text.
|
||||
reasoning (str): reasoning content from current_text.
|
||||
delta_text (str): Text to consider and parse content from.
|
||||
|
||||
Returns:
|
||||
DeltaMessage: Message containing the parsed content.
|
||||
"""
|
||||
# If we have no reasoning content or explicitly end with the start of
|
||||
# response sequence, we are in transition to the response; need to be
|
||||
# careful here, since the final token (:) will match the reasoning
|
||||
# content and fully parse it out; we should not pass the : back.
|
||||
ends_with_start_response_seq = any(
|
||||
current_text.endswith(response_start)
|
||||
for response_start in self.valid_response_starts
|
||||
)
|
||||
if reasoning is None or ends_with_start_response_seq:
|
||||
return DeltaMessage(reasoning=None, content=None)
|
||||
|
||||
# Consider previous / current text only within context of the reasoning
|
||||
previous_text = reasoning[: -len(delta_text)]
|
||||
current_text = reasoning
|
||||
|
||||
# We need to be careful about adding unfinished response sequences;
|
||||
# Find the place at which we MIGHT be starting a response sequence
|
||||
prev_idx = previous_text.rfind(self.seq_boundary_start)
|
||||
delta_idx = delta_text.rfind(self.seq_boundary_start)
|
||||
|
||||
# Check the state of potential start of response substring matches.
|
||||
prev_was_substr = (
|
||||
self._is_response_start_substr(previous_text[prev_idx:])
|
||||
if prev_idx >= 0
|
||||
else False
|
||||
)
|
||||
delta_continues_substr = (
|
||||
self._is_response_start_substr(current_text[prev_idx:])
|
||||
if prev_idx >= 0
|
||||
else False
|
||||
)
|
||||
delta_new_substr = (
|
||||
self._is_response_start_substr(delta_text[delta_idx:])
|
||||
if delta_idx >= 0
|
||||
else False
|
||||
)
|
||||
|
||||
# Delta only contains potential continued response sequence text.
|
||||
if delta_continues_substr:
|
||||
return DeltaMessage(reasoning=None, content=None)
|
||||
|
||||
if not prev_was_substr:
|
||||
# Delta may be starting a new response seq but has other text too.
|
||||
if delta_new_substr:
|
||||
return DeltaMessage(reasoning=delta_text[:delta_idx], content=None)
|
||||
# Normal case for most reasoning text (no potential special seqs).
|
||||
return DeltaMessage(reasoning=delta_text, content=None)
|
||||
# The substring that previously seemed to be a potential response
|
||||
# seq wasn't one; we need to add the content to the delta message,
|
||||
# and also slice off the potential response sequence
|
||||
elif delta_new_substr:
|
||||
reasoning = previous_text[prev_idx:] + delta_text[:delta_idx]
|
||||
return DeltaMessage(reasoning=reasoning, content=None)
|
||||
# No new substring yet, and we broke our old one; take the whole delta
|
||||
return DeltaMessage(
|
||||
reasoning=previous_text[prev_idx:] + delta_text,
|
||||
content=None,
|
||||
)
|
||||
|
||||
def _get_delta_message_with_both_bounds(
|
||||
self,
|
||||
delta_text: str,
|
||||
reasoning: str,
|
||||
response_content: str,
|
||||
current_text: str,
|
||||
response_seq_len: int,
|
||||
) -> DeltaMessage:
|
||||
"""Parse the delta message when the current text has both reasoning
|
||||
content and normal (response) content.
|
||||
|
||||
Args:
|
||||
delta_text: Text to consider and parse content from.
|
||||
reasoning: reasoning content from current_text.
|
||||
response_content: response content from current_text.
|
||||
current_text: The full previous + delta text.
|
||||
response_seq_len: Len of the complete response sequence used.
|
||||
|
||||
Returns:
|
||||
DeltaMessage: Message containing the parsed content.
|
||||
"""
|
||||
# Always have content; take length to the end
|
||||
delta_content = delta_text[-len(response_content) :]
|
||||
reasoning_end_idx = len(delta_text) - (len(response_content) + response_seq_len)
|
||||
|
||||
if reasoning_end_idx < 0:
|
||||
delta_reasoning = None
|
||||
else:
|
||||
# Get the starting offset
|
||||
start_reasoning_idx = (
|
||||
len(reasoning) + response_seq_len + len(response_content) - 1
|
||||
)
|
||||
delta_offset = len(current_text) - len(delta_text)
|
||||
start_offset = start_reasoning_idx - delta_offset
|
||||
if start_offset < 0:
|
||||
start_offset = 0
|
||||
delta_reasoning = delta_text[start_offset:reasoning_end_idx]
|
||||
|
||||
return DeltaMessage(
|
||||
reasoning=delta_reasoning,
|
||||
content=delta_content,
|
||||
)
|
||||
|
||||
def _get_content_sections(
|
||||
self, current_text: str
|
||||
) -> tuple[str | None, int | None, str | None]:
|
||||
"""Parse the text to extract the reasoning content / content
|
||||
if we have them.
|
||||
|
||||
Args:
|
||||
current_text (str): The full previous + delta text.
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[int], Optional[str]]: Tuple of len 3
|
||||
containing the reasoning content, the length of the response seq
|
||||
(if there is one) and the non-reasoning content.
|
||||
"""
|
||||
current_chunk_start = 0
|
||||
start_reasoning = None
|
||||
parsed_content = False
|
||||
delimiter_idxs = [
|
||||
idx
|
||||
for idx, char in enumerate(current_text)
|
||||
if char == self.seq_boundary_end
|
||||
]
|
||||
|
||||
for current_chunk_end in delimiter_idxs:
|
||||
current_chunk = current_text[current_chunk_start:current_chunk_end]
|
||||
# Check to see if the start of reasoning seq if complete
|
||||
if start_reasoning is None:
|
||||
for think_start in self.valid_think_starts:
|
||||
if current_chunk == think_start[:-1]:
|
||||
start_reasoning = current_chunk_end + 1
|
||||
current_chunk_start = current_chunk_end + 1
|
||||
break
|
||||
|
||||
# Check to see if the start of response seq if complete
|
||||
elif not parsed_content:
|
||||
for response_start in self.valid_response_starts:
|
||||
if current_chunk[-len(response_start) + 1 :] == response_start[:-1]:
|
||||
# Mark end of reasoning and start response content
|
||||
# after the start of response sequence.
|
||||
end_reasoning = current_chunk_end - len(response_start)
|
||||
reasoning = current_text[start_reasoning:end_reasoning]
|
||||
response_content = current_text[current_chunk_end + 1 :]
|
||||
return reasoning, len(response_start), response_content
|
||||
|
||||
if start_reasoning and not parsed_content:
|
||||
return current_text[start_reasoning:], None, None
|
||||
return None, None, None
|
||||
237
reasoning/hunyuan_a13b_reasoning_parser.py
Normal file
237
reasoning/hunyuan_a13b_reasoning_parser.py
Normal file
@@ -0,0 +1,237 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import regex as re
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class HunyuanA13BReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for Hunyuan A13B Model
|
||||
|
||||
HunyuanReasoningParser
|
||||
|
||||
This class implements a reasoning parser specifically designed
|
||||
for the Hunyuan A13B Model. It is responsible for parsing and
|
||||
extracting structured reasoning and answer segments from model
|
||||
outputs that follow a specific pattern.
|
||||
|
||||
Key Features:
|
||||
- For non-stream output , Recognizes and extracts reasoning ("think")
|
||||
and answer ("answer") sections from text using regular expressions.
|
||||
- For stream process, it requires a token id sequences to change the
|
||||
reasoning state and other state so it maintains internal state to
|
||||
manage parsing across multiple token.
|
||||
|
||||
|
||||
think start: "<think>\n": [14023, 771, 397]
|
||||
think ends: "\n</think>\n<answer>\n": [198, 524, 27963, 397, 27, 9399, 397]
|
||||
response ends: "\n</answer>": [524, 9399, 29]
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
self.think_start_expr = r"<think>\n"
|
||||
self.think_end_expr = r"\n</think>\n"
|
||||
|
||||
self.response_start_expr = r"\n</think>\n<answer>\n"
|
||||
self.response_end_expr = r"\n</answer>"
|
||||
|
||||
self.full_match_reasoning_regex = re.compile(
|
||||
rf"(?:{self.think_start_expr}(.*?){self.response_start_expr})?(.*?){self.response_end_expr}",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
self.half_match_reasoning_regex = re.compile(
|
||||
rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)", re.DOTALL
|
||||
)
|
||||
|
||||
self.think_start_ids = [14023, 771, 397]
|
||||
self.think_start_ids_fast = [14023, 771, 1363]
|
||||
self.response_start_ids = [198, 524, 27963, 397, 27, 9399, 397]
|
||||
self.response_start_ids_fast = [524, 27963, 397, 27, 9399, 397]
|
||||
self.response_end_ids = [198, 524, 9399, 29]
|
||||
self.fast_think_ids = [14023, 771, 1363, 524, 27963, 397, 27, 9399, 397]
|
||||
|
||||
# when state change, send out all the buffered text in last state
|
||||
self.buffered_text = []
|
||||
self.buffered_ids = []
|
||||
|
||||
self.current_state = "reasoning"
|
||||
self.all_states = ["reasoning", "response"]
|
||||
|
||||
self.current_state = "idle"
|
||||
self.expected_sequence = self.think_start_ids
|
||||
# this sequence only for the think start, it has two way to start.
|
||||
self.expected_sequence_side = self.think_start_ids_fast
|
||||
self.sequence_index = 0
|
||||
self.token_buffer = []
|
||||
self.text_buffer = ""
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return self.current_state == "response"
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
# for hunyuan streaming reason parsing, the stream parse
|
||||
# will call first, and the same token will be called in
|
||||
# is_reasoning_end and extract_content_ids
|
||||
# this id is not part of content, so just return [] here.
|
||||
return []
|
||||
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""Extract the reasoning content & content sections, respectively.
|
||||
If the sequence doesn't match what we expect, i.e., the model generates
|
||||
something else, all content is considered non-reasoning content.
|
||||
|
||||
Args:
|
||||
model_output (str): Output of the model to be parsed.
|
||||
request (ChatCompletionRequest): Request being processed.
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]: Tuple pair containing the
|
||||
reasoning content and non-reasoning content.
|
||||
"""
|
||||
|
||||
re_match = self.full_match_reasoning_regex.findall(model_output)
|
||||
if re_match:
|
||||
reasoning, response_content = re_match[0]
|
||||
if len(reasoning) == 0:
|
||||
reasoning = None
|
||||
if len(response_content) == 0:
|
||||
response_content = None
|
||||
return reasoning, response_content
|
||||
|
||||
fallback_regex = self.half_match_reasoning_regex
|
||||
fallback_match = fallback_regex.findall(model_output)
|
||||
if fallback_match:
|
||||
reasoning, response_content = fallback_match[0]
|
||||
|
||||
if response_content.endswith(self.response_end_expr):
|
||||
response_content = response_content[: -len(self.response_end_expr)]
|
||||
|
||||
if len(reasoning) == 0:
|
||||
reasoning = None
|
||||
if len(response_content) == 0:
|
||||
response_content = None
|
||||
|
||||
return reasoning, response_content
|
||||
|
||||
return None, model_output
|
||||
|
||||
def _is_strict_increasing_subsequence(
|
||||
self, subsequence: Sequence[int], sequence: Sequence[int]
|
||||
) -> bool:
|
||||
if not subsequence:
|
||||
return False
|
||||
|
||||
sub_idx = 0
|
||||
for num in sequence:
|
||||
if sub_idx < len(subsequence) and num == subsequence[sub_idx]:
|
||||
sub_idx += 1
|
||||
return sub_idx == len(subsequence)
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
"""Extract content using token ID sequence state machine"""
|
||||
# Define sequences
|
||||
think_start_sequence = self.think_start_ids
|
||||
response_start_sequence = self.response_start_ids
|
||||
response_end_sequence = self.response_end_ids
|
||||
|
||||
assert len(delta_token_ids) == 1
|
||||
# Process each token in the delta
|
||||
token = delta_token_ids[0]
|
||||
|
||||
def check_token_with_sequence(token):
|
||||
if self.current_state == "idle" or self.current_state == "think":
|
||||
return (
|
||||
token == self.expected_sequence[self.sequence_index]
|
||||
or token == self.expected_sequence_side[self.sequence_index]
|
||||
)
|
||||
else:
|
||||
return token == self.expected_sequence[self.sequence_index]
|
||||
|
||||
def check_last_token(token):
|
||||
if self.current_state == "idle" or self.current_state == "think":
|
||||
# only return true if it's judge using a side sequence.
|
||||
if (
|
||||
self.sequence_index - 1 < len(self.expected_sequence_side)
|
||||
and token == self.expected_sequence_side[self.sequence_index - 1]
|
||||
):
|
||||
return self.sequence_index == len(self.expected_sequence_side)
|
||||
else:
|
||||
return self.sequence_index == len(self.expected_sequence)
|
||||
else:
|
||||
return self.sequence_index == len(self.expected_sequence)
|
||||
|
||||
# Check if token matches expected sequence
|
||||
token_in_state_seq = check_token_with_sequence(token)
|
||||
|
||||
if token_in_state_seq:
|
||||
# Store matching token
|
||||
self.token_buffer.append(token)
|
||||
self.text_buffer += delta_text
|
||||
self.sequence_index += 1
|
||||
## state change from idle->think->response->idle
|
||||
|
||||
# Check if sequence fully matched
|
||||
if check_last_token(token):
|
||||
# State transition
|
||||
if self.current_state == "idle":
|
||||
self.current_state = "think"
|
||||
self.expected_sequence = response_start_sequence
|
||||
self.expected_sequence_side = self.response_start_ids_fast
|
||||
elif self.current_state == "think":
|
||||
self.current_state = "response"
|
||||
self.expected_sequence = response_end_sequence
|
||||
elif self.current_state == "response":
|
||||
self.current_state = "idle"
|
||||
self.expected_sequence = think_start_sequence
|
||||
self.expected_sequence_side = self.think_start_ids_fast
|
||||
|
||||
# Reset matching state
|
||||
self.sequence_index = 0
|
||||
self.token_buffer = []
|
||||
self.text_buffer = ""
|
||||
# Do not send content for state transition texts.
|
||||
else:
|
||||
# Sequence broken - handle buffered content
|
||||
if self.token_buffer and len(self.token_buffer) > 0:
|
||||
# Send buffered tokens
|
||||
buffered_content = self.text_buffer + delta_text
|
||||
# Reset matching state
|
||||
self.sequence_index = 0
|
||||
self.token_buffer = []
|
||||
self.text_buffer = ""
|
||||
|
||||
# Return content based on current state
|
||||
if self.current_state == "think":
|
||||
return DeltaMessage(reasoning=buffered_content, content=None)
|
||||
else:
|
||||
return DeltaMessage(reasoning=None, content=buffered_content)
|
||||
else:
|
||||
# No buffered content, send normally
|
||||
if self.current_state == "think":
|
||||
return DeltaMessage(reasoning=delta_text, content=None)
|
||||
else:
|
||||
return DeltaMessage(reasoning=None, content=delta_text)
|
||||
|
||||
# If no content to send in this delta
|
||||
return None
|
||||
58
reasoning/identity_reasoning_parser.py
Normal file
58
reasoning/identity_reasoning_parser.py
Normal file
@@ -0,0 +1,58 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class IdentityReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Identity reasoning parser.
|
||||
|
||||
This parser does not attempt to parse or strip out reasoning tokens.
|
||||
It treats the entire model output as content and ignores reasoning.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
if not self.model_tokenizer:
|
||||
raise ValueError(
|
||||
"The model tokenizer must be passed to the ReasoningParser "
|
||||
"constructor during construction."
|
||||
)
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
# Always return True, since we never treat reasoning specially
|
||||
return True
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
# Identity: return all tokens as content
|
||||
return input_ids
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
# Just wrap delta_text as content, ignore reasoning
|
||||
if delta_text:
|
||||
return DeltaMessage(content=delta_text)
|
||||
return None
|
||||
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
# No reasoning separation: return None for reasoning,
|
||||
# and full model_output as content
|
||||
return None, model_output
|
||||
67
reasoning/minimax_m2_reasoning_parser.py
Normal file
67
reasoning/minimax_m2_reasoning_parser.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
DeltaMessage,
|
||||
ResponsesRequest,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
|
||||
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
|
||||
"""
|
||||
Reasoning parser for MiniMax M2 model.
|
||||
"""
|
||||
|
||||
@property
|
||||
def start_token(self) -> str:
|
||||
"""The token that starts reasoning content."""
|
||||
return "<think>"
|
||||
|
||||
@property
|
||||
def end_token(self) -> str:
|
||||
"""The token that ends reasoning content."""
|
||||
return "</think>"
|
||||
|
||||
|
||||
class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for MiniMax M2 model.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
self.end_token_id = self.vocab.get("</think>")
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
end_token_id = self.end_token_id
|
||||
return any(input_id == end_token_id for input_id in reversed(input_ids))
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
return input_ids
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
if len(previous_token_ids) == 0:
|
||||
delta_text = "<think>" + delta_text
|
||||
return DeltaMessage(content=delta_text)
|
||||
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
return None, "<think>" + model_output
|
||||
55
reasoning/mistral_reasoning_parser.py
Normal file
55
reasoning/mistral_reasoning_parser.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from functools import cached_property
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser
|
||||
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
|
||||
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class MistralReasoningParser(DeepSeekR1ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for Mistral models.
|
||||
|
||||
The Mistral models uses [THINK]...[/THINK] tokens to denote reasoning
|
||||
text. This parser extracts the reasoning content from the model output.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: MistralTokenizer, *args, **kwargs):
|
||||
if not isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError("The tokenizer must be an instance of MistralTokenizer.")
|
||||
|
||||
ReasoningParser.__init__(self, tokenizer, *args, **kwargs)
|
||||
|
||||
if not self.model_tokenizer:
|
||||
raise ValueError(
|
||||
"The model tokenizer must be passed to the ReasoningParser "
|
||||
"constructor during construction."
|
||||
)
|
||||
|
||||
self.start_token_id = tokenizer.tokenizer.get_control_token(self.start_token)
|
||||
self.end_token_id = tokenizer.tokenizer.get_control_token(self.end_token)
|
||||
|
||||
if self.start_token_id is None or self.end_token_id is None:
|
||||
raise RuntimeError(
|
||||
"Mistral reasoning parser could not locate think start/end "
|
||||
"tokens in the tokenizer!"
|
||||
)
|
||||
|
||||
@cached_property
|
||||
def start_token(self) -> str:
|
||||
"""The token that starts reasoning content."""
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
|
||||
return SpecialTokens.begin_think
|
||||
|
||||
@cached_property
|
||||
def end_token(self) -> str:
|
||||
"""The token that ends reasoning content."""
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
|
||||
return SpecialTokens.end_think
|
||||
302
reasoning/olmo3_reasoning_parser.py
Normal file
302
reasoning/olmo3_reasoning_parser.py
Normal file
@@ -0,0 +1,302 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import dataclasses as dt
|
||||
import enum
|
||||
from collections.abc import Sequence
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import regex as re
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
DeltaMessage,
|
||||
ResponsesRequest,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class Olmo3ReasoningState(enum.Enum):
|
||||
REASONING = 1
|
||||
CONTENT = 2
|
||||
|
||||
|
||||
@dt.dataclass(frozen=True)
|
||||
class Indices:
|
||||
start: int
|
||||
end: int
|
||||
|
||||
def __len__(self):
|
||||
return self.end - self.start
|
||||
|
||||
|
||||
def string_overlap(a: str, b: str) -> tuple[Indices | None, Indices | None]:
|
||||
"""
|
||||
Find the longest overlap where the end of string a matches the start
|
||||
of string b.
|
||||
|
||||
Args:
|
||||
a: First string
|
||||
b: Second string
|
||||
|
||||
Returns:
|
||||
Tuple of IndicesTuples representing the overlapping portions in each
|
||||
string, or a tuple of None if no overlap exists
|
||||
"""
|
||||
|
||||
# swap so a is always the shorter string
|
||||
a, b, swap = (a, b, False) if len(a) < len(b) else (b, a, True)
|
||||
|
||||
# first check: is a fully contained in b?
|
||||
if a in b:
|
||||
ind_a = Indices(0, len(a))
|
||||
ind_b = Indices(b.index(a), b.index(a) + len(a))
|
||||
return (ind_b, ind_a) if swap else (ind_a, ind_b)
|
||||
|
||||
# second check: does the end of a overlap with the
|
||||
# beginning of b?
|
||||
for i in range(len(a) - 1, 0, -1):
|
||||
if a[-i:] == b[:i]:
|
||||
ind_a = Indices(len(a) - i, len(a))
|
||||
ind_b = Indices(0, i)
|
||||
return (ind_b, ind_a) if swap else (ind_a, ind_b)
|
||||
|
||||
# third check: does the beginning of a overlap with
|
||||
# the end of b?
|
||||
for i in range(len(a) - 1, 0, -1):
|
||||
if b[-i:] == a[:i]:
|
||||
ind_a = Indices(0, i)
|
||||
ind_b = Indices(len(b) - i, len(b))
|
||||
return (ind_b, ind_a) if swap else (ind_a, ind_b)
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
@dt.dataclass
|
||||
class Olmo3ReasoningBuffer:
|
||||
think_start: str = "<think>"
|
||||
think_end: str = "</think>"
|
||||
buffer: str = ""
|
||||
|
||||
# we start in reasoning state to support cases where we hardcode
|
||||
# <think> as the start of the reasoning block.
|
||||
# In those cases, the only token we will see is </think>, which
|
||||
# is when we switch to content state.
|
||||
state: Olmo3ReasoningState = Olmo3ReasoningState.REASONING
|
||||
|
||||
def process_buffer(self) -> DeltaMessage | None:
|
||||
start_think_idx = self.buffer.find(self.think_start)
|
||||
|
||||
if start_think_idx >= 0:
|
||||
self.state = Olmo3ReasoningState.REASONING
|
||||
pretext, self.buffer = (
|
||||
self.buffer[:start_think_idx],
|
||||
self.buffer[start_think_idx + len(self.think_start) :],
|
||||
)
|
||||
if start_think_idx > 0:
|
||||
# this covers the case there's content before
|
||||
# the start of the reasoning block
|
||||
return DeltaMessage(content=pretext)
|
||||
|
||||
end_think_idx = self.buffer.rfind(self.think_end)
|
||||
|
||||
if end_think_idx >= 0:
|
||||
self.state = Olmo3ReasoningState.CONTENT
|
||||
pretext, self.buffer = (
|
||||
self.buffer[:end_think_idx],
|
||||
self.buffer[end_think_idx + len(self.think_end) :],
|
||||
)
|
||||
if end_think_idx > 0:
|
||||
# this covers the case there's content before
|
||||
# the end of the reasoning block
|
||||
return DeltaMessage(reasoning=pretext)
|
||||
|
||||
if self.state == Olmo3ReasoningState.REASONING:
|
||||
# we are inside reasoning block, return and empty
|
||||
# the text buffer
|
||||
(
|
||||
text_buffer,
|
||||
self.buffer,
|
||||
) = self.buffer, ""
|
||||
return DeltaMessage(reasoning=text_buffer)
|
||||
|
||||
if self.state == Olmo3ReasoningState.CONTENT:
|
||||
# we are outside reasoning block, return and empty
|
||||
# the text buffer
|
||||
(
|
||||
text_buffer,
|
||||
self.buffer,
|
||||
) = self.buffer, ""
|
||||
return DeltaMessage(content=text_buffer)
|
||||
|
||||
# nothing to return unless we are in reasoning or content state
|
||||
return None
|
||||
|
||||
def __len__(self):
|
||||
# is the length of the text buffer
|
||||
return len(self.buffer)
|
||||
|
||||
def add_text(self, delta_text: str) -> DeltaMessage | None:
|
||||
# we start by adding the delta text to the buffer
|
||||
self.buffer += delta_text
|
||||
|
||||
# setting this to empty before starting
|
||||
delta_message: DeltaMessage | None = None
|
||||
|
||||
# we start by computing the overlap between the delta_text
|
||||
# and start/end of think tokens.
|
||||
_, overlap_think_start = string_overlap(delta_text, self.think_start)
|
||||
_, overlap_think_end = string_overlap(delta_text, self.think_end)
|
||||
|
||||
partial_overlap_start = overlap_think_start is not None and len(
|
||||
overlap_think_start
|
||||
) < len(self.think_start)
|
||||
partial_overlap_end = overlap_think_end is not None and len(
|
||||
overlap_think_end
|
||||
) < len(self.think_end)
|
||||
|
||||
if (
|
||||
partial_overlap_start
|
||||
and self.think_start in self.buffer
|
||||
and not partial_overlap_end
|
||||
):
|
||||
# we can only process the buffer if partial overlap
|
||||
# is the last part of think token (thus causing
|
||||
# text_buffer to contain the start of think token)
|
||||
# and there are no partial overlaps with end think
|
||||
delta_message = self.process_buffer()
|
||||
|
||||
elif partial_overlap_end and self.think_end in self.buffer:
|
||||
# same as before (partial overlap only allowed)
|
||||
# if the buffer contains the end think token,
|
||||
# but we don't have to check for partial overlap
|
||||
# with start think token because they are handled
|
||||
# by the previous condition
|
||||
delta_message = self.process_buffer()
|
||||
|
||||
elif partial_overlap_start or partial_overlap_end:
|
||||
# in general, if there are overlaps, we don't
|
||||
# process the buffer because we want to wait until
|
||||
# the think token is fully completed.
|
||||
return None
|
||||
else:
|
||||
# we process the buffer as normal
|
||||
delta_message = self.process_buffer()
|
||||
|
||||
return delta_message
|
||||
|
||||
|
||||
class Olmo3ReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for Olmo 3 model
|
||||
|
||||
Olmo3ReasoningParser
|
||||
|
||||
This class implements a reasoning parser specifically designed for the
|
||||
Olmo 3 family of models. Olmo 3 models do not use special tokens to
|
||||
indicate reasoning; rather, reasoning trace is wrapped in `<think>` and
|
||||
`</think>`, which are tokenized using standard vocabulary entries.
|
||||
Because of this, the parser operates in string space, accumulating the
|
||||
characters in a buffer until it sees `<think>` or `</think>`. tokens
|
||||
to switch modes.
|
||||
|
||||
Key Features:
|
||||
- For non-stream output, Recognizes and extracts reasoning (text
|
||||
bracketed by `<think>` and `</think>`) and content (everything
|
||||
after the first `</think>`).
|
||||
- For stream process, it uses a buffer to accumulate delta text,
|
||||
and output progressive delta messages as soon as thinking starts
|
||||
or ends.
|
||||
- For reliability, some Olmo 3 models may hardcode the first
|
||||
`<think>` token is the input text (similar to Deepseek R1,
|
||||
or reasoning-only Qwen models). To support such variants, the
|
||||
parser can optionally work in cases where the first `<think>`
|
||||
token is missing from generation.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: "AnyTokenizer", *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
|
||||
self.think_start = r"<think>"
|
||||
self.think_end = r"</think>"
|
||||
|
||||
# notice that the first think is optional; this allows template to
|
||||
# work in cases when we hardcode a <think> at the beginning of the
|
||||
# reasoning template.
|
||||
reasoning_expr = (
|
||||
rf"^(?:{self.think_start})?(?P<reasoning>.*?)"
|
||||
+ rf"{self.think_end}(?P<content>.*)$"
|
||||
)
|
||||
self.reasoning_regex = re.compile(reasoning_expr, re.DOTALL)
|
||||
|
||||
self.buffer = Olmo3ReasoningBuffer(
|
||||
think_start=self.think_start, think_end=self.think_end
|
||||
)
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
text = self.model_tokenizer.decode(input_ids)
|
||||
return self.think_end in text
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
# for Olmo 3 streaming reason parsing, the stream parse
|
||||
# will call first, and the same token will be called in
|
||||
# is_reasoning_end and extract_content_ids
|
||||
# this id is not part of content, so just return [] here.
|
||||
return []
|
||||
|
||||
def extract_reasoning(
|
||||
self,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest | ResponsesRequest,
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""Extract the reasoning content & content sections, respectively.
|
||||
If the sequence doesn't match what we expect, i.e., the model generates
|
||||
something else, all content is considered non-reasoning content.
|
||||
|
||||
Args:
|
||||
model_output (str): Output of the model to be parsed.
|
||||
request (ChatCompletionRequest | ResponsesRequest): Request being
|
||||
processed.
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]: Tuple pair containing the
|
||||
reasoning content and non-reasoning content.
|
||||
"""
|
||||
|
||||
re_match = self.reasoning_regex.match(model_output)
|
||||
if re_match:
|
||||
reasoning = re_match.group("reasoning") or None
|
||||
content = re_match.group("content") or None
|
||||
return reasoning, content
|
||||
|
||||
# no reasoning content
|
||||
return None, model_output
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
"""Extract content using token ID sequence state machine"""
|
||||
|
||||
delta_message = self.buffer.add_text(delta_text)
|
||||
if delta_message is None and self.buffer.think_end in self.buffer.buffer:
|
||||
# this is a bit hacky, but, because of how the buffer is
|
||||
# constructed, if the last delta_text contains characters that
|
||||
# marks the end of thinking tokens, then messages in the buffer
|
||||
# would never be processed because we get no other turn. To get
|
||||
# around that, we check if the text buffer contains the end of
|
||||
# thinking tokens, and, if so, we reprocess the buffer again.
|
||||
delta_message = self.buffer.process_buffer()
|
||||
|
||||
return delta_message
|
||||
67
reasoning/qwen3_reasoning_parser.py
Normal file
67
reasoning/qwen3_reasoning_parser.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ResponsesRequest
|
||||
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
|
||||
|
||||
|
||||
class Qwen3ReasoningParser(BaseThinkingReasoningParser):
|
||||
"""
|
||||
Reasoning parser for the Qwen3 model.
|
||||
|
||||
The Qwen3 model uses <think>...</think> tokens to denote reasoning text
|
||||
within its output. The model provides a strict switch to disable reasoning
|
||||
output via the 'enable_thinking=False' parameter. This parser extracts the
|
||||
reasoning content enclosed by <think> and </think> tokens from the model's
|
||||
output.
|
||||
"""
|
||||
|
||||
@property
|
||||
def start_token(self) -> str:
|
||||
"""The token that starts reasoning content."""
|
||||
return "<think>"
|
||||
|
||||
@property
|
||||
def end_token(self) -> str:
|
||||
"""The token that ends reasoning content."""
|
||||
return "</think>"
|
||||
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
Extract reasoning content from the model output.
|
||||
|
||||
Qwen3 has stricter requirements - it needs both start and end tokens
|
||||
to be present, unlike other models that work with just the end token.
|
||||
|
||||
For text <think>abc</think>xyz:
|
||||
- 'abc' goes to reasoning
|
||||
- 'xyz' goes to content
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]: reasoning content and content
|
||||
"""
|
||||
|
||||
# Check if the model output contains both <think> and </think> tokens.
|
||||
if self.start_token not in model_output or self.end_token not in model_output:
|
||||
return None, model_output
|
||||
|
||||
# Check if the <think> is present in the model output, remove it
|
||||
# if it is present.
|
||||
model_output_parts = model_output.partition(self.start_token)
|
||||
model_output = (
|
||||
model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
|
||||
)
|
||||
|
||||
# Check if the model output contains the </think> tokens.
|
||||
# If the end token is not found, return the model output as is.
|
||||
if self.end_token not in model_output:
|
||||
return None, model_output
|
||||
|
||||
# Extract reasoning content from the model output.
|
||||
reasoning, _, content = model_output.partition(self.end_token)
|
||||
|
||||
final_content = content or None
|
||||
return reasoning, final_content
|
||||
27
reasoning/seedoss_reasoning_parser.py
Normal file
27
reasoning/seedoss_reasoning_parser.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
|
||||
|
||||
|
||||
class SeedOSSReasoningParser(BaseThinkingReasoningParser):
|
||||
"""
|
||||
Reasoning parser for SeedOSS model.
|
||||
|
||||
The SeedOSS model uses <seed:think>...</seed:think> tokens to
|
||||
denote reasoning content text. This parser extracts
|
||||
the reasoning content from the model output.
|
||||
Similar to DeepSeek R1, it supports cases
|
||||
where the model doesn't generate the start token.
|
||||
"""
|
||||
|
||||
@property
|
||||
def start_token(self) -> str:
|
||||
"""The token that starts reasoning content."""
|
||||
return "<seed:think>"
|
||||
|
||||
@property
|
||||
def end_token(self) -> str:
|
||||
"""The token that ends reasoning content."""
|
||||
return "</seed:think>"
|
||||
107
reasoning/step3_reasoning_parser.py
Normal file
107
reasoning/step3_reasoning_parser.py
Normal file
@@ -0,0 +1,107 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import regex as re
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class Step3ReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for Step3 model.
|
||||
|
||||
The Step3 model uses </think> token to denote the end of reasoning
|
||||
text. This parser extracts all content before </think> as reasoning content.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
self.think_end_token = "</think>"
|
||||
|
||||
self.reasoning_regex = re.compile(rf"(.*?){self.think_end_token}", re.DOTALL)
|
||||
|
||||
if not self.model_tokenizer:
|
||||
raise ValueError(
|
||||
"The model tokenizer must be passed to the ReasoningParser "
|
||||
"constructor during construction."
|
||||
)
|
||||
|
||||
self.think_end_token_id = self.vocab.get(self.think_end_token)
|
||||
if self.think_end_token_id is None:
|
||||
raise RuntimeError(
|
||||
"Step3 reasoning parser could not locate think end "
|
||||
"token in the tokenizer!"
|
||||
)
|
||||
|
||||
def extract_reasoning_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> DeltaMessage | None:
|
||||
"""
|
||||
Extract reasoning content from a delta message.
|
||||
Handles streaming output where previous + delta = current.
|
||||
Uses token IDs for faster processing.
|
||||
For text "abc</think>xyz":
|
||||
- 'abc' goes to reasoning
|
||||
- 'xyz' goes to content
|
||||
"""
|
||||
# Skip single special token
|
||||
if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id:
|
||||
return None
|
||||
|
||||
if self.think_end_token_id in delta_token_ids:
|
||||
# </think> in delta, extract reasoning content and remaining content
|
||||
end_index = delta_text.find(self.think_end_token)
|
||||
reasoning = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self.think_end_token) :]
|
||||
return DeltaMessage(
|
||||
reasoning=reasoning,
|
||||
content=content if content else None,
|
||||
)
|
||||
elif self.think_end_token_id in previous_token_ids:
|
||||
# </think> already seen in previous text, everything is content
|
||||
return DeltaMessage(content=delta_text)
|
||||
else:
|
||||
# No </think> seen yet, everything is reasoning
|
||||
return DeltaMessage(reasoning=delta_text)
|
||||
|
||||
def extract_reasoning(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[str | None, str | None]:
|
||||
# Check if the model output contains the </think> token
|
||||
if self.think_end_token not in model_output:
|
||||
# If no </think> token, everything is reasoning content
|
||||
return model_output, None
|
||||
else:
|
||||
# Find the first occurrence of </think>
|
||||
end_index = model_output.find(self.think_end_token)
|
||||
reasoning = model_output[:end_index]
|
||||
|
||||
# Content after </think> token
|
||||
content = model_output[end_index + len(self.think_end_token) :]
|
||||
|
||||
if len(content) == 0:
|
||||
content = None
|
||||
|
||||
return reasoning, content
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return self.think_end_token_id in input_ids
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
if self.think_end_token_id not in input_ids[:-1]:
|
||||
return []
|
||||
else:
|
||||
return input_ids[input_ids.index(self.think_end_token_id) + 1 :]
|
||||
Reference in New Issue
Block a user