init
This commit is contained in:
29
vllm/reasoning/__init__.py
Normal file
29
vllm/reasoning/__init__.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
|
||||
from .basic_parsers import BaseThinkingReasoningParser
|
||||
from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
|
||||
from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser
|
||||
from .gptoss_reasoning_parser import GptOssReasoningParser
|
||||
from .granite_reasoning_parser import GraniteReasoningParser
|
||||
from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser
|
||||
from .mistral_reasoning_parser import MistralReasoningParser
|
||||
from .qwen3_reasoning_parser import Qwen3ReasoningParser
|
||||
from .seedoss_reasoning_parser import SeedOSSReasoningParser
|
||||
from .step3_reasoning_parser import Step3ReasoningParser
|
||||
|
||||
__all__ = [
|
||||
"ReasoningParser",
|
||||
"BaseThinkingReasoningParser",
|
||||
"ReasoningParserManager",
|
||||
"DeepSeekR1ReasoningParser",
|
||||
"GraniteReasoningParser",
|
||||
"HunyuanA13BReasoningParser",
|
||||
"Qwen3ReasoningParser",
|
||||
"Glm4MoeModelReasoningParser",
|
||||
"MistralReasoningParser",
|
||||
"Step3ReasoningParser",
|
||||
"GptOssReasoningParser",
|
||||
"SeedOSSReasoningParser",
|
||||
]
|
||||
BIN
vllm/reasoning/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
vllm/reasoning/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm/reasoning/__pycache__/abs_reasoning_parsers.cpython-312.pyc
Normal file
BIN
vllm/reasoning/__pycache__/abs_reasoning_parsers.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm/reasoning/__pycache__/basic_parsers.cpython-312.pyc
Normal file
BIN
vllm/reasoning/__pycache__/basic_parsers.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
202
vllm/reasoning/abs_reasoning_parsers.py
Normal file
202
vllm/reasoning/abs_reasoning_parsers.py
Normal file
@@ -0,0 +1,202 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Sequence
|
||||
from functools import cached_property
|
||||
from typing import TYPE_CHECKING, Any, Callable, Union
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import import_from_path, is_list_of
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DeltaMessage,
|
||||
ResponsesRequest)
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
else:
|
||||
ChatCompletionRequest = Any
|
||||
DeltaMessage = Any
|
||||
ResponsesRequest = Any
|
||||
AnyTokenizer = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class ReasoningParser:
|
||||
"""
|
||||
Abstract reasoning parser class that should not be used directly.
|
||||
Provided and methods should be used in derived classes.
|
||||
|
||||
It is used to extract reasoning content from the model output.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
|
||||
self.model_tokenizer = tokenizer
|
||||
|
||||
@cached_property
|
||||
def vocab(self) -> dict[str, int]:
|
||||
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
|
||||
# whereas all tokenizers have .get_vocab()
|
||||
return self.model_tokenizer.get_vocab()
|
||||
|
||||
@abstractmethod
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
"""
|
||||
Check if the reasoning content ends in the input_ids.
|
||||
|
||||
It is used in structured engines like `xgrammar` to check if the
|
||||
reasoning content ends in the model output.
|
||||
|
||||
Parameters:
|
||||
input_ids: list[int]
|
||||
The input_ids of the model output.
|
||||
|
||||
Returns:
|
||||
bool
|
||||
True if the reasoning content ends in the input_ids.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
"""
|
||||
Extract content token ids from the input_ids.
|
||||
Parameters:
|
||||
input_ids: list[int]
|
||||
The input_ids of the model output.
|
||||
Returns:
|
||||
list[int]
|
||||
The extracted content from the input_ids.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_reasoning_content(
|
||||
self,
|
||||
model_output: str,
|
||||
request: Union[ChatCompletionRequest, ResponsesRequest],
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
Extract reasoning content from a complete model-generated string.
|
||||
|
||||
Used for non-streaming responses where we have the entire model response
|
||||
available before sending to the client.
|
||||
|
||||
Parameters:
|
||||
model_output: str
|
||||
The model-generated string to extract reasoning content from.
|
||||
|
||||
request: ChatCompletionRequest
|
||||
The request object that was used to generate the model_output.
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]
|
||||
A tuple containing the reasoning content and the content.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def extract_reasoning_content_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> Union[DeltaMessage, None]:
|
||||
"""
|
||||
Instance method that should be implemented for extracting reasoning
|
||||
from an incomplete response; for use when handling reasoning calls and
|
||||
streaming. Has to be an instance method because it requires state -
|
||||
the current tokens/diffs, but also the information about what has
|
||||
previously been parsed and extracted (see constructor)
|
||||
"""
|
||||
|
||||
|
||||
class ReasoningParserManager:
|
||||
reasoning_parsers: dict[str, type] = {}
|
||||
|
||||
@classmethod
|
||||
def get_reasoning_parser(cls, name: str | None) -> type[ReasoningParser]:
|
||||
"""
|
||||
Get reasoning parser by name which is registered by `register_module`.
|
||||
|
||||
Raise a KeyError exception if the name is not registered.
|
||||
"""
|
||||
if name in cls.reasoning_parsers:
|
||||
return cls.reasoning_parsers[name]
|
||||
|
||||
raise KeyError(
|
||||
f"reasoning helper: '{name}' not found in reasoning_parsers")
|
||||
|
||||
@classmethod
|
||||
def _register_module(
|
||||
cls,
|
||||
module: type,
|
||||
module_name: Union[str, list[str]] | None = None,
|
||||
force: bool = True,
|
||||
) -> None:
|
||||
if not issubclass(module, ReasoningParser):
|
||||
raise TypeError("module must be subclass of ReasoningParser, "
|
||||
f"but got {type(module)}")
|
||||
if module_name is None:
|
||||
module_name = module.__name__
|
||||
if isinstance(module_name, str):
|
||||
module_name = [module_name]
|
||||
for name in module_name:
|
||||
if not force and name in cls.reasoning_parsers:
|
||||
existed_module = cls.reasoning_parsers[name]
|
||||
raise KeyError(f"{name} is already registered "
|
||||
f"at {existed_module.__module__}")
|
||||
cls.reasoning_parsers[name] = module
|
||||
|
||||
@classmethod
|
||||
def register_module(
|
||||
cls,
|
||||
name: Union[str, list[str]] | None = None,
|
||||
force: bool = True,
|
||||
module: Union[type, None] = None,
|
||||
) -> Union[type, Callable]:
|
||||
"""
|
||||
Register module with the given name or name list. it can be used as a
|
||||
decoder(with module as None) or normal function(with module as not
|
||||
None).
|
||||
"""
|
||||
if not isinstance(force, bool):
|
||||
raise TypeError(f"force must be a boolean, but got {type(force)}")
|
||||
|
||||
# raise the error ahead of time
|
||||
if not (name is None or isinstance(name, str)
|
||||
or is_list_of(name, str)):
|
||||
raise TypeError(
|
||||
"name must be None, an instance of str, or a sequence of str, "
|
||||
f"but got {type(name)}")
|
||||
|
||||
# use it as a normal method: x.register_module(module=SomeClass)
|
||||
if module is not None:
|
||||
cls._register_module(module=module, module_name=name, force=force)
|
||||
return module
|
||||
|
||||
# use it as a decorator: @x.register_module()
|
||||
def _register(module):
|
||||
cls._register_module(module=module, module_name=name, force=force)
|
||||
return module
|
||||
|
||||
return _register
|
||||
|
||||
@classmethod
|
||||
def import_reasoning_parser(cls, plugin_path: str) -> None:
|
||||
"""
|
||||
Import a user-defined reasoning parser by the path
|
||||
of the reasoning parser define file.
|
||||
"""
|
||||
module_name = os.path.splitext(os.path.basename(plugin_path))[0]
|
||||
|
||||
try:
|
||||
import_from_path(module_name, plugin_path)
|
||||
except Exception:
|
||||
logger.exception("Failed to load module '%s' from %s.",
|
||||
module_name, plugin_path)
|
||||
return
|
||||
156
vllm/reasoning/basic_parsers.py
Normal file
156
vllm/reasoning/basic_parsers.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Sequence
|
||||
from typing import Optional, Union
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DeltaMessage, ResponsesRequest)
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
|
||||
class BaseThinkingReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Base class for reasoning parsers that use thinking tokens.
|
||||
|
||||
This class provides common functionality for parsers that use start and end
|
||||
tokens to delimit reasoning content (
|
||||
e.g., <think>...</think>, <seed:think>...</seed:think>).
|
||||
|
||||
Subclasses must implement the start and end tokens via abstract
|
||||
properties.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def start_token(self) -> str:
|
||||
"""The token that starts reasoning content."""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def end_token(self) -> str:
|
||||
"""The token that ends reasoning content."""
|
||||
raise NotImplementedError
|
||||
|
||||
def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
|
||||
if not self.model_tokenizer:
|
||||
raise ValueError(
|
||||
"The model tokenizer must be passed to the ReasoningParser "
|
||||
"constructor during construction.")
|
||||
|
||||
if not self.start_token or not self.end_token:
|
||||
raise ValueError(
|
||||
"start_token and end_token must be defined in subclasses")
|
||||
|
||||
self.start_token_id = self.vocab.get(self.start_token)
|
||||
self.end_token_id = self.vocab.get(self.end_token)
|
||||
if self.start_token_id is None or self.end_token_id is None:
|
||||
raise RuntimeError(
|
||||
f"{self.__class__.__name__} reasoning parser could not locate "
|
||||
"think start/end tokens in the tokenizer!")
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return self.end_token_id in input_ids
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
"""
|
||||
Extract the content after the end tokens
|
||||
"""
|
||||
if self.end_token_id not in input_ids[:-1]:
|
||||
return []
|
||||
else:
|
||||
return input_ids[input_ids.index(self.end_token_id) + 1:]
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> Union[DeltaMessage, None]:
|
||||
"""
|
||||
Extract reasoning content from a delta message.
|
||||
Handles streaming output where previous + delta = current.
|
||||
Uses token IDs for faster processing.
|
||||
"""
|
||||
# Skip single special tokens
|
||||
if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
|
||||
self.start_token_id, self.end_token_id
|
||||
]):
|
||||
return None
|
||||
|
||||
# Check if start token is present in previous or delta.
|
||||
# Keep compatibility with models that don't generate start tokens.
|
||||
if self.start_token_id in previous_token_ids:
|
||||
if self.end_token_id in delta_token_ids:
|
||||
# start token in previous, end token in delta,
|
||||
# extract reasoning content
|
||||
end_index = delta_text.find(self.end_token)
|
||||
reasoning_content = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self.end_token):]
|
||||
return DeltaMessage(
|
||||
reasoning_content=reasoning_content,
|
||||
content=content if content else None,
|
||||
)
|
||||
elif self.end_token_id in previous_token_ids:
|
||||
# start token in previous, end token in previous,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(content=delta_text)
|
||||
else:
|
||||
# start token in previous, no end token in previous or delta,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
elif self.start_token_id in delta_token_ids:
|
||||
if self.end_token_id in delta_token_ids:
|
||||
# start token in delta, end token in delta,
|
||||
# extract reasoning content
|
||||
start_index = delta_text.find(self.start_token)
|
||||
end_index = delta_text.find(self.end_token)
|
||||
reasoning_content = delta_text[start_index +
|
||||
len(self.start_token):end_index]
|
||||
content = delta_text[end_index + len(self.end_token):]
|
||||
return DeltaMessage(
|
||||
reasoning_content=reasoning_content,
|
||||
content=content if content else None,
|
||||
)
|
||||
else:
|
||||
# start token in delta, no end token in delta,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
else:
|
||||
# not find thinking start token
|
||||
return DeltaMessage(content=delta_text)
|
||||
|
||||
def extract_reasoning_content(
|
||||
self, model_output: str, request: Union[ChatCompletionRequest,
|
||||
ResponsesRequest]
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Extract reasoning content from the model output.
|
||||
|
||||
This is the base implementation that works for most models.
|
||||
Subclasses can override this method for specific behavior.
|
||||
"""
|
||||
# Check if the start token is present in the model output, remove it
|
||||
# if it is present.
|
||||
model_output_parts = model_output.partition(self.start_token)
|
||||
model_output = model_output_parts[2] if model_output_parts[
|
||||
1] else model_output_parts[0]
|
||||
|
||||
# For models that may not generate start token,
|
||||
# assume the reasoning content is always at the start.
|
||||
if self.end_token not in model_output:
|
||||
return model_output, None
|
||||
else:
|
||||
reasoning_content, _, content = model_output.partition(
|
||||
self.end_token)
|
||||
# If generation stops right after end-of-think, return null content
|
||||
final_content = content or None
|
||||
return reasoning_content, final_content
|
||||
67
vllm/reasoning/deepseek_r1_reasoning_parser.py
Normal file
67
vllm/reasoning/deepseek_r1_reasoning_parser.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import Union
|
||||
|
||||
from vllm.entrypoints.openai.protocol import DeltaMessage
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
|
||||
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
|
||||
|
||||
|
||||
@ReasoningParserManager.register_module("deepseek_r1")
|
||||
class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
|
||||
"""
|
||||
Reasoning parser for DeepSeek R1 model.
|
||||
|
||||
The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning
|
||||
text. This parser extracts the reasoning content from the model output.
|
||||
"""
|
||||
|
||||
@property
|
||||
def start_token(self) -> str:
|
||||
"""The token that starts reasoning content."""
|
||||
return "<think>"
|
||||
|
||||
@property
|
||||
def end_token(self) -> str:
|
||||
"""The token that ends reasoning content."""
|
||||
return "</think>"
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> Union[DeltaMessage, None]:
|
||||
ret = super().extract_reasoning_content_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
previous_token_ids,
|
||||
current_token_ids,
|
||||
delta_token_ids,
|
||||
)
|
||||
if (ret is not None and self.start_token_id not in previous_token_ids
|
||||
and self.start_token_id not in delta_token_ids):
|
||||
if self.end_token_id in delta_token_ids:
|
||||
# end token in delta with more tokens,
|
||||
# extract reasoning content and content
|
||||
end_index = delta_text.find(self.end_token)
|
||||
reasoning_content = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self.end_token):]
|
||||
return DeltaMessage(
|
||||
reasoning_content=reasoning_content,
|
||||
content=content if content else None,
|
||||
)
|
||||
elif self.end_token_id in previous_token_ids:
|
||||
# end token in previous, thinking content ends
|
||||
return DeltaMessage(content=delta_text)
|
||||
else:
|
||||
# no end token in previous or delta, reasoning content continues
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
|
||||
return ret
|
||||
151
vllm/reasoning/glm4_moe_reasoning_parser.py
Normal file
151
vllm/reasoning/glm4_moe_reasoning_parser.py
Normal file
@@ -0,0 +1,151 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DeltaMessage)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ReasoningParserManager.register_module("glm45")
|
||||
class Glm4MoeModelReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for the Glm4MoeModel model.
|
||||
|
||||
The Glm4MoeModel model uses <think>...</think> tokens to denote reasoning
|
||||
text within its output. The model provides a strict switch to disable
|
||||
reasoning output via the 'enable_thinking=False' parameter. This parser
|
||||
extracts the reasoning content enclosed by <think> and </think> tokens
|
||||
from the model's output.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
self.think_start_token = "<think>"
|
||||
self.think_end_token = "</think>"
|
||||
|
||||
if not self.model_tokenizer:
|
||||
raise ValueError(
|
||||
"The model tokenizer must be passed to the ReasoningParser "
|
||||
"constructor during construction.")
|
||||
|
||||
self.think_start_token_id = self.vocab.get(self.think_start_token)
|
||||
self.think_end_token_id = self.vocab.get(self.think_end_token)
|
||||
if (self.think_start_token_id is None
|
||||
or self.think_end_token_id is None):
|
||||
raise RuntimeError(
|
||||
"Glm4MoeModel reasoning parser could not locate "
|
||||
"think start/end tokens in the tokenizer!")
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return self.think_end_token_id in input_ids
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
"""
|
||||
Extract the content after the end tokens
|
||||
"""
|
||||
if self.think_end_token_id not in input_ids[:-1]:
|
||||
return []
|
||||
else:
|
||||
return input_ids[input_ids.index(self.think_end_token_id) + 1:]
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> Union[DeltaMessage, None]:
|
||||
"""
|
||||
Extract reasoning content from a delta message.
|
||||
Handles streaming output where previous + delta = current.
|
||||
Uses token IDs for faster processing.
|
||||
For text <think>abc</think>xyz:
|
||||
- 'abc' goes to reasoning_content
|
||||
- 'xyz' goes to content
|
||||
"""
|
||||
# Skip single special tokens
|
||||
if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
|
||||
self.think_start_token_id, self.think_end_token_id
|
||||
]):
|
||||
return None
|
||||
|
||||
if self.think_start_token_id in previous_token_ids:
|
||||
if self.think_end_token_id in delta_token_ids:
|
||||
# <think> in previous, </think> in delta,
|
||||
# extract reasoning content
|
||||
end_index = delta_text.find(self.think_end_token)
|
||||
reasoning_content = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self.think_end_token):]
|
||||
return DeltaMessage(reasoning_content=reasoning_content,
|
||||
content=content if content else None)
|
||||
elif self.think_end_token_id in previous_token_ids:
|
||||
# <think> in previous, </think> in previous,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(content=delta_text)
|
||||
else:
|
||||
# <think> in previous, no </think> in previous or delta,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
elif self.think_start_token_id in delta_token_ids:
|
||||
if self.think_end_token_id in delta_token_ids:
|
||||
# <think> in delta, </think> in delta, extract reasoning content
|
||||
start_index = delta_text.find(self.think_start_token)
|
||||
end_index = delta_text.find(self.think_end_token)
|
||||
reasoning_content = delta_text[start_index +
|
||||
len(self.think_start_token
|
||||
):end_index]
|
||||
content = delta_text[end_index + len(self.think_end_token):]
|
||||
return DeltaMessage(reasoning_content=reasoning_content,
|
||||
content=content if content else None)
|
||||
else:
|
||||
# <think> in delta, no </think> in delta,
|
||||
# reasoning content continues
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
else:
|
||||
# thinking is disabled, just content
|
||||
return DeltaMessage(content=delta_text)
|
||||
|
||||
def extract_reasoning_content(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Extract reasoning content from the model output.
|
||||
|
||||
For text <think>abc</think>xyz:
|
||||
- 'abc' goes to reasoning_content
|
||||
- 'xyz' goes to content
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]: reasoning content and content
|
||||
"""
|
||||
|
||||
# Check if the model output contains the <think> and </think> tokens.
|
||||
if (self.think_start_token not in model_output
|
||||
or self.think_end_token not in model_output):
|
||||
return None, model_output
|
||||
# Check if the <think> is present in the model output, remove it
|
||||
# if it is present.
|
||||
model_output_parts = model_output.partition(self.think_start_token)
|
||||
model_output = model_output_parts[2] if model_output_parts[
|
||||
1] else model_output_parts[0]
|
||||
# Check if the model output contains the </think> tokens.
|
||||
# If the end token is not found, return the model output as is.
|
||||
if self.think_end_token not in model_output:
|
||||
return None, model_output
|
||||
|
||||
# Extract reasoning content from the model output.
|
||||
reasoning_content, _, content = model_output.partition(
|
||||
self.think_end_token)
|
||||
|
||||
final_content = content or None
|
||||
return reasoning_content, final_content
|
||||
87
vllm/reasoning/gptoss_reasoning_parser.py
Normal file
87
vllm/reasoning/gptoss_reasoning_parser.py
Normal file
@@ -0,0 +1,87 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import Optional, Union
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.harmony_utils import parse_chat_output
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DeltaMessage)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ReasoningParserManager.register_module("openai_gptoss")
|
||||
class GptOssReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for GptOss model.
|
||||
|
||||
The GptOss model uses harmony to extract reasoning content and this parser
|
||||
is only used for detecting the end of the reasoning content.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
self.reasoning_end_token_ids = self.model_tokenizer.encode(
|
||||
"<|start|>assistant<|channel|>final<|message|>")
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
end_token_ids = self.reasoning_end_token_ids
|
||||
assert len(end_token_ids) > 0, "reasoning_end_token_ids is empty"
|
||||
# Check if the end sequence is present in the input_ids.
|
||||
# We search from the end of input_ids to find the last match.
|
||||
for i in range(len(input_ids) - len(end_token_ids), -1, -1):
|
||||
if input_ids[i:i + len(end_token_ids)] == end_token_ids:
|
||||
return True
|
||||
return False
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
_, content, _ = parse_chat_output(input_ids)
|
||||
if content is None:
|
||||
return []
|
||||
return self.model_tokenizer.encode(content)
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> Union[DeltaMessage, None]:
|
||||
prev_reasoning, prev_content, _ = parse_chat_output(
|
||||
list(previous_token_ids))
|
||||
cur_reasoning, cur_content, _ = parse_chat_output(
|
||||
list(current_token_ids))
|
||||
reasoning_delta = None
|
||||
content_delta = None
|
||||
if cur_reasoning is not None:
|
||||
prev_r = prev_reasoning or ""
|
||||
if cur_reasoning.startswith(prev_r):
|
||||
reasoning_delta = cur_reasoning[len(prev_r):] or None
|
||||
else:
|
||||
reasoning_delta = cur_reasoning
|
||||
if cur_content is not None:
|
||||
prev_c = prev_content or ""
|
||||
if cur_content.startswith(prev_c):
|
||||
content_delta = cur_content[len(prev_c):] or None
|
||||
else:
|
||||
content_delta = cur_content
|
||||
if reasoning_delta is None and content_delta is None:
|
||||
return None
|
||||
return DeltaMessage(reasoning_content=reasoning_delta,
|
||||
content=content_delta)
|
||||
|
||||
def extract_reasoning_content(
|
||||
self,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest,
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
raise NotImplementedError(
|
||||
"gpt-oss has a special branch for parsing reasoning in non-streaming mode. This method shouldn't be used." # noqa: E501
|
||||
)
|
||||
363
vllm/reasoning/granite_reasoning_parser.py
Normal file
363
vllm/reasoning/granite_reasoning_parser.py
Normal file
@@ -0,0 +1,363 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import Optional, Union
|
||||
|
||||
import regex as re
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DeltaMessage)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ReasoningParserManager.register_module("granite")
|
||||
class GraniteReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for IBM Granite.
|
||||
|
||||
IBM granite models currently use "Here is my thought process:"
|
||||
and "Here is my response:" to separate its thinking / response outputs.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
|
||||
# NOTE: There have been some observed occurrences of quantized
|
||||
# instances of the current models using "Here's" instead of "Here is",
|
||||
# so to be safe, we match on both.
|
||||
self.think_start_expr = r"(?:Here's|Here is) my thought process:"
|
||||
self.response_start_expr = r"(?:Here's|Here is) my response:"
|
||||
|
||||
self.reasoning_regex = re.compile(
|
||||
rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)",
|
||||
re.DOTALL)
|
||||
|
||||
self.valid_think_starts = [
|
||||
"Here's my thought process:", "Here is my thought process:"
|
||||
]
|
||||
self.valid_response_starts = [
|
||||
"Here's my response:", "Here is my response:"
|
||||
]
|
||||
|
||||
# Substrings to match for sequence boundaries on raw text
|
||||
self.seq_boundary_end = ":"
|
||||
self.seq_boundary_start = "Here"
|
||||
|
||||
# The longest any thinking / start of response message can be
|
||||
self.longest_think_start = max(
|
||||
len(think_start) for think_start in self.valid_think_starts)
|
||||
|
||||
def extract_reasoning_content(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Extract the reasoning content & content sections, respectively.
|
||||
If the sequence doesn't match what we expect, i.e., the model generates
|
||||
something else, all content is considered non-reasoning content.
|
||||
|
||||
Args:
|
||||
model_output (str): Output of the model to be parsed.
|
||||
request (ChatCompletionRequest): Request being processed.
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]: Tuple pair containing the
|
||||
reasoning content and non-reasoning content.
|
||||
"""
|
||||
re_match = self.reasoning_regex.findall(model_output)
|
||||
if not re_match:
|
||||
return None, model_output
|
||||
reasoning_content, response_content = re_match[0]
|
||||
if not response_content:
|
||||
return reasoning_content, None
|
||||
return reasoning_content, response_content
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> Union[DeltaMessage, None]:
|
||||
"""Extract the reasoning content / content emitted by granite models;
|
||||
If the sequence doesn't match what we expect, i.e., the model generates
|
||||
something else, all content is considered non-reasoning content.
|
||||
|
||||
NOTE: Granite models do not use a special token to start their reasoning
|
||||
and response sections; instead they have token sequences, e.g.,
|
||||
|
||||
Here is my thought process: Foo Here is my response: Bar
|
||||
|
||||
This increases the complexity of correctly handling streams, since we
|
||||
need to watch for specific sequences and correctly parse them without
|
||||
dropping content that is potentially overlapping & spanning multiple
|
||||
delta messages.
|
||||
|
||||
Args:
|
||||
previous_text (str): Previous text outside of this delta message.
|
||||
current_text (str): Previous text + delta text.
|
||||
delta_text (str): Text to consider and parse content from.
|
||||
previous_token_ids (Sequence[int]): Token IDs of previous_text.
|
||||
current_token_ids (Sequence[int]): Token IDs of current_text.
|
||||
delta_token_ids (Sequence[int]): Token IDs of delta_text.
|
||||
|
||||
Returns:
|
||||
Union[DeltaMessage, None]
|
||||
DeltaMessage with either reasoning content or content, or None.
|
||||
"""
|
||||
reasoning_content, resp_seq_len, content = self._get_content_sections(
|
||||
current_text)
|
||||
# Either we haven't finished the start of the reasoning sequence,
|
||||
# or the model is generating something unexpected.
|
||||
if not reasoning_content:
|
||||
delta_message = self._get_delta_message_with_no_reasoning_bounds(
|
||||
current_text, delta_text)
|
||||
# We have a start of reasoning message, but have not yet finished
|
||||
# the start of response sequence.
|
||||
elif not content:
|
||||
delta_message = self._get_delta_message_with_no_response_bounds(
|
||||
current_text, reasoning_content, delta_text)
|
||||
# We've finished both the start of reasoning and start of response seq.
|
||||
else:
|
||||
# This should never happen since we matched on the response
|
||||
assert resp_seq_len is not None
|
||||
delta_message = self._get_delta_message_with_both_bounds(
|
||||
delta_text, reasoning_content, content, current_text,
|
||||
resp_seq_len)
|
||||
if not delta_message.content and not delta_message.reasoning_content:
|
||||
return None
|
||||
return delta_message
|
||||
|
||||
#### Implementation details of stream parsing for granite models
|
||||
def _is_reasoning_start_substr(self, text: str) -> bool:
|
||||
"""Check if a text matches one of the possible start reasoning seqs.
|
||||
|
||||
Args:
|
||||
text (str): Text to check for leading substr.
|
||||
|
||||
Returns:
|
||||
bool: True if any of the possible reasoning start seqs match.
|
||||
"""
|
||||
return any(
|
||||
think_start.startswith(text)
|
||||
for think_start in self.valid_think_starts)
|
||||
|
||||
def _is_response_start_substr(self, text: str) -> bool:
|
||||
"""Check if a text matches one of the possible start response seqs.
|
||||
|
||||
Args:
|
||||
text (str): Text to check for leading substr.
|
||||
|
||||
Returns:
|
||||
bool: True if any of the possible response start seqs match.
|
||||
"""
|
||||
return any(
|
||||
response_start.startswith(text)
|
||||
for response_start in self.valid_response_starts)
|
||||
|
||||
def _get_delta_message_with_no_reasoning_bounds(
|
||||
self,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
) -> DeltaMessage:
|
||||
"""Parse the delta message when the current text has not yet completed
|
||||
its start of reasoning sequence.
|
||||
|
||||
Args:
|
||||
current_text (str): The full previous + delta text.
|
||||
delta_text (str): Text to consider and parse content from.
|
||||
|
||||
Returns:
|
||||
DeltaMessage: Message containing the parsed content.
|
||||
"""
|
||||
prev_longest_length = len(current_text) - len(delta_text)
|
||||
is_substr = self._is_reasoning_start_substr(current_text)
|
||||
was_substr = self._is_reasoning_start_substr(
|
||||
current_text[:prev_longest_length])
|
||||
|
||||
# Check if we just generated something NOT in the special token seq;
|
||||
# if so, add everything that we previously skipped with this delta
|
||||
# message and append everything to content in the future.
|
||||
if was_substr and not is_substr:
|
||||
return DeltaMessage(
|
||||
reasoning_content=None,
|
||||
content=current_text,
|
||||
)
|
||||
if is_substr:
|
||||
# Might still be in the special token sequence; return nothing
|
||||
return DeltaMessage(reasoning_content=None, content=None)
|
||||
# Otherwise the sequence has already been broken and we already
|
||||
# corrected; just return the delta text as normal content.
|
||||
return DeltaMessage(reasoning_content=None, content=delta_text)
|
||||
|
||||
def _get_delta_message_with_no_response_bounds(
|
||||
self,
|
||||
current_text: str,
|
||||
reasoning_content: str,
|
||||
delta_text: str,
|
||||
) -> DeltaMessage:
|
||||
"""Parse the delta message when the current text has both reasoning
|
||||
content with no (response) content. NOTE that we may have overlapping
|
||||
tokens with the start of reasoning / start of response sequences on
|
||||
either side of the delta text.
|
||||
|
||||
Args:
|
||||
current_text (str): The full previous + delta text.
|
||||
reasoning_content (str): reasoning content from current_text.
|
||||
delta_text (str): Text to consider and parse content from.
|
||||
|
||||
Returns:
|
||||
DeltaMessage: Message containing the parsed content.
|
||||
"""
|
||||
# If we have no reasoning content or explicitly end with the start of
|
||||
# response sequence, we are in transition to the response; need to be
|
||||
# careful here, since the final token (:) will match the reasoning
|
||||
# content and fully parse it out; we should not pass the : back.
|
||||
ends_with_start_response_seq = any(
|
||||
current_text.endswith(response_start)
|
||||
for response_start in self.valid_response_starts)
|
||||
if reasoning_content is None or ends_with_start_response_seq:
|
||||
return DeltaMessage(reasoning_content=None, content=None)
|
||||
|
||||
# Consider previous / current text only within context of the reasoning
|
||||
previous_text = reasoning_content[:-len(delta_text)]
|
||||
current_text = reasoning_content
|
||||
|
||||
# We need to be careful about adding unfinished response sequences;
|
||||
# Find the place at which we MIGHT be starting a response sequence
|
||||
prev_idx = previous_text.rfind(self.seq_boundary_start)
|
||||
delta_idx = delta_text.rfind(self.seq_boundary_start)
|
||||
|
||||
# Check the state of potential start of response substring matches.
|
||||
prev_was_substr = self._is_response_start_substr(
|
||||
previous_text[prev_idx:]) if prev_idx >= 0 else False
|
||||
delta_continues_substr = self._is_response_start_substr(
|
||||
current_text[prev_idx:]) if prev_idx >= 0 else False
|
||||
delta_new_substr = self._is_response_start_substr(
|
||||
delta_text[delta_idx:]) if delta_idx >= 0 else False
|
||||
|
||||
# Delta only contains potential continued response sequence text.
|
||||
if delta_continues_substr:
|
||||
return DeltaMessage(reasoning_content=None, content=None)
|
||||
|
||||
if not prev_was_substr:
|
||||
# Delta may be starting a new response seq but has other text too.
|
||||
if delta_new_substr:
|
||||
return DeltaMessage(reasoning_content=delta_text[:delta_idx],
|
||||
content=None)
|
||||
# Normal case for most reasoning text (no potential special seqs).
|
||||
return DeltaMessage(reasoning_content=delta_text, content=None)
|
||||
# The substring that previously seemed to be a potential response
|
||||
# seq wasn't one; we need to add the content to the delta message,
|
||||
# and also slice off the potential response sequence
|
||||
elif delta_new_substr:
|
||||
reasoning_content = previous_text[
|
||||
prev_idx:] + delta_text[:delta_idx]
|
||||
return DeltaMessage(reasoning_content=reasoning_content,
|
||||
content=None)
|
||||
# No new substring yet, and we broke our old one; take the whole delta
|
||||
return DeltaMessage(
|
||||
reasoning_content=previous_text[prev_idx:] + delta_text,
|
||||
content=None,
|
||||
)
|
||||
|
||||
def _get_delta_message_with_both_bounds(
|
||||
self,
|
||||
delta_text: str,
|
||||
reasoning_content: str,
|
||||
response_content: str,
|
||||
current_text: str,
|
||||
response_seq_len: int,
|
||||
) -> DeltaMessage:
|
||||
"""Parse the delta message when the current text has both reasoning
|
||||
content and normal (response) content.
|
||||
|
||||
Args:
|
||||
delta_text: Text to consider and parse content from.
|
||||
reasoning_content: reasoning content from current_text.
|
||||
response_content: response content from current_text.
|
||||
current_text: The full previous + delta text.
|
||||
response_seq_len: Len of the complete response sequence used.
|
||||
|
||||
Returns:
|
||||
DeltaMessage: Message containing the parsed content.
|
||||
"""
|
||||
# Always have content; take length to the end
|
||||
delta_content = delta_text[-len(response_content):]
|
||||
reasoning_end_idx = len(delta_text) - (len(response_content) +
|
||||
response_seq_len)
|
||||
|
||||
if reasoning_end_idx < 0:
|
||||
delta_reasoning_content = None
|
||||
else:
|
||||
# Get the starting offset
|
||||
start_reasoning_content_idx = len(
|
||||
reasoning_content) + response_seq_len + len(
|
||||
response_content) - 1
|
||||
delta_offset = len(current_text) - len(delta_text)
|
||||
start_offset = start_reasoning_content_idx - delta_offset
|
||||
if start_offset < 0:
|
||||
start_offset = 0
|
||||
delta_reasoning_content = delta_text[
|
||||
start_offset:reasoning_end_idx]
|
||||
|
||||
return DeltaMessage(
|
||||
reasoning_content=delta_reasoning_content,
|
||||
content=delta_content,
|
||||
)
|
||||
|
||||
def _get_content_sections(
|
||||
self, current_text: str
|
||||
) -> tuple[Optional[str], Optional[int], Optional[str]]:
|
||||
"""Parse the text to extract the reasoning content / content
|
||||
if we have them.
|
||||
|
||||
Args:
|
||||
current_text (str): The full previous + delta text.
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[int], Optional[str]]: Tuple of len 3
|
||||
containing the reasoning content, the length of the response seq
|
||||
(if there is one) and the non-reasoning content.
|
||||
"""
|
||||
current_chunk_start = 0
|
||||
start_reasoning_content = None
|
||||
parsed_content = False
|
||||
delimiter_idxs = [
|
||||
idx for idx, char in enumerate(current_text)
|
||||
if char == self.seq_boundary_end
|
||||
]
|
||||
|
||||
for current_chunk_end in delimiter_idxs:
|
||||
current_chunk = current_text[current_chunk_start:current_chunk_end]
|
||||
# Check to see if the start of reasoning seq if complete
|
||||
if start_reasoning_content is None:
|
||||
for think_start in self.valid_think_starts:
|
||||
if current_chunk == think_start[:-1]:
|
||||
start_reasoning_content = current_chunk_end + 1
|
||||
current_chunk_start = current_chunk_end + 1
|
||||
break
|
||||
|
||||
# Check to see if the start of response seq if complete
|
||||
elif not parsed_content:
|
||||
for response_start in self.valid_response_starts:
|
||||
if current_chunk[-len(response_start) +
|
||||
1:] == response_start[:-1]:
|
||||
# Mark end of reasoning and start response content
|
||||
# after the start of response sequence.
|
||||
end_reasoning_content = current_chunk_end - len(
|
||||
response_start)
|
||||
reasoning_content = current_text[
|
||||
start_reasoning_content:end_reasoning_content]
|
||||
response_content = current_text[current_chunk_end + 1:]
|
||||
return reasoning_content, len(
|
||||
response_start), response_content
|
||||
|
||||
if start_reasoning_content and not parsed_content:
|
||||
return current_text[start_reasoning_content:], None, None
|
||||
return None, None, None
|
||||
245
vllm/reasoning/hunyuan_a13b_reasoning_parser.py
Normal file
245
vllm/reasoning/hunyuan_a13b_reasoning_parser.py
Normal file
@@ -0,0 +1,245 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import Optional, Union
|
||||
|
||||
import regex as re
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DeltaMessage)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ReasoningParserManager.register_module("hunyuan_a13b")
|
||||
class HunyuanA13BReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for Hunyuan A13B Model
|
||||
|
||||
HunyuanReasoningParser
|
||||
|
||||
This class implements a reasoning parser specifically designed
|
||||
for the Hunyuan A13B Model. It is responsible for parsing and
|
||||
extracting structured reasoning and answer segments from model
|
||||
outputs that follow a specific pattern.
|
||||
|
||||
Key Features:
|
||||
- For non-stream output , Recognizes and extracts reasoning ("think")
|
||||
and answer ("answer") sections from text using regular expressions.
|
||||
- For stream process, it requires a token id sequences to change the
|
||||
reasoning state and other state so it maintains internal state to
|
||||
manage parsing across multiple token.
|
||||
|
||||
|
||||
think start: "<think>\n": [14023, 771, 397]
|
||||
think ends: "\n</think>\n<answer>\n": [198, 524, 27963, 397, 27, 9399, 397]
|
||||
response ends: "\n</answer>": [524, 9399, 29]
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
self.think_start_expr = r"<think>\n"
|
||||
self.think_end_expr = r"\n</think>\n"
|
||||
|
||||
self.response_start_expr = r"\n</think>\n<answer>\n"
|
||||
self.response_end_expr = r"\n</answer>"
|
||||
|
||||
self.full_match_reasoning_regex = re.compile(
|
||||
rf"(?:{self.think_start_expr}(.*?){self.response_start_expr})?(.*?){self.response_end_expr}",
|
||||
re.DOTALL)
|
||||
|
||||
self.half_match_reasoning_regex = re.compile(
|
||||
rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)",
|
||||
re.DOTALL)
|
||||
|
||||
self.think_start_ids = [14023, 771, 397]
|
||||
self.think_start_ids_fast = [14023, 771, 1363]
|
||||
self.response_start_ids = [198, 524, 27963, 397, 27, 9399, 397]
|
||||
self.response_start_ids_fast = [524, 27963, 397, 27, 9399, 397]
|
||||
self.response_end_ids = [198, 524, 9399, 29]
|
||||
self.fast_think_ids = [
|
||||
14023, 771, 1363, 524, 27963, 397, 27, 9399, 397
|
||||
]
|
||||
|
||||
# when state change, send out all the buffered text in last state
|
||||
self.buffered_text = []
|
||||
self.buffered_ids = []
|
||||
|
||||
self.current_state = "reasoning"
|
||||
self.all_states = ["reasoning", "response"]
|
||||
|
||||
self.current_state = "idle"
|
||||
self.expected_sequence = self.think_start_ids
|
||||
# this sequence only for the think start, it has two way to start.
|
||||
self.expected_sequence_side = self.think_start_ids_fast
|
||||
self.sequence_index = 0
|
||||
self.token_buffer = []
|
||||
self.text_buffer = ""
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return self.current_state == "response"
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
# for hunyuan streaming reason parsing, the stream parse
|
||||
# will call first, and the same token will be called in
|
||||
# is_reasoning_end and extract_content_ids
|
||||
# this id is not part of content, so just return [] here.
|
||||
return []
|
||||
|
||||
def extract_reasoning_content(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Extract the reasoning content & content sections, respectively.
|
||||
If the sequence doesn't match what we expect, i.e., the model generates
|
||||
something else, all content is considered non-reasoning content.
|
||||
|
||||
Args:
|
||||
model_output (str): Output of the model to be parsed.
|
||||
request (ChatCompletionRequest): Request being processed.
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]: Tuple pair containing the
|
||||
reasoning content and non-reasoning content.
|
||||
"""
|
||||
|
||||
re_match = self.full_match_reasoning_regex.findall(model_output)
|
||||
if re_match:
|
||||
reasoning_content, response_content = re_match[0]
|
||||
if len(reasoning_content) == 0:
|
||||
reasoning_content = None
|
||||
if len(response_content) == 0:
|
||||
response_content = None
|
||||
return reasoning_content, response_content
|
||||
|
||||
fallback_regex = self.half_match_reasoning_regex
|
||||
fallback_match = fallback_regex.findall(model_output)
|
||||
if fallback_match:
|
||||
reasoning_content, response_content = fallback_match[0]
|
||||
|
||||
if response_content.endswith(self.response_end_expr):
|
||||
response_content = response_content[:-len(self.
|
||||
response_end_expr)]
|
||||
|
||||
if len(reasoning_content) == 0:
|
||||
reasoning_content = None
|
||||
if len(response_content) == 0:
|
||||
response_content = None
|
||||
|
||||
return reasoning_content, response_content
|
||||
|
||||
return None, model_output
|
||||
|
||||
def _is_strict_increasing_subsequence(self, subsequence: Sequence[int],
|
||||
sequence: Sequence[int]) -> bool:
|
||||
if not subsequence:
|
||||
return False
|
||||
|
||||
sub_idx = 0
|
||||
for num in sequence:
|
||||
if sub_idx < len(subsequence) and num == subsequence[sub_idx]:
|
||||
sub_idx += 1
|
||||
return sub_idx == len(subsequence)
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> Union[DeltaMessage, None]:
|
||||
"""Extract content using token ID sequence state machine"""
|
||||
# Define sequences
|
||||
think_start_sequence = self.think_start_ids
|
||||
response_start_sequence = self.response_start_ids
|
||||
response_end_sequence = self.response_end_ids
|
||||
|
||||
assert (len(delta_token_ids) == 1)
|
||||
# Process each token in the delta
|
||||
token = delta_token_ids[0]
|
||||
|
||||
def check_token_with_sequence(token):
|
||||
if self.current_state == "idle" or self.current_state == "think":
|
||||
return (token == self.expected_sequence[self.sequence_index]
|
||||
or token == \
|
||||
self.expected_sequence_side[self.sequence_index])
|
||||
else:
|
||||
return token == self.expected_sequence[self.sequence_index]
|
||||
|
||||
def check_last_token(token):
|
||||
if self.current_state == "idle" or self.current_state == "think":
|
||||
# only return true if it's judge using a side sequence.
|
||||
if (self.sequence_index - 1 < len(self.expected_sequence_side)
|
||||
and token
|
||||
== self.expected_sequence_side[self.sequence_index -
|
||||
1]):
|
||||
return self.sequence_index == len(
|
||||
self.expected_sequence_side)
|
||||
else:
|
||||
return self.sequence_index == len(self.expected_sequence)
|
||||
else:
|
||||
return self.sequence_index == len(self.expected_sequence)
|
||||
|
||||
# Check if token matches expected sequence
|
||||
token_in_state_seq = check_token_with_sequence(token)
|
||||
|
||||
if token_in_state_seq:
|
||||
# Store matching token
|
||||
self.token_buffer.append(token)
|
||||
self.text_buffer += delta_text
|
||||
self.sequence_index += 1
|
||||
## state change from idle->think->response->idle
|
||||
|
||||
# Check if sequence fully matched
|
||||
if check_last_token(token):
|
||||
# State transition
|
||||
if self.current_state == "idle":
|
||||
self.current_state = "think"
|
||||
self.expected_sequence = response_start_sequence
|
||||
self.expected_sequence_side = self.response_start_ids_fast
|
||||
elif self.current_state == "think":
|
||||
self.current_state = "response"
|
||||
self.expected_sequence = response_end_sequence
|
||||
elif self.current_state == "response":
|
||||
self.current_state = "idle"
|
||||
self.expected_sequence = think_start_sequence
|
||||
self.expected_sequence_side = self.think_start_ids_fast
|
||||
|
||||
# Reset matching state
|
||||
self.sequence_index = 0
|
||||
self.token_buffer = []
|
||||
self.text_buffer = ""
|
||||
# Do not send content for state transition texts.
|
||||
else:
|
||||
# Sequence broken - handle buffered content
|
||||
if self.token_buffer and len(self.token_buffer) > 0:
|
||||
# Send buffered tokens
|
||||
buffered_content = self.text_buffer + delta_text
|
||||
# Reset matching state
|
||||
self.sequence_index = 0
|
||||
self.token_buffer = []
|
||||
self.text_buffer = ""
|
||||
|
||||
# Return content based on current state
|
||||
if self.current_state == "think":
|
||||
return DeltaMessage(reasoning_content=buffered_content,
|
||||
content=None)
|
||||
else:
|
||||
return DeltaMessage(reasoning_content=None,
|
||||
content=buffered_content)
|
||||
else:
|
||||
# No buffered content, send normally
|
||||
if self.current_state == "think":
|
||||
return DeltaMessage(reasoning_content=delta_text,
|
||||
content=None)
|
||||
else:
|
||||
return DeltaMessage(reasoning_content=None,
|
||||
content=delta_text)
|
||||
|
||||
# If no content to send in this delta
|
||||
return None
|
||||
56
vllm/reasoning/mistral_reasoning_parser.py
Normal file
56
vllm/reasoning/mistral_reasoning_parser.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from functools import cached_property
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
from vllm.reasoning.deepseek_r1_reasoning_parser import (
|
||||
DeepSeekR1ReasoningParser)
|
||||
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ReasoningParserManager.register_module("mistral")
|
||||
class MistralReasoningParser(DeepSeekR1ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for Mistral models.
|
||||
|
||||
The Mistral models uses [THINK]...[/THINK] tokens to denote reasoning
|
||||
text. This parser extracts the reasoning content from the model output.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: MistralTokenizer, *args, **kwargs):
|
||||
if not isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError(
|
||||
"The tokenizer must be an instance of MistralTokenizer.")
|
||||
|
||||
ReasoningParser.__init__(self, tokenizer, *args, **kwargs)
|
||||
|
||||
if not self.model_tokenizer:
|
||||
raise ValueError(
|
||||
"The model tokenizer must be passed to the ReasoningParser "
|
||||
"constructor during construction.")
|
||||
|
||||
self.start_token_id = tokenizer.tokenizer.get_control_token(
|
||||
self.start_token)
|
||||
self.end_token_id = tokenizer.tokenizer.get_control_token(
|
||||
self.end_token)
|
||||
|
||||
if self.start_token_id is None or self.end_token_id is None:
|
||||
raise RuntimeError(
|
||||
"Mistral reasoning parser could not locate think start/end "
|
||||
"tokens in the tokenizer!")
|
||||
|
||||
@cached_property
|
||||
def start_token(self) -> str:
|
||||
"""The token that starts reasoning content."""
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
return SpecialTokens.begin_think
|
||||
|
||||
@cached_property
|
||||
def end_token(self) -> str:
|
||||
"""The token that ends reasoning content."""
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
return SpecialTokens.end_think
|
||||
72
vllm/reasoning/qwen3_reasoning_parser.py
Normal file
72
vllm/reasoning/qwen3_reasoning_parser.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
ResponsesRequest)
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
|
||||
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
|
||||
|
||||
|
||||
@ReasoningParserManager.register_module("qwen3")
|
||||
class Qwen3ReasoningParser(BaseThinkingReasoningParser):
|
||||
"""
|
||||
Reasoning parser for the Qwen3 model.
|
||||
|
||||
The Qwen3 model uses <think>...</think> tokens to denote reasoning text
|
||||
within its output. The model provides a strict switch to disable reasoning
|
||||
output via the 'enable_thinking=False' parameter. This parser extracts the
|
||||
reasoning content enclosed by <think> and </think> tokens from the model's
|
||||
output.
|
||||
"""
|
||||
|
||||
@property
|
||||
def start_token(self) -> str:
|
||||
"""The token that starts reasoning content."""
|
||||
return "<think>"
|
||||
|
||||
@property
|
||||
def end_token(self) -> str:
|
||||
"""The token that ends reasoning content."""
|
||||
return "</think>"
|
||||
|
||||
def extract_reasoning_content(
|
||||
self, model_output: str, request: Union[ChatCompletionRequest,
|
||||
ResponsesRequest]
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Extract reasoning content from the model output.
|
||||
|
||||
Qwen3 has stricter requirements - it needs both start and end tokens
|
||||
to be present, unlike other models that work with just the end token.
|
||||
|
||||
For text <think>abc</think>xyz:
|
||||
- 'abc' goes to reasoning_content
|
||||
- 'xyz' goes to content
|
||||
|
||||
Returns:
|
||||
tuple[Optional[str], Optional[str]]: reasoning content and content
|
||||
"""
|
||||
|
||||
# Check if the model output contains both <think> and </think> tokens.
|
||||
if (self.start_token not in model_output
|
||||
or self.end_token not in model_output):
|
||||
return None, model_output
|
||||
|
||||
# Check if the <think> is present in the model output, remove it
|
||||
# if it is present.
|
||||
model_output_parts = model_output.partition(self.start_token)
|
||||
model_output = model_output_parts[2] if model_output_parts[
|
||||
1] else model_output_parts[0]
|
||||
|
||||
# Check if the model output contains the </think> tokens.
|
||||
# If the end token is not found, return the model output as is.
|
||||
if self.end_token not in model_output:
|
||||
return None, model_output
|
||||
|
||||
# Extract reasoning content from the model output.
|
||||
reasoning_content, _, content = model_output.partition(self.end_token)
|
||||
|
||||
final_content = content or None
|
||||
return reasoning_content, final_content
|
||||
28
vllm/reasoning/seedoss_reasoning_parser.py
Normal file
28
vllm/reasoning/seedoss_reasoning_parser.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
|
||||
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
|
||||
|
||||
|
||||
@ReasoningParserManager.register_module("seed_oss")
|
||||
class SeedOSSReasoningParser(BaseThinkingReasoningParser):
|
||||
"""
|
||||
Reasoning parser for SeedOSS model.
|
||||
|
||||
The SeedOSS model uses <seed:think>...</seed:think> tokens to
|
||||
denote reasoning content text. This parser extracts
|
||||
the reasoning content from the model output.
|
||||
Similar to DeepSeek R1, it supports cases
|
||||
where the model doesn't generate the start token.
|
||||
"""
|
||||
|
||||
@property
|
||||
def start_token(self) -> str:
|
||||
"""The token that starts reasoning content."""
|
||||
return "<seed:think>"
|
||||
|
||||
@property
|
||||
def end_token(self) -> str:
|
||||
"""The token that ends reasoning content."""
|
||||
return "</seed:think>"
|
||||
109
vllm/reasoning/step3_reasoning_parser.py
Normal file
109
vllm/reasoning/step3_reasoning_parser.py
Normal file
@@ -0,0 +1,109 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import Optional, Union
|
||||
|
||||
import regex as re
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DeltaMessage)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ReasoningParserManager.register_module("step3")
|
||||
class Step3ReasoningParser(ReasoningParser):
|
||||
"""
|
||||
Reasoning parser for Step3 model.
|
||||
|
||||
The Step3 model uses </think> token to denote the end of reasoning
|
||||
text. This parser extracts all content before </think> as reasoning content.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
|
||||
super().__init__(tokenizer, *args, **kwargs)
|
||||
self.think_end_token = "</think>"
|
||||
|
||||
self.reasoning_regex = re.compile(rf"(.*?){self.think_end_token}",
|
||||
re.DOTALL)
|
||||
|
||||
if not self.model_tokenizer:
|
||||
raise ValueError(
|
||||
"The model tokenizer must be passed to the ReasoningParser "
|
||||
"constructor during construction.")
|
||||
|
||||
self.think_end_token_id = self.vocab.get(self.think_end_token)
|
||||
if self.think_end_token_id is None:
|
||||
raise RuntimeError(
|
||||
"Step3 reasoning parser could not locate think end "
|
||||
"token in the tokenizer!")
|
||||
|
||||
def extract_reasoning_content_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> Union[DeltaMessage, None]:
|
||||
"""
|
||||
Extract reasoning content from a delta message.
|
||||
Handles streaming output where previous + delta = current.
|
||||
Uses token IDs for faster processing.
|
||||
For text "abc</think>xyz":
|
||||
- 'abc' goes to reasoning_content
|
||||
- 'xyz' goes to content
|
||||
"""
|
||||
# Skip single special token
|
||||
if len(delta_token_ids
|
||||
) == 1 and delta_token_ids[0] == self.think_end_token_id:
|
||||
return None
|
||||
|
||||
if self.think_end_token_id in delta_token_ids:
|
||||
# </think> in delta, extract reasoning content and remaining content
|
||||
end_index = delta_text.find(self.think_end_token)
|
||||
reasoning_content = delta_text[:end_index]
|
||||
content = delta_text[end_index + len(self.think_end_token):]
|
||||
return DeltaMessage(reasoning_content=reasoning_content,
|
||||
content=content if content else None)
|
||||
elif self.think_end_token_id in previous_token_ids:
|
||||
# </think> already seen in previous text, everything is content
|
||||
return DeltaMessage(content=delta_text)
|
||||
else:
|
||||
# No </think> seen yet, everything is reasoning
|
||||
return DeltaMessage(reasoning_content=delta_text)
|
||||
|
||||
def extract_reasoning_content(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
|
||||
# Check if the model output contains the </think> token
|
||||
if self.think_end_token not in model_output:
|
||||
# If no </think> token, everything is reasoning content
|
||||
return model_output, None
|
||||
else:
|
||||
# Find the first occurrence of </think>
|
||||
end_index = model_output.find(self.think_end_token)
|
||||
reasoning_content = model_output[:end_index]
|
||||
|
||||
# Content after </think> token
|
||||
content = model_output[end_index + len(self.think_end_token):]
|
||||
|
||||
if len(content) == 0:
|
||||
content = None
|
||||
|
||||
return reasoning_content, content
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return self.think_end_token_id in input_ids
|
||||
|
||||
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
|
||||
if self.think_end_token_id not in input_ids[:-1]:
|
||||
return []
|
||||
else:
|
||||
return input_ids[input_ids.index(self.think_end_token_id) + 1:]
|
||||
Reference in New Issue
Block a user