Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

@@ -0,0 +1,20 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .protocol import TokenizerLike
from .registry import (
TokenizerRegistry,
cached_get_tokenizer,
cached_tokenizer_from_config,
get_tokenizer,
init_tokenizer_from_config,
)
__all__ = [
"TokenizerLike",
"TokenizerRegistry",
"cached_get_tokenizer",
"get_tokenizer",
"cached_tokenizer_from_config",
"init_tokenizer_from_config",
]

View File

@@ -0,0 +1,175 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
from typing import Any
from transformers import BatchEncoding
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from .deepseek_v32_encoding import encode_messages
from .hf import CachedHfTokenizer
from .protocol import TokenizerLike
class DeepseekV32Tokenizer(CachedHfTokenizer):
@classmethod
def from_pretrained(
cls,
path_or_repo_id: str | Path,
*args,
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> "TokenizerLike":
tokenizer = super().from_pretrained(
path_or_repo_id,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
return DeepseekV32Tokenizer(tokenizer)
def __init__(self, tokenizer: TokenizerLike) -> None:
super().__init__()
self.tokenizer = tokenizer
self.name_or_path = getattr(tokenizer, "name_or_path", "")
self._added_vocab = self.tokenizer.get_added_vocab()
self._added_vocab_size = len(self._added_vocab)
def apply_chat_template(
self,
messages: list["ChatCompletionMessageParam"],
tools: list[dict[str, Any]] | None = None,
**kwargs,
) -> str | list[int]:
thinking = kwargs.get("thinking", False)
thinking_mode = "thinking"
if not thinking:
thinking_mode = "chat"
conversation = kwargs.get("conversation", messages)
messages = conversation.copy()
if tools is not None and len(tools) > 0:
messages.insert(0, {"role": "system"})
messages[0]["tools"] = tools # type: ignore[typeddict-unknown-key]
# Historical reasoning content is dropped when a new user message is introduced
drop_thinking = messages[-1]["role"] == "user"
encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
prompt_str = encode_messages(messages, **encode_config) # type: ignore
if kwargs.get("tokenize", True):
tokenizer_kwargs = {
k: kwargs[k] for k in ("truncation", "max_length") if k in kwargs
}
return self.encode(
prompt_str,
add_special_tokens=False,
**tokenizer_kwargs,
)
return prompt_str
def num_special_tokens_to_add(self) -> int:
return len(self.encode(""))
@property
def all_special_tokens(self) -> list[str]:
return self.tokenizer.all_special_tokens
@property
def all_special_ids(self) -> list[int]:
return self.tokenizer.all_special_ids
@property
def bos_token_id(self) -> int:
return self.tokenizer.bos_token_id
@property
def eos_token_id(self) -> int:
return self.tokenizer.eos_token_id
@property
def pad_token_id(self) -> int:
return self.tokenizer.pad_token_id
@property
def is_fast(self) -> bool:
return self.tokenizer.is_fast
@property
def vocab_size(self) -> int:
return self.tokenizer.vocab_size
@property
def max_token_id(self) -> int:
return self.tokenizer.max_token_id
@property
def truncation_side(self) -> str:
return self.tokenizer.truncation_side
def __hash__(self) -> int:
return hash(id(self))
def __len__(self) -> int:
# </think> is an added token in DeepseekV32 tokenizer
return self.vocab_size + self._added_vocab_size
def __call__(
self,
text: str | list[str],
text_pair: str | None = None,
add_special_tokens: bool = True,
truncation: bool = False,
max_length: int | None = None,
) -> "BatchEncoding":
return self.tokenizer(
text,
text_pair=text_pair,
add_special_tokens=add_special_tokens,
truncation=truncation,
max_length=max_length,
)
def get_vocab(self) -> dict[str, int]:
return self.tokenizer.get_vocab()
def get_added_vocab(self) -> dict[str, int]:
return self._added_vocab.copy()
def encode(
self,
text: str,
truncation: bool | None = None,
max_length: int | None = None,
add_special_tokens: bool = True,
) -> list[int]:
return self.tokenizer.encode(
text,
truncation=truncation,
max_length=max_length,
add_special_tokens=add_special_tokens,
)
def convert_tokens_to_string(self, tokens: list[str]) -> str:
return self.tokenizer.convert_tokens_to_string(tokens)
def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
def convert_ids_to_tokens(
self,
ids: list[int],
skip_special_tokens: bool = False,
) -> list[str]:
return self.tokenizer.convert_ids_to_tokens(
ids, skip_special_tokens=skip_special_tokens
)

View File

@@ -0,0 +1,459 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# copy from https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/encoding/encoding_dsv32.py
import copy
import json
from typing import Any
import regex as re
# flake8: noqa: E501
TOOLS_SYSTEM_TEMPLATE = """## Tools
You have access to a set of tools you can use to answer the user's question.
You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user:
<{dsml_token}function_calls>
<{dsml_token}invoke name="$FUNCTION_NAME">
<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</{dsml_token}parameter>
...
</{dsml_token}invoke>
<{dsml_token}invoke name="$FUNCTION_NAME2">
...
</{dsml_token}invoke>
</{dsml_token}function_calls>
String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" for other types (numbers, booleans, arrays, objects).
If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking block. Here is an example:
<{dsml_token}function_calls>
...
</{dsml_token}function_calls>
<function_results>
...
</function_results>
{thinking_start_token}...thinking about results{thinking_end_token}
Here are the functions available in JSONSchema format:
<functions>
{tool_schemas}
</functions>
"""
bos_token: str = "<begin▁of▁sentence>"
eos_token: str = "<end▁of▁sentence>"
thinking_start_token: str = "<think>"
thinking_end_token: str = "</think>"
dsml_token: str = "DSML"
system_msg_template: str = "{content}"
user_msg_template: str = "<User>{content}<Assistant>"
assistant_msg_template: str = "{reasoning}{content}{tool_calls}<end▁of▁sentence>"
thinking_template = "{reasoning_content}"
response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}"
tool_call_template: str = (
'<{dsml_token}invoke name="{name}">\n{arguments}\n</{dsml_token}invoke>'
)
tool_calls_template = (
"<{dsml_token}function_calls>\n{tool_calls}\n</{dsml_token}function_calls>"
)
tool_output_template: str = "\n<result>{content}</result>"
def to_json(value: Any) -> str:
try:
return json.dumps(value, ensure_ascii=False)
except Exception:
return json.dumps(value, ensure_ascii=True)
def tools_from_openai_format(tools):
return [tool["function"] for tool in tools]
def tool_calls_from_openai_format(tool_calls):
return [
{
"name": tool_call["function"]["name"],
"arguments": tool_call["function"]["arguments"],
}
for tool_call in tool_calls
]
def tool_calls_to_openai_format(tool_calls):
return [
{
"type": "function",
"function": {
"name": tool_call["name"],
"arguments": tool_call["arguments"],
},
}
for tool_call in tool_calls
]
def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>"""
P_dsml_strs = []
if isinstance(tool_call["arguments"], str):
arguments = json.loads(tool_call["arguments"])
else:
arguments = tool_call["arguments"]
for k, v in arguments.items():
p_dsml_str = p_dsml_template.format(
dsml_token=dsml_token,
key=k,
is_str="true" if isinstance(v, str) else "false",
value=v if isinstance(v, str) else to_json(v),
)
P_dsml_strs.append(p_dsml_str)
return "\n".join(P_dsml_strs)
def decode_dsml_to_arguments(
tool_name: str, tool_args: dict[str, tuple[str, str]]
) -> dict[str, str]:
def _decode_value(key: str, value: str, string: str):
if string == "true":
value = to_json(value)
return f"{to_json(key)}: {value}"
tool_args_json = (
"{"
+ ", ".join(
[_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]
)
+ "}"
)
return dict(name=tool_name, arguments=tool_args_json)
def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str:
tools_json = [to_json(t) for t in tools]
return TOOLS_SYSTEM_TEMPLATE.format(
tool_schemas="\n".join(tools_json),
dsml_token=dsml_token,
thinking_start_token=thinking_start_token,
thinking_end_token=thinking_end_token,
)
def find_last_user_index(messages: list[dict[str, Any]]) -> int:
last_user_index = -1
for idx in range(len(messages) - 1, -1, -1):
if messages[idx].get("role") in ["user", "developer"]:
last_user_index = idx
break
return last_user_index
def render_message(
index: int, messages: list[dict[str, Any]], thinking_mode: str
) -> str:
assert 0 <= index < len(messages)
assert thinking_mode in ["chat", "thinking"], (
f"Invalid thinking_mode `{thinking_mode}`"
)
prompt = ""
msg = messages[index]
last_user_idx = find_last_user_index(messages)
role = msg.get("role")
content = msg.get("content")
tools = msg.get("tools")
response_format = msg.get("response_format")
tool_calls = msg.get("tool_calls")
reasoning_content = msg.get("reasoning") or msg.get("reasoning_content")
if tools:
tools = tools_from_openai_format(tools)
if tool_calls:
tool_calls = tool_calls_from_openai_format(tool_calls)
if role == "system":
prompt += system_msg_template.format(content=content or "")
if tools:
prompt += "\n\n" + render_tools(tools)
if response_format:
prompt += "\n\n" + response_format_template.format(
schema=to_json(response_format)
)
elif role == "developer":
assert content, f"Invalid message for role `{role}`: {msg}"
content_developer = ""
if tools:
content_developer += "\n\n" + render_tools(tools)
if response_format:
content_developer += "\n\n" + response_format_template.format(
schema=to_json(response_format)
)
content_developer += "\n\n# The user's message is: {}".format(content)
prompt += user_msg_template.format(content=content_developer)
if index == last_user_idx and thinking_mode == "thinking":
prompt += thinking_start_token
else:
prompt += thinking_end_token
elif role == "user":
prompt += user_msg_template.format(content=content)
if index == last_user_idx and thinking_mode == "thinking":
prompt += thinking_start_token
else:
prompt += thinking_end_token
elif role == "tool":
prev_assistant_idx = index - 1
assistant_msg = messages[prev_assistant_idx]
while prev_assistant_idx >= 0 and assistant_msg.get("role") == "tool":
prev_assistant_idx -= 1
assistant_msg = messages[prev_assistant_idx]
assert (
index == 0
or prev_assistant_idx >= 0
and assistant_msg.get("role") == "assistant"
), f"Invalid messages at {index}:\n{assistant_msg}"
tool_call_order = index - prev_assistant_idx
assistant_tool_calls = assistant_msg.get("tool_calls")
assert assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order, (
"No tool calls but found tool output"
)
if tool_call_order == 1:
prompt += "\n\n<function_results>"
prompt += tool_output_template.format(content=content)
if tool_call_order == len(assistant_tool_calls):
prompt += "\n</function_results>"
if index >= last_user_idx and thinking_mode == "thinking":
prompt += "\n\n" + thinking_start_token
else:
prompt += "\n\n" + thinking_end_token
elif role == "assistant":
prev_assistant_idx = index
thinking_part = ""
tool_calls_content = ""
if tool_calls:
tool_calls = [
tool_call_template.format(
dsml_token=dsml_token,
name=tool_call.get("name"),
arguments=encode_arguments_to_dsml(tool_call),
)
for tool_call in tool_calls
]
tool_calls_content += "\n\n" + tool_calls_template.format(
dsml_token=dsml_token, tool_calls="\n".join(tool_calls)
)
summary_content = content or ""
if thinking_mode == "thinking" and index > last_user_idx:
assert reasoning_content or tool_calls, (
f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message"
)
thinking_part = (
thinking_template.format(reasoning_content=reasoning_content or "")
+ thinking_end_token
)
prompt += assistant_msg_template.format(
reasoning=thinking_part,
content=summary_content,
tool_calls=tool_calls_content,
)
else:
raise NotImplementedError(f"Unknown role: {role}")
return prompt
def drop_thinking_messages(
messages: list[dict[str, Any]], last_user_idx: int | None = None
) -> list[dict[str, Any]]:
messages_wo_thinking: list[dict[str, Any]] = []
last_user_idx = (
find_last_user_index(messages) if last_user_idx is None else last_user_idx
)
for idx, msg in enumerate(messages):
role = msg.get("role")
if role in ["user", "system", "tool"] or idx >= last_user_idx:
messages_wo_thinking.append(msg)
continue
elif role == "assistant":
msg_wo_thinking = copy.copy(msg)
msg_wo_thinking.pop("reasoning_content", None)
msg_wo_thinking.pop("reasoning", None)
messages_wo_thinking.append(msg_wo_thinking)
return messages_wo_thinking
def encode_messages(
messages: list[dict[str, Any]],
thinking_mode: str,
context: list[dict[str, Any]] | None = None,
drop_thinking: bool = True,
add_default_bos_token: bool = True,
) -> str:
context = context if context else []
full_messages = context + messages
prompt = bos_token if add_default_bos_token and len(context) == 0 else ""
if thinking_mode == "thinking" and drop_thinking:
full_messages = drop_thinking_messages(full_messages)
for idx in range(len(messages)):
prompt += render_message(
idx + len(context), full_messages, thinking_mode=thinking_mode
)
return prompt
def _read_until_stop(
index: int, text: str, stop: list[str]
) -> tuple[int, str, None | str]:
min_pos = len(text)
matched_stop = None
for s in stop:
pos = text.find(s, index)
if pos != -1 and pos < min_pos:
min_pos = pos
matched_stop = s
if matched_stop:
content = text[index:min_pos]
return min_pos + len(matched_stop), content, matched_stop
else:
content = text[index:]
return len(text), content, None
def parse_tool_calls(index: int, text: str):
tool_calls: list[dict[str, Any]] = []
stop_token = None
tool_calls_end_token = f"</{dsml_token}function_calls>"
while index < len(text):
index, _, stop_token = _read_until_stop(
index, text, [f"<{dsml_token}invoke", tool_calls_end_token]
)
assert _ == ">\n", "Tool call format error"
if stop_token == tool_calls_end_token:
break
assert stop_token is not None, "Missing special token"
index, tool_name_content, stop_token = _read_until_stop(
index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
)
p_tool_name = re.findall(
r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL
)
assert len(p_tool_name) == 1, "Tool name format error"
tool_name = p_tool_name[0]
tool_args: dict[str, tuple[str, str]] = {}
while stop_token == f"<{dsml_token}parameter":
index, param_content, stop_token = _read_until_stop(
index, text, [f"/{dsml_token}parameter"]
)
param_kv = re.findall(
r'^ name="(.*?)" string="(true|false)">(.*?)<$',
param_content,
flags=re.DOTALL,
)
assert len(param_kv) == 1, "Parameter format error"
param_name, string, param_value = param_kv[0]
assert param_name not in tool_args, "Duplicate parameter name"
tool_args[param_name] = (param_value, string)
index, content, stop_token = _read_until_stop(
index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
)
assert content == ">\n", "Parameter format error"
tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
tool_calls.append(tool_call)
return index, stop_token, tool_calls
# NOTE: This function is designed to parse only correctly
# formatted string and will not attempt to correct malformed output
# that may be generated by the model.
def parse_message_from_completion_text(text: str, thinking_mode: str):
summary_content, reasoning_content, tool_calls = "", "", []
index, stop_token = 0, None
tool_calls_start_token = f"\n\n<{dsml_token}function_calls"
is_thinking, is_tool_calling = thinking_mode == "thinking", False
if is_thinking:
index, content_delta, stop_token = _read_until_stop(
index, text, [thinking_end_token, tool_calls_start_token]
)
reasoning_content = content_delta
assert stop_token == thinking_end_token, "Invalid thinking format"
index, content_delta, stop_token = _read_until_stop(
index, text, [eos_token, tool_calls_start_token]
)
summary_content = content_delta
if stop_token == tool_calls_start_token:
is_tool_calling = True
else:
assert stop_token == eos_token, "Invalid summary format"
if is_tool_calling:
index, stop_token, tool_calls = parse_tool_calls(index, text)
index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
assert not tool_ends_text, "Unexpected content after tool calls"
assert len(text) == index and stop_token in [eos_token, None], (
"Unexpected content at end"
)
for sp_token in [
bos_token,
eos_token,
thinking_start_token,
thinking_end_token,
dsml_token,
]:
assert sp_token not in summary_content and sp_token not in reasoning_content, (
"Unexpected special token in content"
)
return {
"role": "assistant",
"content": summary_content,
"reasoning_content": reasoning_content,
"reasoning": reasoning_content,
"tool_calls": tool_calls_to_openai_format(tool_calls),
}

View File

@@ -0,0 +1,198 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.tokenizers import TokenizerLike
def _replace_none_with_empty(tokens: list[str | None]):
for i, token in enumerate(tokens):
if token is None:
tokens[i] = ""
def _convert_tokens_to_string_with_added_encoders(
tokenizer: TokenizerLike,
output_tokens: list[str],
skip_special_tokens: bool,
spaces_between_special_tokens: bool,
) -> str:
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
# NOTE(woosuk): The following code is slow because it runs a for loop over
# the output_tokens. In Python, running a for loop over a list can be slow
# even when the loop body is very simple.
# Performance improvements: avoid repeated attribute and function lookups;
# localize frequently used objects;
sub_texts: list[str] = []
current_sub_text: list[str] = []
convert_tokens_to_string = tokenizer.convert_tokens_to_string
added_vocab_set = set(tokenizer.get_added_vocab())
all_special_tokens = (
set(tokenizer.all_special_tokens) if skip_special_tokens else ()
)
for token in output_tokens:
# Use precomputed set for skip-special check
if token in all_special_tokens:
continue
if token in added_vocab_set:
if current_sub_text:
sub_texts.append(convert_tokens_to_string(current_sub_text))
current_sub_text.clear()
sub_texts.append(token)
else:
current_sub_text.append(token)
if current_sub_text:
sub_texts.append(convert_tokens_to_string(current_sub_text))
if spaces_between_special_tokens:
return " ".join(sub_texts)
return "".join(sub_texts)
# 5 is an arbitrary value that should work for all
# tokenizers (bigger = more conservative).
INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
def convert_prompt_ids_to_tokens(
tokenizer: TokenizerLike,
prompt_ids: list[int],
skip_special_tokens: bool = False,
) -> tuple[list[str], int, int]:
"""Converts the prompt ids to tokens and returns the tokens and offsets
for incremental detokenization.
Note that not all tokens are converted to strings. Only the tokens that
are necessary for incremental detokenization are converted to strings.
"""
# We do not need to convert the whole prompt to tokens.
# Offset a little more in case we have special tokens.
new_tokens = tokenizer.convert_ids_to_tokens(
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2 :],
skip_special_tokens=skip_special_tokens,
)
read_offset = len(new_tokens)
prefix_offset = max(read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
# This is required to guard against out-of-vocab prompt token ids
_replace_none_with_empty(new_tokens) # type: ignore[arg-type]
return new_tokens, prefix_offset, read_offset
def convert_ids_list_to_tokens(
tokenizer: TokenizerLike,
token_ids: list[int],
) -> list[str]:
"""Detokenize the input ids individually.
Args:
tokenizer: tokenizer used by model under test
token_ids: convert these tokens (Python list form)
Returns:
Python list of token string representations
"""
token_str_lst = []
for token_id in token_ids:
# use default skip_special_tokens.
token_str = tokenizer.decode([token_id])
if token_str is None:
token_str = ""
token_str_lst.append(token_str)
return token_str_lst
# Based on
# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
# under Apache 2.0 license
def detokenize_incrementally(
tokenizer: TokenizerLike,
all_input_ids: list[int],
prev_tokens: list[str] | None,
prefix_offset: int,
read_offset: int,
skip_special_tokens: bool = False,
spaces_between_special_tokens: bool = True,
) -> tuple[list[str], str, int, int]:
"""Detokenizes the input ids incrementally and returns the new tokens
and the new text.
If `prev_tokens` is None, this function will convert the input ids to
tokens and return the tokens and the new text. Otherwise, it will return the
new tokens and the new text.
This function will also return the new prefix offset and the new read
offset to be used in the next iteration.
The offsets are necessary to defeat cleanup algorithms in the decode which
decide to add a space or not depending on the surrounding ids.
Args:
tokenizer: The tokenizer to use.
all_input_ids: The input ids. The last id is the new token id.
prev_tokens: The previous tokens. If None, this function will convert
the input ids to tokens and return the tokens and the new text.
prefix_offset: The prefix offset.
read_offset: The read offset.
skip_special_tokens: Whether to skip special tokens.
spaces_between_special_tokens: Whether to add spaces between special
tokens.
"""
new_token_id = all_input_ids[-1]
# This is the first iteration for this sequence
is_first_iter = prev_tokens is None
if is_first_iter:
(prev_tokens, prefix_offset, read_offset) = convert_prompt_ids_to_tokens(
tokenizer, all_input_ids[:-1], skip_special_tokens=skip_special_tokens
)
assert prev_tokens is not None
# If the new token id is out of bounds, return an empty string.
if 0 <= new_token_id < len(tokenizer):
# Put new_token_id in a list so skip_special_tokens is respected
new_tokens = tokenizer.convert_ids_to_tokens(
[new_token_id], skip_special_tokens=skip_special_tokens
)
if isinstance(new_tokens, str):
new_tokens = [new_tokens]
else:
new_tokens = [""]
output_tokens = prev_tokens + new_tokens
# If this is the first iteration, return all tokens.
if is_first_iter:
new_tokens = output_tokens
# The prefix text is necessary only to defeat cleanup algorithms in
# the decode which decide to add a space or not depending on the
# surrounding ids.
if tokenizer.is_fast or not tokenizer.get_added_vocab():
prefix_text = tokenizer.convert_tokens_to_string(
output_tokens[prefix_offset:read_offset]
)
new_text = tokenizer.convert_tokens_to_string(output_tokens[prefix_offset:])
else:
prefix_text = _convert_tokens_to_string_with_added_encoders(
tokenizer,
output_tokens[prefix_offset:read_offset],
skip_special_tokens=skip_special_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
)
new_text = _convert_tokens_to_string_with_added_encoders(
tokenizer,
output_tokens[prefix_offset:],
skip_special_tokens=skip_special_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
)
if len(new_text) <= len(prefix_text) or new_text.endswith("<EFBFBD>"):
# utf-8 char at the end means it's a potential unfinished byte sequence
# from byte fallback tokenization.
# If it's in the middle, it's probably a real invalid id generated
# by the model
return new_tokens, "", prefix_offset, read_offset
new_text = new_text[len(prefix_text) :]
return new_tokens, new_text, read_offset, len(output_tokens)

119
vllm/tokenizers/hf.py Normal file
View File

@@ -0,0 +1,119 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
import copy
from pathlib import Path
from typing import TypeAlias
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
from .protocol import TokenizerLike
HfTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast
def get_cached_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
"""
By default, transformers will recompute multiple tokenizer properties
each time they are called, leading to a significant slowdown.
This proxy caches these properties for faster access.
"""
cached_tokenizer = copy.copy(tokenizer)
tokenizer_all_special_ids = tokenizer.all_special_ids
tokenizer_all_special_tokens = tokenizer.all_special_tokens
tokenizer_vocab = tokenizer.get_vocab()
tokenizer_len = len(tokenizer)
max_token_id = max(tokenizer_vocab.values())
# Some tokenizers (e.g., QwenTokenizer) have special tokens that
# are added and included in the implementation of the vocab_size
# property, but not in get_vocab(); if there is an implementation
# of vocab size, we should take the greater value.
if hasattr(tokenizer, "vocab_size"):
with contextlib.suppress(NotImplementedError):
max_token_id = max(max_token_id, tokenizer.vocab_size)
class CachedTokenizer(tokenizer.__class__): # type: ignore
@property
def all_special_ids(self) -> list[int]:
return tokenizer_all_special_ids
@property
def all_special_tokens(self) -> list[str]:
return tokenizer_all_special_tokens
@property
def max_token_id(self) -> int:
return max_token_id
def get_vocab(self) -> dict[str, int]:
return tokenizer_vocab
def __len__(self) -> int:
return tokenizer_len
def __reduce__(self):
return get_cached_tokenizer, (tokenizer,)
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
cached_tokenizer.__class__ = CachedTokenizer
return cached_tokenizer
class CachedHfTokenizer(TokenizerLike):
@classmethod
def from_pretrained(
cls,
path_or_repo_id: str | Path,
*args,
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> HfTokenizer:
try:
tokenizer = AutoTokenizer.from_pretrained(
path_or_repo_id,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
cache_dir=download_dir,
**kwargs,
)
except ValueError as e:
# If the error pertains to the tokenizer class not existing or not
# currently being imported,
# suggest using the --trust-remote-code flag.
if not trust_remote_code and (
"does not exist or is not currently imported." in str(e)
or "requires you to execute the tokenizer file" in str(e)
):
err_msg = (
"Failed to load the tokenizer. If the tokenizer "
"is a custom tokenizer not yet available in the "
"HuggingFace transformers library, consider "
"setting `trust_remote_code=True` in LLM or using "
"the `--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e
# The special_tokens in tokenizer should also be
# controlled by do_lower_case in encoder_config
encoder_config = get_sentence_transformer_tokenizer_config(
path_or_repo_id, revision
)
if isinstance(encoder_config, dict) and encoder_config.get(
"do_lower_case", False
):
special_tokens_map = {
k: v.lower() for k, v in tokenizer.special_tokens_map.items()
}
tokenizer.add_special_tokens(special_tokens_map)
return get_cached_tokenizer(tokenizer)

567
vllm/tokenizers/mistral.py Normal file
View File

@@ -0,0 +1,567 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
from typing import TYPE_CHECKING, Any, cast
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.logger import init_logger
from .protocol import TokenizerLike
if TYPE_CHECKING:
from mistral_common.protocol.instruct.request import (
ChatCompletionRequest as MistralChatCompletionRequest,
)
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
from transformers import BatchEncoding
try:
# Transformers v5
from transformers.tokenization_mistral_common import MistralCommonBackend
except ImportError:
# Transformers v4
from transformers.tokenization_mistral_common import (
MistralCommonTokenizer as MistralCommonBackend,
)
logger = init_logger(__name__)
def maybe_serialize_tool_calls(request: "MistralChatCompletionRequest"):
# SEE: https://github.com/vllm-project/vllm/pull/9951
# Credits go to: @gcalmettes
# NOTE: There is currently a bug in pydantic where attributes
# declared as iterables are replaced in in the instances by
# pydantic-core ValidatorIterator instance. In particular, this
# affects tool_calls defined in ChatCompletionAssistantMessageParam
# model:
# see:
# - https://github.com/pydantic/pydantic/issues/9467
# As a result, tool_calls from assistant messages are never
# deserialized in the request object if the tool_calls iterator is
# not consumed. This affect messages passed to the MistralTokenizer
# since no chat template is applied and therefore the tools_calls
# iterator is not directly consumed.
# Issue is tracked on Pydantic side, with resolution planned for
# v2.11 release. In the meantime, the official workaround is to
# consume the iterator so the tool_calls are correctly deserialized
# in the OpenAI ChatCompletionAssistantMessageParam object
# https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501
# Official Pydantic Issues:
# - https://github.com/pydantic/pydantic/issues/9541
# TODO: remove when pydantic v2.11 is released
for i, message in enumerate(request.messages):
if message.get("role") == "assistant":
tool_calls_validator = message.get("tool_calls", ().__iter__())
validated_tool_calls = []
while True:
try:
tool_call = next(tool_calls_validator) # type: ignore
validated_tool_calls.append(tool_call)
except StopIteration:
break
request.messages[i]["tool_calls"] = validated_tool_calls
def truncate_tool_call_ids(request: "MistralChatCompletionRequest"):
"""Truncates tool call IDs for Mistral's ID requirements."""
for i, message in enumerate(request.messages):
if message.get("role") == "assistant":
tool_calls = message.get("tool_calls", [])
for tool_call in tool_calls:
if len(tool_call["id"]) > 9:
logger.warning(
"Truncating tool call ID: %s to %s",
tool_call["id"],
tool_call["id"][-9:],
)
tool_call["id"] = tool_call["id"][-9:]
request.messages[i]["tool_calls"] = tool_calls
elif message.get("role") in {"tool_results", "tool"}:
if "tool_call_id" in message:
tool_call_id = message["tool_call_id"]
if len(tool_call_id) > 9:
logger.warning(
"Truncating tool_call_id: %s to %s",
tool_call_id,
tool_call_id[-9:],
)
tool_call_id = tool_call_id[-9:]
request.messages[i]["tool_call_id"] = tool_call_id
def _prepare_apply_chat_template_tools_and_messages(
messages: list["ChatCompletionMessageParam"],
tools: list[dict[str, Any]] | None = None,
continue_final_message: bool = False,
add_generation_prompt: bool = False,
) -> tuple[list["ChatCompletionMessageParam"], list[dict[str, Any]] | None]:
from mistral_common.protocol.instruct.tool_calls import Function, Tool
if add_generation_prompt and continue_final_message:
raise ValueError(
"Cannot set both `add_generation_prompt` and "
"`continue_final_message` to True."
)
last_message = cast(dict[str, Any], messages[-1])
# add_generation_prompt is directly handled by the tokenizer but we
# check if the user is trying to use it with a final assistant message
# which is probably not what they want.
# If add_generation_prompt is False, we don't need to check anything.
if add_generation_prompt and last_message["role"] == "assistant":
raise ValueError(
"Cannot set `add_generation_prompt` to True when "
"the last message is from the assistant. Consider "
"using `continue_final_message` instead."
)
if continue_final_message and last_message["role"] != "assistant":
raise ValueError(
"Cannot set `continue_final_message` to True when "
"the last message is not from the assistant."
)
# mistral-common requires AssistantMessage content to be string [1].
#
# [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
for message in messages:
# Remove reasoning as unsupported by Mistral
_ = message.pop("reasoning", None) # type: ignore
# The Mistral client, in comparison to the OpenAI client, requires the
# "parameters" dict and the "description" string to be present
# even if they are empty.
if tools:
for function in [
tool["function"] for tool in tools if tool["type"] == "function"
]:
if function.get("parameters") is None:
function["parameters"] = {}
if function.get("description") is None:
function["description"] = ""
# We filter not supported arguments to avoid throwing an error.
# TODO(juliendenize): remove this once OpenAI API is better supported by
# `mistral-common`.
tools_fields = set(Tool.model_fields.keys())
function_fields = set(Function.model_fields.keys())
for tool in tools:
tool_keys = list(tool.keys())
for tool_key in tool_keys:
if tool_key not in tools_fields:
tool.pop(tool_key)
logger.warning_once(
f"'{tool_key}' is not supported by mistral-common for tools. "
"It has been poped from the tool definition."
)
if tool["type"] == "function":
function_keys = list(tool["function"].keys())
for function_key in function_keys:
if function_key not in function_fields:
tool["function"].pop(function_key)
logger.warning_once(
f"'{function_key}' is not supported by mistral-common "
"for function tools. It has been poped from the "
"function definition."
)
else:
raise ValueError("mistral-common only supports function tools.")
return messages, tools
def validate_request_params(request: "ChatCompletionRequest"):
if request.chat_template is not None or request.chat_template_kwargs is not None:
raise ValueError("chat_template is not supported for Mistral tokenizers.")
def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
assert isinstance(tokenizer, Tekkenizer), type(tokenizer)
t_bytes = t.encode("utf-8") if not isinstance(t, bytes) else t
shift = tokenizer.num_special_tokens
try:
return shift + tokenizer._tekken_token2id_nospecial[t_bytes]
except KeyError:
t_str = t_bytes.decode("utf-8")
if t_str in tokenizer._special_tokens_reverse_vocab:
return tokenizer._special_tokens_reverse_vocab[t_str]
logger.warning(
"Failed to convert token %s to id, replacing with <unk>", t_bytes
)
return tokenizer.unk_id
class MistralTokenizer(TokenizerLike):
@classmethod
def from_pretrained(
cls,
path_or_repo_id: str | Path,
*args,
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> "MistralTokenizer":
from mistral_common.protocol.instruct.validator import ValidationMode
try:
# Transformers v5
from transformers.tokenization_mistral_common import MistralCommonBackend
except ImportError:
# Transformers v4
from transformers.tokenization_mistral_common import (
MistralCommonTokenizer as MistralCommonBackend,
)
tokenizer = MistralCommonBackend.from_pretrained(
path_or_repo_id,
*args,
mode=ValidationMode.test,
cache_dir=download_dir,
revision="main" if revision is None else revision,
**kwargs,
)
return cls(tokenizer)
def __init__(self, tokenizer: "MistralCommonBackend") -> None:
super().__init__()
from mistral_common.protocol.instruct.validator import ValidationMode
from mistral_common.tokens.tokenizers.sentencepiece import (
SentencePieceTokenizer,
)
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
self.transformers_tokenizer = tokenizer
self.mistral = tokenizer.tokenizer
self.instruct = self.mistral.instruct_tokenizer
self.tokenizer = self.instruct.tokenizer
mode = self.mistral._chat_completion_request_validator._mode
if mode != ValidationMode.test:
raise ValueError(
"Mistral tokenizer must be in test mode. Make sure to "
"set `mode='ValidationMode.test'` when creating the "
"Mistral tokenizer."
)
_mistral_version_str = str(self.tokenizer.version.value)
self.version: int = int(_mistral_version_str.split("v")[-1])
self.is_tekken = isinstance(self.tokenizer, Tekkenizer)
self.is_spm = isinstance(self.tokenizer, SentencePieceTokenizer)
if not (self.is_tekken or self.is_spm):
raise TypeError(f"Unsupported tokenizer: {type(self.tokenizer)}")
# Reverse order to ensure that the lowest token id is kept.
self._vocab_dict = {
self.convert_ids_to_tokens([i], skip_special_tokens=False)[0]: i
for i in range(self.vocab_size - 1, -1, -1)
}
# Sort the dict for convenience
self._vocab_dict = dict(sorted(self._vocab_dict.items(), key=lambda x: x[1]))
# Cache special tokens for faster access.
self._special_token_ids = self._get_special_token_ids()
self._special_token_ids_set = set(self._special_token_ids)
self._special_tokens = self._get_special_tokens(self._special_token_ids)
self._special_tokens_set = set(self._special_tokens)
# Vocab sorted by token id.
self._vocab = self.tokenizer._vocab
self._max_token_id = self.vocab_size - 1
def _get_special_token_ids(self) -> list[int]:
from mistral_common.tokens.tokenizers.sentencepiece import (
SentencePieceTokenizer,
)
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
if self.is_tekken:
assert isinstance(self.tokenizer, Tekkenizer), type(self.tokenizer)
special_ids = {t["rank"] for t in self.tokenizer._all_special_tokens}
elif self.is_spm:
assert isinstance(self.tokenizer, SentencePieceTokenizer), type(
self.tokenizer
)
special_ids = self.tokenizer._control_tokens
else:
raise ValueError(f"Unknown tokenizer type: {type(self.tokenizer)}")
return sorted(special_ids)
def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]:
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
return [
self.tokenizer.decode([i], special_token_policy=SpecialTokenPolicy.KEEP)
for i in all_special_ids
]
def num_special_tokens_to_add(self) -> int:
return len(self.encode(""))
# the following attributes are set to fit vLLM's design and are used
# by the structured output backends.
@property
def all_special_tokens(self) -> list[str]:
return self._special_tokens
@property
def all_special_ids(self) -> list[int]:
return self._special_token_ids
@property
def bos_token_id(self) -> int:
return self.tokenizer.bos_id
@property
def eos_token_id(self) -> int:
return self.tokenizer.eos_id
@property
def pad_token_id(self) -> int:
return self.tokenizer.pad_id
@property
def is_fast(self) -> bool:
return True
@property
def vocab_size(self) -> int:
return self.transformers_tokenizer.vocab_size
@property
def max_token_id(self) -> int:
return self._max_token_id
@property
def truncation_side(self) -> str:
return self.transformers_tokenizer.truncation_side
def _is_special_token_id(self, token_id: int) -> bool:
return token_id in self._special_token_ids_set
def __hash__(self) -> int:
return hash(id(self))
def __len__(self) -> int:
return self.vocab_size
def __call__(
self,
text: str | list[str],
text_pair: str | None = None,
add_special_tokens: bool = True,
truncation: bool = False,
max_length: int | None = None,
) -> "BatchEncoding":
if text_pair is not None:
raise ValueError(
"`text_pair` is not supported by `MistralTokenizer.__call__`."
)
encoded = self.transformers_tokenizer(
text=text,
text_pair=text_pair,
add_special_tokens=add_special_tokens,
truncation=truncation,
max_length=max_length,
)
# TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
# is in, revert to only call self.transformers_tokenizer(...).
# Hack to fix wrongly added eos token, when fix will be supported the condition
# below will be False even before the revert is done.
if encoded["input_ids"] and encoded["input_ids"][-1] == self.eos_token_id:
encoded["input_ids"].pop(-1)
if attention_mask := encoded.get("attention_mask"):
attention_mask.pop(-1)
return encoded
@property
def vocab(self) -> list[str]:
return self._vocab
def get_vocab(self) -> dict[str, int]:
return self._vocab_dict
def get_added_vocab(self) -> dict[str, int]:
# Mistral tokenizers have no added vocabulary
return {}
def encode(
self,
text: str,
truncation: bool | None = None,
max_length: int | None = None,
add_special_tokens: bool = True,
) -> list[int]:
# TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
# is in, directly call self.transformers_tokenizer.encode(...).
encoded = self.tokenizer.encode(text, bos=add_special_tokens, eos=False)
if truncation is not False and max_length is not None:
return encoded[:max_length]
else:
return encoded
def apply_chat_template(
self,
messages: list["ChatCompletionMessageParam"],
tools: list[dict[str, Any]] | None = None,
**kwargs,
) -> list[int]:
add_generation_prompt = kwargs.pop("add_generation_prompt", False)
continue_final_message = kwargs.get("continue_final_message", False)
tokenize = kwargs.get("tokenize", True)
padding = kwargs.get("padding", False)
truncation = kwargs.get("truncation", False)
max_length = kwargs.get("max_length")
messages, tools = _prepare_apply_chat_template_tools_and_messages(
messages, tools, continue_final_message, add_generation_prompt
)
return self.transformers_tokenizer.apply_chat_template(
conversation=messages,
tools=tools,
continue_final_message=continue_final_message,
tokenize=tokenize,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=None,
return_dict=False,
)
def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
# TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
# is in, directly call self.transformers_tokenizer.decode(...).
if isinstance(ids, int):
ids = [ids]
return self.transformers_tokenizer.decode(
ids, skip_special_tokens=skip_special_tokens
)
def batch_decode(
self, ids: list[list[int]] | list[int], skip_special_tokens: bool = False
) -> str:
return self.transformers_tokenizer.batch_decode(
ids, skip_special_tokens=skip_special_tokens
)
def convert_tokens_to_string(self, tokens: list[str]) -> str:
from mistral_common.tokens.tokenizers.base import (
SpecialTokenPolicy,
SpecialTokens,
)
from mistral_common.tokens.tokenizers.sentencepiece import (
SentencePieceTokenizer,
)
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
to_decode_special_tokens = {SpecialTokens.tool_calls}
if self.is_tekken:
assert isinstance(self.tokenizer, Tekkenizer), type(self.tokenizer)
tokens = [
t
for t in tokens
if (t in to_decode_special_tokens or t not in self._special_tokens_set)
]
if any(isinstance(t, bytes) for t in tokens):
# we need to encode and decode all tokens again
ids = [_tekken_token_to_id(self.tokenizer, t) for t in tokens]
# We filtered unwanted special tokens before
# so we can decode the rest.
decoded = self.tokenizer.decode(ids, SpecialTokenPolicy.KEEP)
else:
decoded = "".join(tokens)
else:
# make sure certain special tokens like Tool calls are
# not decoded
assert isinstance(self.tokenizer, SentencePieceTokenizer), type(
self.tokenizer
)
regular_tokens: list[str] = []
decoded_list: list[str] = []
decoded = ""
for token in tokens:
if token in to_decode_special_tokens:
if regular_tokens:
decoded_list.append(
self.tokenizer.decode(
regular_tokens, SpecialTokenPolicy.IGNORE
)
)
regular_tokens = []
decoded_list.append(token)
else:
regular_tokens.append(token)
if regular_tokens:
decoded_list.append(
self.tokenizer.decode(regular_tokens, SpecialTokenPolicy.IGNORE)
)
decoded = "".join(decoded_list)
return decoded
def convert_ids_to_tokens(
self,
ids: list[int],
skip_special_tokens: bool = False,
) -> list[str]:
from mistral_common.tokens.tokenizers.base import (
SpecialTokenPolicy,
SpecialTokens,
)
from mistral_common.tokens.tokenizers.instruct import InstructTokenizerV13
if not skip_special_tokens:
return [self.tokenizer.id_to_piece(token_id) for token_id in ids]
non_skip_special_tokens_ids = {
self.tokenizer.get_control_token(SpecialTokens.tool_calls),
}
if isinstance(self.instruct, InstructTokenizerV13):
if self.instruct.BEGIN_THINK:
non_skip_special_tokens_ids.add(self.instruct.BEGIN_THINK)
if self.instruct.END_THINK:
non_skip_special_tokens_ids.add(self.instruct.END_THINK)
ids_kept = [
i
for i in ids
if i in non_skip_special_tokens_ids or not self._is_special_token_id(i)
]
# We filtered unwanted special tokens so we can decode the rest.
tokens = [self.tokenizer.id_to_piece(token_id) for token_id in ids_kept]
if any("<EFBFBD>" in t for t in tokens) and self.is_tekken:
# if a decoded token contains the replacement character, then the
# token has an incomplete UTF-8 character so we must use bytes
# See: https://github.com/vllm-project/vllm/pull/8640
# https://github.com/vllm-project/vllm/pull/9625
# if underlying tokenizer is sentencepiece, we just add "<22>".
# We filtered unwanted special tokens so we can decode the rest.
tokens = [
self.tokenizer.id_to_byte_piece(token_id, SpecialTokenPolicy.KEEP)
if token_id not in self._special_token_ids_set
else self.tokenizer.decode([token_id], SpecialTokenPolicy.KEEP)
for token_id in ids_kept
]
return tokens

114
vllm/tokenizers/protocol.py Normal file
View File

@@ -0,0 +1,114 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
from typing import TYPE_CHECKING, Any, Protocol
if TYPE_CHECKING:
from transformers import BatchEncoding
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
class TokenizerLike(Protocol):
@classmethod
def from_pretrained(
cls,
path_or_repo_id: str | Path,
*args,
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> "TokenizerLike":
raise NotImplementedError
def num_special_tokens_to_add(self) -> int:
raise NotImplementedError
@property
def all_special_tokens(self) -> list[str]:
raise NotImplementedError
@property
def all_special_ids(self) -> list[int]:
raise NotImplementedError
@property
def bos_token_id(self) -> int:
raise NotImplementedError
@property
def eos_token_id(self) -> int:
raise NotImplementedError
@property
def pad_token_id(self) -> int:
raise NotImplementedError
@property
def is_fast(self) -> bool:
raise NotImplementedError
@property
def vocab_size(self) -> int:
raise NotImplementedError
@property
def max_token_id(self) -> int:
raise NotImplementedError
@property
def truncation_side(self) -> str:
raise NotImplementedError
def __hash__(self) -> int:
return hash(id(self))
def __len__(self) -> int:
return self.vocab_size
def __call__(
self,
text: str | list[str],
text_pair: str | None = None,
add_special_tokens: bool = True,
truncation: bool = False,
max_length: int | None = None,
) -> "BatchEncoding":
raise NotImplementedError
def get_vocab(self) -> dict[str, int]:
raise NotImplementedError
def get_added_vocab(self) -> dict[str, int]:
raise NotImplementedError
def encode(
self,
text: str,
truncation: bool | None = None,
max_length: int | None = None,
add_special_tokens: bool = True,
) -> list[int]:
raise NotImplementedError
def apply_chat_template(
self,
messages: list["ChatCompletionMessageParam"],
tools: list[dict[str, Any]] | None = None,
**kwargs,
) -> str | list[int]:
raise NotImplementedError
def convert_tokens_to_string(self, tokens: list[str]) -> str:
raise NotImplementedError
def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
raise NotImplementedError
def convert_ids_to_tokens(
self,
ids: list[int],
skip_special_tokens: bool = False,
) -> list[str]:
raise NotImplementedError

233
vllm/tokenizers/registry.py Normal file
View File

@@ -0,0 +1,233 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
from dataclasses import dataclass, field
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING
import huggingface_hub
from typing_extensions import TypeVar, assert_never, deprecated
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.transformers_utils.gguf_utils import (
check_gguf_file,
get_gguf_file_path_from_hf,
is_gguf,
is_remote_gguf,
split_remote_gguf,
)
from vllm.transformers_utils.repo_utils import list_filtered_repo_files
from vllm.utils.import_utils import resolve_obj_by_qualname
from .protocol import TokenizerLike
if TYPE_CHECKING:
from vllm.config.model import ModelConfig, RunnerType
logger = init_logger(__name__)
_VLLM_TOKENIZERS = {
"deepseek_v32": ("deepseek_v32", "DeepseekV32Tokenizer"),
"hf": ("hf", "CachedHfTokenizer"),
"mistral": ("mistral", "MistralTokenizer"),
}
@dataclass
class _TokenizerRegistry:
# Tokenizer mode -> (tokenizer module, tokenizer class)
tokenizers: dict[str, tuple[str, str]] = field(default_factory=dict)
def register(self, tokenizer_mode: str, module: str, class_name: str) -> None:
if tokenizer_mode in self.tokenizers:
logger.warning(
"%s.%s is already registered for tokenizer_mode=%r. "
"It is overwritten by the new one.",
module,
class_name,
tokenizer_mode,
)
self.tokenizers[tokenizer_mode] = (module, class_name)
return None
def load_tokenizer_cls(self, tokenizer_mode: str) -> type[TokenizerLike]:
if tokenizer_mode not in self.tokenizers:
raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")
module, class_name = self.tokenizers[tokenizer_mode]
logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")
return resolve_obj_by_qualname(f"{module}.{class_name}")
def load_tokenizer(self, tokenizer_mode: str, *args, **kwargs) -> TokenizerLike:
tokenizer_cls = self.load_tokenizer_cls(tokenizer_mode)
return tokenizer_cls.from_pretrained(*args, **kwargs)
TokenizerRegistry = _TokenizerRegistry(
{
mode: (f"vllm.tokenizers.{mod_relname}", cls_name)
for mode, (mod_relname, cls_name) in _VLLM_TOKENIZERS.items()
}
)
def resolve_tokenizer_args(
tokenizer_name: str | Path,
*args,
runner_type: "RunnerType" = "generate",
tokenizer_mode: str = "auto",
**kwargs,
):
revision: str | None = kwargs.get("revision")
download_dir: str | None = kwargs.get("download_dir")
if envs.VLLM_USE_MODELSCOPE:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
from modelscope.hub.snapshot_download import snapshot_download
# avoid circular import
from vllm.model_executor.model_loader.weight_utils import get_lock
# Only set the tokenizer here, model will be downloaded on the workers.
if not Path(tokenizer_name).exists():
# Use file lock to prevent multiple processes from
# downloading the same file at the same time.
with get_lock(tokenizer_name, download_dir):
tokenizer_path = snapshot_download(
model_id=str(tokenizer_name),
cache_dir=download_dir,
revision=revision,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
# Ignore weights - we only need the tokenizer.
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
)
tokenizer_name = tokenizer_path
# Separate model folder from file path for GGUF models
if is_gguf(tokenizer_name):
if check_gguf_file(tokenizer_name):
kwargs["gguf_file"] = Path(tokenizer_name).name
tokenizer_name = Path(tokenizer_name).parent
elif is_remote_gguf(tokenizer_name):
tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
# Get the HuggingFace Hub path for the GGUF file
gguf_file = get_gguf_file_path_from_hf(
tokenizer_name,
quant_type,
revision=revision,
)
kwargs["gguf_file"] = gguf_file
if "truncation_side" not in kwargs:
if runner_type == "generate" or runner_type == "draft":
kwargs["truncation_side"] = "left"
elif runner_type == "pooling":
kwargs["truncation_side"] = "right"
else:
assert_never(runner_type)
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
tokenizer_mode = "hf"
kwargs["use_fast"] = False
# Try to use official Mistral tokenizer if possible
if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"):
allow_patterns = ["tekken.json", "tokenizer.model.v*"]
files_list = list_filtered_repo_files(
model_name_or_path=str(tokenizer_name),
allow_patterns=allow_patterns,
revision=revision,
)
if len(files_list) > 0:
tokenizer_mode = "mistral"
# Fallback to HF tokenizer
if tokenizer_mode == "auto":
tokenizer_mode = "hf"
return tokenizer_mode, tokenizer_name, args, kwargs
cached_resolve_tokenizer_args = lru_cache(resolve_tokenizer_args)
def tokenizer_args_from_config(config: "ModelConfig", **kwargs):
return cached_resolve_tokenizer_args(
config.tokenizer,
runner_type=config.runner_type,
tokenizer_mode=config.tokenizer_mode,
revision=config.tokenizer_revision,
trust_remote_code=config.trust_remote_code,
**kwargs,
)
_T = TypeVar("_T", bound=TokenizerLike, default=TokenizerLike)
def get_tokenizer(
tokenizer_name: str | Path,
*args,
tokenizer_cls: type[_T] = TokenizerLike, # type: ignore[assignment]
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> _T:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
tokenizer_mode, tokenizer_name, args, kwargs = cached_resolve_tokenizer_args(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
if tokenizer_cls == TokenizerLike:
tokenizer_cls_ = TokenizerRegistry.load_tokenizer_cls(tokenizer_mode)
else:
tokenizer_cls_ = tokenizer_cls
tokenizer = tokenizer_cls_.from_pretrained(tokenizer_name, *args, **kwargs)
if not tokenizer.is_fast:
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)
return tokenizer # type: ignore
cached_get_tokenizer = lru_cache(get_tokenizer)
def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
if model_config.skip_tokenizer_init:
return None
return cached_get_tokenizer(
model_config.tokenizer,
runner_type=model_config.runner_type,
tokenizer_mode=model_config.tokenizer_mode,
revision=model_config.tokenizer_revision,
trust_remote_code=model_config.trust_remote_code,
**kwargs,
)
@deprecated(
"Renamed to `cached_tokenizer_from_config`. The old name will be removed in v0.14."
)
def init_tokenizer_from_config(model_config: "ModelConfig"):
return cached_tokenizer_from_config(model_config)