This commit is contained in:
root
2026-04-09 11:23:47 +08:00
parent 8082d5f4b2
commit 72387e4fa8
1885 changed files with 611521 additions and 1 deletions

View File

@@ -0,0 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .protocol import TokenizerLike
from .registry import (
TokenizerRegistry,
cached_get_tokenizer,
cached_tokenizer_from_config,
get_tokenizer,
)
__all__ = [
"TokenizerLike",
"TokenizerRegistry",
"cached_get_tokenizer",
"get_tokenizer",
"cached_tokenizer_from_config",
]

View File

@@ -0,0 +1,89 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
from typing import Any
from transformers import AutoTokenizer
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from . import TokenizerLike
from .deepseek_v32_encoding import encode_messages
from .hf import HfTokenizer, get_cached_tokenizer
def get_deepseek_v32_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
"""
Wraps a tokenizer to use the custom DeepSeek V3.2 chat template encoding.
"""
dsv32_tokenizer = copy.copy(tokenizer)
added_vocab = tokenizer.get_added_vocab()
added_vocab_size = len(added_vocab)
tokenizer_vocab_size = tokenizer.vocab_size
class _DeepseekV32Tokenizer(tokenizer.__class__): # type: ignore
def apply_chat_template(
self,
messages: list["ChatCompletionMessageParam"],
tools: list[dict[str, Any]] | None = None,
**kwargs,
) -> str | list[int]:
thinking = kwargs.get("thinking", False)
enable_thinking = kwargs.get("enable_thinking", False)
thinking = thinking or enable_thinking
thinking_mode = "thinking"
if not thinking:
thinking_mode = "chat"
conversation = kwargs.get("conversation", messages)
messages = conversation.copy()
if tools is not None and len(tools) > 0:
messages.insert(0, {"role": "system"})
messages[0]["tools"] = tools # type: ignore[typeddict-unknown-key]
# Historical reasoning content is dropped when a new user message
# is introduced
drop_thinking = messages[-1]["role"] == "user"
encode_config = dict(
thinking_mode=thinking_mode, drop_thinking=drop_thinking
)
prompt_str = encode_messages(messages, **encode_config) # type: ignore
if kwargs.get("tokenize", True):
tokenizer_kwargs = {
k: kwargs[k] for k in ("truncation", "max_length") if k in kwargs
}
return self.encode(
prompt_str,
add_special_tokens=False,
**tokenizer_kwargs,
)
return prompt_str
def num_special_tokens_to_add(self) -> int:
return len(self.encode(""))
def __len__(self) -> int:
# </think> is an added token in DeepseekV32 tokenizer
return tokenizer_vocab_size + added_vocab_size
def get_added_vocab(self) -> dict[str, int]:
return added_vocab.copy()
def __reduce__(self):
return get_deepseek_v32_tokenizer, (tokenizer,)
_DeepseekV32Tokenizer.__name__ = f"DSV32{tokenizer.__class__.__name__}"
dsv32_tokenizer.__class__ = _DeepseekV32Tokenizer
return dsv32_tokenizer
class DeepseekV32Tokenizer(TokenizerLike):
@classmethod
def from_pretrained(cls, *args, **kwargs) -> HfTokenizer:
tokenizer = AutoTokenizer.from_pretrained(*args, **kwargs)
return get_cached_tokenizer(get_deepseek_v32_tokenizer(tokenizer))

View File

@@ -0,0 +1,471 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# copy from https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/encoding/encoding_dsv32.py
import copy
import json
from typing import Any
import regex as re
# flake8: noqa: E501
TOOLS_SYSTEM_TEMPLATE = """## Tools
You have access to a set of tools you can use to answer the user's question.
You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user:
<{dsml_token}function_calls>
<{dsml_token}invoke name="$FUNCTION_NAME">
<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</{dsml_token}parameter>
...
</{dsml_token}invoke>
<{dsml_token}invoke name="$FUNCTION_NAME2">
...
</{dsml_token}invoke>
</{dsml_token}function_calls>
String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" for other types (numbers, booleans, arrays, objects).
If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking block. Here is an example:
<{dsml_token}function_calls>
...
</{dsml_token}function_calls>
<function_results>
...
</function_results>
{thinking_start_token}...thinking about results{thinking_end_token}
Here are the functions available in JSONSchema format:
<functions>
{tool_schemas}
</functions>
"""
bos_token: str = "<begin▁of▁sentence>"
eos_token: str = "<end▁of▁sentence>"
thinking_start_token: str = "<think>"
thinking_end_token: str = "</think>"
dsml_token: str = "DSML"
system_msg_template: str = "{content}"
user_msg_template: str = "<User>{content}<Assistant>"
assistant_msg_template: str = "{reasoning}{content}{tool_calls}<end▁of▁sentence>"
thinking_template = "{reasoning}"
response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}"
tool_call_template: str = (
'<{dsml_token}invoke name="{name}">\n{arguments}\n</{dsml_token}invoke>'
)
tool_calls_template = (
"<{dsml_token}function_calls>\n{tool_calls}\n</{dsml_token}function_calls>"
)
tool_output_template: str = "\n<result>{content}</result>"
def to_json(value: Any) -> str:
try:
return json.dumps(value, ensure_ascii=False)
except Exception:
return json.dumps(value, ensure_ascii=True)
def tools_from_openai_format(tools):
return [tool["function"] for tool in tools]
def tool_calls_from_openai_format(tool_calls):
return [
{
"name": tool_call["function"]["name"],
"arguments": tool_call["function"]["arguments"],
}
for tool_call in tool_calls
]
def tool_calls_to_openai_format(tool_calls):
return [
{
"type": "function",
"function": {
"name": tool_call["name"],
"arguments": tool_call["arguments"],
},
}
for tool_call in tool_calls
]
def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>"""
P_dsml_strs = []
if isinstance(tool_call["arguments"], str):
arguments = json.loads(tool_call["arguments"])
else:
arguments = tool_call["arguments"]
for k, v in arguments.items():
p_dsml_str = p_dsml_template.format(
dsml_token=dsml_token,
key=k,
is_str="true" if isinstance(v, str) else "false",
value=v if isinstance(v, str) else to_json(v),
)
P_dsml_strs.append(p_dsml_str)
return "\n".join(P_dsml_strs)
def decode_dsml_to_arguments(
tool_name: str, tool_args: dict[str, tuple[str, str]]
) -> dict[str, str]:
def _decode_value(key: str, value: str, string: str):
if string == "true":
value = to_json(value)
return f"{to_json(key)}: {value}"
tool_args_json = (
"{"
+ ", ".join(
[_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]
)
+ "}"
)
return dict(name=tool_name, arguments=tool_args_json)
def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str:
tools_json = [to_json(t) for t in tools]
return TOOLS_SYSTEM_TEMPLATE.format(
tool_schemas="\n".join(tools_json),
dsml_token=dsml_token,
thinking_start_token=thinking_start_token,
thinking_end_token=thinking_end_token,
)
def find_last_user_index(messages: list[dict[str, Any]]) -> int:
last_user_index = -1
for idx in range(len(messages) - 1, -1, -1):
if messages[idx].get("role") in ["user", "developer"]:
last_user_index = idx
break
return last_user_index
def render_message(
index: int, messages: list[dict[str, Any]], thinking_mode: str
) -> str:
if not (0 <= index < len(messages)):
raise ValueError(
f"Index {index} out of range for messages list of length {len(messages)}"
)
if thinking_mode not in ["chat", "thinking"]:
raise ValueError(f"Invalid thinking_mode `{thinking_mode}`")
prompt = ""
msg = messages[index]
last_user_idx = find_last_user_index(messages)
role = msg.get("role")
content = msg.get("content")
tools = msg.get("tools")
response_format = msg.get("response_format")
tool_calls = msg.get("tool_calls")
reasoning = msg.get("reasoning")
is_prefix = msg.get("prefix", False)
if tools:
tools = tools_from_openai_format(tools)
if tool_calls:
tool_calls = tool_calls_from_openai_format(tool_calls)
if role == "system":
prompt += system_msg_template.format(content=content or "")
if tools:
prompt += "\n\n" + render_tools(tools)
if response_format:
prompt += "\n\n" + response_format_template.format(
schema=to_json(response_format)
)
elif role == "developer":
if not content:
raise ValueError(f"Invalid message for role `{role}`: {msg}")
content_developer = ""
if tools:
content_developer += "\n\n" + render_tools(tools)
if response_format:
content_developer += "\n\n" + response_format_template.format(
schema=to_json(response_format)
)
content_developer += "\n\n# The user's message is: {}".format(content)
prompt += user_msg_template.format(content=content_developer)
if index == last_user_idx and thinking_mode == "thinking":
prompt += thinking_start_token
else:
prompt += thinking_end_token
elif role == "user":
prompt += user_msg_template.format(content=content)
if index == last_user_idx and thinking_mode == "thinking":
prompt += thinking_start_token
else:
prompt += thinking_end_token
elif role == "tool":
prev_assistant_idx = index - 1
assistant_msg = messages[prev_assistant_idx]
while prev_assistant_idx >= 0 and assistant_msg.get("role") == "tool":
prev_assistant_idx -= 1
assistant_msg = messages[prev_assistant_idx]
if not (
index == 0
or prev_assistant_idx >= 0
and assistant_msg.get("role") == "assistant"
):
raise ValueError(f"Invalid messages at {index}:\n{assistant_msg}")
tool_call_order = index - prev_assistant_idx
assistant_tool_calls = assistant_msg.get("tool_calls")
if not (assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order):
raise ValueError("No tool calls but found tool output")
if tool_call_order == 1:
prompt += "\n\n<function_results>"
prompt += tool_output_template.format(content=content)
if tool_call_order == len(assistant_tool_calls):
prompt += "\n</function_results>"
if index >= last_user_idx and thinking_mode == "thinking":
prompt += "\n\n" + thinking_start_token
else:
prompt += "\n\n" + thinking_end_token
elif role == "assistant":
prev_assistant_idx = index
thinking_part = ""
tool_calls_content = ""
if tool_calls:
tool_calls = [
tool_call_template.format(
dsml_token=dsml_token,
name=tool_call.get("name"),
arguments=encode_arguments_to_dsml(tool_call),
)
for tool_call in tool_calls
]
tool_calls_content += "\n\n" + tool_calls_template.format(
dsml_token=dsml_token, tool_calls="\n".join(tool_calls)
)
summary_content = content or ""
if thinking_mode == "thinking" and index > last_user_idx:
if not (reasoning or tool_calls):
raise ValueError(
f"ThinkingMode: {thinking_mode}, invalid message without reasoning/tool_calls `{msg}` after last user message"
)
thinking_part = (
thinking_template.format(reasoning=reasoning or "") + thinking_end_token
)
if not tool_calls and is_prefix:
prompt += summary_content
else:
prompt += assistant_msg_template.format(
reasoning=thinking_part,
content=summary_content,
tool_calls=tool_calls_content,
)
else:
raise NotImplementedError(f"Unknown role: {role}")
return prompt
def drop_thinking_messages(
messages: list[dict[str, Any]], last_user_idx: int | None = None
) -> list[dict[str, Any]]:
messages_wo_thinking: list[dict[str, Any]] = []
last_user_idx = (
find_last_user_index(messages) if last_user_idx is None else last_user_idx
)
for idx, msg in enumerate(messages):
role = msg.get("role")
if role in ["user", "system", "tool"] or idx >= last_user_idx:
messages_wo_thinking.append(msg)
continue
elif role == "assistant":
msg_wo_thinking = copy.copy(msg)
msg_wo_thinking.pop("reasoning", None)
messages_wo_thinking.append(msg_wo_thinking)
return messages_wo_thinking
def encode_messages(
messages: list[dict[str, Any]],
thinking_mode: str,
context: list[dict[str, Any]] | None = None,
drop_thinking: bool = True,
add_default_bos_token: bool = True,
) -> str:
context = context if context else []
full_messages = context + messages
prompt = bos_token if add_default_bos_token and len(context) == 0 else ""
if thinking_mode == "thinking" and drop_thinking:
full_messages = drop_thinking_messages(full_messages)
for idx in range(len(messages)):
prompt += render_message(
idx + len(context), full_messages, thinking_mode=thinking_mode
)
return prompt
def _read_until_stop(
index: int, text: str, stop: list[str]
) -> tuple[int, str, None | str]:
min_pos = len(text)
matched_stop = None
for s in stop:
pos = text.find(s, index)
if pos != -1 and pos < min_pos:
min_pos = pos
matched_stop = s
if matched_stop:
content = text[index:min_pos]
return min_pos + len(matched_stop), content, matched_stop
else:
content = text[index:]
return len(text), content, None
def parse_tool_calls(index: int, text: str):
tool_calls: list[dict[str, Any]] = []
stop_token = None
tool_calls_end_token = f"</{dsml_token}function_calls>"
while index < len(text):
index, _, stop_token = _read_until_stop(
index, text, [f"<{dsml_token}invoke", tool_calls_end_token]
)
if _ != ">\n":
raise RuntimeError("Tool call format error")
if stop_token == tool_calls_end_token:
break
if stop_token is None:
raise RuntimeError("Missing special token")
index, tool_name_content, stop_token = _read_until_stop(
index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
)
p_tool_name = re.findall(
r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL
)
if len(p_tool_name) != 1:
raise RuntimeError("Tool name format error")
tool_name = p_tool_name[0]
tool_args: dict[str, tuple[str, str]] = {}
while stop_token == f"<{dsml_token}parameter":
index, param_content, stop_token = _read_until_stop(
index, text, [f"/{dsml_token}parameter"]
)
param_kv = re.findall(
r'^ name="(.*?)" string="(true|false)">(.*?)<$',
param_content,
flags=re.DOTALL,
)
if len(param_kv) != 1:
raise RuntimeError("Parameter format error")
param_name, string, param_value = param_kv[0]
if param_name in tool_args:
raise RuntimeError("Duplicate parameter name")
tool_args[param_name] = (param_value, string)
index, content, stop_token = _read_until_stop(
index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
)
if content != ">\n":
raise RuntimeError("Parameter format error")
tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
tool_calls.append(tool_call)
return index, stop_token, tool_calls
# NOTE: This function is designed to parse only correctly
# formatted string and will not attempt to correct malformed output
# that may be generated by the model.
def parse_message_from_completion_text(text: str, thinking_mode: str):
summary_content, reasoning, tool_calls = "", "", []
index, stop_token = 0, None
tool_calls_start_token = f"\n\n<{dsml_token}function_calls"
is_thinking, is_tool_calling = thinking_mode == "thinking", False
if is_thinking:
index, content_delta, stop_token = _read_until_stop(
index, text, [thinking_end_token, tool_calls_start_token]
)
reasoning = content_delta
if stop_token != thinking_end_token:
raise RuntimeError("Invalid thinking format")
index, content_delta, stop_token = _read_until_stop(
index, text, [eos_token, tool_calls_start_token]
)
summary_content = content_delta
if stop_token == tool_calls_start_token:
is_tool_calling = True
else:
if stop_token != eos_token:
raise RuntimeError("Invalid summary format")
if is_tool_calling:
index, stop_token, tool_calls = parse_tool_calls(index, text)
index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
if tool_ends_text:
raise RuntimeError("Unexpected content after tool calls")
if not (len(text) == index and stop_token in [eos_token, None]):
raise RuntimeError("Unexpected content at end")
for sp_token in [
bos_token,
eos_token,
thinking_start_token,
thinking_end_token,
dsml_token,
]:
if sp_token in summary_content or sp_token in reasoning:
raise RuntimeError("Unexpected special token in content")
return {
"role": "assistant",
"content": summary_content,
"reasoning": reasoning,
"tool_calls": tool_calls_to_openai_format(tool_calls),
}

View File

@@ -0,0 +1,202 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.tokenizers import TokenizerLike
def _replace_none_with_empty(tokens: list[str | None]):
for i, token in enumerate(tokens):
if token is None:
tokens[i] = ""
def _convert_tokens_to_string_with_added_encoders(
tokenizer: TokenizerLike,
output_tokens: list[str],
skip_special_tokens: bool,
spaces_between_special_tokens: bool,
) -> str:
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
# NOTE(woosuk): The following code is slow because it runs a for loop over
# the output_tokens. In Python, running a for loop over a list can be slow
# even when the loop body is very simple.
# Performance improvements: avoid repeated attribute and function lookups;
# localize frequently used objects;
sub_texts: list[str] = []
current_sub_text: list[str] = []
convert_tokens_to_string = tokenizer.convert_tokens_to_string
added_vocab_set = set(tokenizer.get_added_vocab())
all_special_tokens = (
set(tokenizer.all_special_tokens) if skip_special_tokens else ()
)
for token in output_tokens:
# Use precomputed set for skip-special check
if token in all_special_tokens:
continue
if token in added_vocab_set:
if current_sub_text:
sub_texts.append(convert_tokens_to_string(current_sub_text))
current_sub_text.clear()
sub_texts.append(token)
else:
current_sub_text.append(token)
if current_sub_text:
sub_texts.append(convert_tokens_to_string(current_sub_text))
if spaces_between_special_tokens:
return " ".join(sub_texts)
return "".join(sub_texts)
# 5 is an arbitrary value that should work for all
# tokenizers (bigger = more conservative).
INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
def convert_prompt_ids_to_tokens(
tokenizer: TokenizerLike,
prompt_ids: list[int],
skip_special_tokens: bool = False,
) -> tuple[list[str], int, int]:
"""Converts the prompt ids to tokens and returns the tokens and offsets
for incremental detokenization.
Note that not all tokens are converted to strings. Only the tokens that
are necessary for incremental detokenization are converted to strings.
"""
# We do not need to convert the whole prompt to tokens.
# Offset a little more in case we have special tokens.
new_tokens = tokenizer.convert_ids_to_tokens(
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2 :],
skip_special_tokens=skip_special_tokens,
)
read_offset = len(new_tokens)
prefix_offset = max(read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
# This is required to guard against out-of-vocab prompt token ids
_replace_none_with_empty(new_tokens) # type: ignore[arg-type]
return new_tokens, prefix_offset, read_offset
def convert_ids_list_to_tokens(
tokenizer: TokenizerLike,
token_ids: list[int],
) -> list[str]:
"""Detokenize the input ids individually.
Args:
tokenizer: tokenizer used by model under test
token_ids: convert these tokens (Python list form)
Returns:
Python list of token string representations
"""
token_str_lst = []
for token_id in token_ids:
# use default skip_special_tokens.
token_str = tokenizer.decode([token_id])
if token_str is None:
token_str = ""
token_str_lst.append(token_str)
return token_str_lst
# Based on
# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
# under Apache 2.0 license
def detokenize_incrementally(
tokenizer: TokenizerLike,
all_input_ids: list[int],
prev_tokens: list[str] | None,
prefix_offset: int,
read_offset: int,
skip_special_tokens: bool = False,
spaces_between_special_tokens: bool = True,
) -> tuple[list[str], str, int, int]:
"""Detokenizes the input ids incrementally and returns the new tokens
and the new text.
If `prev_tokens` is None, this function will convert the input ids to
tokens and return the tokens and the new text. Otherwise, it will return the
new tokens and the new text.
This function will also return the new prefix offset and the new read
offset to be used in the next iteration.
The offsets are necessary to defeat cleanup algorithms in the decode which
decide to add a space or not depending on the surrounding ids.
Args:
tokenizer: The tokenizer to use.
all_input_ids: The input ids. The last id is the new token id.
prev_tokens: The previous tokens. If None, this function will convert
the input ids to tokens and return the tokens and the new text.
prefix_offset: The prefix offset.
read_offset: The read offset.
skip_special_tokens: Whether to skip special tokens.
spaces_between_special_tokens: Whether to add spaces between special
tokens.
"""
new_token_id = all_input_ids[-1]
# This is the first iteration for this sequence
is_first_iter = prev_tokens is None
if is_first_iter:
(prev_tokens, prefix_offset, read_offset) = convert_prompt_ids_to_tokens(
tokenizer, all_input_ids[:-1], skip_special_tokens=skip_special_tokens
)
assert prev_tokens is not None
# If the new token id is out of bounds, return an empty string.
if 0 <= new_token_id < len(tokenizer):
# Put new_token_id in a list so skip_special_tokens is respected
new_tokens = tokenizer.convert_ids_to_tokens(
[new_token_id], skip_special_tokens=skip_special_tokens
)
if isinstance(new_tokens, str):
new_tokens = [new_tokens]
else:
# This is required to guard against out-of-vocab prompt token ids
# (for example when using dummy weights)
_replace_none_with_empty(new_tokens) # type: ignore[arg-type]
else:
new_tokens = [""]
output_tokens = prev_tokens + new_tokens
# If this is the first iteration, return all tokens.
if is_first_iter:
new_tokens = output_tokens
# The prefix text is necessary only to defeat cleanup algorithms in
# the decode which decide to add a space or not depending on the
# surrounding ids.
if tokenizer.is_fast or not tokenizer.get_added_vocab():
prefix_text = tokenizer.convert_tokens_to_string(
output_tokens[prefix_offset:read_offset]
)
new_text = tokenizer.convert_tokens_to_string(output_tokens[prefix_offset:])
else:
prefix_text = _convert_tokens_to_string_with_added_encoders(
tokenizer,
output_tokens[prefix_offset:read_offset],
skip_special_tokens=skip_special_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
)
new_text = _convert_tokens_to_string_with_added_encoders(
tokenizer,
output_tokens[prefix_offset:],
skip_special_tokens=skip_special_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
)
if len(new_text) <= len(prefix_text) or new_text.endswith("<EFBFBD>"):
# utf-8 char at the end means it's a potential unfinished byte sequence
# from byte fallback tokenization.
# If it's in the middle, it's probably a real invalid id generated
# by the model
return new_tokens, "", prefix_offset, read_offset
new_text = new_text[len(prefix_text) :]
return new_tokens, new_text, read_offset, len(output_tokens)

450
vllm/tokenizers/grok2.py Normal file
View File

@@ -0,0 +1,450 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tokenizer for Grok-2 .tok.json format."""
import functools
import json
from collections.abc import Collection, Set
from pathlib import Path
from typing import Any, Literal, overload
from huggingface_hub import hf_hub_download
from huggingface_hub.utils import (
EntryNotFoundError,
HfHubHTTPError,
RepositoryNotFoundError,
RevisionNotFoundError,
)
from transformers import BatchEncoding
from transformers.utils import chat_template_utils as hf_chat_utils
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.logger import init_logger
from .protocol import TokenizerLike
logger = init_logger(__name__)
PAD = "<|pad|>"
EOS = "<|eos|>"
SEP = "<|separator|>"
RESERVED_TOKEN_TEXTS = [f"<|reserved_{i}|>" for i in range(3, 128)]
CONTROL_TOKEN_TEXTS = [f"<|control{i}|>" for i in range(1, 705)]
DEFAULT_SPECIAL_TOKENS = [PAD, SEP, EOS]
DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": SEP, "eos": EOS}
DEFAULT_CHAT_TEMPLATE = (
"{% for message in messages %}"
"{% if message['role'] == 'user' %}"
"{{ 'Human: ' + message['content'].strip() + '<|separator|>\\n\\n' }}"
"{% elif message['role'] == 'system' %}"
"{{ 'System: ' + message['content'].strip() + '<|separator|>\\n\\n' }}"
"{% elif message['role'] == 'assistant' %}"
"{{ 'Assistant: ' + message['content'] + '<|separator|>\\n\\n' }}"
"{% endif %}"
"{% endfor %}"
"{% if add_generation_prompt %}"
"{{ 'Assistant:' }}"
"{% endif %}"
)
# Default + separate each single digit.
PAT_STR_B = (
r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}|"""
r""" ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
)
def _maybe_load_tokenizer_config(
model_path: Path,
*,
repo_id: str | None,
revision: str | None,
download_dir: str | None,
) -> dict[str, Any]:
config_path = model_path / "tokenizer_config.json"
if config_path.is_file():
with config_path.open("r", encoding="utf-8") as f:
return json.load(f)
if repo_id is None:
return {}
try:
config_file = hf_hub_download(
repo_id=repo_id,
filename="tokenizer_config.json",
revision=revision,
cache_dir=download_dir,
)
except (RepositoryNotFoundError, RevisionNotFoundError, EntryNotFoundError):
# If the repo, revision, or file does not exist, fall back silently.
return {}
except HfHubHTTPError as exc:
logger.warning(
"Failed to download tokenizer_config.json from %s. "
"This may be due to a network or authentication issue. "
"The default chat template will be used. Error: %s",
repo_id,
exc,
)
return {}
try:
with Path(config_file).open("r", encoding="utf-8") as f:
return json.load(f)
except json.JSONDecodeError as exc:
logger.warning(
"Failed to parse tokenizer_config.json. "
"The default chat template will be used. Error: %s",
exc,
)
return {}
except OSError as exc:
logger.warning(
"Failed to open tokenizer_config.json. "
"The default chat template will be used. Error: %s",
exc,
)
return {}
def _load_tiktoken_encoding(
vocab_file: Path,
) -> tuple[Any, dict[str, int]]:
try:
import tiktoken
except ImportError as exc:
raise ImportError("Grok-2 tokenizer requires the `tiktoken` package.") from exc
with vocab_file.open("rb") as f:
xtok_dict = json.load(f)
mergeable_ranks = {
bytes(item["bytes"]): item["token"]
for item in xtok_dict.get("regular_tokens", [])
}
special_tokens = {
bytes(item["bytes"]).decode("utf-8", errors="replace"): item["token"]
for item in xtok_dict.get("special_tokens", [])
}
if xtok_dict.get("word_split") == "V1":
pat_str = PAT_STR_B
else:
raise ValueError(f"Unknown word_split: {xtok_dict.get('word_split')!r}")
pat_str = xtok_dict.get("pat_str", pat_str)
kwargs = {
"name": str(vocab_file),
"pat_str": pat_str,
"mergeable_ranks": mergeable_ranks,
"special_tokens": special_tokens,
}
if "vocab_size" in xtok_dict:
kwargs["explicit_n_vocab"] = xtok_dict["vocab_size"]
tokenizer = tiktoken.Encoding(**kwargs)
default_allowed_special: set[str] | None = None
if "default_allowed_special" in xtok_dict:
default_allowed_special = {
bytes(bytes_list).decode("utf-8", errors="replace")
for bytes_list in xtok_dict["default_allowed_special"]
}
tokenizer._default_allowed_special = default_allowed_special or set()
tokenizer._control_tokens = DEFAULT_CONTROL_TOKENS
def encode_patched(
self,
text: str,
*,
allowed_special: Literal["all"] | Set[str] = set(),
disallowed_special: Literal["all"] | Collection[str] = "all",
) -> list[int]:
del disallowed_special
if isinstance(allowed_special, set):
allowed_special |= self._default_allowed_special
return tiktoken.Encoding.encode(
self,
text,
allowed_special=allowed_special,
disallowed_special=(),
)
tokenizer.encode = functools.partial(encode_patched, tokenizer)
tokenizer._default_allowed_special |= set(DEFAULT_CONTROL_TOKENS.values())
tokenizer._default_allowed_special |= set(
CONTROL_TOKEN_TEXTS + RESERVED_TOKEN_TEXTS
)
return tokenizer, special_tokens
class Grok2Tokenizer(TokenizerLike):
@classmethod
def from_pretrained(
cls,
path_or_repo_id: str | Path,
*args,
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> "Grok2Tokenizer":
if args:
logger.debug_once("Ignoring extra positional args for Grok2Tokenizer.")
path = Path(path_or_repo_id)
if path.is_file():
vocab_file = path
model_path = path.parent
repo_id = None
elif path.is_dir():
vocab_file = path / "tokenizer.tok.json"
model_path = path
repo_id = None
else:
vocab_file = Path(
hf_hub_download(
repo_id=str(path_or_repo_id),
filename="tokenizer.tok.json",
revision=revision,
cache_dir=download_dir,
)
)
model_path = vocab_file.parent
repo_id = str(path_or_repo_id)
if not vocab_file.is_file():
raise FileNotFoundError(f"tokenizer.tok.json not found at {vocab_file}.")
config = _maybe_load_tokenizer_config(
model_path,
repo_id=repo_id,
revision=revision,
download_dir=download_dir,
)
return cls(
vocab_file=vocab_file,
name_or_path=str(path_or_repo_id),
truncation_side=kwargs.get("truncation_side", "left"),
chat_template=config.get("chat_template"),
init_kwargs=config,
)
def __init__(
self,
*,
vocab_file: Path,
name_or_path: str,
truncation_side: str,
chat_template: str | None,
init_kwargs: dict[str, Any] | None = None,
) -> None:
super().__init__()
self.name_or_path = name_or_path
self._truncation_side = truncation_side
self.init_kwargs = init_kwargs or {}
self._chat_template = chat_template or DEFAULT_CHAT_TEMPLATE
self._tokenizer, self._special_tokens = _load_tiktoken_encoding(vocab_file)
self._token_to_id: dict[str, int] = {}
self._id_to_token: dict[int, str] = {}
for token, token_id in self._tokenizer._mergeable_ranks.items():
token_str = token.decode("utf-8", errors="replace")
self._token_to_id[token_str] = token_id
self._id_to_token[token_id] = token_str
for token, token_id in self._special_tokens.items():
self._token_to_id[token] = token_id
self._id_to_token[token_id] = token
bos_token_id = self._special_tokens.get(SEP)
if bos_token_id is None:
bos_token_id = self._special_tokens.get(PAD)
if bos_token_id is None:
bos_token_id = self._special_tokens.get(EOS)
if bos_token_id is None:
bos_token_id = 0
self._bos_token_id = bos_token_id
self._eos_token_id = self._special_tokens.get(EOS, self._bos_token_id)
self._pad_token_id = self._special_tokens.get(PAD, self._eos_token_id)
self._unk_token_id = self._pad_token_id
self._max_chars_per_token = max(len(tok) for tok in self._token_to_id)
def num_special_tokens_to_add(self) -> int:
return 0
@property
def all_special_tokens(self) -> list[str]:
return list(self._special_tokens.keys())
@property
def all_special_ids(self) -> list[int]:
return list(self._special_tokens.values())
@property
def bos_token_id(self) -> int:
return self._bos_token_id
@property
def eos_token_id(self) -> int:
return self._eos_token_id
@property
def pad_token_id(self) -> int:
return self._pad_token_id
@property
def is_fast(self) -> bool:
return False
@property
def vocab_size(self) -> int:
return self._tokenizer.n_vocab
@property
def max_token_id(self) -> int:
return self._tokenizer.n_vocab - 1
@property
def max_chars_per_token(self) -> int:
return self._max_chars_per_token
@property
def truncation_side(self) -> str:
return self._truncation_side
def get_vocab(self) -> dict[str, int]:
return dict(self._token_to_id)
def get_added_vocab(self) -> dict[str, int]:
return dict(self._special_tokens)
def _maybe_truncate(self, tokens: list[int], max_length: int | None) -> list[int]:
if max_length is None or len(tokens) <= max_length:
return tokens
if self.truncation_side == "left":
return tokens[-max_length:]
return tokens[:max_length]
def encode(
self,
text: str,
truncation: bool | None = None,
max_length: int | None = None,
add_special_tokens: bool = True,
) -> list[int]:
del add_special_tokens
tokens = self._tokenizer.encode(text)
if truncation:
tokens = self._maybe_truncate(tokens, max_length)
return tokens
def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
if isinstance(ids, int):
ids = [ids]
if skip_special_tokens:
ids = [
token_id
for token_id in ids
if token_id not in self._special_tokens.values()
]
return self._tokenizer.decode(ids)
@overload
def convert_tokens_to_ids(self, tokens: str) -> int: ...
@overload
def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ...
def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
if isinstance(tokens, str):
return self._token_to_id.get(tokens, self._unk_token_id)
return [self._token_to_id.get(token, self._unk_token_id) for token in tokens]
def convert_ids_to_tokens(
self, ids: list[int], skip_special_tokens: bool = False
) -> list[str]:
tokens = []
for token_id in ids:
if skip_special_tokens and token_id in self._special_tokens.values():
continue
tokens.append(self._id_to_token.get(token_id, "<|unk|>"))
return tokens
def convert_tokens_to_string(self, tokens: list[str]) -> str:
token_ids = self.convert_tokens_to_ids(tokens)
return self.decode(token_ids, skip_special_tokens=False)
def __call__(
self,
text: str | list[str],
text_pair: str | None = None,
add_special_tokens: bool = True,
truncation: bool = False,
max_length: int | None = None,
) -> BatchEncoding:
if text_pair is not None:
raise NotImplementedError("text_pair is not supported for Grok2Tokenizer.")
if isinstance(text, list):
input_ids_batch: list[list[int]] = [
self.encode(
item,
truncation=truncation,
max_length=max_length,
add_special_tokens=add_special_tokens,
)
for item in text
]
attention_mask_batch = [[1] * len(ids) for ids in input_ids_batch]
return BatchEncoding(
{"input_ids": input_ids_batch, "attention_mask": attention_mask_batch}
)
input_ids = self.encode(
text,
truncation=truncation,
max_length=max_length,
add_special_tokens=add_special_tokens,
)
attention_mask = [1] * len(input_ids)
return BatchEncoding({"input_ids": input_ids, "attention_mask": attention_mask})
def get_chat_template(
self, chat_template: str | None, tools: list[dict[str, Any]] | None = None
) -> str | None:
del tools
return chat_template or self._chat_template
def apply_chat_template(
self,
messages: list[ChatCompletionMessageParam],
tools: list[dict[str, Any]] | None = None,
chat_template: str | None = None,
tokenize: bool = False,
**kwargs,
) -> str | list[int]:
template = self.get_chat_template(chat_template, tools=tools)
if template is None:
raise ValueError(
"No chat template available. Provide `chat_template` explicitly."
)
kwargs["return_dict"] = False
prompt = hf_chat_utils.apply_chat_template(
conversation=messages,
chat_template=template,
tools=tools,
**kwargs,
)
if tokenize:
return self.encode(prompt, add_special_tokens=False)
return prompt

125
vllm/tokenizers/hf.py Normal file
View File

@@ -0,0 +1,125 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
import copy
from pathlib import Path
from typing import TypeAlias
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
from .protocol import TokenizerLike
HfTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast
def get_cached_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
"""
By default, transformers will recompute multiple tokenizer properties
each time they are called, leading to a significant slowdown.
This proxy caches these properties for faster access.
"""
cached_tokenizer = copy.copy(tokenizer)
tokenizer_all_special_ids = tokenizer.all_special_ids
tokenizer_all_special_tokens = tokenizer.all_special_tokens
tokenizer_vocab = tokenizer.get_vocab()
tokenizer_len = len(tokenizer)
max_token_id = max(tokenizer_vocab.values())
max_chars_per_token = max(len(tok) for tok in tokenizer_vocab)
# Some tokenizers (e.g., QwenTokenizer) have special tokens that
# are added and included in the implementation of the vocab_size
# property, but not in get_vocab(); if there is an implementation
# of vocab size, we should take the greater value.
if hasattr(tokenizer, "vocab_size"):
with contextlib.suppress(NotImplementedError):
max_token_id = max(max_token_id, tokenizer.vocab_size)
class CachedTokenizer(tokenizer.__class__): # type: ignore
@property
def all_special_ids(self) -> list[int]:
return tokenizer_all_special_ids
@property
def all_special_tokens(self) -> list[str]:
return tokenizer_all_special_tokens
@property
def max_token_id(self) -> int:
return max_token_id
@property
def max_chars_per_token(self) -> int:
return max_chars_per_token
def get_vocab(self) -> dict[str, int]:
return tokenizer_vocab
def __len__(self) -> int:
return tokenizer_len
def __reduce__(self):
return get_cached_tokenizer, (tokenizer,)
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
cached_tokenizer.__class__ = CachedTokenizer
return cached_tokenizer
class CachedHfTokenizer(TokenizerLike):
@classmethod
def from_pretrained(
cls,
path_or_repo_id: str | Path,
*args,
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> HfTokenizer:
try:
tokenizer = AutoTokenizer.from_pretrained(
path_or_repo_id,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
cache_dir=download_dir,
**kwargs,
)
except ValueError as e:
# If the error pertains to the tokenizer class not existing or not
# currently being imported,
# suggest using the --trust-remote-code flag.
if not trust_remote_code and (
"does not exist or is not currently imported." in str(e)
or "requires you to execute the tokenizer file" in str(e)
):
err_msg = (
"Failed to load the tokenizer. If the tokenizer "
"is a custom tokenizer not yet available in the "
"HuggingFace transformers library, consider "
"setting `trust_remote_code=True` in LLM or using "
"the `--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e
# The special_tokens in tokenizer should also be
# controlled by do_lower_case in encoder_config
encoder_config = get_sentence_transformer_tokenizer_config(
path_or_repo_id, revision
)
if isinstance(encoder_config, dict) and encoder_config.get(
"do_lower_case", False
):
special_tokens_map = {
k: v.lower() for k, v in tokenizer.special_tokens_map.items()
}
tokenizer.add_special_tokens(special_tokens_map)
return get_cached_tokenizer(tokenizer)

553
vllm/tokenizers/mistral.py Normal file
View File

@@ -0,0 +1,553 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
from typing import TYPE_CHECKING, Any, cast, overload
from mistral_common.protocol.instruct.request import (
ChatCompletionRequest as MistralChatCompletionRequest,
)
from mistral_common.protocol.instruct.tool_calls import Function, Tool
from mistral_common.protocol.instruct.validator import ValidationMode
from mistral_common.tokens.tokenizers.base import (
SpecialTokenPolicy,
SpecialTokens,
)
from mistral_common.tokens.tokenizers.instruct import InstructTokenizerV13
from mistral_common.tokens.tokenizers.sentencepiece import (
SentencePieceTokenizer,
)
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
from pydantic import ValidationError
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.logger import init_logger
from .protocol import TokenizerLike
if TYPE_CHECKING:
from transformers import BatchEncoding
try:
# Transformers v5
from transformers.tokenization_mistral_common import MistralCommonBackend
except ImportError:
# Transformers v4
from transformers.tokenization_mistral_common import (
MistralCommonTokenizer as MistralCommonBackend,
)
logger = init_logger(__name__)
def maybe_serialize_tool_calls(request: "MistralChatCompletionRequest"):
# SEE: https://github.com/vllm-project/vllm/pull/9951
# Credits go to: @gcalmettes
# NOTE: There is currently a bug in pydantic where attributes
# declared as iterables are replaced in in the instances by
# pydantic-core ValidatorIterator instance. In particular, this
# affects tool_calls defined in ChatCompletionAssistantMessageParam
# model:
# see:
# - https://github.com/pydantic/pydantic/issues/9467
# As a result, tool_calls from assistant messages are never
# deserialized in the request object if the tool_calls iterator is
# not consumed. This affect messages passed to the MistralTokenizer
# since no chat template is applied and therefore the tools_calls
# iterator is not directly consumed.
# Issue is tracked on Pydantic side, with resolution planned for
# v2.11 release. In the meantime, the official workaround is to
# consume the iterator so the tool_calls are correctly deserialized
# in the OpenAI ChatCompletionAssistantMessageParam object
# https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501
# Official Pydantic Issues:
# - https://github.com/pydantic/pydantic/issues/9541
# TODO: remove when pydantic v2.11 is released
for i, message in enumerate(request.messages):
if message.get("role") == "assistant":
if (tool_calls_validator := message.get("tool_calls", None)) is not None:
try:
validated_tool_calls = list(tool_calls_validator)
except ValidationError as e:
raise ValueError(
"Validating messages' `tool_calls` raised an error. "
"Please ensure `tool_calls` are iterable of tool calls."
) from e
else:
validated_tool_calls = []
request.messages[i]["tool_calls"] = validated_tool_calls
def truncate_tool_call_ids(request: "MistralChatCompletionRequest"):
"""Truncates tool call IDs for Mistral's ID requirements."""
for i, message in enumerate(request.messages):
if message.get("role") == "assistant":
tool_calls = message.get("tool_calls", [])
for tool_call in tool_calls:
if len(tool_call["id"]) > 9:
logger.warning(
"Truncating tool call ID: %s to %s",
tool_call["id"],
tool_call["id"][-9:],
)
tool_call["id"] = tool_call["id"][-9:]
request.messages[i]["tool_calls"] = tool_calls
elif message.get("role") in {"tool_results", "tool"}:
if "tool_call_id" in message:
tool_call_id = message["tool_call_id"]
if len(tool_call_id) > 9:
logger.warning(
"Truncating tool_call_id: %s to %s",
tool_call_id,
tool_call_id[-9:],
)
tool_call_id = tool_call_id[-9:]
request.messages[i]["tool_call_id"] = tool_call_id
def _prepare_apply_chat_template_tools_and_messages(
messages: list["ChatCompletionMessageParam"],
tools: list[dict[str, Any]] | None = None,
continue_final_message: bool = False,
add_generation_prompt: bool = False,
) -> tuple[list["ChatCompletionMessageParam"], list[dict[str, Any]] | None]:
if add_generation_prompt and continue_final_message:
raise ValueError(
"Cannot set both `add_generation_prompt` and "
"`continue_final_message` to True."
)
last_message = cast(dict[str, Any], messages[-1])
# add_generation_prompt is directly handled by the tokenizer but we
# check if the user is trying to use it with a final assistant message
# which is probably not what they want.
# If add_generation_prompt is False, we don't need to check anything.
if add_generation_prompt and last_message["role"] == "assistant":
raise ValueError(
"Cannot set `add_generation_prompt` to True when "
"the last message is from the assistant. Consider "
"using `continue_final_message` instead."
)
if continue_final_message and last_message["role"] != "assistant":
raise ValueError(
"Cannot set `continue_final_message` to True when "
"the last message is not from the assistant."
)
# mistral-common requires AssistantMessage content to be string [1].
#
# [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
for message in messages:
# Remove reasoning as unsupported by Mistral
_ = message.pop("reasoning", None) # type: ignore
# The Mistral client, in comparison to the OpenAI client, requires the
# "parameters" dict and the "description" string to be present
# even if they are empty.
if tools:
for function in [
tool["function"] for tool in tools if tool["type"] == "function"
]:
if function.get("parameters") is None:
function["parameters"] = {}
if function.get("description") is None:
function["description"] = ""
# We filter not supported arguments to avoid throwing an error.
# TODO(juliendenize): remove this once OpenAI API is better supported by
# `mistral-common`.
tools_fields = set(Tool.model_fields.keys())
function_fields = set(Function.model_fields.keys())
for tool in tools:
tool_keys = list(tool.keys())
for tool_key in tool_keys:
if tool_key not in tools_fields:
tool.pop(tool_key)
logger.warning_once(
f"'{tool_key}' is not supported by mistral-common for tools. "
"It has been poped from the tool definition."
)
if tool["type"] == "function":
function_keys = list(tool["function"].keys())
for function_key in function_keys:
if function_key not in function_fields:
tool["function"].pop(function_key)
logger.warning_once(
f"'{function_key}' is not supported by mistral-common "
"for function tools. It has been poped from the "
"function definition."
)
else:
raise ValueError("mistral-common only supports function tools.")
return messages, tools
def validate_request_params(request: "ChatCompletionRequest"):
if request.chat_template is not None or request.chat_template_kwargs is not None:
raise ValueError("chat_template is not supported for Mistral tokenizers.")
def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
assert isinstance(tokenizer, Tekkenizer), type(tokenizer)
t_bytes = t.encode("utf-8") if not isinstance(t, bytes) else t
shift = tokenizer.num_special_tokens
try:
return shift + tokenizer._tekken_token2id_nospecial[t_bytes]
except KeyError:
t_str = t_bytes.decode("utf-8")
if t_str in tokenizer._special_tokens_reverse_vocab:
return tokenizer._special_tokens_reverse_vocab[t_str]
logger.warning(
"Failed to convert token %s to id, replacing with <unk>", t_bytes
)
return tokenizer.unk_id
class MistralTokenizer(TokenizerLike):
IS_MISTRAL_TOKENIZER = True # used by vllm.utils.mistral
@classmethod
def from_pretrained(
cls,
path_or_repo_id: str | Path,
*args,
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> "MistralTokenizer":
try:
# Transformers v5
from transformers.tokenization_mistral_common import MistralCommonBackend
except ImportError:
# Transformers v4
from transformers.tokenization_mistral_common import (
MistralCommonTokenizer as MistralCommonBackend,
)
tokenizer = MistralCommonBackend.from_pretrained(
path_or_repo_id,
*args,
mode=ValidationMode.test,
cache_dir=download_dir,
revision="main" if revision is None else revision,
**kwargs,
)
return cls(tokenizer)
def __init__(self, tokenizer: "MistralCommonBackend") -> None:
super().__init__()
self.transformers_tokenizer = tokenizer
self.mistral = tokenizer.tokenizer
self.instruct = self.mistral.instruct_tokenizer
self.tokenizer = self.instruct.tokenizer
mode = self.mistral._chat_completion_request_validator._mode
if mode != ValidationMode.test:
raise ValueError(
"Mistral tokenizer must be in test mode. Make sure to "
"set `mode='ValidationMode.test'` when creating the "
"Mistral tokenizer."
)
_mistral_version_str = str(self.tokenizer.version.value)
self.version: int = int(_mistral_version_str.split("v")[-1])
self.is_tekken = isinstance(self.tokenizer, Tekkenizer)
self.is_spm = isinstance(self.tokenizer, SentencePieceTokenizer)
if not (self.is_tekken or self.is_spm):
raise TypeError(f"Unsupported tokenizer: {type(self.tokenizer)}")
# Reverse order to ensure that the lowest token id is kept.
self._vocab_dict = {
self.convert_ids_to_tokens([i], skip_special_tokens=False)[0]: i
for i in range(self.vocab_size - 1, -1, -1)
}
# Sort the dict for convenience
self._vocab_dict = dict(sorted(self._vocab_dict.items(), key=lambda x: x[1]))
# Vocab sorted by token id.
self._vocab = self.tokenizer.vocab()
self._max_token_id = self.vocab_size - 1
self._max_chars_per_token = max(len(tok) for tok in self._vocab)
# Cache special tokens for faster access.
self._special_token_ids = self._get_special_token_ids()
self._special_token_ids_set = set(self._special_token_ids)
self._special_tokens = self._get_special_tokens(self._special_token_ids)
self._special_tokens_set = set(self._special_tokens)
def _get_special_token_ids(self) -> list[int]:
return [i for i in range(len(self._vocab)) if self.tokenizer.is_special(i)]
def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]:
return [
self.tokenizer.decode([i], special_token_policy=SpecialTokenPolicy.KEEP)
for i in all_special_ids
]
def num_special_tokens_to_add(self) -> int:
return len(self.encode(""))
# the following attributes are set to fit vLLM's design and are used
# by the structured output backends.
@property
def all_special_tokens(self) -> list[str]:
return self._special_tokens
@property
def all_special_ids(self) -> list[int]:
return self._special_token_ids
@property
def bos_token_id(self) -> int:
return self.tokenizer.bos_id
@property
def eos_token_id(self) -> int:
return self.tokenizer.eos_id
@property
def pad_token_id(self) -> int:
return self.tokenizer.pad_id
@property
def is_fast(self) -> bool:
return True
@property
def vocab_size(self) -> int:
return self.transformers_tokenizer.vocab_size
@property
def max_token_id(self) -> int:
return self._max_token_id
@property
def max_chars_per_token(self) -> int:
return self._max_chars_per_token
@property
def truncation_side(self) -> str:
return self.transformers_tokenizer.truncation_side
def _is_special_token_id(self, token_id: int) -> bool:
return token_id in self._special_token_ids_set
def __hash__(self) -> int:
return hash(id(self))
def __len__(self) -> int:
return self.vocab_size
def __call__(
self,
text: str | list[str],
text_pair: str | None = None,
add_special_tokens: bool = True,
truncation: bool = False,
max_length: int | None = None,
) -> "BatchEncoding":
if text_pair is not None:
raise ValueError(
"`text_pair` is not supported by `MistralTokenizer.__call__`."
)
encoded = self.transformers_tokenizer(
text=text,
text_pair=text_pair,
add_special_tokens=add_special_tokens,
truncation=truncation,
max_length=max_length,
)
# TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
# is in, revert to only call self.transformers_tokenizer(...).
# Hack to fix wrongly added eos token, when fix will be supported the condition
# below will be False even before the revert is done.
if encoded["input_ids"] and encoded["input_ids"][-1] == self.eos_token_id:
encoded["input_ids"].pop(-1)
if attention_mask := encoded.get("attention_mask"):
attention_mask.pop(-1)
return encoded
@property
def vocab(self) -> list[str]:
return self._vocab
def get_vocab(self) -> dict[str, int]:
return self._vocab_dict
def get_added_vocab(self) -> dict[str, int]:
# Mistral tokenizers have no added vocabulary
return {}
def encode(
self,
text: str,
truncation: bool | None = None,
max_length: int | None = None,
add_special_tokens: bool = True,
) -> list[int]:
# TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
# is in, directly call self.transformers_tokenizer.encode(...).
encoded = self.tokenizer.encode(text, bos=add_special_tokens, eos=False)
if truncation is not False and max_length is not None:
return encoded[:max_length]
else:
return encoded
def apply_chat_template(
self,
messages: list["ChatCompletionMessageParam"],
tools: list[dict[str, Any]] | None = None,
**kwargs,
) -> list[int]:
add_generation_prompt = kwargs.pop("add_generation_prompt", False)
continue_final_message = kwargs.get("continue_final_message", False)
tokenize = kwargs.get("tokenize", True)
padding = kwargs.get("padding", False)
truncation = kwargs.get("truncation", False)
max_length = kwargs.get("max_length")
messages, tools = _prepare_apply_chat_template_tools_and_messages(
messages, tools, continue_final_message, add_generation_prompt
)
return self.transformers_tokenizer.apply_chat_template(
conversation=messages,
tools=tools,
continue_final_message=continue_final_message,
tokenize=tokenize,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=None,
return_dict=False,
)
def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
# TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
# is in, directly call self.transformers_tokenizer.decode(...).
if isinstance(ids, int):
ids = [ids]
return self.transformers_tokenizer.decode(
ids, skip_special_tokens=skip_special_tokens
)
def batch_decode(
self, ids: list[list[int]] | list[int], skip_special_tokens: bool = False
) -> str:
return self.transformers_tokenizer.batch_decode(
ids, skip_special_tokens=skip_special_tokens
)
@overload
def convert_tokens_to_ids(self, tokens: str) -> int: ...
@overload
def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ...
def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
return self.transformers_tokenizer.convert_tokens_to_ids(tokens)
def convert_tokens_to_string(self, tokens: list[str]) -> str:
to_decode_special_tokens = {SpecialTokens.tool_calls}
if self.is_tekken:
assert isinstance(self.tokenizer, Tekkenizer), type(self.tokenizer)
tokens = [
t
for t in tokens
if (t in to_decode_special_tokens or t not in self._special_tokens_set)
]
if any(isinstance(t, bytes) for t in tokens):
# we need to encode and decode all tokens again
ids = [_tekken_token_to_id(self.tokenizer, t) for t in tokens]
# We filtered unwanted special tokens before
# so we can decode the rest.
decoded = self.tokenizer.decode(ids, SpecialTokenPolicy.KEEP)
else:
decoded = "".join(tokens)
else:
# make sure certain special tokens like Tool calls are
# not decoded
assert isinstance(self.tokenizer, SentencePieceTokenizer), type(
self.tokenizer
)
regular_tokens: list[str] = []
decoded_list: list[str] = []
decoded = ""
for token in tokens:
if token in to_decode_special_tokens:
if regular_tokens:
decoded_list.append(
self.tokenizer.decode(
regular_tokens, SpecialTokenPolicy.IGNORE
)
)
regular_tokens = []
decoded_list.append(token)
else:
regular_tokens.append(token)
if regular_tokens:
decoded_list.append(
self.tokenizer.decode(regular_tokens, SpecialTokenPolicy.IGNORE)
)
decoded = "".join(decoded_list)
return decoded
def convert_ids_to_tokens(
self,
ids: list[int],
skip_special_tokens: bool = False,
) -> list[str]:
if not skip_special_tokens:
return [self.tokenizer.id_to_piece(token_id) for token_id in ids]
non_skip_special_tokens_ids = {
self.tokenizer.get_special_token(SpecialTokens.tool_calls),
}
if isinstance(self.instruct, InstructTokenizerV13):
if self.instruct.BEGIN_THINK:
non_skip_special_tokens_ids.add(self.instruct.BEGIN_THINK)
if self.instruct.END_THINK:
non_skip_special_tokens_ids.add(self.instruct.END_THINK)
ids_kept = [
i
for i in ids
if i in non_skip_special_tokens_ids or not self._is_special_token_id(i)
]
# We filtered unwanted special tokens so we can decode the rest.
tokens = [self.tokenizer.id_to_piece(token_id) for token_id in ids_kept]
if any("<EFBFBD>" in t for t in tokens) and self.is_tekken:
# if a decoded token contains the replacement character, then the
# token has an incomplete UTF-8 character so we must use bytes
# See: https://github.com/vllm-project/vllm/pull/8640
# https://github.com/vllm-project/vllm/pull/9625
# if underlying tokenizer is sentencepiece, we just add "<22>".
# We filtered unwanted special tokens so we can decode the rest.
tokens = [
self.tokenizer.id_to_byte_piece(token_id, SpecialTokenPolicy.KEEP)
if token_id not in self._special_token_ids_set
else self.tokenizer.decode([token_id], SpecialTokenPolicy.KEEP)
for token_id in ids_kept
]
return tokens

127
vllm/tokenizers/protocol.py Normal file
View File

@@ -0,0 +1,127 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
from typing import TYPE_CHECKING, Any, Protocol, overload
if TYPE_CHECKING:
from transformers import BatchEncoding
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
class TokenizerLike(Protocol):
@classmethod
def from_pretrained(
cls,
path_or_repo_id: str | Path,
*args,
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> "TokenizerLike":
raise NotImplementedError
def num_special_tokens_to_add(self) -> int:
raise NotImplementedError
@property
def all_special_tokens(self) -> list[str]:
raise NotImplementedError
@property
def all_special_ids(self) -> list[int]:
raise NotImplementedError
@property
def bos_token_id(self) -> int:
raise NotImplementedError
@property
def eos_token_id(self) -> int:
raise NotImplementedError
@property
def pad_token_id(self) -> int:
raise NotImplementedError
@property
def is_fast(self) -> bool:
raise NotImplementedError
@property
def vocab_size(self) -> int:
raise NotImplementedError
@property
def max_token_id(self) -> int:
raise NotImplementedError
@property
def max_chars_per_token(self) -> int:
raise NotImplementedError
@property
def truncation_side(self) -> str:
raise NotImplementedError
def __hash__(self) -> int:
return hash(id(self))
def __len__(self) -> int:
return self.vocab_size
def __call__(
self,
text: str | list[str],
text_pair: str | None = None,
add_special_tokens: bool = True,
truncation: bool = False,
max_length: int | None = None,
) -> "BatchEncoding":
raise NotImplementedError
def get_vocab(self) -> dict[str, int]:
raise NotImplementedError
def get_added_vocab(self) -> dict[str, int]:
raise NotImplementedError
def encode(
self,
text: str,
truncation: bool | None = None,
max_length: int | None = None,
add_special_tokens: bool = True,
) -> list[int]:
raise NotImplementedError
def apply_chat_template(
self,
messages: list["ChatCompletionMessageParam"],
tools: list[dict[str, Any]] | None = None,
**kwargs,
) -> str | list[int]:
raise NotImplementedError
@overload
def convert_tokens_to_ids(self, tokens: str) -> int: ...
@overload
def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ...
def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
raise NotImplementedError
def convert_tokens_to_string(self, tokens: list[str]) -> str:
raise NotImplementedError
def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
raise NotImplementedError
def convert_ids_to_tokens(
self,
ids: list[int],
skip_special_tokens: bool = False,
) -> list[str]:
raise NotImplementedError

240
vllm/tokenizers/registry.py Normal file
View File

@@ -0,0 +1,240 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass, field
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING
import huggingface_hub
from typing_extensions import TypeVar, assert_never
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.transformers_utils.gguf_utils import (
check_gguf_file,
get_gguf_file_path_from_hf,
is_gguf,
is_remote_gguf,
split_remote_gguf,
)
from vllm.transformers_utils.repo_utils import (
any_pattern_in_repo_files,
is_mistral_model_repo,
)
from vllm.utils.import_utils import resolve_obj_by_qualname
from .protocol import TokenizerLike
if TYPE_CHECKING:
from vllm.config.model import ModelConfig, RunnerType
logger = init_logger(__name__)
_VLLM_TOKENIZERS = {
"deepseek_v32": ("deepseek_v32", "DeepseekV32Tokenizer"),
"grok2": ("grok2", "Grok2Tokenizer"),
"hf": ("hf", "CachedHfTokenizer"),
"mistral": ("mistral", "MistralTokenizer"),
}
@dataclass
class _TokenizerRegistry:
# Tokenizer mode -> (tokenizer module, tokenizer class)
tokenizers: dict[str, tuple[str, str]] = field(default_factory=dict)
def register(self, tokenizer_mode: str, module: str, class_name: str) -> None:
if tokenizer_mode in self.tokenizers:
logger.warning(
"%s.%s is already registered for tokenizer_mode=%r. "
"It is overwritten by the new one.",
module,
class_name,
tokenizer_mode,
)
self.tokenizers[tokenizer_mode] = (module, class_name)
return None
def load_tokenizer_cls(self, tokenizer_mode: str) -> type[TokenizerLike]:
if tokenizer_mode not in self.tokenizers:
raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")
module, class_name = self.tokenizers[tokenizer_mode]
logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")
return resolve_obj_by_qualname(f"{module}.{class_name}")
def load_tokenizer(self, tokenizer_mode: str, *args, **kwargs) -> TokenizerLike:
tokenizer_cls = self.load_tokenizer_cls(tokenizer_mode)
return tokenizer_cls.from_pretrained(*args, **kwargs)
TokenizerRegistry = _TokenizerRegistry(
{
mode: (f"vllm.tokenizers.{mod_relname}", cls_name)
for mode, (mod_relname, cls_name) in _VLLM_TOKENIZERS.items()
}
)
def resolve_tokenizer_args(
tokenizer_name: str | Path,
*args,
runner_type: "RunnerType" = "generate",
tokenizer_mode: str = "auto",
**kwargs,
):
revision: str | None = kwargs.get("revision")
download_dir: str | None = kwargs.get("download_dir")
if envs.VLLM_USE_MODELSCOPE:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
from modelscope.hub.snapshot_download import snapshot_download
# avoid circular import
from vllm.model_executor.model_loader.weight_utils import get_lock
# Only set the tokenizer here, model will be downloaded on the workers.
if not Path(tokenizer_name).exists():
# Use file lock to prevent multiple processes from
# downloading the same file at the same time.
with get_lock(tokenizer_name, download_dir):
tokenizer_path = snapshot_download(
model_id=str(tokenizer_name),
cache_dir=download_dir,
revision=revision,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
# Ignore weights - we only need the tokenizer.
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
)
tokenizer_name = tokenizer_path
# Separate model folder from file path for GGUF models
if is_gguf(tokenizer_name):
if check_gguf_file(tokenizer_name):
kwargs["gguf_file"] = Path(tokenizer_name).name
tokenizer_name = Path(tokenizer_name).parent
elif is_remote_gguf(tokenizer_name):
tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
# Get the HuggingFace Hub path for the GGUF file
gguf_file = get_gguf_file_path_from_hf(
tokenizer_name,
quant_type,
revision=revision,
)
kwargs["gguf_file"] = gguf_file
if "truncation_side" not in kwargs:
if runner_type == "generate" or runner_type == "draft":
kwargs["truncation_side"] = "left"
elif runner_type == "pooling":
kwargs["truncation_side"] = "right"
else:
assert_never(runner_type)
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
tokenizer_mode = "hf"
kwargs["use_fast"] = False
# Try to use official Mistral tokenizer if possible
if (
tokenizer_mode == "auto"
and is_mistral_model_repo(
model_name_or_path=str(tokenizer_name), revision=revision
)
and any_pattern_in_repo_files(
model_name_or_path=str(tokenizer_name),
allow_patterns=["tekken.json", "tokenizer.model.v*"],
revision=revision,
)
):
tokenizer_mode = "mistral"
# Try to use Grok2 tiktoken tokenizer if possible
if tokenizer_mode == "auto" and any_pattern_in_repo_files(
model_name_or_path=str(tokenizer_name),
allow_patterns=["tokenizer.tok.json"],
revision=revision,
):
tokenizer_mode = "grok2"
# Fallback to HF tokenizer
if tokenizer_mode == "auto":
tokenizer_mode = "hf"
return tokenizer_mode, tokenizer_name, args, kwargs
cached_resolve_tokenizer_args = lru_cache(resolve_tokenizer_args)
def tokenizer_args_from_config(config: "ModelConfig", **kwargs):
return cached_resolve_tokenizer_args(
config.tokenizer,
runner_type=config.runner_type,
tokenizer_mode=config.tokenizer_mode,
revision=config.tokenizer_revision,
trust_remote_code=config.trust_remote_code,
**kwargs,
)
_T = TypeVar("_T", bound=TokenizerLike, default=TokenizerLike)
def get_tokenizer(
tokenizer_name: str | Path,
*args,
tokenizer_cls: type[_T] = TokenizerLike, # type: ignore[assignment]
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> _T:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
tokenizer_mode, tokenizer_name, args, kwargs = cached_resolve_tokenizer_args(
tokenizer_name,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
if tokenizer_cls == TokenizerLike:
tokenizer_cls_ = TokenizerRegistry.load_tokenizer_cls(tokenizer_mode)
else:
tokenizer_cls_ = tokenizer_cls
tokenizer = tokenizer_cls_.from_pretrained(tokenizer_name, *args, **kwargs)
if not tokenizer.is_fast:
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)
return tokenizer # type: ignore
cached_get_tokenizer = lru_cache(get_tokenizer)
def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
if model_config.skip_tokenizer_init:
return None
return cached_get_tokenizer(
model_config.tokenizer,
runner_type=model_config.runner_type,
tokenizer_mode=model_config.tokenizer_mode,
revision=model_config.tokenizer_revision,
trust_remote_code=model_config.trust_remote_code,
**kwargs,
)