update

2026-04-09 11:23:47 +08:00
parent 8082d5f4b2
commit 72387e4fa8
1885 changed files with 611521 additions and 1 deletions
--- a/vllm/tokenizers/init.py
+++ b/vllm/tokenizers/init.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .protocol import TokenizerLike
+from .registry import (
+    TokenizerRegistry,
+    cached_get_tokenizer,
+    cached_tokenizer_from_config,
+    get_tokenizer,
+)
+
+__all__ = [
+    "TokenizerLike",
+    "TokenizerRegistry",
+    "cached_get_tokenizer",
+    "get_tokenizer",
+    "cached_tokenizer_from_config",
+]
--- a/vllm/tokenizers/deepseek_v32.py
+++ b/vllm/tokenizers/deepseek_v32.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+from typing import Any
+
+from transformers import AutoTokenizer
+
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+from . import TokenizerLike
+from .deepseek_v32_encoding import encode_messages
+from .hf import HfTokenizer, get_cached_tokenizer
+
+
+def get_deepseek_v32_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
+    """
+    Wraps a tokenizer to use the custom DeepSeek V3.2 chat template encoding.
+    """
+    dsv32_tokenizer = copy.copy(tokenizer)
+
+    added_vocab = tokenizer.get_added_vocab()
+    added_vocab_size = len(added_vocab)
+    tokenizer_vocab_size = tokenizer.vocab_size
+
+    class _DeepseekV32Tokenizer(tokenizer.__class__):  # type: ignore
+        def apply_chat_template(
+            self,
+            messages: list["ChatCompletionMessageParam"],
+            tools: list[dict[str, Any]] | None = None,
+            **kwargs,
+        ) -> str | list[int]:
+            thinking = kwargs.get("thinking", False)
+            enable_thinking = kwargs.get("enable_thinking", False)
+            thinking = thinking or enable_thinking
+            thinking_mode = "thinking"
+            if not thinking:
+                thinking_mode = "chat"
+            conversation = kwargs.get("conversation", messages)
+            messages = conversation.copy()
+            if tools is not None and len(tools) > 0:
+                messages.insert(0, {"role": "system"})
+                messages[0]["tools"] = tools  # type: ignore[typeddict-unknown-key]
+
+            # Historical reasoning content is dropped when a new user message
+            # is introduced
+            drop_thinking = messages[-1]["role"] == "user"
+
+            encode_config = dict(
+                thinking_mode=thinking_mode, drop_thinking=drop_thinking
+            )
+
+            prompt_str = encode_messages(messages, **encode_config)  # type: ignore
+
+            if kwargs.get("tokenize", True):
+                tokenizer_kwargs = {
+                    k: kwargs[k] for k in ("truncation", "max_length") if k in kwargs
+                }
+                return self.encode(
+                    prompt_str,
+                    add_special_tokens=False,
+                    **tokenizer_kwargs,
+                )
+
+            return prompt_str
+
+        def num_special_tokens_to_add(self) -> int:
+            return len(self.encode(""))
+
+        def __len__(self) -> int:
+            # </think> is an added token in DeepseekV32 tokenizer
+            return tokenizer_vocab_size + added_vocab_size
+
+        def get_added_vocab(self) -> dict[str, int]:
+            return added_vocab.copy()
+
+        def __reduce__(self):
+            return get_deepseek_v32_tokenizer, (tokenizer,)
+
+    _DeepseekV32Tokenizer.__name__ = f"DSV32{tokenizer.__class__.__name__}"
+
+    dsv32_tokenizer.__class__ = _DeepseekV32Tokenizer
+    return dsv32_tokenizer
+
+
+class DeepseekV32Tokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs) -> HfTokenizer:
+        tokenizer = AutoTokenizer.from_pretrained(*args, **kwargs)
+        return get_cached_tokenizer(get_deepseek_v32_tokenizer(tokenizer))
--- a/vllm/tokenizers/deepseek_v32_encoding.py
+++ b/vllm/tokenizers/deepseek_v32_encoding.py
@@ -0,0 +1,471 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+# copy from https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/encoding/encoding_dsv32.py
+import copy
+import json
+from typing import Any
+
+import regex as re
+
+# flake8: noqa: E501
+TOOLS_SYSTEM_TEMPLATE = """## Tools
+You have access to a set of tools you can use to answer the user's question.
+You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user:
+<{dsml_token}function_calls>
+<{dsml_token}invoke name="$FUNCTION_NAME">
+<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</{dsml_token}parameter>
+...
+</{dsml_token}invoke>
+<{dsml_token}invoke name="$FUNCTION_NAME2">
+...
+</{dsml_token}invoke>
+</{dsml_token}function_calls>
+String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" for other types (numbers, booleans, arrays, objects).
+If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking block. Here is an example:
+<{dsml_token}function_calls>
+...
+</{dsml_token}function_calls>
+<function_results>
+...
+</function_results>
+{thinking_start_token}...thinking about results{thinking_end_token}
+Here are the functions available in JSONSchema format:
+<functions>
+{tool_schemas}
+</functions>
+"""
+
+bos_token: str = "<｜begin▁of▁sentence｜>"
+eos_token: str = "<｜end▁of▁sentence｜>"
+thinking_start_token: str = "<think>"
+thinking_end_token: str = "</think>"
+dsml_token: str = "｜DSML｜"
+system_msg_template: str = "{content}"
+user_msg_template: str = "<｜User｜>{content}<｜Assistant｜>"
+assistant_msg_template: str = "{reasoning}{content}{tool_calls}<｜end▁of▁sentence｜>"
+thinking_template = "{reasoning}"
+
+response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}"
+tool_call_template: str = (
+    '<{dsml_token}invoke name="{name}">\n{arguments}\n</{dsml_token}invoke>'
+)
+tool_calls_template = (
+    "<{dsml_token}function_calls>\n{tool_calls}\n</{dsml_token}function_calls>"
+)
+
+tool_output_template: str = "\n<result>{content}</result>"
+
+
+def to_json(value: Any) -> str:
+    try:
+        return json.dumps(value, ensure_ascii=False)
+    except Exception:
+        return json.dumps(value, ensure_ascii=True)
+
+
+def tools_from_openai_format(tools):
+    return [tool["function"] for tool in tools]
+
+
+def tool_calls_from_openai_format(tool_calls):
+    return [
+        {
+            "name": tool_call["function"]["name"],
+            "arguments": tool_call["function"]["arguments"],
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def tool_calls_to_openai_format(tool_calls):
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": tool_call["name"],
+                "arguments": tool_call["arguments"],
+            },
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
+    p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>"""
+    P_dsml_strs = []
+    if isinstance(tool_call["arguments"], str):
+        arguments = json.loads(tool_call["arguments"])
+    else:
+        arguments = tool_call["arguments"]
+
+    for k, v in arguments.items():
+        p_dsml_str = p_dsml_template.format(
+            dsml_token=dsml_token,
+            key=k,
+            is_str="true" if isinstance(v, str) else "false",
+            value=v if isinstance(v, str) else to_json(v),
+        )
+
+        P_dsml_strs.append(p_dsml_str)
+
+    return "\n".join(P_dsml_strs)
+
+
+def decode_dsml_to_arguments(
+    tool_name: str, tool_args: dict[str, tuple[str, str]]
+) -> dict[str, str]:
+    def _decode_value(key: str, value: str, string: str):
+        if string == "true":
+            value = to_json(value)
+        return f"{to_json(key)}: {value}"
+
+    tool_args_json = (
+        "{"
+        + ", ".join(
+            [_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]
+        )
+        + "}"
+    )
+    return dict(name=tool_name, arguments=tool_args_json)
+
+
+def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str:
+    tools_json = [to_json(t) for t in tools]
+
+    return TOOLS_SYSTEM_TEMPLATE.format(
+        tool_schemas="\n".join(tools_json),
+        dsml_token=dsml_token,
+        thinking_start_token=thinking_start_token,
+        thinking_end_token=thinking_end_token,
+    )
+
+
+def find_last_user_index(messages: list[dict[str, Any]]) -> int:
+    last_user_index = -1
+    for idx in range(len(messages) - 1, -1, -1):
+        if messages[idx].get("role") in ["user", "developer"]:
+            last_user_index = idx
+            break
+    return last_user_index
+
+
+def render_message(
+    index: int, messages: list[dict[str, Any]], thinking_mode: str
+) -> str:
+    if not (0 <= index < len(messages)):
+        raise ValueError(
+            f"Index {index} out of range for messages list of length {len(messages)}"
+        )
+    if thinking_mode not in ["chat", "thinking"]:
+        raise ValueError(f"Invalid thinking_mode `{thinking_mode}`")
+
+    prompt = ""
+    msg = messages[index]
+    last_user_idx = find_last_user_index(messages)
+
+    role = msg.get("role")
+    content = msg.get("content")
+    tools = msg.get("tools")
+    response_format = msg.get("response_format")
+    tool_calls = msg.get("tool_calls")
+    reasoning = msg.get("reasoning")
+    is_prefix = msg.get("prefix", False)
+
+    if tools:
+        tools = tools_from_openai_format(tools)
+    if tool_calls:
+        tool_calls = tool_calls_from_openai_format(tool_calls)
+
+    if role == "system":
+        prompt += system_msg_template.format(content=content or "")
+        if tools:
+            prompt += "\n\n" + render_tools(tools)
+
+        if response_format:
+            prompt += "\n\n" + response_format_template.format(
+                schema=to_json(response_format)
+            )
+
+    elif role == "developer":
+        if not content:
+            raise ValueError(f"Invalid message for role `{role}`: {msg}")
+        content_developer = ""
+        if tools:
+            content_developer += "\n\n" + render_tools(tools)
+
+        if response_format:
+            content_developer += "\n\n" + response_format_template.format(
+                schema=to_json(response_format)
+            )
+
+        content_developer += "\n\n# The user's message is: {}".format(content)
+
+        prompt += user_msg_template.format(content=content_developer)
+        if index == last_user_idx and thinking_mode == "thinking":
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == "user":
+        prompt += user_msg_template.format(content=content)
+
+        if index == last_user_idx and thinking_mode == "thinking":
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == "tool":
+        prev_assistant_idx = index - 1
+        assistant_msg = messages[prev_assistant_idx]
+        while prev_assistant_idx >= 0 and assistant_msg.get("role") == "tool":
+            prev_assistant_idx -= 1
+            assistant_msg = messages[prev_assistant_idx]
+
+        if not (
+            index == 0
+            or prev_assistant_idx >= 0
+            and assistant_msg.get("role") == "assistant"
+        ):
+            raise ValueError(f"Invalid messages at {index}:\n{assistant_msg}")
+
+        tool_call_order = index - prev_assistant_idx
+        assistant_tool_calls = assistant_msg.get("tool_calls")
+        if not (assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order):
+            raise ValueError("No tool calls but found tool output")
+
+        if tool_call_order == 1:
+            prompt += "\n\n<function_results>"
+
+        prompt += tool_output_template.format(content=content)
+
+        if tool_call_order == len(assistant_tool_calls):
+            prompt += "\n</function_results>"
+
+            if index >= last_user_idx and thinking_mode == "thinking":
+                prompt += "\n\n" + thinking_start_token
+            else:
+                prompt += "\n\n" + thinking_end_token
+
+    elif role == "assistant":
+        prev_assistant_idx = index
+        thinking_part = ""
+
+        tool_calls_content = ""
+        if tool_calls:
+            tool_calls = [
+                tool_call_template.format(
+                    dsml_token=dsml_token,
+                    name=tool_call.get("name"),
+                    arguments=encode_arguments_to_dsml(tool_call),
+                )
+                for tool_call in tool_calls
+            ]
+            tool_calls_content += "\n\n" + tool_calls_template.format(
+                dsml_token=dsml_token, tool_calls="\n".join(tool_calls)
+            )
+
+        summary_content = content or ""
+
+        if thinking_mode == "thinking" and index > last_user_idx:
+            if not (reasoning or tool_calls):
+                raise ValueError(
+                    f"ThinkingMode: {thinking_mode}, invalid message without reasoning/tool_calls `{msg}` after last user message"
+                )
+            thinking_part = (
+                thinking_template.format(reasoning=reasoning or "") + thinking_end_token
+            )
+
+        if not tool_calls and is_prefix:
+            prompt += summary_content
+        else:
+            prompt += assistant_msg_template.format(
+                reasoning=thinking_part,
+                content=summary_content,
+                tool_calls=tool_calls_content,
+            )
+    else:
+        raise NotImplementedError(f"Unknown role: {role}")
+
+    return prompt
+
+
+def drop_thinking_messages(
+    messages: list[dict[str, Any]], last_user_idx: int | None = None
+) -> list[dict[str, Any]]:
+    messages_wo_thinking: list[dict[str, Any]] = []
+    last_user_idx = (
+        find_last_user_index(messages) if last_user_idx is None else last_user_idx
+    )
+    for idx, msg in enumerate(messages):
+        role = msg.get("role")
+        if role in ["user", "system", "tool"] or idx >= last_user_idx:
+            messages_wo_thinking.append(msg)
+            continue
+
+        elif role == "assistant":
+            msg_wo_thinking = copy.copy(msg)
+            msg_wo_thinking.pop("reasoning", None)
+            messages_wo_thinking.append(msg_wo_thinking)
+
+    return messages_wo_thinking
+
+
+def encode_messages(
+    messages: list[dict[str, Any]],
+    thinking_mode: str,
+    context: list[dict[str, Any]] | None = None,
+    drop_thinking: bool = True,
+    add_default_bos_token: bool = True,
+) -> str:
+    context = context if context else []
+    full_messages = context + messages
+
+    prompt = bos_token if add_default_bos_token and len(context) == 0 else ""
+
+    if thinking_mode == "thinking" and drop_thinking:
+        full_messages = drop_thinking_messages(full_messages)
+
+    for idx in range(len(messages)):
+        prompt += render_message(
+            idx + len(context), full_messages, thinking_mode=thinking_mode
+        )
+
+    return prompt
+
+
+def _read_until_stop(
+    index: int, text: str, stop: list[str]
+) -> tuple[int, str, None | str]:
+    min_pos = len(text)
+    matched_stop = None
+
+    for s in stop:
+        pos = text.find(s, index)
+        if pos != -1 and pos < min_pos:
+            min_pos = pos
+            matched_stop = s
+
+    if matched_stop:
+        content = text[index:min_pos]
+        return min_pos + len(matched_stop), content, matched_stop
+    else:
+        content = text[index:]
+        return len(text), content, None
+
+
+def parse_tool_calls(index: int, text: str):
+    tool_calls: list[dict[str, Any]] = []
+    stop_token = None
+    tool_calls_end_token = f"</{dsml_token}function_calls>"
+
+    while index < len(text):
+        index, _, stop_token = _read_until_stop(
+            index, text, [f"<{dsml_token}invoke", tool_calls_end_token]
+        )
+        if _ != ">\n":
+            raise RuntimeError("Tool call format error")
+
+        if stop_token == tool_calls_end_token:
+            break
+
+        if stop_token is None:
+            raise RuntimeError("Missing special token")
+
+        index, tool_name_content, stop_token = _read_until_stop(
+            index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
+        )
+
+        p_tool_name = re.findall(
+            r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL
+        )
+        if len(p_tool_name) != 1:
+            raise RuntimeError("Tool name format error")
+        tool_name = p_tool_name[0]
+
+        tool_args: dict[str, tuple[str, str]] = {}
+        while stop_token == f"<{dsml_token}parameter":
+            index, param_content, stop_token = _read_until_stop(
+                index, text, [f"/{dsml_token}parameter"]
+            )
+
+            param_kv = re.findall(
+                r'^ name="(.*?)" string="(true|false)">(.*?)<$',
+                param_content,
+                flags=re.DOTALL,
+            )
+            if len(param_kv) != 1:
+                raise RuntimeError("Parameter format error")
+            param_name, string, param_value = param_kv[0]
+
+            if param_name in tool_args:
+                raise RuntimeError("Duplicate parameter name")
+            tool_args[param_name] = (param_value, string)
+
+            index, content, stop_token = _read_until_stop(
+                index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
+            )
+            if content != ">\n":
+                raise RuntimeError("Parameter format error")
+
+        tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
+        tool_calls.append(tool_call)
+
+    return index, stop_token, tool_calls
+
+
+# NOTE: This function is designed to parse only correctly
+# formatted string and will not attempt to correct malformed output
+# that may be generated by the model.
+def parse_message_from_completion_text(text: str, thinking_mode: str):
+    summary_content, reasoning, tool_calls = "", "", []
+    index, stop_token = 0, None
+    tool_calls_start_token = f"\n\n<{dsml_token}function_calls"
+
+    is_thinking, is_tool_calling = thinking_mode == "thinking", False
+
+    if is_thinking:
+        index, content_delta, stop_token = _read_until_stop(
+            index, text, [thinking_end_token, tool_calls_start_token]
+        )
+        reasoning = content_delta
+        if stop_token != thinking_end_token:
+            raise RuntimeError("Invalid thinking format")
+
+    index, content_delta, stop_token = _read_until_stop(
+        index, text, [eos_token, tool_calls_start_token]
+    )
+    summary_content = content_delta
+    if stop_token == tool_calls_start_token:
+        is_tool_calling = True
+    else:
+        if stop_token != eos_token:
+            raise RuntimeError("Invalid summary format")
+
+    if is_tool_calling:
+        index, stop_token, tool_calls = parse_tool_calls(index, text)
+
+        index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
+        if tool_ends_text:
+            raise RuntimeError("Unexpected content after tool calls")
+
+    if not (len(text) == index and stop_token in [eos_token, None]):
+        raise RuntimeError("Unexpected content at end")
+
+    for sp_token in [
+        bos_token,
+        eos_token,
+        thinking_start_token,
+        thinking_end_token,
+        dsml_token,
+    ]:
+        if sp_token in summary_content or sp_token in reasoning:
+            raise RuntimeError("Unexpected special token in content")
+
+    return {
+        "role": "assistant",
+        "content": summary_content,
+        "reasoning": reasoning,
+        "tool_calls": tool_calls_to_openai_format(tool_calls),
+    }
--- a/vllm/tokenizers/detokenizer_utils.py
+++ b/vllm/tokenizers/detokenizer_utils.py
@@ -0,0 +1,202 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from vllm.tokenizers import TokenizerLike
+
+
+def _replace_none_with_empty(tokens: list[str | None]):
+    for i, token in enumerate(tokens):
+        if token is None:
+            tokens[i] = ""
+
+
+def _convert_tokens_to_string_with_added_encoders(
+    tokenizer: TokenizerLike,
+    output_tokens: list[str],
+    skip_special_tokens: bool,
+    spaces_between_special_tokens: bool,
+) -> str:
+    # Adapted from
+    # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
+    # NOTE(woosuk): The following code is slow because it runs a for loop over
+    # the output_tokens. In Python, running a for loop over a list can be slow
+    # even when the loop body is very simple.
+    # Performance improvements: avoid repeated attribute and function lookups;
+    # localize frequently used objects;
+
+    sub_texts: list[str] = []
+    current_sub_text: list[str] = []
+    convert_tokens_to_string = tokenizer.convert_tokens_to_string
+    added_vocab_set = set(tokenizer.get_added_vocab())
+    all_special_tokens = (
+        set(tokenizer.all_special_tokens) if skip_special_tokens else ()
+    )
+
+    for token in output_tokens:
+        # Use precomputed set for skip-special check
+        if token in all_special_tokens:
+            continue
+        if token in added_vocab_set:
+            if current_sub_text:
+                sub_texts.append(convert_tokens_to_string(current_sub_text))
+                current_sub_text.clear()
+            sub_texts.append(token)
+        else:
+            current_sub_text.append(token)
+    if current_sub_text:
+        sub_texts.append(convert_tokens_to_string(current_sub_text))
+    if spaces_between_special_tokens:
+        return " ".join(sub_texts)
+    return "".join(sub_texts)
+
+
+# 5 is an arbitrary value that should work for all
+# tokenizers (bigger = more conservative).
+INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
+
+
+def convert_prompt_ids_to_tokens(
+    tokenizer: TokenizerLike,
+    prompt_ids: list[int],
+    skip_special_tokens: bool = False,
+) -> tuple[list[str], int, int]:
+    """Converts the prompt ids to tokens and returns the tokens and offsets
+    for incremental detokenization.
+
+    Note that not all tokens are converted to strings. Only the tokens that
+    are necessary for incremental detokenization are converted to strings.
+    """
+    # We do not need to convert the whole prompt to tokens.
+    # Offset a little more in case we have special tokens.
+    new_tokens = tokenizer.convert_ids_to_tokens(
+        prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2 :],
+        skip_special_tokens=skip_special_tokens,
+    )
+    read_offset = len(new_tokens)
+    prefix_offset = max(read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
+    # This is required to guard against out-of-vocab prompt token ids
+    _replace_none_with_empty(new_tokens)  # type: ignore[arg-type]
+    return new_tokens, prefix_offset, read_offset
+
+
+def convert_ids_list_to_tokens(
+    tokenizer: TokenizerLike,
+    token_ids: list[int],
+) -> list[str]:
+    """Detokenize the input ids individually.
+
+    Args:
+      tokenizer: tokenizer used by model under test
+      token_ids: convert these tokens (Python list form)
+
+    Returns:
+      Python list of token string representations
+
+    """
+    token_str_lst = []
+    for token_id in token_ids:
+        # use default skip_special_tokens.
+        token_str = tokenizer.decode([token_id])
+        if token_str is None:
+            token_str = ""
+        token_str_lst.append(token_str)
+    return token_str_lst
+
+
+# Based on
+# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
+# under Apache 2.0 license
+def detokenize_incrementally(
+    tokenizer: TokenizerLike,
+    all_input_ids: list[int],
+    prev_tokens: list[str] | None,
+    prefix_offset: int,
+    read_offset: int,
+    skip_special_tokens: bool = False,
+    spaces_between_special_tokens: bool = True,
+) -> tuple[list[str], str, int, int]:
+    """Detokenizes the input ids incrementally and returns the new tokens
+    and the new text.
+
+    If `prev_tokens` is None, this function will convert the input ids to
+    tokens and return the tokens and the new text. Otherwise, it will return the
+    new tokens and the new text.
+
+    This function will also return the new prefix offset and the new read
+    offset to be used in the next iteration.
+
+    The offsets are necessary to defeat cleanup algorithms in the decode which
+    decide to add a space or not depending on the surrounding ids.
+
+    Args:
+        tokenizer: The tokenizer to use.
+        all_input_ids: The input ids. The last id is the new token id.
+        prev_tokens: The previous tokens. If None, this function will convert
+            the input ids to tokens and return the tokens and the new text.
+        prefix_offset: The prefix offset.
+        read_offset: The read offset.
+        skip_special_tokens: Whether to skip special tokens.
+        spaces_between_special_tokens: Whether to add spaces between special
+            tokens.
+    """
+    new_token_id = all_input_ids[-1]
+    # This is the first iteration for this sequence
+    is_first_iter = prev_tokens is None
+    if is_first_iter:
+        (prev_tokens, prefix_offset, read_offset) = convert_prompt_ids_to_tokens(
+            tokenizer, all_input_ids[:-1], skip_special_tokens=skip_special_tokens
+        )
+    assert prev_tokens is not None
+
+    # If the new token id is out of bounds, return an empty string.
+    if 0 <= new_token_id < len(tokenizer):
+        # Put new_token_id in a list so skip_special_tokens is respected
+        new_tokens = tokenizer.convert_ids_to_tokens(
+            [new_token_id], skip_special_tokens=skip_special_tokens
+        )
+        if isinstance(new_tokens, str):
+            new_tokens = [new_tokens]
+        else:
+            # This is required to guard against out-of-vocab prompt token ids
+            # (for example when using dummy weights)
+            _replace_none_with_empty(new_tokens)  # type: ignore[arg-type]
+    else:
+        new_tokens = [""]
+    output_tokens = prev_tokens + new_tokens
+
+    # If this is the first iteration, return all tokens.
+    if is_first_iter:
+        new_tokens = output_tokens
+
+    # The prefix text is necessary only to defeat cleanup algorithms in
+    # the decode which decide to add a space or not depending on the
+    # surrounding ids.
+    if tokenizer.is_fast or not tokenizer.get_added_vocab():
+        prefix_text = tokenizer.convert_tokens_to_string(
+            output_tokens[prefix_offset:read_offset]
+        )
+        new_text = tokenizer.convert_tokens_to_string(output_tokens[prefix_offset:])
+    else:
+        prefix_text = _convert_tokens_to_string_with_added_encoders(
+            tokenizer,
+            output_tokens[prefix_offset:read_offset],
+            skip_special_tokens=skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )
+        new_text = _convert_tokens_to_string_with_added_encoders(
+            tokenizer,
+            output_tokens[prefix_offset:],
+            skip_special_tokens=skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )
+
+    if len(new_text) <= len(prefix_text) or new_text.endswith("<EFBFBD>"):
+        # utf-8 char at the end means it's a potential unfinished byte sequence
+        # from byte fallback tokenization.
+        # If it's in the middle, it's probably a real invalid id generated
+        # by the model
+        return new_tokens, "", prefix_offset, read_offset
+
+    new_text = new_text[len(prefix_text) :]
+    return new_tokens, new_text, read_offset, len(output_tokens)
--- a/vllm/tokenizers/grok2.py
+++ b/vllm/tokenizers/grok2.py
@@ -0,0 +1,450 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tokenizer for Grok-2 .tok.json format."""
+
+import functools
+import json
+from collections.abc import Collection, Set
+from pathlib import Path
+from typing import Any, Literal, overload
+
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    HfHubHTTPError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
+from transformers import BatchEncoding
+from transformers.utils import chat_template_utils as hf_chat_utils
+
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.logger import init_logger
+
+from .protocol import TokenizerLike
+
+logger = init_logger(__name__)
+
+PAD = "<|pad|>"
+EOS = "<|eos|>"
+SEP = "<|separator|>"
+RESERVED_TOKEN_TEXTS = [f"<|reserved_{i}|>" for i in range(3, 128)]
+CONTROL_TOKEN_TEXTS = [f"<|control{i}|>" for i in range(1, 705)]
+DEFAULT_SPECIAL_TOKENS = [PAD, SEP, EOS]
+DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": SEP, "eos": EOS}
+DEFAULT_CHAT_TEMPLATE = (
+    "{% for message in messages %}"
+    "{% if message['role'] == 'user' %}"
+    "{{ 'Human: ' + message['content'].strip() + '<|separator|>\\n\\n' }}"
+    "{% elif message['role'] == 'system' %}"
+    "{{ 'System: ' + message['content'].strip() + '<|separator|>\\n\\n' }}"
+    "{% elif message['role'] == 'assistant' %}"
+    "{{ 'Assistant: ' + message['content'] + '<|separator|>\\n\\n' }}"
+    "{% endif %}"
+    "{% endfor %}"
+    "{% if add_generation_prompt %}"
+    "{{ 'Assistant:' }}"
+    "{% endif %}"
+)
+
+# Default + separate each single digit.
+PAT_STR_B = (
+    r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}|"""
+    r""" ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+)
+
+
+def _maybe_load_tokenizer_config(
+    model_path: Path,
+    *,
+    repo_id: str | None,
+    revision: str | None,
+    download_dir: str | None,
+) -> dict[str, Any]:
+    config_path = model_path / "tokenizer_config.json"
+    if config_path.is_file():
+        with config_path.open("r", encoding="utf-8") as f:
+            return json.load(f)
+
+    if repo_id is None:
+        return {}
+
+    try:
+        config_file = hf_hub_download(
+            repo_id=repo_id,
+            filename="tokenizer_config.json",
+            revision=revision,
+            cache_dir=download_dir,
+        )
+    except (RepositoryNotFoundError, RevisionNotFoundError, EntryNotFoundError):
+        # If the repo, revision, or file does not exist, fall back silently.
+        return {}
+    except HfHubHTTPError as exc:
+        logger.warning(
+            "Failed to download tokenizer_config.json from %s. "
+            "This may be due to a network or authentication issue. "
+            "The default chat template will be used. Error: %s",
+            repo_id,
+            exc,
+        )
+        return {}
+
+    try:
+        with Path(config_file).open("r", encoding="utf-8") as f:
+            return json.load(f)
+    except json.JSONDecodeError as exc:
+        logger.warning(
+            "Failed to parse tokenizer_config.json. "
+            "The default chat template will be used. Error: %s",
+            exc,
+        )
+        return {}
+    except OSError as exc:
+        logger.warning(
+            "Failed to open tokenizer_config.json. "
+            "The default chat template will be used. Error: %s",
+            exc,
+        )
+        return {}
+
+
+def _load_tiktoken_encoding(
+    vocab_file: Path,
+) -> tuple[Any, dict[str, int]]:
+    try:
+        import tiktoken
+    except ImportError as exc:
+        raise ImportError("Grok-2 tokenizer requires the `tiktoken` package.") from exc
+
+    with vocab_file.open("rb") as f:
+        xtok_dict = json.load(f)
+
+    mergeable_ranks = {
+        bytes(item["bytes"]): item["token"]
+        for item in xtok_dict.get("regular_tokens", [])
+    }
+    special_tokens = {
+        bytes(item["bytes"]).decode("utf-8", errors="replace"): item["token"]
+        for item in xtok_dict.get("special_tokens", [])
+    }
+
+    if xtok_dict.get("word_split") == "V1":
+        pat_str = PAT_STR_B
+    else:
+        raise ValueError(f"Unknown word_split: {xtok_dict.get('word_split')!r}")
+
+    pat_str = xtok_dict.get("pat_str", pat_str)
+
+    kwargs = {
+        "name": str(vocab_file),
+        "pat_str": pat_str,
+        "mergeable_ranks": mergeable_ranks,
+        "special_tokens": special_tokens,
+    }
+
+    if "vocab_size" in xtok_dict:
+        kwargs["explicit_n_vocab"] = xtok_dict["vocab_size"]
+
+    tokenizer = tiktoken.Encoding(**kwargs)
+
+    default_allowed_special: set[str] | None = None
+    if "default_allowed_special" in xtok_dict:
+        default_allowed_special = {
+            bytes(bytes_list).decode("utf-8", errors="replace")
+            for bytes_list in xtok_dict["default_allowed_special"]
+        }
+
+    tokenizer._default_allowed_special = default_allowed_special or set()
+    tokenizer._control_tokens = DEFAULT_CONTROL_TOKENS
+
+    def encode_patched(
+        self,
+        text: str,
+        *,
+        allowed_special: Literal["all"] | Set[str] = set(),
+        disallowed_special: Literal["all"] | Collection[str] = "all",
+    ) -> list[int]:
+        del disallowed_special
+        if isinstance(allowed_special, set):
+            allowed_special |= self._default_allowed_special
+        return tiktoken.Encoding.encode(
+            self,
+            text,
+            allowed_special=allowed_special,
+            disallowed_special=(),
+        )
+
+    tokenizer.encode = functools.partial(encode_patched, tokenizer)
+    tokenizer._default_allowed_special |= set(DEFAULT_CONTROL_TOKENS.values())
+    tokenizer._default_allowed_special |= set(
+        CONTROL_TOKEN_TEXTS + RESERVED_TOKEN_TEXTS
+    )
+
+    return tokenizer, special_tokens
+
+
+class Grok2Tokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "Grok2Tokenizer":
+        if args:
+            logger.debug_once("Ignoring extra positional args for Grok2Tokenizer.")
+
+        path = Path(path_or_repo_id)
+        if path.is_file():
+            vocab_file = path
+            model_path = path.parent
+            repo_id = None
+        elif path.is_dir():
+            vocab_file = path / "tokenizer.tok.json"
+            model_path = path
+            repo_id = None
+        else:
+            vocab_file = Path(
+                hf_hub_download(
+                    repo_id=str(path_or_repo_id),
+                    filename="tokenizer.tok.json",
+                    revision=revision,
+                    cache_dir=download_dir,
+                )
+            )
+            model_path = vocab_file.parent
+            repo_id = str(path_or_repo_id)
+
+        if not vocab_file.is_file():
+            raise FileNotFoundError(f"tokenizer.tok.json not found at {vocab_file}.")
+
+        config = _maybe_load_tokenizer_config(
+            model_path,
+            repo_id=repo_id,
+            revision=revision,
+            download_dir=download_dir,
+        )
+
+        return cls(
+            vocab_file=vocab_file,
+            name_or_path=str(path_or_repo_id),
+            truncation_side=kwargs.get("truncation_side", "left"),
+            chat_template=config.get("chat_template"),
+            init_kwargs=config,
+        )
+
+    def __init__(
+        self,
+        *,
+        vocab_file: Path,
+        name_or_path: str,
+        truncation_side: str,
+        chat_template: str | None,
+        init_kwargs: dict[str, Any] | None = None,
+    ) -> None:
+        super().__init__()
+        self.name_or_path = name_or_path
+        self._truncation_side = truncation_side
+        self.init_kwargs = init_kwargs or {}
+        self._chat_template = chat_template or DEFAULT_CHAT_TEMPLATE
+
+        self._tokenizer, self._special_tokens = _load_tiktoken_encoding(vocab_file)
+
+        self._token_to_id: dict[str, int] = {}
+        self._id_to_token: dict[int, str] = {}
+        for token, token_id in self._tokenizer._mergeable_ranks.items():
+            token_str = token.decode("utf-8", errors="replace")
+            self._token_to_id[token_str] = token_id
+            self._id_to_token[token_id] = token_str
+
+        for token, token_id in self._special_tokens.items():
+            self._token_to_id[token] = token_id
+            self._id_to_token[token_id] = token
+
+        bos_token_id = self._special_tokens.get(SEP)
+        if bos_token_id is None:
+            bos_token_id = self._special_tokens.get(PAD)
+        if bos_token_id is None:
+            bos_token_id = self._special_tokens.get(EOS)
+        if bos_token_id is None:
+            bos_token_id = 0
+        self._bos_token_id = bos_token_id
+
+        self._eos_token_id = self._special_tokens.get(EOS, self._bos_token_id)
+        self._pad_token_id = self._special_tokens.get(PAD, self._eos_token_id)
+        self._unk_token_id = self._pad_token_id
+
+        self._max_chars_per_token = max(len(tok) for tok in self._token_to_id)
+
+    def num_special_tokens_to_add(self) -> int:
+        return 0
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        return list(self._special_tokens.keys())
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        return list(self._special_tokens.values())
+
+    @property
+    def bos_token_id(self) -> int:
+        return self._bos_token_id
+
+    @property
+    def eos_token_id(self) -> int:
+        return self._eos_token_id
+
+    @property
+    def pad_token_id(self) -> int:
+        return self._pad_token_id
+
+    @property
+    def is_fast(self) -> bool:
+        return False
+
+    @property
+    def vocab_size(self) -> int:
+        return self._tokenizer.n_vocab
+
+    @property
+    def max_token_id(self) -> int:
+        return self._tokenizer.n_vocab - 1
+
+    @property
+    def max_chars_per_token(self) -> int:
+        return self._max_chars_per_token
+
+    @property
+    def truncation_side(self) -> str:
+        return self._truncation_side
+
+    def get_vocab(self) -> dict[str, int]:
+        return dict(self._token_to_id)
+
+    def get_added_vocab(self) -> dict[str, int]:
+        return dict(self._special_tokens)
+
+    def _maybe_truncate(self, tokens: list[int], max_length: int | None) -> list[int]:
+        if max_length is None or len(tokens) <= max_length:
+            return tokens
+        if self.truncation_side == "left":
+            return tokens[-max_length:]
+        return tokens[:max_length]
+
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> list[int]:
+        del add_special_tokens
+        tokens = self._tokenizer.encode(text)
+        if truncation:
+            tokens = self._maybe_truncate(tokens, max_length)
+        return tokens
+
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
+        if isinstance(ids, int):
+            ids = [ids]
+        if skip_special_tokens:
+            ids = [
+                token_id
+                for token_id in ids
+                if token_id not in self._special_tokens.values()
+            ]
+        return self._tokenizer.decode(ids)
+
+    @overload
+    def convert_tokens_to_ids(self, tokens: str) -> int: ...
+
+    @overload
+    def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ...
+
+    def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
+        if isinstance(tokens, str):
+            return self._token_to_id.get(tokens, self._unk_token_id)
+        return [self._token_to_id.get(token, self._unk_token_id) for token in tokens]
+
+    def convert_ids_to_tokens(
+        self, ids: list[int], skip_special_tokens: bool = False
+    ) -> list[str]:
+        tokens = []
+        for token_id in ids:
+            if skip_special_tokens and token_id in self._special_tokens.values():
+                continue
+            tokens.append(self._id_to_token.get(token_id, "<|unk|>"))
+        return tokens
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        token_ids = self.convert_tokens_to_ids(tokens)
+        return self.decode(token_ids, skip_special_tokens=False)
+
+    def __call__(
+        self,
+        text: str | list[str],
+        text_pair: str | None = None,
+        add_special_tokens: bool = True,
+        truncation: bool = False,
+        max_length: int | None = None,
+    ) -> BatchEncoding:
+        if text_pair is not None:
+            raise NotImplementedError("text_pair is not supported for Grok2Tokenizer.")
+
+        if isinstance(text, list):
+            input_ids_batch: list[list[int]] = [
+                self.encode(
+                    item,
+                    truncation=truncation,
+                    max_length=max_length,
+                    add_special_tokens=add_special_tokens,
+                )
+                for item in text
+            ]
+            attention_mask_batch = [[1] * len(ids) for ids in input_ids_batch]
+            return BatchEncoding(
+                {"input_ids": input_ids_batch, "attention_mask": attention_mask_batch}
+            )
+
+        input_ids = self.encode(
+            text,
+            truncation=truncation,
+            max_length=max_length,
+            add_special_tokens=add_special_tokens,
+        )
+        attention_mask = [1] * len(input_ids)
+        return BatchEncoding({"input_ids": input_ids, "attention_mask": attention_mask})
+
+    def get_chat_template(
+        self, chat_template: str | None, tools: list[dict[str, Any]] | None = None
+    ) -> str | None:
+        del tools
+        return chat_template or self._chat_template
+
+    def apply_chat_template(
+        self,
+        messages: list[ChatCompletionMessageParam],
+        tools: list[dict[str, Any]] | None = None,
+        chat_template: str | None = None,
+        tokenize: bool = False,
+        **kwargs,
+    ) -> str | list[int]:
+        template = self.get_chat_template(chat_template, tools=tools)
+        if template is None:
+            raise ValueError(
+                "No chat template available. Provide `chat_template` explicitly."
+            )
+        kwargs["return_dict"] = False
+        prompt = hf_chat_utils.apply_chat_template(
+            conversation=messages,
+            chat_template=template,
+            tools=tools,
+            **kwargs,
+        )
+        if tokenize:
+            return self.encode(prompt, add_special_tokens=False)
+        return prompt
--- a/vllm/tokenizers/hf.py
+++ b/vllm/tokenizers/hf.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import copy
+from pathlib import Path
+from typing import TypeAlias
+
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
+
+from .protocol import TokenizerLike
+
+HfTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast
+
+
+def get_cached_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
+    """
+    By default, transformers will recompute multiple tokenizer properties
+    each time they are called, leading to a significant slowdown.
+    This proxy caches these properties for faster access.
+    """
+    cached_tokenizer = copy.copy(tokenizer)
+
+    tokenizer_all_special_ids = tokenizer.all_special_ids
+    tokenizer_all_special_tokens = tokenizer.all_special_tokens
+    tokenizer_vocab = tokenizer.get_vocab()
+    tokenizer_len = len(tokenizer)
+
+    max_token_id = max(tokenizer_vocab.values())
+    max_chars_per_token = max(len(tok) for tok in tokenizer_vocab)
+
+    # Some tokenizers (e.g., QwenTokenizer) have special tokens that
+    # are added and included in the implementation of the vocab_size
+    # property, but not in get_vocab(); if there is an implementation
+    # of vocab size, we should take the greater value.
+    if hasattr(tokenizer, "vocab_size"):
+        with contextlib.suppress(NotImplementedError):
+            max_token_id = max(max_token_id, tokenizer.vocab_size)
+
+    class CachedTokenizer(tokenizer.__class__):  # type: ignore
+        @property
+        def all_special_ids(self) -> list[int]:
+            return tokenizer_all_special_ids
+
+        @property
+        def all_special_tokens(self) -> list[str]:
+            return tokenizer_all_special_tokens
+
+        @property
+        def max_token_id(self) -> int:
+            return max_token_id
+
+        @property
+        def max_chars_per_token(self) -> int:
+            return max_chars_per_token
+
+        def get_vocab(self) -> dict[str, int]:
+            return tokenizer_vocab
+
+        def __len__(self) -> int:
+            return tokenizer_len
+
+        def __reduce__(self):
+            return get_cached_tokenizer, (tokenizer,)
+
+    CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
+
+    cached_tokenizer.__class__ = CachedTokenizer
+    return cached_tokenizer
+
+
+class CachedHfTokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> HfTokenizer:
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                path_or_repo_id,
+                *args,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                cache_dir=download_dir,
+                **kwargs,
+            )
+        except ValueError as e:
+            # If the error pertains to the tokenizer class not existing or not
+            # currently being imported,
+            # suggest using the --trust-remote-code flag.
+            if not trust_remote_code and (
+                "does not exist or is not currently imported." in str(e)
+                or "requires you to execute the tokenizer file" in str(e)
+            ):
+                err_msg = (
+                    "Failed to load the tokenizer. If the tokenizer "
+                    "is a custom tokenizer not yet available in the "
+                    "HuggingFace transformers library, consider "
+                    "setting `trust_remote_code=True` in LLM or using "
+                    "the `--trust-remote-code` flag in the CLI."
+                )
+                raise RuntimeError(err_msg) from e
+            else:
+                raise e
+
+        # The special_tokens in tokenizer should also be
+        # controlled by do_lower_case in encoder_config
+        encoder_config = get_sentence_transformer_tokenizer_config(
+            path_or_repo_id, revision
+        )
+        if isinstance(encoder_config, dict) and encoder_config.get(
+            "do_lower_case", False
+        ):
+            special_tokens_map = {
+                k: v.lower() for k, v in tokenizer.special_tokens_map.items()
+            }
+            tokenizer.add_special_tokens(special_tokens_map)
+
+        return get_cached_tokenizer(tokenizer)
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -0,0 +1,553 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, cast, overload
+
+from mistral_common.protocol.instruct.request import (
+    ChatCompletionRequest as MistralChatCompletionRequest,
+)
+from mistral_common.protocol.instruct.tool_calls import Function, Tool
+from mistral_common.protocol.instruct.validator import ValidationMode
+from mistral_common.tokens.tokenizers.base import (
+    SpecialTokenPolicy,
+    SpecialTokens,
+)
+from mistral_common.tokens.tokenizers.instruct import InstructTokenizerV13
+from mistral_common.tokens.tokenizers.sentencepiece import (
+    SentencePieceTokenizer,
+)
+from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+from pydantic import ValidationError
+
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.logger import init_logger
+
+from .protocol import TokenizerLike
+
+if TYPE_CHECKING:
+    from transformers import BatchEncoding
+
+    try:
+        # Transformers v5
+        from transformers.tokenization_mistral_common import MistralCommonBackend
+    except ImportError:
+        # Transformers v4
+        from transformers.tokenization_mistral_common import (
+            MistralCommonTokenizer as MistralCommonBackend,
+        )
+
+logger = init_logger(__name__)
+
+
+def maybe_serialize_tool_calls(request: "MistralChatCompletionRequest"):
+    # SEE: https://github.com/vllm-project/vllm/pull/9951
+    # Credits go to: @gcalmettes
+    # NOTE: There is currently a bug in pydantic where attributes
+    # declared as iterables are replaced in in the instances by
+    # pydantic-core ValidatorIterator instance. In particular, this
+    # affects tool_calls defined in ChatCompletionAssistantMessageParam
+    # model:
+    # see:
+    #   - https://github.com/pydantic/pydantic/issues/9467
+    # As a result, tool_calls from assistant messages are never
+    # deserialized in the request object if the tool_calls iterator is
+    # not consumed. This affect messages passed to the MistralTokenizer
+    # since no chat template is applied and therefore the tools_calls
+    # iterator is not directly consumed.
+    # Issue is tracked on Pydantic side, with resolution planned for
+    # v2.11 release. In the meantime, the official workaround is to
+    # consume the iterator so the tool_calls are correctly deserialized
+    # in the OpenAI ChatCompletionAssistantMessageParam object
+    # https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501
+    # Official Pydantic Issues:
+    #   - https://github.com/pydantic/pydantic/issues/9541
+    # TODO: remove when pydantic v2.11 is released
+    for i, message in enumerate(request.messages):
+        if message.get("role") == "assistant":
+            if (tool_calls_validator := message.get("tool_calls", None)) is not None:
+                try:
+                    validated_tool_calls = list(tool_calls_validator)
+                except ValidationError as e:
+                    raise ValueError(
+                        "Validating messages' `tool_calls` raised an error. "
+                        "Please ensure `tool_calls` are iterable of tool calls."
+                    ) from e
+            else:
+                validated_tool_calls = []
+
+            request.messages[i]["tool_calls"] = validated_tool_calls
+
+
+def truncate_tool_call_ids(request: "MistralChatCompletionRequest"):
+    """Truncates tool call IDs for Mistral's ID requirements."""
+    for i, message in enumerate(request.messages):
+        if message.get("role") == "assistant":
+            tool_calls = message.get("tool_calls", [])
+            for tool_call in tool_calls:
+                if len(tool_call["id"]) > 9:
+                    logger.warning(
+                        "Truncating tool call ID: %s to %s",
+                        tool_call["id"],
+                        tool_call["id"][-9:],
+                    )
+                    tool_call["id"] = tool_call["id"][-9:]
+
+            request.messages[i]["tool_calls"] = tool_calls
+
+        elif message.get("role") in {"tool_results", "tool"}:
+            if "tool_call_id" in message:
+                tool_call_id = message["tool_call_id"]
+
+                if len(tool_call_id) > 9:
+                    logger.warning(
+                        "Truncating tool_call_id: %s to %s",
+                        tool_call_id,
+                        tool_call_id[-9:],
+                    )
+                    tool_call_id = tool_call_id[-9:]
+                request.messages[i]["tool_call_id"] = tool_call_id
+
+
+def _prepare_apply_chat_template_tools_and_messages(
+    messages: list["ChatCompletionMessageParam"],
+    tools: list[dict[str, Any]] | None = None,
+    continue_final_message: bool = False,
+    add_generation_prompt: bool = False,
+) -> tuple[list["ChatCompletionMessageParam"], list[dict[str, Any]] | None]:
+    if add_generation_prompt and continue_final_message:
+        raise ValueError(
+            "Cannot set both `add_generation_prompt` and "
+            "`continue_final_message` to True."
+        )
+
+    last_message = cast(dict[str, Any], messages[-1])
+    # add_generation_prompt is directly handled by the tokenizer but we
+    # check if the user is trying to use it with a final assistant message
+    # which is probably not what they want.
+    # If add_generation_prompt is False, we don't need to check anything.
+    if add_generation_prompt and last_message["role"] == "assistant":
+        raise ValueError(
+            "Cannot set `add_generation_prompt` to True when "
+            "the last message is from the assistant. Consider "
+            "using `continue_final_message` instead."
+        )
+    if continue_final_message and last_message["role"] != "assistant":
+        raise ValueError(
+            "Cannot set `continue_final_message` to True when "
+            "the last message is not from the assistant."
+        )
+
+    # mistral-common requires AssistantMessage content to be string [1].
+    #
+    # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
+    for message in messages:
+        # Remove reasoning as unsupported by Mistral
+        _ = message.pop("reasoning", None)  # type: ignore
+
+    # The Mistral client, in comparison to the OpenAI client, requires the
+    # "parameters" dict and the "description" string to be present
+    # even if they are empty.
+    if tools:
+        for function in [
+            tool["function"] for tool in tools if tool["type"] == "function"
+        ]:
+            if function.get("parameters") is None:
+                function["parameters"] = {}
+            if function.get("description") is None:
+                function["description"] = ""
+
+        # We filter not supported arguments to avoid throwing an error.
+        # TODO(juliendenize): remove this once OpenAI API is better supported by
+        # `mistral-common`.
+        tools_fields = set(Tool.model_fields.keys())
+        function_fields = set(Function.model_fields.keys())
+        for tool in tools:
+            tool_keys = list(tool.keys())
+            for tool_key in tool_keys:
+                if tool_key not in tools_fields:
+                    tool.pop(tool_key)
+                    logger.warning_once(
+                        f"'{tool_key}' is not supported by mistral-common for tools. "
+                        "It has been poped from the tool definition."
+                    )
+                if tool["type"] == "function":
+                    function_keys = list(tool["function"].keys())
+                    for function_key in function_keys:
+                        if function_key not in function_fields:
+                            tool["function"].pop(function_key)
+                            logger.warning_once(
+                                f"'{function_key}' is not supported by mistral-common "
+                                "for function tools. It has been poped from the "
+                                "function definition."
+                            )
+                else:
+                    raise ValueError("mistral-common only supports function tools.")
+
+    return messages, tools
+
+
+def validate_request_params(request: "ChatCompletionRequest"):
+    if request.chat_template is not None or request.chat_template_kwargs is not None:
+        raise ValueError("chat_template is not supported for Mistral tokenizers.")
+
+
+def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
+    assert isinstance(tokenizer, Tekkenizer), type(tokenizer)
+
+    t_bytes = t.encode("utf-8") if not isinstance(t, bytes) else t
+    shift = tokenizer.num_special_tokens
+    try:
+        return shift + tokenizer._tekken_token2id_nospecial[t_bytes]
+    except KeyError:
+        t_str = t_bytes.decode("utf-8")
+        if t_str in tokenizer._special_tokens_reverse_vocab:
+            return tokenizer._special_tokens_reverse_vocab[t_str]
+        logger.warning(
+            "Failed to convert token %s to id, replacing with <unk>", t_bytes
+        )
+        return tokenizer.unk_id
+
+
+class MistralTokenizer(TokenizerLike):
+    IS_MISTRAL_TOKENIZER = True  # used by vllm.utils.mistral
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "MistralTokenizer":
+        try:
+            # Transformers v5
+            from transformers.tokenization_mistral_common import MistralCommonBackend
+        except ImportError:
+            # Transformers v4
+            from transformers.tokenization_mistral_common import (
+                MistralCommonTokenizer as MistralCommonBackend,
+            )
+
+        tokenizer = MistralCommonBackend.from_pretrained(
+            path_or_repo_id,
+            *args,
+            mode=ValidationMode.test,
+            cache_dir=download_dir,
+            revision="main" if revision is None else revision,
+            **kwargs,
+        )
+
+        return cls(tokenizer)
+
+    def __init__(self, tokenizer: "MistralCommonBackend") -> None:
+        super().__init__()
+
+        self.transformers_tokenizer = tokenizer
+        self.mistral = tokenizer.tokenizer
+        self.instruct = self.mistral.instruct_tokenizer
+        self.tokenizer = self.instruct.tokenizer
+
+        mode = self.mistral._chat_completion_request_validator._mode
+        if mode != ValidationMode.test:
+            raise ValueError(
+                "Mistral tokenizer must be in test mode. Make sure to "
+                "set `mode='ValidationMode.test'` when creating the "
+                "Mistral tokenizer."
+            )
+
+        _mistral_version_str = str(self.tokenizer.version.value)
+        self.version: int = int(_mistral_version_str.split("v")[-1])
+
+        self.is_tekken = isinstance(self.tokenizer, Tekkenizer)
+        self.is_spm = isinstance(self.tokenizer, SentencePieceTokenizer)
+        if not (self.is_tekken or self.is_spm):
+            raise TypeError(f"Unsupported tokenizer: {type(self.tokenizer)}")
+
+        # Reverse order to ensure that the lowest token id is kept.
+        self._vocab_dict = {
+            self.convert_ids_to_tokens([i], skip_special_tokens=False)[0]: i
+            for i in range(self.vocab_size - 1, -1, -1)
+        }
+        # Sort the dict for convenience
+        self._vocab_dict = dict(sorted(self._vocab_dict.items(), key=lambda x: x[1]))
+
+        # Vocab sorted by token id.
+        self._vocab = self.tokenizer.vocab()
+        self._max_token_id = self.vocab_size - 1
+        self._max_chars_per_token = max(len(tok) for tok in self._vocab)
+
+        # Cache special tokens for faster access.
+        self._special_token_ids = self._get_special_token_ids()
+        self._special_token_ids_set = set(self._special_token_ids)
+        self._special_tokens = self._get_special_tokens(self._special_token_ids)
+        self._special_tokens_set = set(self._special_tokens)
+
+    def _get_special_token_ids(self) -> list[int]:
+        return [i for i in range(len(self._vocab)) if self.tokenizer.is_special(i)]
+
+    def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]:
+        return [
+            self.tokenizer.decode([i], special_token_policy=SpecialTokenPolicy.KEEP)
+            for i in all_special_ids
+        ]
+
+    def num_special_tokens_to_add(self) -> int:
+        return len(self.encode(""))
+
+    # the following attributes are set to fit vLLM's design and are used
+    # by the structured output backends.
+    @property
+    def all_special_tokens(self) -> list[str]:
+        return self._special_tokens
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        return self._special_token_ids
+
+    @property
+    def bos_token_id(self) -> int:
+        return self.tokenizer.bos_id
+
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_id
+
+    @property
+    def pad_token_id(self) -> int:
+        return self.tokenizer.pad_id
+
+    @property
+    def is_fast(self) -> bool:
+        return True
+
+    @property
+    def vocab_size(self) -> int:
+        return self.transformers_tokenizer.vocab_size
+
+    @property
+    def max_token_id(self) -> int:
+        return self._max_token_id
+
+    @property
+    def max_chars_per_token(self) -> int:
+        return self._max_chars_per_token
+
+    @property
+    def truncation_side(self) -> str:
+        return self.transformers_tokenizer.truncation_side
+
+    def _is_special_token_id(self, token_id: int) -> bool:
+        return token_id in self._special_token_ids_set
+
+    def __hash__(self) -> int:
+        return hash(id(self))
+
+    def __len__(self) -> int:
+        return self.vocab_size
+
+    def __call__(
+        self,
+        text: str | list[str],
+        text_pair: str | None = None,
+        add_special_tokens: bool = True,
+        truncation: bool = False,
+        max_length: int | None = None,
+    ) -> "BatchEncoding":
+        if text_pair is not None:
+            raise ValueError(
+                "`text_pair` is not supported by `MistralTokenizer.__call__`."
+            )
+
+        encoded = self.transformers_tokenizer(
+            text=text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            max_length=max_length,
+        )
+        # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
+        # is in, revert to only call self.transformers_tokenizer(...).
+        # Hack to fix wrongly added eos token, when fix will be supported the condition
+        # below will be False even before the revert is done.
+        if encoded["input_ids"] and encoded["input_ids"][-1] == self.eos_token_id:
+            encoded["input_ids"].pop(-1)
+            if attention_mask := encoded.get("attention_mask"):
+                attention_mask.pop(-1)
+        return encoded
+
+    @property
+    def vocab(self) -> list[str]:
+        return self._vocab
+
+    def get_vocab(self) -> dict[str, int]:
+        return self._vocab_dict
+
+    def get_added_vocab(self) -> dict[str, int]:
+        # Mistral tokenizers have no added vocabulary
+        return {}
+
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> list[int]:
+        # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
+        # is in, directly call self.transformers_tokenizer.encode(...).
+        encoded = self.tokenizer.encode(text, bos=add_special_tokens, eos=False)
+
+        if truncation is not False and max_length is not None:
+            return encoded[:max_length]
+        else:
+            return encoded
+
+    def apply_chat_template(
+        self,
+        messages: list["ChatCompletionMessageParam"],
+        tools: list[dict[str, Any]] | None = None,
+        **kwargs,
+    ) -> list[int]:
+        add_generation_prompt = kwargs.pop("add_generation_prompt", False)
+        continue_final_message = kwargs.get("continue_final_message", False)
+        tokenize = kwargs.get("tokenize", True)
+        padding = kwargs.get("padding", False)
+        truncation = kwargs.get("truncation", False)
+        max_length = kwargs.get("max_length")
+
+        messages, tools = _prepare_apply_chat_template_tools_and_messages(
+            messages, tools, continue_final_message, add_generation_prompt
+        )
+
+        return self.transformers_tokenizer.apply_chat_template(
+            conversation=messages,
+            tools=tools,
+            continue_final_message=continue_final_message,
+            tokenize=tokenize,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=None,
+            return_dict=False,
+        )
+
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
+        # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
+        # is in, directly call self.transformers_tokenizer.decode(...).
+        if isinstance(ids, int):
+            ids = [ids]
+
+        return self.transformers_tokenizer.decode(
+            ids, skip_special_tokens=skip_special_tokens
+        )
+
+    def batch_decode(
+        self, ids: list[list[int]] | list[int], skip_special_tokens: bool = False
+    ) -> str:
+        return self.transformers_tokenizer.batch_decode(
+            ids, skip_special_tokens=skip_special_tokens
+        )
+
+    @overload
+    def convert_tokens_to_ids(self, tokens: str) -> int: ...
+
+    @overload
+    def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ...
+
+    def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
+        return self.transformers_tokenizer.convert_tokens_to_ids(tokens)
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        to_decode_special_tokens = {SpecialTokens.tool_calls}
+        if self.is_tekken:
+            assert isinstance(self.tokenizer, Tekkenizer), type(self.tokenizer)
+            tokens = [
+                t
+                for t in tokens
+                if (t in to_decode_special_tokens or t not in self._special_tokens_set)
+            ]
+
+            if any(isinstance(t, bytes) for t in tokens):
+                # we need to encode and decode all tokens again
+                ids = [_tekken_token_to_id(self.tokenizer, t) for t in tokens]
+                # We filtered unwanted special tokens before
+                # so we can decode the rest.
+                decoded = self.tokenizer.decode(ids, SpecialTokenPolicy.KEEP)
+            else:
+                decoded = "".join(tokens)
+        else:
+            # make sure certain special tokens like Tool calls are
+            # not decoded
+            assert isinstance(self.tokenizer, SentencePieceTokenizer), type(
+                self.tokenizer
+            )
+
+            regular_tokens: list[str] = []
+            decoded_list: list[str] = []
+            decoded = ""
+
+            for token in tokens:
+                if token in to_decode_special_tokens:
+                    if regular_tokens:
+                        decoded_list.append(
+                            self.tokenizer.decode(
+                                regular_tokens, SpecialTokenPolicy.IGNORE
+                            )
+                        )
+                        regular_tokens = []
+                    decoded_list.append(token)
+                else:
+                    regular_tokens.append(token)
+
+            if regular_tokens:
+                decoded_list.append(
+                    self.tokenizer.decode(regular_tokens, SpecialTokenPolicy.IGNORE)
+                )
+            decoded = "".join(decoded_list)
+
+        return decoded
+
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = False,
+    ) -> list[str]:
+        if not skip_special_tokens:
+            return [self.tokenizer.id_to_piece(token_id) for token_id in ids]
+
+        non_skip_special_tokens_ids = {
+            self.tokenizer.get_special_token(SpecialTokens.tool_calls),
+        }
+        if isinstance(self.instruct, InstructTokenizerV13):
+            if self.instruct.BEGIN_THINK:
+                non_skip_special_tokens_ids.add(self.instruct.BEGIN_THINK)
+            if self.instruct.END_THINK:
+                non_skip_special_tokens_ids.add(self.instruct.END_THINK)
+
+        ids_kept = [
+            i
+            for i in ids
+            if i in non_skip_special_tokens_ids or not self._is_special_token_id(i)
+        ]
+
+        # We filtered unwanted special tokens so we can decode the rest.
+        tokens = [self.tokenizer.id_to_piece(token_id) for token_id in ids_kept]
+
+        if any("<EFBFBD>" in t for t in tokens) and self.is_tekken:
+            # if a decoded token contains the replacement character, then the
+            # token has an incomplete UTF-8 character so we must use bytes
+            # See: https://github.com/vllm-project/vllm/pull/8640
+            #      https://github.com/vllm-project/vllm/pull/9625
+            # if underlying tokenizer is sentencepiece, we just add "<22>".
+            # We filtered unwanted special tokens so we can decode the rest.
+            tokens = [
+                self.tokenizer.id_to_byte_piece(token_id, SpecialTokenPolicy.KEEP)
+                if token_id not in self._special_token_ids_set
+                else self.tokenizer.decode([token_id], SpecialTokenPolicy.KEEP)
+                for token_id in ids_kept
+            ]
+
+        return tokens
--- a/vllm/tokenizers/protocol.py
+++ b/vllm/tokenizers/protocol.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Protocol, overload
+
+if TYPE_CHECKING:
+    from transformers import BatchEncoding
+
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+
+class TokenizerLike(Protocol):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TokenizerLike":
+        raise NotImplementedError
+
+    def num_special_tokens_to_add(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        raise NotImplementedError
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        raise NotImplementedError
+
+    @property
+    def bos_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def eos_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def pad_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def is_fast(self) -> bool:
+        raise NotImplementedError
+
+    @property
+    def vocab_size(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def max_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def max_chars_per_token(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def truncation_side(self) -> str:
+        raise NotImplementedError
+
+    def __hash__(self) -> int:
+        return hash(id(self))
+
+    def __len__(self) -> int:
+        return self.vocab_size
+
+    def __call__(
+        self,
+        text: str | list[str],
+        text_pair: str | None = None,
+        add_special_tokens: bool = True,
+        truncation: bool = False,
+        max_length: int | None = None,
+    ) -> "BatchEncoding":
+        raise NotImplementedError
+
+    def get_vocab(self) -> dict[str, int]:
+        raise NotImplementedError
+
+    def get_added_vocab(self) -> dict[str, int]:
+        raise NotImplementedError
+
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> list[int]:
+        raise NotImplementedError
+
+    def apply_chat_template(
+        self,
+        messages: list["ChatCompletionMessageParam"],
+        tools: list[dict[str, Any]] | None = None,
+        **kwargs,
+    ) -> str | list[int]:
+        raise NotImplementedError
+
+    @overload
+    def convert_tokens_to_ids(self, tokens: str) -> int: ...
+
+    @overload
+    def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ...
+
+    def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
+        raise NotImplementedError
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        raise NotImplementedError
+
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
+        raise NotImplementedError
+
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = False,
+    ) -> list[str]:
+        raise NotImplementedError
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass, field
+from functools import lru_cache
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import huggingface_hub
+from typing_extensions import TypeVar, assert_never
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.transformers_utils.gguf_utils import (
+    check_gguf_file,
+    get_gguf_file_path_from_hf,
+    is_gguf,
+    is_remote_gguf,
+    split_remote_gguf,
+)
+from vllm.transformers_utils.repo_utils import (
+    any_pattern_in_repo_files,
+    is_mistral_model_repo,
+)
+from vllm.utils.import_utils import resolve_obj_by_qualname
+
+from .protocol import TokenizerLike
+
+if TYPE_CHECKING:
+    from vllm.config.model import ModelConfig, RunnerType
+
+logger = init_logger(__name__)
+
+
+_VLLM_TOKENIZERS = {
+    "deepseek_v32": ("deepseek_v32", "DeepseekV32Tokenizer"),
+    "grok2": ("grok2", "Grok2Tokenizer"),
+    "hf": ("hf", "CachedHfTokenizer"),
+    "mistral": ("mistral", "MistralTokenizer"),
+}
+
+
+@dataclass
+class _TokenizerRegistry:
+    # Tokenizer mode ->  (tokenizer module, tokenizer class)
+    tokenizers: dict[str, tuple[str, str]] = field(default_factory=dict)
+
+    def register(self, tokenizer_mode: str, module: str, class_name: str) -> None:
+        if tokenizer_mode in self.tokenizers:
+            logger.warning(
+                "%s.%s is already registered for tokenizer_mode=%r. "
+                "It is overwritten by the new one.",
+                module,
+                class_name,
+                tokenizer_mode,
+            )
+
+        self.tokenizers[tokenizer_mode] = (module, class_name)
+
+        return None
+
+    def load_tokenizer_cls(self, tokenizer_mode: str) -> type[TokenizerLike]:
+        if tokenizer_mode not in self.tokenizers:
+            raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")
+
+        module, class_name = self.tokenizers[tokenizer_mode]
+        logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")
+
+        return resolve_obj_by_qualname(f"{module}.{class_name}")
+
+    def load_tokenizer(self, tokenizer_mode: str, *args, **kwargs) -> TokenizerLike:
+        tokenizer_cls = self.load_tokenizer_cls(tokenizer_mode)
+        return tokenizer_cls.from_pretrained(*args, **kwargs)
+
+
+TokenizerRegistry = _TokenizerRegistry(
+    {
+        mode: (f"vllm.tokenizers.{mod_relname}", cls_name)
+        for mode, (mod_relname, cls_name) in _VLLM_TOKENIZERS.items()
+    }
+)
+
+
+def resolve_tokenizer_args(
+    tokenizer_name: str | Path,
+    *args,
+    runner_type: "RunnerType" = "generate",
+    tokenizer_mode: str = "auto",
+    **kwargs,
+):
+    revision: str | None = kwargs.get("revision")
+    download_dir: str | None = kwargs.get("download_dir")
+
+    if envs.VLLM_USE_MODELSCOPE:
+        # download model from ModelScope hub,
+        # lazy import so that modelscope is not required for normal use.
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        # avoid circular import
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
+        # Only set the tokenizer here, model will be downloaded on the workers.
+        if not Path(tokenizer_name).exists():
+            # Use file lock to prevent multiple processes from
+            # downloading the same file at the same time.
+            with get_lock(tokenizer_name, download_dir):
+                tokenizer_path = snapshot_download(
+                    model_id=str(tokenizer_name),
+                    cache_dir=download_dir,
+                    revision=revision,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    # Ignore weights - we only need the tokenizer.
+                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
+                )
+                tokenizer_name = tokenizer_path
+
+    # Separate model folder from file path for GGUF models
+    if is_gguf(tokenizer_name):
+        if check_gguf_file(tokenizer_name):
+            kwargs["gguf_file"] = Path(tokenizer_name).name
+            tokenizer_name = Path(tokenizer_name).parent
+        elif is_remote_gguf(tokenizer_name):
+            tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
+            # Get the HuggingFace Hub path for the GGUF file
+            gguf_file = get_gguf_file_path_from_hf(
+                tokenizer_name,
+                quant_type,
+                revision=revision,
+            )
+            kwargs["gguf_file"] = gguf_file
+
+    if "truncation_side" not in kwargs:
+        if runner_type == "generate" or runner_type == "draft":
+            kwargs["truncation_side"] = "left"
+        elif runner_type == "pooling":
+            kwargs["truncation_side"] = "right"
+        else:
+            assert_never(runner_type)
+
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
+
+        tokenizer_mode = "hf"
+        kwargs["use_fast"] = False
+
+    # Try to use official Mistral tokenizer if possible
+    if (
+        tokenizer_mode == "auto"
+        and is_mistral_model_repo(
+            model_name_or_path=str(tokenizer_name), revision=revision
+        )
+        and any_pattern_in_repo_files(
+            model_name_or_path=str(tokenizer_name),
+            allow_patterns=["tekken.json", "tokenizer.model.v*"],
+            revision=revision,
+        )
+    ):
+        tokenizer_mode = "mistral"
+
+    # Try to use Grok2 tiktoken tokenizer if possible
+    if tokenizer_mode == "auto" and any_pattern_in_repo_files(
+        model_name_or_path=str(tokenizer_name),
+        allow_patterns=["tokenizer.tok.json"],
+        revision=revision,
+    ):
+        tokenizer_mode = "grok2"
+
+    # Fallback to HF tokenizer
+    if tokenizer_mode == "auto":
+        tokenizer_mode = "hf"
+
+    return tokenizer_mode, tokenizer_name, args, kwargs
+
+
+cached_resolve_tokenizer_args = lru_cache(resolve_tokenizer_args)
+
+
+def tokenizer_args_from_config(config: "ModelConfig", **kwargs):
+    return cached_resolve_tokenizer_args(
+        config.tokenizer,
+        runner_type=config.runner_type,
+        tokenizer_mode=config.tokenizer_mode,
+        revision=config.tokenizer_revision,
+        trust_remote_code=config.trust_remote_code,
+        **kwargs,
+    )
+
+
+_T = TypeVar("_T", bound=TokenizerLike, default=TokenizerLike)
+
+
+def get_tokenizer(
+    tokenizer_name: str | Path,
+    *args,
+    tokenizer_cls: type[_T] = TokenizerLike,  # type: ignore[assignment]
+    trust_remote_code: bool = False,
+    revision: str | None = None,
+    download_dir: str | None = None,
+    **kwargs,
+) -> _T:
+    """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
+    tokenizer_mode, tokenizer_name, args, kwargs = cached_resolve_tokenizer_args(
+        tokenizer_name,
+        *args,
+        trust_remote_code=trust_remote_code,
+        revision=revision,
+        download_dir=download_dir,
+        **kwargs,
+    )
+
+    if tokenizer_cls == TokenizerLike:
+        tokenizer_cls_ = TokenizerRegistry.load_tokenizer_cls(tokenizer_mode)
+    else:
+        tokenizer_cls_ = tokenizer_cls
+
+    tokenizer = tokenizer_cls_.from_pretrained(tokenizer_name, *args, **kwargs)
+    if not tokenizer.is_fast:
+        logger.warning(
+            "Using a slow tokenizer. This might cause a significant "
+            "slowdown. Consider using a fast tokenizer instead."
+        )
+
+    return tokenizer  # type: ignore
+
+
+cached_get_tokenizer = lru_cache(get_tokenizer)
+
+
+def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
+    if model_config.skip_tokenizer_init:
+        return None
+
+    return cached_get_tokenizer(
+        model_config.tokenizer,
+        runner_type=model_config.runner_type,
+        tokenizer_mode=model_config.tokenizer_mode,
+        revision=model_config.tokenizer_revision,
+        trust_remote_code=model_config.trust_remote_code,
+        **kwargs,
+    )