Sync from v0.13
This commit is contained in:
4
tests/tokenizers_/__init__.py
Normal file
4
tests/tokenizers_/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# NOTE: Since CI runs the tests from the `tests` directory, it is necessary to rename
|
||||
# this module to avoid conflicting with HF's `tokenizers` package
|
||||
59
tests/tokenizers_/test_basic.py
Normal file
59
tests/tokenizers_/test_basic.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import _get_protocol_attrs # type: ignore
|
||||
|
||||
import pytest
|
||||
from transformers import (
|
||||
PreTrainedTokenizer,
|
||||
PreTrainedTokenizerBase,
|
||||
PreTrainedTokenizerFast,
|
||||
)
|
||||
|
||||
from vllm.tokenizers import TokenizerLike, get_tokenizer
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
|
||||
|
||||
def _get_missing_attrs(obj: object, target: type):
|
||||
return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]
|
||||
|
||||
|
||||
def _assert_tokenizer_like(tokenizer: object):
|
||||
missing_attrs = _get_missing_attrs(tokenizer, TokenizerLike)
|
||||
assert not missing_attrs, f"Missing attrs: {missing_attrs}"
|
||||
|
||||
|
||||
def test_tokenizer_like_protocol():
|
||||
tokenizer = get_tokenizer("gpt2", use_fast=False)
|
||||
assert isinstance(tokenizer, PreTrainedTokenizer)
|
||||
_assert_tokenizer_like(tokenizer)
|
||||
|
||||
tokenizer = get_tokenizer("gpt2", use_fast=True)
|
||||
assert isinstance(tokenizer, PreTrainedTokenizerFast)
|
||||
_assert_tokenizer_like(tokenizer)
|
||||
|
||||
tokenizer = get_tokenizer(
|
||||
"mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
|
||||
)
|
||||
assert isinstance(tokenizer, MistralTokenizer)
|
||||
_assert_tokenizer_like(tokenizer)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
|
||||
def test_tokenizer_revision(tokenizer_name: str):
|
||||
# Assume that "main" branch always exists
|
||||
tokenizer = get_tokenizer(tokenizer_name, revision="main")
|
||||
assert isinstance(tokenizer, PreTrainedTokenizerBase)
|
||||
|
||||
# Assume that "never" branch always does not exist
|
||||
with pytest.raises(OSError, match="not a valid git identifier"):
|
||||
get_tokenizer(tokenizer_name, revision="never")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokenizer_name", ["BAAI/bge-base-en"])
|
||||
@pytest.mark.parametrize("n_tokens", [510])
|
||||
def test_special_tokens(tokenizer_name: str, n_tokens: int):
|
||||
tokenizer = get_tokenizer(tokenizer_name, revision="main")
|
||||
|
||||
prompts = "[UNK]" * n_tokens
|
||||
prompt_token_ids = tokenizer.encode(prompts)
|
||||
assert len(prompt_token_ids) == n_tokens + 2
|
||||
241
tests/tokenizers_/test_detokenize.py
Normal file
241
tests/tokenizers_/test_detokenize.py
Normal file
@@ -0,0 +1,241 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Generator
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.tokenizers.mistral import MistralTokenizer
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.detokenizer import (
|
||||
FastIncrementalDetokenizer,
|
||||
IncrementalDetokenizer,
|
||||
SlowIncrementalDetokenizer,
|
||||
)
|
||||
|
||||
SPECIAL_TOKS_TRUTH = [
|
||||
"Some text with adjacent special tokens <|padding|><|padding|><fim_prefix><fim_middle><fim_suffix>other text<fim_pad>", # noqa
|
||||
]
|
||||
|
||||
TRUTH = [
|
||||
"Hello here, this is a simple test",
|
||||
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa
|
||||
"我很感谢你的热情",
|
||||
# Burmese text triggers an edge-case for Mistral's V3-Tekken tokenizer (eg.
|
||||
# for mistralai/Pixtral-12B-2409) where tokens may map to bytes with
|
||||
# incomplete UTF-8 characters
|
||||
# see https://github.com/vllm-project/vllm/pull/9625
|
||||
"ပုံပြင်လေးပြောပြပါ်",
|
||||
] + SPECIAL_TOKS_TRUTH
|
||||
|
||||
TOKENIZERS = [
|
||||
"facebook/opt-125m",
|
||||
"gpt2",
|
||||
"bigcode/tiny_starcoder_py",
|
||||
"EleutherAI/gpt-j-6b",
|
||||
"EleutherAI/pythia-70m",
|
||||
"bigscience/bloom-560m",
|
||||
"mosaicml/mpt-7b",
|
||||
"tiiuae/falcon-7b",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"codellama/CodeLlama-7b-hf",
|
||||
"mistralai/Pixtral-12B-2409",
|
||||
]
|
||||
|
||||
|
||||
def _run_incremental_decode(
|
||||
tokenizer,
|
||||
all_input_ids,
|
||||
skip_special_tokens: bool,
|
||||
starting_index: int,
|
||||
spaces_between_special_tokens: bool = True,
|
||||
fast: bool | None = None,
|
||||
):
|
||||
prompt_token_ids = all_input_ids[:starting_index]
|
||||
|
||||
params = SamplingParams(
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
)
|
||||
request = EngineCoreRequest(
|
||||
request_id="",
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
mm_features=None,
|
||||
sampling_params=params,
|
||||
pooling_params=None,
|
||||
eos_token_id=None,
|
||||
arrival_time=0.0,
|
||||
lora_request=None,
|
||||
cache_salt=None,
|
||||
data_parallel_rank=None,
|
||||
)
|
||||
|
||||
if fast is None:
|
||||
detokenizer = IncrementalDetokenizer.from_new_request(tokenizer, request)
|
||||
elif fast:
|
||||
detokenizer = FastIncrementalDetokenizer(tokenizer, request)
|
||||
else:
|
||||
detokenizer = SlowIncrementalDetokenizer(tokenizer, request)
|
||||
|
||||
output_text = ""
|
||||
for i, token_id in enumerate(all_input_ids[starting_index:]):
|
||||
detokenizer.update([token_id], False)
|
||||
finished = i == len(all_input_ids) - 1
|
||||
output_text += detokenizer.get_next_output_text(finished, delta=True)
|
||||
|
||||
return output_text, detokenizer.output_token_ids
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tokenizer(tokenizer_name):
|
||||
return (
|
||||
MistralTokenizer.from_pretrained(tokenizer_name)
|
||||
if "mistral" in tokenizer_name
|
||||
else AutoTokenizer.from_pretrained(tokenizer_name)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"])
|
||||
@pytest.mark.parametrize(
|
||||
"truth",
|
||||
[
|
||||
# Burmese text triggers an edge-case where tokens may map to bytes with
|
||||
# incomplete UTF-8 characters
|
||||
"ပုံပြင်လေးပြောပြပါ",
|
||||
# Using "URGENCY" since "CY" has token id 130282
|
||||
"URGENCY🌶️",
|
||||
],
|
||||
)
|
||||
def test_mistral_edge_case(tokenizer, truth):
|
||||
"""Test for a specific edge cases with V3-Tekken MistralTokenizer.
|
||||
|
||||
See https://github.com/vllm-project/vllm/pull/9625
|
||||
"""
|
||||
starting_index = 0
|
||||
all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids
|
||||
|
||||
decoded_text, out_ids = _run_incremental_decode(
|
||||
tokenizer,
|
||||
all_input_ids,
|
||||
skip_special_tokens=True,
|
||||
starting_index=starting_index,
|
||||
)
|
||||
assert decoded_text == truth
|
||||
assert out_ids == all_input_ids[starting_index:]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
|
||||
if "mistral" in tokenizer_name:
|
||||
yield (
|
||||
True
|
||||
if request.param
|
||||
else pytest.skip("mistral doesn't support skip_special_tokens=False")
|
||||
)
|
||||
else:
|
||||
yield bool(request.param)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("truth", TRUTH)
|
||||
@pytest.mark.parametrize("with_prompt", [True, False])
|
||||
@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
|
||||
@pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True)
|
||||
@pytest.mark.parametrize("spaces_between_special_tokens", (True, False))
|
||||
@pytest.mark.parametrize("fast", (True, False))
|
||||
def test_decode_streaming(
|
||||
tokenizer,
|
||||
truth,
|
||||
with_prompt,
|
||||
skip_special_tokens,
|
||||
spaces_between_special_tokens,
|
||||
fast,
|
||||
):
|
||||
if fast and not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
pytest.skip()
|
||||
|
||||
if skip_special_tokens and not spaces_between_special_tokens:
|
||||
pytest.skip()
|
||||
|
||||
if not fast and isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
# Fix up inconsistency in fast/slow tokenizer behaviour.
|
||||
tokenizer.add_special_tokens(
|
||||
{
|
||||
"additional_special_tokens": [
|
||||
at
|
||||
for at in tokenizer._tokenizer.get_added_tokens_decoder().values()
|
||||
if at.special
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
extra_decode_args = (
|
||||
{}
|
||||
if not isinstance(tokenizer, PreTrainedTokenizer)
|
||||
else {"spaces_between_special_tokens": spaces_between_special_tokens}
|
||||
)
|
||||
|
||||
truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids
|
||||
if tokenizer.bos_token_id is not None:
|
||||
truth_tokens.insert(0, tokenizer.bos_token_id)
|
||||
truth_tokens.append(tokenizer.eos_token_id)
|
||||
|
||||
new_truth = tokenizer.decode(
|
||||
truth_tokens, skip_special_tokens=skip_special_tokens, **extra_decode_args
|
||||
)
|
||||
|
||||
if with_prompt:
|
||||
num_prompt_tokens = len(
|
||||
tokenizer(truth[: len(truth) // 2], add_special_tokens=False).input_ids
|
||||
)
|
||||
if tokenizer.bos_token_id is not None:
|
||||
num_prompt_tokens += 1
|
||||
|
||||
prompt_input_ids = truth_tokens[:num_prompt_tokens]
|
||||
generated_input_ids = truth_tokens[num_prompt_tokens:]
|
||||
all_input_ids = prompt_input_ids + generated_input_ids
|
||||
starting_index = len(prompt_input_ids)
|
||||
prompt = tokenizer.decode(
|
||||
prompt_input_ids,
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
**extra_decode_args,
|
||||
)
|
||||
|
||||
generated = new_truth[len(prompt) :]
|
||||
else:
|
||||
generated = new_truth
|
||||
starting_index = 0
|
||||
all_input_ids = truth_tokens
|
||||
|
||||
decoded_text, out_ids = _run_incremental_decode(
|
||||
tokenizer,
|
||||
all_input_ids,
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
starting_index=starting_index,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
fast=fast,
|
||||
)
|
||||
|
||||
assert decoded_text == generated
|
||||
assert out_ids == all_input_ids[starting_index:]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
|
||||
@pytest.mark.parametrize("fast", (True, False))
|
||||
def test_oov_decode(tokenizer, fast):
|
||||
if fast and not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
pytest.skip()
|
||||
|
||||
decoded_text, out_ids = _run_incremental_decode(
|
||||
tokenizer,
|
||||
[len(tokenizer)],
|
||||
skip_special_tokens=True,
|
||||
starting_index=0,
|
||||
spaces_between_special_tokens=True,
|
||||
fast=fast,
|
||||
)
|
||||
|
||||
assert decoded_text == ""
|
||||
assert out_ids == [len(tokenizer)]
|
||||
43
tests/tokenizers_/test_hf.py
Normal file
43
tests/tokenizers_/test_hf.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pickle
|
||||
from copy import deepcopy
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tokenizers.hf import get_cached_tokenizer
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])
|
||||
def test_cached_tokenizer(model_id: str):
|
||||
reference_tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id, trust_remote_code=True
|
||||
)
|
||||
reference_tokenizer.add_special_tokens({"cls_token": "<CLS>"})
|
||||
reference_tokenizer.add_special_tokens({"additional_special_tokens": ["<SEP>"]})
|
||||
|
||||
cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer))
|
||||
_check_consistency(cached_tokenizer, reference_tokenizer)
|
||||
|
||||
pickled_tokenizer = pickle.dumps(cached_tokenizer)
|
||||
unpickled_tokenizer = pickle.loads(pickled_tokenizer)
|
||||
_check_consistency(unpickled_tokenizer, reference_tokenizer)
|
||||
|
||||
|
||||
def _check_consistency(target: TokenizerLike, expected: TokenizerLike):
|
||||
assert isinstance(target, type(expected))
|
||||
|
||||
# Cached attributes
|
||||
assert target.all_special_ids == expected.all_special_ids
|
||||
assert target.all_special_tokens == expected.all_special_tokens
|
||||
assert target.get_vocab() == expected.get_vocab()
|
||||
assert len(target) == len(expected)
|
||||
|
||||
# Other attributes
|
||||
assert getattr(target, "padding_side", None) == getattr(
|
||||
expected, "padding_side", None
|
||||
)
|
||||
|
||||
assert target.encode("prompt") == expected.encode("prompt")
|
||||
2409
tests/tokenizers_/test_mistral.py
Normal file
2409
tests/tokenizers_/test_mistral.py
Normal file
File diff suppressed because it is too large
Load Diff
77
tests/tokenizers_/test_registry.py
Normal file
77
tests/tokenizers_/test_registry.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tokenizers.registry import (
|
||||
TokenizerRegistry,
|
||||
get_tokenizer,
|
||||
resolve_tokenizer_args,
|
||||
)
|
||||
|
||||
|
||||
class TestTokenizer(TokenizerLike):
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
cls,
|
||||
path_or_repo_id: str | Path,
|
||||
*args,
|
||||
trust_remote_code: bool = False,
|
||||
revision: str | None = None,
|
||||
download_dir: str | None = None,
|
||||
**kwargs,
|
||||
) -> "TestTokenizer":
|
||||
return TestTokenizer(path_or_repo_id) # type: ignore
|
||||
|
||||
def __init__(self, path_or_repo_id: str | Path) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.path_or_repo_id = path_or_repo_id
|
||||
|
||||
@property
|
||||
def bos_token_id(self) -> int:
|
||||
return 0
|
||||
|
||||
@property
|
||||
def eos_token_id(self) -> int:
|
||||
return 1
|
||||
|
||||
@property
|
||||
def pad_token_id(self) -> int:
|
||||
return 2
|
||||
|
||||
@property
|
||||
def is_fast(self) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
@pytest.mark.parametrize("runner_type", ["generate", "pooling"])
|
||||
def test_resolve_tokenizer_args_idempotent(runner_type):
|
||||
tokenizer_mode, tokenizer_name, args, kwargs = resolve_tokenizer_args(
|
||||
"facebook/opt-125m",
|
||||
runner_type=runner_type,
|
||||
)
|
||||
|
||||
assert (tokenizer_mode, tokenizer_name, args, kwargs) == resolve_tokenizer_args(
|
||||
tokenizer_name, *args, **kwargs
|
||||
)
|
||||
|
||||
|
||||
def test_customized_tokenizer():
|
||||
TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
|
||||
|
||||
tokenizer = TokenizerRegistry.load_tokenizer("test_tokenizer", "abc")
|
||||
assert isinstance(tokenizer, TestTokenizer)
|
||||
assert tokenizer.path_or_repo_id == "abc"
|
||||
assert tokenizer.bos_token_id == 0
|
||||
assert tokenizer.eos_token_id == 1
|
||||
assert tokenizer.pad_token_id == 2
|
||||
|
||||
tokenizer = get_tokenizer("abc", tokenizer_mode="test_tokenizer")
|
||||
assert isinstance(tokenizer, TestTokenizer)
|
||||
assert tokenizer.path_or_repo_id == "abc"
|
||||
assert tokenizer.bos_token_id == 0
|
||||
assert tokenizer.eos_token_id == 1
|
||||
assert tokenizer.pad_token_id == 2
|
||||
Reference in New Issue
Block a user