Sync from v0.13
This commit is contained in:
0
tests/detokenizer/__init__.py
Normal file
0
tests/detokenizer/__init__.py
Normal file
32
tests/detokenizer/test_disable_detokenization.py
Normal file
32
tests/detokenizer/test_disable_detokenization.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
|
||||
def test_computed_prefix_blocks(model: str):
|
||||
# This test checks if the engine generates completions both with and
|
||||
# without optional detokenization, that detokenization includes text
|
||||
# and no-detokenization doesn't, and that both completions have the same
|
||||
# token_ids.
|
||||
prompt = (
|
||||
"You are a helpful assistant. How do I build a car from cardboard and "
|
||||
"paper clips? Is there an easy to follow video tutorial available "
|
||||
"online for free?"
|
||||
)
|
||||
|
||||
llm = LLM(model=model)
|
||||
sampling_params = SamplingParams(max_tokens=10, temperature=0.0, detokenize=False)
|
||||
|
||||
outputs_no_detokenization = llm.generate(prompt, sampling_params)[0].outputs[0]
|
||||
sampling_params.detokenize = True
|
||||
outputs_with_detokenization = llm.generate(prompt, sampling_params)[0].outputs[0]
|
||||
|
||||
assert outputs_no_detokenization.text == ""
|
||||
assert outputs_with_detokenization.text != ""
|
||||
assert outputs_no_detokenization.token_ids == outputs_with_detokenization.token_ids
|
||||
52
tests/detokenizer/test_min_tokens.py
Normal file
52
tests/detokenizer/test_min_tokens.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.detokenizer import FastIncrementalDetokenizer
|
||||
|
||||
PROMPT = "Hello, my name is Lee, and I'm a student in the " + "college of engineering"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"min_tokens,stop,truth",
|
||||
[
|
||||
(0, None, " is Lee, and I'm a student in the college of engineering"),
|
||||
(0, "e", " is L"),
|
||||
(5, "e", " is Lee, and I'm a stud"),
|
||||
],
|
||||
)
|
||||
def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str):
|
||||
"""Test for a specific min_tokens and stop.
|
||||
|
||||
See https://github.com/vllm-project/vllm/pull/22014
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
|
||||
all_prompt_ids = tokenizer(PROMPT, add_special_tokens=False).input_ids
|
||||
|
||||
# The prompt is "Hello, my name is"
|
||||
prompt_token_ids = all_prompt_ids[:4]
|
||||
params = SamplingParams(
|
||||
stop=stop,
|
||||
min_tokens=min_tokens,
|
||||
)
|
||||
request = EngineCoreRequest(
|
||||
request_id="",
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
mm_features=None,
|
||||
sampling_params=params,
|
||||
pooling_params=None,
|
||||
eos_token_id=None,
|
||||
arrival_time=0.0,
|
||||
lora_request=None,
|
||||
cache_salt=None,
|
||||
data_parallel_rank=None,
|
||||
)
|
||||
|
||||
detokenizer = FastIncrementalDetokenizer(tokenizer, request)
|
||||
|
||||
detokenizer.update(all_prompt_ids[4:], False)
|
||||
assert detokenizer.output_text == truth
|
||||
69
tests/detokenizer/test_stop_reason.py
Normal file
69
tests/detokenizer/test_stop_reason.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test the different finish_reason="stop" situations during generation:
|
||||
1. One of the provided stop strings
|
||||
2. One of the provided stop tokens
|
||||
3. The EOS token
|
||||
|
||||
Run `pytest tests/engine/test_stop_reason.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import transformers
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
MODEL = "distilbert/distilgpt2"
|
||||
STOP_STR = "."
|
||||
SEED = 42
|
||||
MAX_TOKENS = 1024
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vllm_model(vllm_runner):
|
||||
with vllm_runner(MODEL) as vllm_model:
|
||||
yield vllm_model
|
||||
|
||||
|
||||
def test_stop_reason(vllm_model, example_prompts):
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
|
||||
stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
|
||||
llm = vllm_model.llm
|
||||
|
||||
# test stop token
|
||||
outputs = llm.generate(
|
||||
example_prompts,
|
||||
sampling_params=SamplingParams(
|
||||
ignore_eos=True,
|
||||
seed=SEED,
|
||||
max_tokens=MAX_TOKENS,
|
||||
stop_token_ids=[stop_token_id],
|
||||
),
|
||||
)
|
||||
for output in outputs:
|
||||
output = output.outputs[0]
|
||||
assert output.finish_reason == "stop"
|
||||
assert output.stop_reason == stop_token_id
|
||||
|
||||
# test stop string
|
||||
outputs = llm.generate(
|
||||
example_prompts,
|
||||
sampling_params=SamplingParams(
|
||||
ignore_eos=True, seed=SEED, max_tokens=MAX_TOKENS, stop="."
|
||||
),
|
||||
)
|
||||
for output in outputs:
|
||||
output = output.outputs[0]
|
||||
assert output.finish_reason == "stop"
|
||||
assert output.stop_reason == STOP_STR
|
||||
|
||||
# test EOS token
|
||||
outputs = llm.generate(
|
||||
example_prompts,
|
||||
sampling_params=SamplingParams(seed=SEED, max_tokens=MAX_TOKENS),
|
||||
)
|
||||
for output in outputs:
|
||||
output = output.outputs[0]
|
||||
assert output.finish_reason == "length" or (
|
||||
output.finish_reason == "stop" and output.stop_reason is None
|
||||
)
|
||||
@@ -0,0 +1,102 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.detokenizer import BaseIncrementalDetokenizer
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def include_stop_str_in_output(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class _DummyDetokenizer(BaseIncrementalDetokenizer):
|
||||
def __init__(self, request: EngineCoreRequest):
|
||||
super().__init__(request)
|
||||
|
||||
def decode_next(self, next_token_id: int) -> str:
|
||||
# Map token id to single ASCII character for deterministic testing.
|
||||
return chr(next_token_id)
|
||||
|
||||
|
||||
def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0):
|
||||
params = SamplingParams(
|
||||
stop=stop,
|
||||
include_stop_str_in_output=include_stop_str_in_output,
|
||||
min_tokens=min_tokens,
|
||||
)
|
||||
# Keep other fields minimal for unit test purposes.
|
||||
req = EngineCoreRequest(
|
||||
request_id="test",
|
||||
prompt_token_ids=[],
|
||||
mm_features=None,
|
||||
sampling_params=params,
|
||||
pooling_params=None,
|
||||
eos_token_id=None,
|
||||
arrival_time=0.0,
|
||||
lora_request=None,
|
||||
cache_salt=None,
|
||||
data_parallel_rank=None,
|
||||
)
|
||||
return req
|
||||
|
||||
|
||||
def test_stop_string_while_stop_token_terminates(include_stop_str_in_output: bool):
|
||||
"""
|
||||
This test verifies that the detokenizer correctly handles the case where
|
||||
the generated token sequence contains both:
|
||||
- a stop token
|
||||
- an <eos> token
|
||||
|
||||
The detokenizer should respect the stop string and truncate the output
|
||||
accordingly.
|
||||
|
||||
Imagine the following sequence:
|
||||
- "abcdeZ" is generated, where "Z" is the <eos> token.
|
||||
- "cd" is the stop string.
|
||||
|
||||
If include_stop_str_in_output=False, the detokenizer should truncate the
|
||||
output to "ab" because the stop string "cd" is excluded.
|
||||
If include_stop_str_in_output=True, the detokenizer should include the stop
|
||||
string "cd" in the output, resulting in "abcd".
|
||||
|
||||
|
||||
This verifies the behavioral change introduced in BaseIncrementalDetokenizer
|
||||
where stop-string evaluation occurs before the early-return on
|
||||
stop_terminated.
|
||||
"""
|
||||
|
||||
# Generate text "abcdeZ" and tokenize it.
|
||||
generated_text = "abcde"
|
||||
eos_token = "Z"
|
||||
stop_string = "cd"
|
||||
generated_text = generated_text + eos_token
|
||||
token_ids = [ord(c) for c in generated_text]
|
||||
|
||||
# Create a request with the stop string and initialize the detokenizer.
|
||||
req = _make_request(
|
||||
stop=[stop_string], include_stop_str_in_output=include_stop_str_in_output
|
||||
)
|
||||
detok = _DummyDetokenizer(req)
|
||||
|
||||
# Simulate that the last token ('Z') is a stop token (stop_terminated=True).
|
||||
result = detok.update(new_token_ids=token_ids, stop_terminated=True)
|
||||
|
||||
# The update should not report a stop string
|
||||
assert result == stop_string
|
||||
|
||||
# Output text should reflect stop-string handling:
|
||||
# - include_stop_str_in_output=False => exclude "cd" => "ab"
|
||||
# - include_stop_str_in_output=True => include "cd" => "abcd"
|
||||
expected_text = "abcd" if include_stop_str_in_output else "ab"
|
||||
assert detok.output_text == expected_text
|
||||
|
||||
# The skipped final token should still be recorded in token_ids.
|
||||
assert detok.output_token_ids == token_ids
|
||||
|
||||
# get_next_output_text should return the full text when finished=True.
|
||||
# (Buffering only applies during streaming when finished=False.)
|
||||
assert detok.get_next_output_text(finished=True, delta=False) == expected_text
|
||||
120
tests/detokenizer/test_stop_strings.py
Normal file
120
tests/detokenizer/test_stop_strings.py
Normal file
@@ -0,0 +1,120 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
MODEL = "meta-llama/llama-2-7b-hf"
|
||||
MAX_TOKENS = 200
|
||||
|
||||
|
||||
def _test_stopping(
|
||||
llm: LLM,
|
||||
expected_output: str,
|
||||
expected_reason: Any,
|
||||
stop: list[str] | None = None,
|
||||
stop_token_ids: list[int] | None = None,
|
||||
include_in_output: bool = False,
|
||||
) -> None:
|
||||
output = llm.generate(
|
||||
"A story about vLLM:\n",
|
||||
SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=MAX_TOKENS,
|
||||
stop=stop,
|
||||
stop_token_ids=stop_token_ids,
|
||||
include_stop_str_in_output=include_in_output,
|
||||
),
|
||||
)[0].outputs[0]
|
||||
|
||||
assert output is not None
|
||||
assert output.text == expected_output
|
||||
assert output.stop_reason == expected_reason
|
||||
|
||||
|
||||
def _stop_basic(llm):
|
||||
_test_stopping(
|
||||
llm,
|
||||
stop=["."],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer organization",
|
||||
expected_reason=".",
|
||||
)
|
||||
|
||||
_test_stopping(
|
||||
llm,
|
||||
stop=["."],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organization.",
|
||||
expected_reason=".",
|
||||
)
|
||||
|
||||
|
||||
def _stop_multi_tokens(llm):
|
||||
_test_stopping(
|
||||
llm,
|
||||
stop=["group of peo", "short"],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer organization. We are a ",
|
||||
expected_reason="group of peo",
|
||||
)
|
||||
|
||||
_test_stopping(
|
||||
llm,
|
||||
stop=["group of peo", "short"],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organization. We are a group of peo",
|
||||
expected_reason="group of peo",
|
||||
)
|
||||
|
||||
|
||||
def _stop_partial_token(llm):
|
||||
_test_stopping(
|
||||
llm,
|
||||
stop=["gani"],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer or",
|
||||
expected_reason="gani",
|
||||
)
|
||||
|
||||
_test_stopping(
|
||||
llm,
|
||||
stop=["gani"],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organi",
|
||||
expected_reason="gani",
|
||||
)
|
||||
|
||||
|
||||
def _stop_token_id(llm):
|
||||
# token id 13013 => " organization"
|
||||
|
||||
_test_stopping(
|
||||
llm,
|
||||
stop_token_ids=[13013],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer",
|
||||
expected_reason=13013,
|
||||
)
|
||||
|
||||
_test_stopping(
|
||||
llm,
|
||||
stop_token_ids=[13013],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organization",
|
||||
expected_reason=13013,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_strings():
|
||||
llm = LLM(MODEL, enforce_eager=True)
|
||||
|
||||
_stop_basic(llm)
|
||||
_stop_multi_tokens(llm)
|
||||
_stop_partial_token(llm)
|
||||
# FIXME: this does not respect include_in_output=False
|
||||
# _stop_token_id(llm)
|
||||
Reference in New Issue
Block a user