Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/detokenizer/init.py
+++ b/tests/detokenizer/init.py
--- a/tests/detokenizer/test_disable_detokenization.py
+++ b/tests/detokenizer/test_disable_detokenization.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.skip_v1
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
+def test_computed_prefix_blocks(model: str):
+    # This test checks if the engine generates completions both with and
+    # without optional detokenization, that detokenization includes text
+    # and no-detokenization doesn't, and that both completions have the same
+    # token_ids.
+    prompt = (
+        "You are a helpful assistant. How do I build a car from cardboard and "
+        "paper clips? Is there an easy to follow video tutorial available "
+        "online for free?"
+    )
+
+    llm = LLM(model=model)
+    sampling_params = SamplingParams(max_tokens=10, temperature=0.0, detokenize=False)
+
+    outputs_no_detokenization = llm.generate(prompt, sampling_params)[0].outputs[0]
+    sampling_params.detokenize = True
+    outputs_with_detokenization = llm.generate(prompt, sampling_params)[0].outputs[0]
+
+    assert outputs_no_detokenization.text == ""
+    assert outputs_with_detokenization.text != ""
+    assert outputs_no_detokenization.token_ids == outputs_with_detokenization.token_ids
--- a/tests/detokenizer/test_min_tokens.py
+++ b/tests/detokenizer/test_min_tokens.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm import SamplingParams
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import FastIncrementalDetokenizer
+
+PROMPT = "Hello, my name is Lee, and I'm a student in the " + "college of engineering"
+
+
+@pytest.mark.parametrize(
+    "min_tokens,stop,truth",
+    [
+        (0, None, " is Lee, and I'm a student in the college of engineering"),
+        (0, "e", " is L"),
+        (5, "e", " is Lee, and I'm a stud"),
+    ],
+)
+def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str):
+    """Test for a specific min_tokens and stop.
+
+    See https://github.com/vllm-project/vllm/pull/22014
+    """
+    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+    all_prompt_ids = tokenizer(PROMPT, add_special_tokens=False).input_ids
+
+    # The prompt is "Hello, my name is"
+    prompt_token_ids = all_prompt_ids[:4]
+    params = SamplingParams(
+        stop=stop,
+        min_tokens=min_tokens,
+    )
+    request = EngineCoreRequest(
+        request_id="",
+        prompt_token_ids=prompt_token_ids,
+        mm_features=None,
+        sampling_params=params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
+
+    detokenizer = FastIncrementalDetokenizer(tokenizer, request)
+
+    detokenizer.update(all_prompt_ids[4:], False)
+    assert detokenizer.output_text == truth
--- a/tests/detokenizer/test_stop_reason.py
+++ b/tests/detokenizer/test_stop_reason.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Test the different finish_reason="stop" situations during generation:
+    1. One of the provided stop strings
+    2. One of the provided stop tokens
+    3. The EOS token
+
+Run `pytest tests/engine/test_stop_reason.py`.
+"""
+
+import pytest
+import transformers
+
+from vllm import SamplingParams
+
+MODEL = "distilbert/distilgpt2"
+STOP_STR = "."
+SEED = 42
+MAX_TOKENS = 1024
+
+
+@pytest.fixture
+def vllm_model(vllm_runner):
+    with vllm_runner(MODEL) as vllm_model:
+        yield vllm_model
+
+
+def test_stop_reason(vllm_model, example_prompts):
+    tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
+    stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
+    llm = vllm_model.llm
+
+    # test stop token
+    outputs = llm.generate(
+        example_prompts,
+        sampling_params=SamplingParams(
+            ignore_eos=True,
+            seed=SEED,
+            max_tokens=MAX_TOKENS,
+            stop_token_ids=[stop_token_id],
+        ),
+    )
+    for output in outputs:
+        output = output.outputs[0]
+        assert output.finish_reason == "stop"
+        assert output.stop_reason == stop_token_id
+
+    # test stop string
+    outputs = llm.generate(
+        example_prompts,
+        sampling_params=SamplingParams(
+            ignore_eos=True, seed=SEED, max_tokens=MAX_TOKENS, stop="."
+        ),
+    )
+    for output in outputs:
+        output = output.outputs[0]
+        assert output.finish_reason == "stop"
+        assert output.stop_reason == STOP_STR
+
+    # test EOS token
+    outputs = llm.generate(
+        example_prompts,
+        sampling_params=SamplingParams(seed=SEED, max_tokens=MAX_TOKENS),
+    )
+    for output in outputs:
+        output = output.outputs[0]
+        assert output.finish_reason == "length" or (
+            output.finish_reason == "stop" and output.stop_reason is None
+        )
--- a/tests/detokenizer/test_stop_string_while_stop_model_terminates.py
+++ b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.sampling_params import SamplingParams
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import BaseIncrementalDetokenizer
+
+
+@pytest.fixture(params=[True, False])
+def include_stop_str_in_output(request):
+    return request.param
+
+
+class _DummyDetokenizer(BaseIncrementalDetokenizer):
+    def __init__(self, request: EngineCoreRequest):
+        super().__init__(request)
+
+    def decode_next(self, next_token_id: int) -> str:
+        # Map token id to single ASCII character for deterministic testing.
+        return chr(next_token_id)
+
+
+def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0):
+    params = SamplingParams(
+        stop=stop,
+        include_stop_str_in_output=include_stop_str_in_output,
+        min_tokens=min_tokens,
+    )
+    # Keep other fields minimal for unit test purposes.
+    req = EngineCoreRequest(
+        request_id="test",
+        prompt_token_ids=[],
+        mm_features=None,
+        sampling_params=params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
+    return req
+
+
+def test_stop_string_while_stop_token_terminates(include_stop_str_in_output: bool):
+    """
+    This test verifies that the detokenizer correctly handles the case where
+    the generated token sequence contains both:
+    - a stop token
+    - an <eos> token
+
+    The detokenizer should respect the stop string and truncate the output
+    accordingly.
+
+    Imagine the following sequence:
+    - "abcdeZ" is generated, where "Z" is the <eos> token.
+    - "cd" is the stop string.
+
+    If include_stop_str_in_output=False, the detokenizer should truncate the
+    output to "ab" because the stop string "cd" is excluded.
+    If include_stop_str_in_output=True, the detokenizer should include the stop
+    string "cd" in the output, resulting in "abcd".
+
+
+    This verifies the behavioral change introduced in BaseIncrementalDetokenizer
+    where stop-string evaluation occurs before the early-return on
+    stop_terminated.
+    """
+
+    # Generate text "abcdeZ" and tokenize it.
+    generated_text = "abcde"
+    eos_token = "Z"
+    stop_string = "cd"
+    generated_text = generated_text + eos_token
+    token_ids = [ord(c) for c in generated_text]
+
+    # Create a request with the stop string and initialize the detokenizer.
+    req = _make_request(
+        stop=[stop_string], include_stop_str_in_output=include_stop_str_in_output
+    )
+    detok = _DummyDetokenizer(req)
+
+    # Simulate that the last token ('Z') is a stop token (stop_terminated=True).
+    result = detok.update(new_token_ids=token_ids, stop_terminated=True)
+
+    # The update should not report a stop string
+    assert result == stop_string
+
+    # Output text should reflect stop-string handling:
+    # - include_stop_str_in_output=False => exclude "cd" => "ab"
+    # - include_stop_str_in_output=True  => include "cd" => "abcd"
+    expected_text = "abcd" if include_stop_str_in_output else "ab"
+    assert detok.output_text == expected_text
+
+    # The skipped final token should still be recorded in token_ids.
+    assert detok.output_token_ids == token_ids
+
+    # get_next_output_text should return the full text when finished=True.
+    # (Buffering only applies during streaming when finished=False.)
+    assert detok.get_next_output_text(finished=True, delta=False) == expected_text
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+MODEL = "meta-llama/llama-2-7b-hf"
+MAX_TOKENS = 200
+
+
+def _test_stopping(
+    llm: LLM,
+    expected_output: str,
+    expected_reason: Any,
+    stop: list[str] | None = None,
+    stop_token_ids: list[int] | None = None,
+    include_in_output: bool = False,
+) -> None:
+    output = llm.generate(
+        "A story about vLLM:\n",
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=MAX_TOKENS,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            include_stop_str_in_output=include_in_output,
+        ),
+    )[0].outputs[0]
+
+    assert output is not None
+    assert output.text == expected_output
+    assert output.stop_reason == expected_reason
+
+
+def _stop_basic(llm):
+    _test_stopping(
+        llm,
+        stop=["."],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer organization",
+        expected_reason=".",
+    )
+
+    _test_stopping(
+        llm,
+        stop=["."],
+        include_in_output=True,
+        expected_output="VLLM is a 100% volunteer organization.",
+        expected_reason=".",
+    )
+
+
+def _stop_multi_tokens(llm):
+    _test_stopping(
+        llm,
+        stop=["group of peo", "short"],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer organization. We are a ",
+        expected_reason="group of peo",
+    )
+
+    _test_stopping(
+        llm,
+        stop=["group of peo", "short"],
+        include_in_output=True,
+        expected_output="VLLM is a 100% volunteer organization. We are a group of peo",
+        expected_reason="group of peo",
+    )
+
+
+def _stop_partial_token(llm):
+    _test_stopping(
+        llm,
+        stop=["gani"],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer or",
+        expected_reason="gani",
+    )
+
+    _test_stopping(
+        llm,
+        stop=["gani"],
+        include_in_output=True,
+        expected_output="VLLM is a 100% volunteer organi",
+        expected_reason="gani",
+    )
+
+
+def _stop_token_id(llm):
+    # token id 13013 => " organization"
+
+    _test_stopping(
+        llm,
+        stop_token_ids=[13013],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer",
+        expected_reason=13013,
+    )
+
+    _test_stopping(
+        llm,
+        stop_token_ids=[13013],
+        include_in_output=True,
+        expected_output="VLLM is a 100% volunteer organization",
+        expected_reason=13013,
+    )
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_strings():
+    llm = LLM(MODEL, enforce_eager=True)
+
+    _stop_basic(llm)
+    _stop_multi_tokens(llm)
+    _stop_partial_token(llm)
+    # FIXME: this does not respect include_in_output=False
+    # _stop_token_id(llm)