Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,685 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
import math
from collections.abc import Generator
from typing import get_args
import pytest
import torch
from tests.utils import large_gpu_mark
from tests.v1.sample.utils import (
BatchLogprobsComposition,
BatchLogprobsSpecType,
assert_incr_detok_str_matches_non_incr_detok_str,
compute_correct_cumulative_logprob,
get_test_batch,
)
from vllm import SamplingParams
from vllm.config.model import LogprobsMode
from vllm.distributed import cleanup_dist_env_and_memory
from ...conftest import HfRunner, VllmRunner
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
DTYPE = "half"
NONE = BatchLogprobsComposition.NONE
SAMPLE = BatchLogprobsComposition.SAMPLE
PROMPT = BatchLogprobsComposition.PROMPT
SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT
@pytest.fixture(
scope="module",
# Parameterize APC
params=[False, True],
)
def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
with vllm_runner(
MODEL,
dtype=DTYPE,
max_logprobs=7,
# Very small number of batched tokens to ensure
# that we test chunking.
max_num_batched_tokens=16,
max_num_seqs=16,
max_model_len=128,
enable_chunked_prefill=True,
enforce_eager=True,
# TODO: enable this once we support it for
# prompt logprobs.
enable_prefix_caching=request.param,
gpu_memory_utilization=0.4, # up to 2 alive concurrently
) as vllm_model:
yield vllm_model
@pytest.fixture(scope="module")
def hf_model(hf_runner) -> Generator[HfRunner, None, None]:
with hf_runner(MODEL, dtype=DTYPE) as hf_model:
yield hf_model
def _repeat_logprob_config(
test_prompts,
logprob_prompt_logprob_list: BatchLogprobsSpecType,
) -> BatchLogprobsSpecType:
"""Ensure each test prompt has a logprob config.
A logprob config specifies the optional (i.e.
may-be-`None`) number of sample logprobs and
the optional number of prompt logprobs.
If more test prompts than logprob configs are
provided, the provided logprob configs are
tiled to match the number of test prompts.
If fewer test prompts than logprob configs
are provided, the list of logprob configs
is truncated to match the number of test
prompts.
Otherwise, the list of logprob configs
is returned as-is.
Args:
test_prompts: list of prompts under test
logprob_prompt_logprob_list: list of
(optional num sample logprob,
optional num prompt logprob)
tuples
Returns:
list of
(optional num sample logprob,optional num prompt logprob)
tuples which is either identical to
`logprob_prompt_logprob_list`, or else repeats
`logprob_prompt_logprob_list` enough times to match the
number of `test_prompts`, or else is truncated to match
the number of `test_prompts`
"""
num_test_prompts = len(test_prompts)
# Make sure there is a logprobs configuration for each test prompt
logprob_prompt_logprob_list = list(
itertools.islice(itertools.cycle(logprob_prompt_logprob_list), num_test_prompts)
)
# Now the number of prompts should match the number of sample params combos
assert num_test_prompts == len(logprob_prompt_logprob_list)
return logprob_prompt_logprob_list
def _run_and_validate(
vllm_model: VllmRunner,
test_prompts: list[str],
vllm_sampling_params: SamplingParams,
hf_logprobs: list[list[torch.Tensor]],
hf_outputs: list[tuple[list[int], str]],
logprob_prompt_logprob_list: BatchLogprobsSpecType,
temperature: float,
max_tokens: int,
do_apc: bool,
) -> None:
vllm_results = vllm_model.llm.generate(
test_prompts, sampling_params=vllm_sampling_params
)
for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
vllm_results, hf_logprobs, hf_outputs, logprob_prompt_logprob_list
):
# Extract request-level (prompt)logprobs config
num_top_logprobs, num_top_prompt_logprobs = logprob_prompt_logprob
# Test whether sampled token output is consistent between vLLM and HF
# vLLM prompt+completion should match HF output
if temperature == 0.0:
assert (
vllm_result.prompt_token_ids + vllm_result.outputs[0].token_ids
== hf_output[0]
)
else:
# Sampled tokens won't match if not greedy
assert (
vllm_result.prompt_token_ids
== hf_output[0][: len(vllm_result.prompt_token_ids)]
)
# Validate sample logprobs
if num_top_logprobs is not None:
assert num_top_logprobs is not None
# Confirm that the structure of the sample logprobs in the result is
# correct
assert vllm_result.outputs[0].logprobs is not None
assert len(vllm_result.outputs[0].logprobs) == max_tokens
for logprobs, token_id in zip(
vllm_result.outputs[0].logprobs, vllm_result.outputs[0].token_ids
):
assert logprobs is not None
# Confirm that the output token appears among the logprobs
assert token_id in logprobs
token_in_topk = logprobs[token_id].rank <= num_top_logprobs
# If the output token is not included in the top K
# logprob, it can return 1 more data
if token_in_topk and num_top_logprobs != 0:
assert len(logprobs) == num_top_logprobs
else:
assert len(logprobs) == num_top_logprobs + 1
if num_top_logprobs > 0:
# We should have an entry for each of the topk ranks
all_ranks = {lp.rank for lp in logprobs.values()}
assert all(r in all_ranks for r in range(1, num_top_logprobs + 1))
output_text = vllm_result.outputs[0].text
output_string_from_most_likely_tokens_lst: list[str] = []
for top_logprobs in vllm_result.outputs[0].logprobs:
top_logprob = next(iter(top_logprobs.values()))
output_string_from_most_likely_tokens_lst.append(
top_logprob.decoded_token
)
output_string_from_most_likely_tokens = "".join(
output_string_from_most_likely_tokens_lst
)
assert_incr_detok_str_matches_non_incr_detok_str(
output_text,
output_string_from_most_likely_tokens,
"The output text from the top logprob for each token "
"position should be the same as the output text in the "
"result.",
)
# Compare vLLM sample logprobs to HF
vllm_sample_logprobs = vllm_result.outputs[0].logprobs
for i, top_logprobs in enumerate(vllm_sample_logprobs):
for token_id, sample_logprob in top_logprobs.items():
if temperature == 0.0 or i == 0:
logprob = sample_logprob.logprob
torch.testing.assert_close(
logprob,
hf_logprob[i][-1][token_id].item(),
atol=1e-2,
rtol=1e-2,
)
assert isinstance(sample_logprob.decoded_token, str), (
"The token should be decoded by the time it is"
" returned to the user."
)
# At this point we know the sample logprobs are correct for this
# request. Validate that cumulative_logprob is actually the sum.
# For each request, assert that the returned cumulative logprob
# matches the correct value, which is computed below.
torch.testing.assert_close(
vllm_result.outputs[0].cumulative_logprob,
compute_correct_cumulative_logprob(vllm_result.outputs[0]),
atol=1e-6,
rtol=1e-6,
)
else:
# Logprobs disabled for this request; should be None
assert vllm_result.outputs[0].logprobs is None
# Validate prompt logprobs
if num_top_prompt_logprobs is not None:
# Confirm that structure of prompt logprobs in result is correct
assert vllm_result.prompt_logprobs is not None
# - The first prompt logprob is always None
assert vllm_result.prompt_logprobs[0] is None
# - Prompt logprobs are returned for all indices in
# the prompt
assert len(vllm_result.prompt_logprobs) == len(vllm_result.prompt_token_ids)
for prompt_logprobs, prompt_token_id in zip(
vllm_result.prompt_logprobs[1:], vllm_result.prompt_token_ids[1:]
):
assert prompt_logprobs is not None
# Confirm that the prompt token appears among the logprobs
assert prompt_token_id in prompt_logprobs
token_in_topk = (
prompt_logprobs[prompt_token_id].rank <= num_top_prompt_logprobs
)
# If the prompt token is not included in the top K
# logprob, it can return 1 more data
if token_in_topk and num_top_prompt_logprobs != 0:
assert len(prompt_logprobs) == num_top_prompt_logprobs
else:
assert len(prompt_logprobs) == num_top_prompt_logprobs + 1
if num_top_prompt_logprobs > 0:
# We should have an entry for each of the topk ranks
all_ranks = {lp.rank for lp in prompt_logprobs.values()}
assert all(
r in all_ranks for r in range(1, num_top_prompt_logprobs + 1)
)
# Compare prompt logprobs to HF
# The first prompt logprob is always None, so we compare it from
# 1:.
vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
for token_id, logprob in vllm_prompt_logprob_dict.items():
torch.testing.assert_close(
logprob.logprob,
hf_logprob[0][i][token_id].item(),
atol=2e-2,
rtol=2e-2,
)
else:
assert vllm_result.prompt_logprobs is None
@pytest.mark.parametrize(
"batch_logprobs_composition", [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT]
)
@pytest.mark.parametrize("temperature", [0.0, 2.0])
def test_get_logprobs_and_prompt_logprobs(
hf_model,
vllm_model,
batch_logprobs_composition: BatchLogprobsComposition,
temperature: float,
example_prompts: list[str],
) -> None:
"""Test V1 Engine logprobs & prompt logprobs
Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
settings and validate that
* The generated logprobs and prompt logprobs are consistent with the
configuration settings, in terms of whether or not the logprobs
(of either type) were requested and how many were requested
* The generated logprobs are consistent with the generated tokens
* The generated (prompt)logprobs are consistent with HuggingFace
(prompt)logprobs, as a reference
batch_logprobs_composition controls the logprobs configurations for
requests in the batch under test.
APC tests run two test iterations so that cache hits occur.
To save time, only test one APC-enabled scenario
(sample & prompt logprobs enabled, temperature>0.0).
Args:
hf_model: HuggingFace reference model fixture
vllm_model: vLLM model fixture
batch_logprobs_composition: logprobs configuration for test batch
temperature: "temperature" sampling parameter
example_prompts: example prompt fixture
"""
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
# Skip some test-cases to save time.
pytest.skip()
test_prompts = example_prompts
max_tokens = 5
hf_outputs = hf_model.generate_greedy(
test_prompts,
max_tokens=max_tokens,
)
hf_logprobs = hf_model.generate_greedy_logprobs(
test_prompts,
max_tokens=max_tokens,
)
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
# Ensure that each test prompt has a logprob config for testing
logprob_prompt_logprob_list = _repeat_logprob_config(
test_prompts, logprob_prompt_logprob_list
)
# Generate SamplingParams
vllm_sampling_params = [
SamplingParams(
max_tokens=max_tokens,
logprobs=num_lp,
prompt_logprobs=num_plp,
temperature=temperature,
seed=1984,
)
for num_lp, num_plp in logprob_prompt_logprob_list
]
for _ in range(2 if do_apc else 1):
_run_and_validate(
vllm_model=vllm_model,
test_prompts=test_prompts,
vllm_sampling_params=vllm_sampling_params,
hf_logprobs=hf_logprobs,
hf_outputs=hf_outputs,
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
temperature=temperature,
max_tokens=max_tokens,
do_apc=do_apc,
)
def test_max_logprobs():
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs`
APC should not matter as this test checks basic request validation.
"""
runner = VllmRunner(
"facebook/opt-125m",
max_logprobs=1,
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.15,
max_model_len=256,
)
vllm_sampling_params = SamplingParams(logprobs=1)
# should pass
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
bad_sampling_params = SamplingParams(logprobs=2)
with pytest.raises(ValueError):
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
def test_none_logprobs(vllm_model, example_prompts):
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
Args:
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
"""
max_tokens = 5
sampling_params_logprobs_none = SamplingParams(
max_tokens=max_tokens,
logprobs=None,
prompt_logprobs=None,
temperature=0.0,
)
results_logprobs_none = vllm_model.llm.generate(
example_prompts,
sampling_params=sampling_params_logprobs_none,
)
for i in range(len(results_logprobs_none)):
# Check sample logprobs are None
assert results_logprobs_none[i].outputs[0].logprobs is None
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
# Check prompt logprobs are None
assert results_logprobs_none[i].prompt_logprobs is None
def test_zero_logprobs(vllm_model, example_prompts):
"""Engine should return sampled token and prompt token logprobs
Args:
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
"""
max_tokens = 5
sampling_params_logprobs_zero = SamplingParams(
max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
)
results_logprobs_zero = vllm_model.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_zero
)
for i in range(len(results_logprobs_zero)):
# Check that there is one sample logprob dict for each
# sample token
logprobs = results_logprobs_zero[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
assert logprobs is not None
assert len(sampled_token_ids) == len(logprobs)
assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
# Check that there is one prompt logprob dict for each
# prompt token
assert prompt_logprobs is not None
assert len(prompt_token_ids) == len(prompt_logprobs)
def test_all_logprobs(example_prompts):
"""Engine should return all vocabulary logprobs and prompt logprobs
Args:
example_prompts: list of example prompts (test fixture)
"""
runner = VllmRunner(
"facebook/opt-125m",
max_logprobs=-1,
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.15,
max_model_len=256,
)
sampling_params_logprobs_all = SamplingParams(
max_tokens=5, logprobs=-1, prompt_logprobs=-1
)
results_logprobs_all = runner.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_all
)
vocab_size = runner.llm.llm_engine.model_config.get_vocab_size()
for i in range(len(results_logprobs_all)):
logprobs = results_logprobs_all[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_all[i].prompt_logprobs
assert logprobs is not None
for logprob in logprobs:
assert len(logprob) == vocab_size
assert prompt_logprobs is not None
assert prompt_logprobs[0] is None
for prompt_logprob in prompt_logprobs[1:]:
assert len(prompt_logprob) == vocab_size
@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
def test_logprobs_mode(logprobs_mode: LogprobsMode):
"""Test with LLM engine with different logprobs_mode.
For logprobs, we should have non-positive values.
For logits, we should expect at least one positive values.
"""
from vllm import LLM
llm = LLM(
"facebook/opt-125m",
max_logprobs=5,
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.05,
max_model_len=16,
logprobs_mode=logprobs_mode,
)
vllm_sampling_params = SamplingParams(logprobs=1)
results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
total_token_with_logprobs = 0
positive_values = 0
for output in results[0].outputs:
for logprobs in output.logprobs:
for token_id in logprobs:
logprob = logprobs[token_id]
if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
assert logprob.logprob <= 0
if logprob.logprob > 0:
positive_values = positive_values + 1
total_token_with_logprobs = total_token_with_logprobs + 1
assert total_token_with_logprobs >= len(results[0].outputs)
if logprobs_mode in ("raw_logits", "processed_logits"):
assert positive_values > 0
del llm
@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
@pytest.mark.parametrize(
"model_setup",
[
pytest.param(
(
"eagle",
"meta-llama/Llama-3.2-1B-Instruct",
"nm-testing/Llama3_2_1B_speculator.eagle3",
),
marks=large_gpu_mark(min_gb=32),
),
],
)
@pytest.mark.parametrize("top_logprobs", [0, 3])
def test_spec_decode_logprobs(
logprobs_mode: LogprobsMode,
model_setup: tuple[str, str, str],
top_logprobs: int,
):
"""Spec decode logprobs should match those of the base model.
Args:
logprobs_mode: logprobs mode.
model_setup: Spec decode method, base model name, and
draft model name.
"""
from vllm import LLM
prompt = "Hello world " * 50
sampling_params = SamplingParams(
temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False
)
method, model_name, spec_model_name = model_setup
max_model_len = 256
# Run base LLM.
ref_llm = LLM(
model=model_name,
max_logprobs=5,
max_model_len=max_model_len,
seed=42,
logprobs_mode=logprobs_mode,
gpu_memory_utilization=0.4,
)
ref_results = ref_llm.generate([prompt], sampling_params)
# Collect logprobs outputs from reference LLM.
ref_logprobs = []
for output in ref_results[0].outputs:
for logprobs in output.logprobs:
for token_id in logprobs:
ref_logprobs.append(logprobs[token_id])
del ref_llm
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
# Run spec decode LLM.
spec_llm = LLM(
model_name,
speculative_config={
"method": method,
"model": spec_model_name,
"num_speculative_tokens": 3,
"max_model_len": max_model_len,
},
max_logprobs=5,
max_model_len=max_model_len,
seed=42,
logprobs_mode=logprobs_mode,
gpu_memory_utilization=0.4,
# Force prefill chunking
enable_chunked_prefill=True,
max_num_batched_tokens=32,
)
spec_results = spec_llm.generate([prompt], sampling_params)
# Collect logprobs outputs from spec decode LLM.
spec_logprobs = []
for output in spec_results[0].outputs:
for logprobs in output.logprobs:
for token_id in logprobs:
spec_logprobs.append(logprobs[token_id])
del spec_llm
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
# Per-token logprobs are expected to be the same.
assert len(ref_logprobs) == len(spec_logprobs)
for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
assert math.isclose(
ref_logprob.logprob, spec_logprob.logprob, rel_tol=5e-2, abs_tol=1e-1
)
assert ref_logprob.rank == spec_logprob.rank
assert ref_logprob.decoded_token == spec_logprob.decoded_token
def test_prompt_logprobs_with_chunking_and_preemption():
"""Test that prompt logprobs are correctly returned when using
both chunked prefill and preemption.
This test ensures that the num_prompt_logprobs tracking persists
across preemptions and prefill chunks.
"""
# Create prompts that will trigger chunking and preemption
prompts = [
"The following numbers of the sequence "
+ ", ".join(str(i) for i in range(10))
+ " are:",
"In one word, the capital of France is ",
] + [f"Tell me about the number {i}: " for i in range(32)]
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=40,
min_tokens=20,
prompt_logprobs=2, # Request prompt logprobs
)
with VllmRunner(
"Qwen/Qwen3-0.6B",
max_model_len=512,
enable_chunked_prefill=True,
max_num_batched_tokens=48, # Force prefill chunking
num_gpu_blocks_override=32, # Force preemptions
disable_log_stats=False,
gpu_memory_utilization=0.25,
) as vllm_model:
metrics_before = vllm_model.llm.get_metrics()
# Generate with prompt logprobs using generate_w_logprobs which
# returns (output_ids, output_str, output_logprobs, prompt_logprobs)
outputs = vllm_model.generate_w_logprobs(
prompts, sampling_params=sampling_params, include_prompt_token_ids=True
)
# Verify that all outputs have prompt logprobs
for i, output in enumerate(outputs):
_, _, _, prompt_token_ids, prompt_logprobs = output
assert prompt_logprobs is not None and len(prompt_logprobs) > 0, (
f"Output {i} missing prompt logprobs"
)
assert len(prompt_logprobs) == len(prompt_token_ids), (
"Unexpected number of prompt logprob positions"
)
# Each position should have the requested number of logprobs
for pos, logprobs_dict in enumerate(prompt_logprobs):
if logprobs_dict is not None: # First token may be None
assert (
sampling_params.prompt_logprobs
<= len(logprobs_dict)
<= sampling_params.prompt_logprobs + 1
), (
f"Output {i} position {pos} has {len(logprobs_dict)} "
f"logprobs, expected {sampling_params.prompt_logprobs}"
)
# Check that we actually had preemptions
metrics_after = vllm_model.llm.get_metrics()
preemptions_before = next(
(m.value for m in metrics_before if m.name == "vllm:num_preemptions"), 0
)
preemptions_after = next(
(m.value for m in metrics_after if m.name == "vllm:num_preemptions"), 0
)
preemptions = preemptions_after - preemptions_before
assert preemptions > 0, "Test did not trigger any preemptions"
print(f"Test passed with {preemptions} preemptions")

View File

@@ -0,0 +1,57 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import lm_eval
from ...utils import RemoteOpenAIServer
# arc-easy uses prompt_logprobs=1, logprobs=1
TASK = "arc_easy"
FILTER = "acc_norm,none"
RTOL = 0.03
EXPECTED_VALUE = 0.62
# FIXME(rob): enable prefix caching once supported.
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8" # noqa: E501
SERVER_ARGS = [
"--enforce_eager",
"--no_enable_prefix_caching",
"--gpu-memory-utilization=0.8",
]
NUM_CONCURRENT = 100
def test_prompt_logprobs_e2e():
results = lm_eval.simple_evaluate(
model="vllm", model_args=MODEL_ARGS, tasks=TASK, batch_size="auto"
)
measured_value = results["results"][TASK][FILTER]
assert (
measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
def test_prompt_logprobs_e2e_server():
with RemoteOpenAIServer(MODEL, SERVER_ARGS) as remote_server:
url = f"{remote_server.url_for('v1')}/completions"
model_args = (
f"model={MODEL},"
f"base_url={url},"
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False"
)
results = lm_eval.simple_evaluate(
model="local-completions",
model_args=model_args,
tasks=TASK,
)
measured_value = results["results"][TASK][FILTER]
assert (
measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"

View File

@@ -0,0 +1,781 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from unittest.mock import Mock
import pytest
import torch
import torch.nn.functional as F
from tests.v1.sample.utils import create_allowed_token_ids
from vllm.platforms import current_platform
from vllm.v1.sample.logits_processor import LogitsProcessors
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.rejection_sampler import PLACEHOLDER_TOKEN_ID, RejectionSampler
from vllm.v1.sample.sampler import Sampler, SamplerOutput
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
DEVICE = current_platform.device_type
@pytest.fixture
def rejection_sampler():
mock_sampler = Mock(spec=Sampler)
mock_sampler.logprobs_mode = "raw_logprobs"
return RejectionSampler(mock_sampler)
def mock_sampler_output(
rejection_sampler: RejectionSampler, bonus_token_ids: torch.Tensor
):
rejection_sampler.sampler.return_value = SamplerOutput(
sampled_token_ids=bonus_token_ids, logprobs_tensors=None
)
def create_spec_decode_metadata(
spec_tokens: list[list[int]], logits: torch.Tensor
) -> SpecDecodeMetadata:
metadata = SpecDecodeMetadata.make_dummy(spec_tokens, device=logits.device)
metadata.target_logits_indices = torch.arange(logits.shape[0])
# Output bonus token ids are mocked, so the bonus logit indices should
# be empty.
metadata.bonus_logits_indices = torch.empty(0, dtype=torch.int32)
return metadata
def create_logits_tensor(
output_token_ids: list[list[int]],
vocab_size: int = 100,
token_idx_to_override: int | None = None,
) -> torch.Tensor:
"""Helper function to create logits tensor that
will produce desired token ids on argmax"""
token_ids = [tokens[:-1] for tokens in output_token_ids]
num_total_tokens = sum(len(tokens) for tokens in token_ids)
logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
start_loc = 0
for tokens in token_ids:
for j, token_id in enumerate(tokens):
logits[start_loc + j, token_id] = 100.0
start_loc += len(tokens)
if token_idx_to_override:
logits[:, token_idx_to_override] = 99.0
return logits
def create_sampling_metadata(
all_greedy: bool,
output_token_ids: list[list[int]] | None = None,
prompt_token_ids: torch.Tensor | None = None,
spec_token_ids: torch.Tensor | None = None,
temperature: torch.Tensor | None = None,
top_k: torch.Tensor | None = None,
top_p: torch.Tensor | None = None,
generators: dict[int, Any] | None = None,
frequency_penalties: list[float] | None = None,
presence_penalties: list[float] | None = None,
repetition_penalties: list[float] | None = None,
bad_words_token_ids: dict[int, list[list[int]]] | None = None,
allowed_token_ids_mask: torch.Tensor | None = None,
) -> SamplingMetadata:
"""Create a v1 sampling metadata object with all_greedy set
to the given value. Either all greedy or all random sampling
is used.
"""
generators = generators or {}
if all_greedy:
temperature = None
else:
assert temperature is not None
if any([frequency_penalties, presence_penalties, repetition_penalties]):
no_penalties = False
assert output_token_ids
assert len(output_token_ids) > 0
frequency_penalties = torch.tensor(frequency_penalties, device=DEVICE)
presence_penalties = torch.tensor(presence_penalties, device=DEVICE)
repetition_penalties = torch.tensor(repetition_penalties, device=DEVICE)
else:
no_penalties = True
frequency_penalties = torch.tensor([])
presence_penalties = torch.tensor([])
repetition_penalties = torch.tensor([])
return SamplingMetadata(
temperature=temperature,
all_greedy=all_greedy,
all_random=not all_greedy,
top_p=top_p,
top_k=top_k,
generators=generators,
max_num_logprobs=None,
no_penalties=no_penalties,
prompt_token_ids=prompt_token_ids,
frequency_penalties=frequency_penalties,
presence_penalties=presence_penalties,
repetition_penalties=repetition_penalties,
output_token_ids=[] if output_token_ids is None else output_token_ids,
spec_token_ids=[] if spec_token_ids is None else spec_token_ids,
allowed_token_ids_mask=allowed_token_ids_mask,
bad_words_token_ids={} if bad_words_token_ids is None else bad_words_token_ids,
logitsprocs=LogitsProcessors(),
)
########################### Tests for Greedy Sampling ###################
def test_perfect_match(rejection_sampler):
"""Test when output tokens perfectly match speculated tokens"""
spec_tokens = [[1, 2, 3]]
output_tokens = [[1, 2, 3, 4]] # 4 is the bonus token
metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
mock_sampler_output(rejection_sampler, bonus_token_tensor)
output = rejection_sampler(
spec_decode_metadata,
draft_probs=None,
logits=logits,
sampling_metadata=metadata,
)
expected = torch.tensor([[1, 2, 3, 4]], dtype=torch.int, device=logits.device)
assert torch.equal(output.sampled_token_ids, expected)
def test_early_mismatch(rejection_sampler):
"""Test when there's an early mismatch in tokens"""
spec_tokens = [[1, 2, 3]]
output_tokens = [[1, 5, 3, 4]] # Mismatch at position 1
metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
mock_sampler_output(rejection_sampler, bonus_token_tensor)
output = rejection_sampler(
spec_decode_metadata,
draft_probs=None,
logits=logits,
sampling_metadata=metadata,
)
expected = torch.tensor(
[[1, 5, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID]],
dtype=torch.int,
device=logits.device,
)
assert torch.equal(output.sampled_token_ids, expected)
def test_multiple_sequences(rejection_sampler):
"""Test handling multiple sequences of speculated tokens"""
spec_tokens = [[1, 2], [3]]
output_tokens = [[1, 2, 5], [3, 4]] # Two sequences with bonus tokens 5 and 4
metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
bonus_token_tensor = torch.tensor(
[output_tokens[0][-1], output_tokens[1][-1]], device=logits.device
)
spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
mock_sampler_output(rejection_sampler, bonus_token_tensor)
output = rejection_sampler(
spec_decode_metadata,
draft_probs=None,
logits=logits,
sampling_metadata=metadata,
)
expected = torch.tensor(
[[1, 2, 5], [3, 4, PLACEHOLDER_TOKEN_ID]], dtype=torch.int, device=logits.device
)
assert torch.equal(output.sampled_token_ids, expected)
def test_single_token_sequence(rejection_sampler):
"""Test handling sequences with single token"""
spec_tokens = [[1]]
output_tokens = [[1, 2]] # Single token with bonus token 2
metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
mock_sampler_output(rejection_sampler, bonus_token_tensor)
output = rejection_sampler(
spec_decode_metadata,
draft_probs=None,
logits=logits,
sampling_metadata=metadata,
)
expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
assert torch.equal(output.sampled_token_ids, expected)
def test_empty_sequence(rejection_sampler):
"""Test handling empty sequence of speculated tokens"""
spec_tokens: list[list[int]] = [[]]
output_tokens = [[5]] # Just the bonus token
metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
mock_sampler_output(rejection_sampler, bonus_token_tensor)
output = rejection_sampler(
spec_decode_metadata,
draft_probs=None,
logits=logits,
sampling_metadata=metadata,
)
expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
assert torch.equal(output.sampled_token_ids, expected)
def test_multiple_mismatches(rejection_sampler):
"""Test handling multiple sequences with mismatches"""
spec_tokens = [[1, 2, 3], [4, 5, 6]]
output_tokens = [[1, 2, 7, 6], [4, 8, 6, 9]] # Mismatches in both sequences
metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
bonus_token_tensor = torch.tensor(
[output_tokens[0][-1], output_tokens[1][-1]], device=logits.device
)
spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
mock_sampler_output(rejection_sampler, bonus_token_tensor)
output = rejection_sampler(
spec_decode_metadata,
draft_probs=None,
logits=logits,
sampling_metadata=metadata,
)
expected = torch.tensor(
[
[1, 2, 7, PLACEHOLDER_TOKEN_ID],
[4, 8, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID],
],
dtype=torch.int,
device=logits.device,
)
assert torch.equal(output.sampled_token_ids, expected)
@pytest.mark.parametrize(
"spec_tokens,output_tokens,expected",
[
([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]), # Perfect match with bonus
([[1]], [[2, 3]], [[2, PLACEHOLDER_TOKEN_ID]]), # First mismatch
(
[[1, 2], [3, 4]],
[[1, 5, 6], [3, 4, 7]],
[[1, 5, PLACEHOLDER_TOKEN_ID], [3, 4, 7]],
), # Mixed matches
],
)
def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens, expected):
"""Parametrized test for various matching scenarios"""
metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
bonus_token_tensor = torch.tensor(
[tokens[-1] for tokens in output_tokens], device=logits.device
)
spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
mock_sampler_output(rejection_sampler, bonus_token_tensor)
output = rejection_sampler(
spec_decode_metadata,
draft_probs=None,
logits=logits,
sampling_metadata=metadata,
)
expected_tensor = torch.tensor(expected, dtype=torch.int, device=logits.device)
assert torch.equal(output.sampled_token_ids, expected_tensor)
########################### Tests for Random Sampling ###################
@pytest.mark.parametrize("k", [1, 3, 5])
@pytest.mark.parametrize("vocab_size", [1000])
@pytest.mark.parametrize("batch_size", [1, 4, 8])
@pytest.mark.parametrize("frac_seeded", [0.0, 0.5])
@pytest.mark.parametrize("n_rep", [20])
def test_deterministic_when_seeded(
rejection_sampler,
k: int,
vocab_size: int,
batch_size: int,
frac_seeded: float,
n_rep: int,
):
num_tokens = batch_size * k
draft_probs = torch.rand(num_tokens, vocab_size, dtype=torch.float32, device=DEVICE)
draft_probs = F.softmax(draft_probs, dim=-1)
target_logits = torch.rand_like(draft_probs)
bonus_token_ids = torch.randint(
low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64, device=DEVICE
)
draft_token_ids = torch.randint(
low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64, device=DEVICE
)
seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
results = []
for _ in range(n_rep):
seeded_seqs = {
i: torch.Generator(device=DEVICE).manual_seed(i)
for i in range(batch_size)
if seeded_mask[i]
}
temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
sampling_metadata = create_sampling_metadata(
all_greedy=False, temperature=temperature, generators=seeded_seqs
)
spec_decode_metadata = create_spec_decode_metadata(
draft_token_ids.tolist(), target_logits
)
mock_sampler_output(rejection_sampler, bonus_token_ids)
rep_result = rejection_sampler(
spec_decode_metadata,
draft_probs=None,
logits=target_logits,
sampling_metadata=sampling_metadata,
)
results.append(rep_result.sampled_token_ids)
for i in range(batch_size):
if seeded_mask[i]:
for j in range(1, n_rep):
assert torch.equal(results[j][i], results[0][i])
def test_rejection_sampling_approximates_target_distribution():
"""Verify rejection sampling approximates target distribution,
despite sampling from a potentially distinct draft distribution.
This is done by first creating a random target probability
distribution and a random draft probability distribution. We then
sample token ids from the rejection sampler using these draft
and target distributions. The samples are used to estimate
the output probability distribution, which we expect to approximate
the target distribution.
A basic distance metric is used to determine similarity between
distributions.
We expect that as we increase the number of samples,
the distance between the observed distribution and the target
distribution decreases. To measure this, we compare the distance
of the observed distribution against both the target distribution
and a uniform random distribution. We expect the distance between
the observed distribution and the target distribution to improve
much more than the distance improvement between the observed
distribution and the random distribution.
"""
torch.set_default_device(DEVICE)
vocab_size = 10
k = 2
num_reference_probs = 100
# Prepare draft, target, and reference probability distributions
draft_probs = F.softmax(torch.rand(vocab_size, dtype=torch.float32), dim=-1)
target_logits = torch.rand(vocab_size, dtype=torch.float32)
target_probs = F.softmax(target_logits, dim=-1)
reference_probs = F.softmax(
torch.rand(num_reference_probs, vocab_size, dtype=torch.float32),
dim=-1,
)
sample_sizes = [10, 100, 1_000, 10_000, 100_000]
distance_wrt_reference: list[float] = []
distance_wrt_target: list[float] = []
for num_samples in sample_sizes:
# Sample using rejection sampling.
rej_sample_probs = estimate_rejection_sampling_pdf(
draft_probs, target_logits, k, vocab_size, num_samples
)
rej_sample_probs = rej_sample_probs.to(DEVICE)
# Average distance from reference probs.
reference_vs_rejsample_dist = (
torch.dist(reference_probs, rej_sample_probs).item()
/ reference_probs.shape[0]
)
target_vs_rejsample_dist = torch.dist(target_probs, rej_sample_probs).item()
distance_wrt_reference.append(reference_vs_rejsample_dist)
distance_wrt_target.append(target_vs_rejsample_dist)
relative_change_in_distance_wrt_target = get_ratio_first_to_last(
distance_wrt_target
)
relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
distance_wrt_reference
)
print(
f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
f"{reference_vs_rejsample_dist=:.05f}"
)
print(
f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
f"{relative_change_in_distance_wrt_reference=:.02f}"
)
relative_change_in_distance_wrt_target = get_ratio_first_to_last(
distance_wrt_target
)
relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
distance_wrt_reference
)
expected_improvement_multiplier = 20
assert (
relative_change_in_distance_wrt_target
> relative_change_in_distance_wrt_reference * expected_improvement_multiplier
)
def get_ratio_first_to_last(elements: list[float]) -> float:
return elements[0] / elements[-1]
def estimate_rejection_sampling_pdf(
draft_probs: torch.Tensor,
target_logits: torch.Tensor,
k: int,
vocab_size: int,
num_samples: int,
) -> torch.Tensor:
"""Estimate the probability distribution of the output tokens
using rejection sampling.
Args:
draft_probs: Draft probability distribution.
target_logits: Target logits.
num_samples: Number of samples to draw.
Returns:
Estimated probability distribution of the output tokens.
"""
mock_sampler = Mock(spec=Sampler)
mock_sampler.logprobs_mode = "raw_logprobs"
rejection_sampler = RejectionSampler(mock_sampler)
num_tokens = num_samples * k
# Repeat draft probs num_samples * k times.
draft_probs = draft_probs.reshape(1, 1, vocab_size).repeat(num_samples, k, 1)
# Repeat target probs num_tokens times.
target_logits = target_logits.reshape(1, vocab_size).repeat(num_tokens, 1)
# Randomly sample draft token ids from draft probs.
draft_token_ids = torch.multinomial(
draft_probs[:, 0, :], num_samples=k, replacement=True
).reshape(num_samples, k)
draft_probs = draft_probs.view(num_tokens, vocab_size)
# Bonus tokens not used but required.
bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64, device=DEVICE).repeat(
num_samples, 1
)
temperature = torch.ones(num_samples, dtype=torch.float32, device=DEVICE)
sampling_metadata = create_sampling_metadata(
all_greedy=False, temperature=temperature
)
spec_decode_metadata = create_spec_decode_metadata(
draft_token_ids.tolist(), target_logits
)
mock_sampler_output(rejection_sampler, bonus_token_ids)
sampler_output = rejection_sampler(
spec_decode_metadata,
draft_probs=draft_probs,
logits=target_logits,
sampling_metadata=sampling_metadata,
)
output_token_ids = sampler_output.sampled_token_ids[:, :-1].flatten()
hist = torch.histogram(
output_token_ids.to(dtype=torch.float, device="cpu"),
bins=vocab_size,
range=(0, vocab_size),
density=True,
)
return hist.hist
def _test_masked_logits(
rejection_sampler,
batch_size: int,
num_draft_tokens: int,
vocab_size: int,
target_logits: torch.Tensor,
unmasked_indices: torch.Tensor,
sampling_metadata: SamplingMetadata,
):
# Set up test parameters
num_tokens = batch_size * num_draft_tokens
# Create random draft probabilities.
draft_probs = torch.rand(
(num_tokens, vocab_size), dtype=torch.float32, device=DEVICE
)
draft_probs = F.softmax(draft_probs, dim=-1)
# Randomly sample draft token ids from draft probs
draft_token_ids = torch.multinomial(draft_probs, num_samples=1)
draft_token_ids = draft_token_ids.reshape(batch_size, num_draft_tokens)
draft_token_ids = draft_token_ids.tolist()
# Bonus tokens not used but required
bonus_token_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=DEVICE)
# Create spec decode metadata
spec_decode_metadata = create_spec_decode_metadata(draft_token_ids, target_logits)
# Run rejection sampling
mock_sampler_output(rejection_sampler, bonus_token_ids)
output = rejection_sampler(
spec_decode_metadata,
draft_probs=draft_probs,
logits=target_logits,
sampling_metadata=sampling_metadata,
)
# Remove bonus tokens and reshape
output_token_ids = output.sampled_token_ids[:, :-1].flatten().tolist()
# Check that all sampled tokens are within the unmasked indices.
for i in range(num_tokens):
token_id = output_token_ids[i]
if token_id == PLACEHOLDER_TOKEN_ID:
continue
assert token_id in unmasked_indices[i]
@pytest.mark.parametrize("top_k", [1, 5, 99])
def test_top_k(rejection_sampler, top_k):
"""Test rejection sampling with top-k sampling"""
vocab_size = 100
batch_size = 100
num_draft_tokens = 3
num_tokens = batch_size * num_draft_tokens
# Randomly create top-k indices.
top_k_indices = [
torch.randperm(vocab_size, device=DEVICE)[:top_k] for _ in range(num_tokens)
]
top_k_indices = torch.stack(top_k_indices)
# Create logits with the uniform distribution.
target_logits = torch.zeros((num_tokens, vocab_size), device=DEVICE)
# Increment the logits for top-k indices, a little bit more than the other
# ones. If the masking is effective, the non-topk indices will never be
# sampled despite the small difference in logits.
for i in range(num_tokens):
target_logits[i, top_k_indices[i]] += 0.1
# Create sampling metadata
temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
sampling_metadata = create_sampling_metadata(
all_greedy=False,
temperature=temperature,
top_k=torch.tensor([top_k] * batch_size, device=DEVICE, dtype=torch.int64),
)
_test_masked_logits(
rejection_sampler,
batch_size=batch_size,
num_draft_tokens=num_draft_tokens,
vocab_size=vocab_size,
target_logits=target_logits,
unmasked_indices=top_k_indices,
sampling_metadata=sampling_metadata,
)
@pytest.mark.parametrize("top_p", [0.5, 0.9, 0.99])
def test_top_p(rejection_sampler, top_p):
"""Test rejection sampling with top-p sampling"""
vocab_size = 100
batch_size = 100
num_draft_tokens = 3
num_tokens = batch_size * num_draft_tokens
# Create logits with the uniform distribution.
target_logits = torch.randn((num_tokens, vocab_size), device=DEVICE)
temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
rescaled_logits = target_logits / temperature
logits_sort, logits_idx = rescaled_logits.sort(dim=-1, descending=False)
probs_sort = logits_sort.softmax(dim=-1)
probs_sum = probs_sort.cumsum(dim=-1)
top_p_mask = probs_sum <= 1 - top_p
# at least one
top_p_mask[:, -1] = False
# Get the top-p indices.
top_p_indices = []
for i in range(num_tokens):
top_p_indices.append(logits_idx[i][~top_p_mask[i]].tolist())
# Create sampling metadata
sampling_metadata = create_sampling_metadata(
all_greedy=False,
temperature=temperature,
top_p=torch.tensor([top_p] * batch_size, device=DEVICE, dtype=torch.float32),
)
_test_masked_logits(
rejection_sampler,
batch_size=batch_size,
num_draft_tokens=num_draft_tokens,
vocab_size=vocab_size,
target_logits=target_logits,
unmasked_indices=top_p_indices,
sampling_metadata=sampling_metadata,
)
########################### Tests for Logit Processors ###################
def test_frequency_penalties(rejection_sampler):
"""Test rejection sampling with frequency penalties"""
spec_tokens = [[1, 1, 1], [], [1, 1, 1]]
output_tokens = [[1, 1, 1, 1], [7], [1, 1, 1, 1]] # 1, 7 and 1 are the bonus tokens
num_requsts = len(spec_tokens)
logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
metadata = create_sampling_metadata(
all_greedy=True,
output_token_ids=[[2], [3], [4]],
spec_token_ids=spec_tokens,
prompt_token_ids=torch.tensor([[5, 6, 7], [6, 7, 8], [7, 8, 9]], device=DEVICE),
frequency_penalties=[1.5, 1.5, 0.7],
presence_penalties=[0.0] * num_requsts,
repetition_penalties=[1.0] * num_requsts,
)
bonus_token_tensor = torch.tensor(
[output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
)
spec_decode_metadata = SpecDecodeMetadata.make_dummy(
spec_tokens, device=logits.device
)
mock_sampler_output(rejection_sampler, bonus_token_tensor)
output = rejection_sampler(
spec_decode_metadata,
draft_probs=None,
logits=logits,
sampling_metadata=metadata,
)
expected = torch.tensor(
[[1, 15, -1, -1], [7, -1, -1, -1], [1, 1, 15, -1]],
dtype=torch.int,
device=logits.device,
)
assert torch.equal(output.sampled_token_ids, expected)
def test_bad_words(rejection_sampler):
"""Test rejection sampling with bad words constraints"""
spec_tokens = [[1, 2, 3], [1, 15, 3], [1, 2, 3]]
output_tokens = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
metadata = create_sampling_metadata(
all_greedy=True,
output_token_ids=[[2], [3], [4]],
spec_token_ids=spec_tokens,
bad_words_token_ids={
0: [
[
2,
]
],
1: [
[
2,
]
],
# Do not apply bad words to the last request
},
)
bonus_token_tensor = torch.tensor(
[output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
)
spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
mock_sampler_output(rejection_sampler, bonus_token_tensor)
output = rejection_sampler(
spec_decode_metadata,
draft_probs=None,
logits=logits,
sampling_metadata=metadata,
)
expected = torch.tensor(
[[1, 15, -1, -1], [1, 15, 3, 4], [1, 2, 3, 4]],
dtype=torch.int,
device=logits.device,
)
assert torch.equal(output.sampled_token_ids, expected)
def test_allowed_token_ids(rejection_sampler):
"""Test rejection sampling with allowed token ids"""
spec_tokens = [[1, 2, 10], [10, 5, 3], [7, 10, 12]]
output_tokens = [[1, 2, 10, 5], [10, 5, 10, 5], [7, 10, 12, 5]]
# Not allowed tokens:
# 0: 0-4
# 1: 1-5
# 2: 2-6
num_allowed_token_ids = 5
# Use the token 15 as the sampler choose if a token rejected
logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
batch_size = len(output_tokens)
_, vocab_size = logits.size()
mask = create_allowed_token_ids(
batch_size=batch_size,
vocab_size=vocab_size,
num_allowed_token_ids=num_allowed_token_ids,
device=logits.device,
)
metadata = create_sampling_metadata(
all_greedy=True,
output_token_ids=[[], [], []],
spec_token_ids=spec_tokens,
allowed_token_ids_mask=mask,
)
bonus_token_tensor = torch.tensor(
[output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
)
spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
mock_sampler_output(rejection_sampler, bonus_token_tensor)
output = rejection_sampler(
spec_decode_metadata,
draft_probs=None,
logits=logits,
sampling_metadata=metadata,
)
expected = torch.tensor(
[[15, -1, -1, -1], [10, 5, 10, -1], [7, 10, 12, 5]],
dtype=torch.int,
device=logits.device,
)
assert torch.equal(output.sampled_token_ids, expected)

View File

@@ -0,0 +1,449 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np
import pytest
import torch
from tests.v1.sample.utils import create_allowed_token_ids
from vllm.platforms import current_platform
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.sample.logits_processor import LogitsProcessors
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.sampler import Sampler
PIN_MEMORY_AVAILABLE = is_pin_memory_available()
MAX_NUM_REQS = 256
VOCAB_SIZE = 1024
NUM_OUTPUT_TOKENS = 20
CUDA_DEVICES = [
f"{current_platform.device_type}:{i}"
for i in range(1 if current_platform.device_count() == 1 else 2)
]
MAX_NUM_PROMPT_TOKENS = 64
def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
return fake_logits
def _create_penalty_tensor(
batch_size: int, penalty_value: float, device: torch.device
) -> torch.Tensor:
return torch.full(
(batch_size,), fill_value=penalty_value, dtype=torch.float, device=device
)
def _create_prompt_tokens_tensor(
prompt_token_ids: list[list[int]],
vocab_size: int,
device: torch.device,
) -> torch.Tensor:
return make_tensor_with_pad(
prompt_token_ids,
pad=vocab_size,
device=device,
dtype=torch.int64,
pin_memory=False,
)
def _create_bad_words_token_ids(
batch_size: int,
vocab_size: int,
bad_words_lengths: tuple[int, ...],
) -> dict[int, list[list[int]]]:
bad_words_token_ids = {}
for batch_idx in range(batch_size):
token_ids_single_batch = []
for bad_words_length in bad_words_lengths:
token_ids = np.random.choice(
vocab_size, size=bad_words_length, replace=True
).tolist()
token_ids_single_batch.append(token_ids)
bad_words_token_ids[batch_idx] = token_ids_single_batch
if batch_size >= 2:
# Test no bad_words for some batch
no_bad_words_batch_idx = np.random.choice(batch_size)
bad_words_token_ids.pop(no_bad_words_batch_idx, None)
return bad_words_token_ids
# Returns all last tokens of bad word sequences that share the same prefix
# as `given_prefix` (excluding the last token).
def _collect_suffixes_with_same_prefix(
given_prefix: list[int], bad_words_token_ids: list[list[int]]
) -> list[int]:
return [bwt[-1] for bwt in bad_words_token_ids if bwt[:-1] == given_prefix]
# generate a valid token id that is not in bad_words_token_ids
def _generate_valid_token_id(
bad_words_token_ids: list[list[int]], vocab_size: int
) -> int:
forbidden_start_tokens = set()
for bad_word in bad_words_token_ids:
forbidden_start_tokens.add(bad_word[0])
# Get a safe token that's not in forbidden starts
safe_token_candidates = list(set(range(vocab_size)) - forbidden_start_tokens)
# Pick a random safe token
return np.random.choice(safe_token_candidates)
def _update_output_token_ids_for_bad_words(
metadata: SamplingMetadata, vocab_size: int
) -> dict[int, list[int]]:
bad_words_last_tokens = {}
for batch_idx, bad_words_token_ids in metadata.bad_words_token_ids.items():
output_token_ids = metadata.output_token_ids[batch_idx]
bad_words_last_token: list[int] = []
for i, bad_word_token_ids in enumerate(bad_words_token_ids):
if len(bad_word_token_ids) == 1:
# Single token id always affects logits
bad_words_last_token.append(bad_word_token_ids[0])
else:
prefix_length = len(bad_word_token_ids) - 1
has_bad_words = np.random.choice([True, False])
if has_bad_words:
prefix = bad_word_token_ids[:-1]
output_token_ids[-prefix_length:] = prefix
# Collect all last tokens from other bad words
# that share this prefix
bad_words_last_token.extend(
_collect_suffixes_with_same_prefix(prefix, bad_words_token_ids)
)
break # Maximum one update to output_token_ids
else: # Make sure no accidental match to bad words
output_token_ids[-1] = _generate_valid_token_id(
bad_words_token_ids, vocab_size
)
bad_words_last_tokens[batch_idx] = bad_words_last_token
return bad_words_last_tokens
def _create_default_sampling_metadata(
num_output_tokens: int,
batch_size: int,
vocab_size: int,
device: torch.device,
) -> SamplingMetadata:
output_token_ids: list[list[int]] = []
prompt_token_ids: list[list[int]] = []
for _ in range(batch_size):
output_token_ids.append(
np.random.randint(0, vocab_size, size=num_output_tokens).tolist()
)
prompt_token_ids.append(
np.random.randint(
0, vocab_size, size=np.random.randint(1, MAX_NUM_PROMPT_TOKENS)
).tolist()
)
fake_sampling_metadata = SamplingMetadata(
temperature=torch.full((batch_size,), 0.0),
all_greedy=True,
all_random=False,
top_p=None,
top_k=None,
generators={},
max_num_logprobs=0,
prompt_token_ids=_create_prompt_tokens_tensor(
prompt_token_ids, vocab_size, device
),
output_token_ids=output_token_ids,
spec_token_ids=[[] for _ in range(batch_size)],
frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
no_penalties=True,
allowed_token_ids_mask=None,
bad_words_token_ids={},
logitsprocs=LogitsProcessors(),
)
return fake_sampling_metadata
def _create_weighted_output_token_list(
batch_size: int, vocab_size: int
) -> tuple[list[list[int]], list[list[int]]]:
"""
Creates an output token list where each token occurs a distinct
number of times.
For each batch, a random subset of token IDs is selected from the
vocabulary. The selected tokens are then added to the output token
list, each with a different frequency.
Returns:
tuple[list[list[int]], list[list[int]]]:
- The first element is the output token list, where each sublist
corresponds to a batch and contains tokens with weighted
frequencies.
- The second element is a list of distinct token IDs for each
batch, ordered by their frequency in the corresponding output
list.
"""
output_token_ids: list[list[int]] = []
sorted_token_ids_in_output: list[list[int]] = []
for _ in range(batch_size):
distinct_token_ids = np.random.choice(
vocab_size, size=np.random.randint(1, 10), replace=False
).tolist()
sorted_token_ids_in_output.append(distinct_token_ids)
output_token_ids_for_batch = []
for index, token_id in enumerate(distinct_token_ids):
output_token_ids_for_batch.extend([token_id for _ in range(index + 1)])
output_token_ids.append(output_token_ids_for_batch)
return output_token_ids, sorted_token_ids_in_output
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("batch_size", [1, 2, 32])
@pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
def test_sampler_presence_penalty(
device: str, batch_size: int, presence_penalty: float
):
"""
Test to verify that if presence penalty is enabled then tokens
are penalized as per their presence in the existing output.
"""
torch.set_default_device(device)
# Create fake logits where each token is assigned the same
# logit value.
fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
sampling_metadata = _create_default_sampling_metadata(
NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
)
output_token_ids = sampling_metadata.output_token_ids
sampling_metadata.presence_penalties = _create_penalty_tensor(
batch_size, presence_penalty, torch.device(device)
)
sampling_metadata.no_penalties = False
sampler = Sampler()
logits = sampler.apply_penalties(
fake_logits, sampling_metadata, sampling_metadata.output_token_ids
)
logits = logits.cpu()
for batch_idx in range(batch_size):
# Since all tokens initially have the same logits, the non-penalized
# token ID will be the one with the highest logit value, while the
# penalized token ID will be the one with the lowest logit value.
non_penalized_token_id = logits[batch_idx].argmax().item()
penalized_token_id = logits[batch_idx].argmin().item()
if presence_penalty > 0:
# If `presence_penalty` is set to a value greater than 0, it
# indicates a preference for new tokens over those already
# present in the output.
# Verify that the penalized token ID exists in the output, while the
# non-penalized token ID does not.
assert penalized_token_id in output_token_ids[batch_idx]
assert non_penalized_token_id not in output_token_ids[batch_idx]
elif presence_penalty < 0:
# If `presence_penalty` is set to a value less than 0, it indicates
# a preference for existing tokens over new ones. Verify that the
# non-penalized token ID exists in the output, while the penalized
# token ID does not.
assert non_penalized_token_id in output_token_ids[batch_idx]
assert penalized_token_id not in output_token_ids[batch_idx]
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("batch_size", [1, 2, 32])
@pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
def test_sampler_frequency_penalty(
device: str, batch_size: int, frequency_penalty: float
):
"""
Test to verify that if frequency penalty is enabled then tokens are
penalized as per their frequency of occurrence.
"""
torch.set_default_device(device)
# Create fake logits where each token is assigned the same
# logit value.
fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
sampling_metadata = _create_default_sampling_metadata(
NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
)
sampling_metadata.frequency_penalties = _create_penalty_tensor(
batch_size, frequency_penalty, torch.device(device)
)
output_token_ids, sorted_token_ids_in_output = _create_weighted_output_token_list(
batch_size,
VOCAB_SIZE,
)
sampling_metadata.output_token_ids = output_token_ids
sampling_metadata.no_penalties = False
sampler = Sampler()
logits = sampler.apply_penalties(
fake_logits, sampling_metadata, sampling_metadata.output_token_ids
)
logits = logits.cpu()
for batch_idx in range(batch_size):
non_penalized_token_id = logits[batch_idx].argmax().item()
penalized_token_id = logits[batch_idx].argmin().item()
distinct_sorted_token_ids_in_output = sorted_token_ids_in_output[batch_idx]
most_frequent_token_id = distinct_sorted_token_ids_in_output[
len(distinct_sorted_token_ids_in_output) - 1
]
if frequency_penalty > 0:
# If `frequency_penalty` is set to > 0, it indicates
# a preference for new tokens over existing ones. Verify that the
# non-penalized token ID is not present in the output, while the
# most penalized token is the one that occurs most frequently in
# the output.
assert non_penalized_token_id not in distinct_sorted_token_ids_in_output
assert penalized_token_id == most_frequent_token_id
elif frequency_penalty < 0:
# If `frequency_penalty` is set to < 0, it indicates
# a preference for existing tokens over new ones. Verify that the
# non-penalized token ID is the one that occurs most frequently
# in the output, while the penalized token ID is one that has not
# yet appeared.
assert non_penalized_token_id == most_frequent_token_id
assert penalized_token_id not in distinct_sorted_token_ids_in_output
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("batch_size", [1, 2, 32])
@pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
def test_sampler_repetition_penalty(
device: str, batch_size: int, repetition_penalty: float
):
"""
Test to verify that when the repetition penalty is enabled, tokens
are penalized based on their presence in the prompt or the existing
output.
"""
torch.set_default_device(device)
# Create fake logits where each token is assigned the same
# logit value.
fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
sampling_metadata = _create_default_sampling_metadata(
NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
)
sampling_metadata.repetition_penalties = _create_penalty_tensor(
batch_size, repetition_penalty, torch.device(device)
)
sampling_metadata.no_penalties = False
sampler = Sampler()
logits = sampler.apply_penalties(
fake_logits, sampling_metadata, sampling_metadata.output_token_ids
)
logits = logits.cpu()
for batch_idx in range(batch_size):
non_penalized_token_id = logits[batch_idx].argmax().item()
penalized_token_id = logits[batch_idx].argmin().item()
prompt_tokens = sampling_metadata.prompt_token_ids[batch_idx][:].tolist()
output_tokens = sampling_metadata.output_token_ids[batch_idx]
if repetition_penalty > 1.0:
# If `repetition_penalty` > 1.0, verify that the non-penalized
# token ID has not been seen before, while the penalized token ID
# exists either in the prompt or the output.
assert (
non_penalized_token_id not in prompt_tokens
and non_penalized_token_id not in output_tokens
)
assert (
penalized_token_id in prompt_tokens
or penalized_token_id in output_tokens
)
elif repetition_penalty < 1.0:
# If `repetition_penalty` < 1.0, verify that the penalized
# token ID has not been seen before, while the non-penalized
# token ID exists either in the prompt or the output.
assert (
penalized_token_id not in prompt_tokens
and penalized_token_id not in output_tokens
)
assert (
non_penalized_token_id in prompt_tokens
or non_penalized_token_id in output_tokens
)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("batch_size", [1, 2, 32])
@pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2])
def test_sampler_allowed_token_ids(
device: str, batch_size: int, num_allowed_token_ids: int
):
"""
Test to verify that when the repetition penalty is enabled, tokens
are penalized based on their presence in the prompt or the existing
output.
"""
torch.set_default_device(device)
# Create fake logits where each token is assigned the same
# logit value.
fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
sampling_metadata = _create_default_sampling_metadata(
NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
)
mask = create_allowed_token_ids(
batch_size=batch_size,
vocab_size=VOCAB_SIZE,
num_allowed_token_ids=num_allowed_token_ids,
device=device,
)
sampling_metadata.allowed_token_ids_mask = mask
sampler = Sampler()
logits = sampler.apply_logits_processors(
fake_logits, sampling_metadata, predict_bonus_token=False
)
logits = logits.cpu()
for batch_idx in range(batch_size):
logits_for_req = logits[batch_idx]
if batch_idx % 2 == 1:
assert torch.all(logits_for_req != -float("inf"))
continue
for token_id in range(VOCAB_SIZE):
start = min(batch_idx, VOCAB_SIZE - 1)
end = min(batch_idx + num_allowed_token_ids, VOCAB_SIZE - 1)
if token_id >= start and token_id < end:
assert logits_for_req[token_id] == -float("inf"), (
f"{batch_idx}, {token_id}"
)
else:
assert logits_for_req[token_id] != -float("inf")
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("batch_size", [1, 2, 32])
@pytest.mark.parametrize("bad_words_lengths", [(1,), (1, 3), (2, 2)])
def test_sampler_bad_words(
device: str, batch_size: int, bad_words_lengths: tuple[int, ...]
):
"""
Test to verify that when the bad words restriction is present, tokens
are penalized based on their match with the bad words.
"""
torch.set_default_device(device)
# Create fake logits where each token is assigned the same
# logit value.
fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
sampling_metadata = _create_default_sampling_metadata(
NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
)
sampling_metadata.bad_words_token_ids = _create_bad_words_token_ids(
batch_size, VOCAB_SIZE, bad_words_lengths
)
bad_words_last_tokens = _update_output_token_ids_for_bad_words(
sampling_metadata, VOCAB_SIZE
)
sampler = Sampler()
logits = sampler.apply_logits_processors(
fake_logits, sampling_metadata, predict_bonus_token=False
)
logits = logits.cpu()
for batch_idx in range(batch_size):
logits_for_req = logits[batch_idx]
for token_id in range(VOCAB_SIZE):
if (
batch_idx in bad_words_last_tokens
and token_id in bad_words_last_tokens[batch_idx]
):
assert logits_for_req[token_id] == -float("inf")
else:
assert logits_for_req[token_id] != -float("inf")

View File

@@ -0,0 +1,190 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm import LLM, SamplingParams
MODEL = "hmellor/tiny-random-LlamaForCausalLM"
PROMPT = "Hello my name is Robert and I"
@pytest.fixture(scope="module")
def llm() -> LLM:
return LLM(MODEL, enforce_eager=True)
def test_n_gt_1(llm):
"""ParallelSampling is supported."""
params = SamplingParams(n=3)
outputs = llm.generate(PROMPT, params)
assert len(outputs[0].outputs) == 3
def test_penalties(llm):
"""Check that we do not get errors if applied."""
params = SamplingParams(
temperature=1.2,
presence_penalty=1.2,
frequency_penalty=1.2,
repetition_penalty=1.2,
min_p=0.5,
top_p=0.5,
top_k=3,
)
_ = llm.generate(PROMPT, params)
def test_stop(llm):
"""Check that we respect the stop words."""
output = llm.generate(PROMPT, SamplingParams(temperature=0))
split_text = output[0].outputs[0].text.split()
STOP_IDX = 5
params = SamplingParams(temperature=0, stop=split_text[STOP_IDX])
output = llm.generate(PROMPT, params)
new_split_text = output[0].outputs[0].text.split()
# Output should not contain the stop word.
assert len(new_split_text) == STOP_IDX
params = SamplingParams(
temperature=0, stop=split_text[STOP_IDX], include_stop_str_in_output=True
)
output = llm.generate(PROMPT, params)
new_split_text = output[0].outputs[0].text.split()
# Output should contain the stop word.
assert len(new_split_text) == STOP_IDX + 1
def test_stop_token_ids(llm):
"""Check that we respect the stop token ids."""
output = llm.generate(PROMPT, SamplingParams(temperature=0))
stop_token_id_0 = output[0].outputs[0].token_ids[5]
stop_token_id_1 = output[0].outputs[0].token_ids[6]
stop_token_ids = [stop_token_id_1, stop_token_id_0]
params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
output = llm.generate(PROMPT, params)
assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
stop_token_ids = [stop_token_id_0, stop_token_id_1]
params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
output = llm.generate(PROMPT, params)
assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
def test_detokenize_false(llm):
"""Check that detokenize=False option works."""
output = llm.generate(PROMPT, SamplingParams(detokenize=False))
assert len(output[0].outputs[0].token_ids) > 0
assert len(output[0].outputs[0].text) == 0
output = llm.generate(
PROMPT, SamplingParams(detokenize=False, logprobs=3, prompt_logprobs=3)
)
assert len(output[0].outputs[0].token_ids) > 0
assert len(output[0].outputs[0].text) == 0
prompt_logprobs = output[0].prompt_logprobs
sampled_logprobs = output[0].outputs[0].logprobs
assert len(prompt_logprobs) > 1
assert len(sampled_logprobs) > 1
for all_logprobs in (prompt_logprobs[1:], sampled_logprobs):
for logprobs in all_logprobs:
assert 3 <= len(logprobs) <= 4
assert all(lp.decoded_token is None for lp in logprobs.values())
def test_bad_words(llm):
"""Check that we respect bad words."""
tokenizer = llm.get_tokenizer()
def contains_bad_word(text: str, tokens: list[int], bad_word: str) -> bool:
"""Check if word appears in BOTH text and token sequence."""
if bad_word not in text:
return False
for add_prefix_space in [False, True]:
prefix = " " if add_prefix_space else ""
bad_words_token = tokenizer.encode(
prefix + bad_word.lstrip(), add_special_tokens=False
)
if not bad_words_token:
continue
for i in range(len(tokens) - len(bad_words_token) + 1):
if tokens[i : i + len(bad_words_token)] == bad_words_token:
return True
return False
output = llm.generate(PROMPT, SamplingParams(temperature=0))
split_text = output[0].outputs[0].text.split()
bad_words_1 = " ".join(split_text[:2])
params = SamplingParams(temperature=0, bad_words=[bad_words_1])
output = llm.generate(PROMPT, params)
new_text = output[0].outputs[0].text
new_tokens = output[0].outputs[0].token_ids
assert not contains_bad_word(new_text, new_tokens, bad_words_1)
bad_words_2 = new_text.split()[-1]
params = SamplingParams(temperature=0, bad_words=[bad_words_1, bad_words_2])
output = llm.generate(PROMPT, params)
new_text = output[0].outputs[0].text
new_tokens = output[0].outputs[0].token_ids
assert not contains_bad_word(new_text, new_tokens, bad_words_1)
assert not contains_bad_word(new_text, new_tokens, bad_words_2)
def test_logits_processor(llm):
"""Check that we reject logits processor."""
# This sample logits processor gives infinite score to the i-th token,
# where i is the length of the input sequence.
# We therefore expect the output token sequence to be [0, 1, 2, ...]
def pick_ith(token_ids, logits):
logits[len(token_ids)] = float("inf")
return logits
with pytest.raises(ValueError):
_ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith]))
def test_allowed_token_ids(llm):
"""Check that we can use allowed_token_ids."""
TOKEN_ID = 10
allowed_token_ids = [TOKEN_ID]
output = llm.generate(PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
# Reject empty allowed_token_ids.
with pytest.raises(ValueError):
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
# Reject negative token id.
with pytest.raises(ValueError):
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
# Reject out of vocabulary.
with pytest.raises(ValueError):
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
def test_seed(llm):
"""Check that seed impacts randomness."""
out_1 = llm.generate(PROMPT, SamplingParams(seed=42))
out_2 = llm.generate(PROMPT, SamplingParams(seed=42))
out_3 = llm.generate(PROMPT, SamplingParams(seed=43))
assert out_1[0].outputs[0].text == out_2[0].outputs[0].text
assert out_1[0].outputs[0].text != out_3[0].outputs[0].text

View File

@@ -0,0 +1,117 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from torch import Generator
from vllm.platforms import current_platform
from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
DEVICE = current_platform.device_type
BATCH_SIZE = 1024
VOCAB_SIZE = 128 * 1024
@pytest.fixture(autouse=True)
def reset_default_device():
"""
Explicitly set the default device, which can affect subsequent tests.
Adding this fixture helps avoid this problem.
"""
original_device = torch.get_default_device()
yield
torch.set_default_device(original_device)
def test_topk_impl_equivalence():
torch.set_default_device(DEVICE)
generator = Generator(device=DEVICE).manual_seed(33)
logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
# Random top-k values between 1 and 9.
k = torch.randint(1, 10, (BATCH_SIZE,), generator=generator)
# Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
k.masked_fill_(
torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=bool), VOCAB_SIZE
)
# Top-k only implementation
result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
# Top-p + top-k
no_op_top_p = torch.tensor([1.0])
result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
assert torch.allclose(result1, result2)
def test_flashinfer_sampler():
"""
This test verifies that the FlashInfer top-k and top-p sampling
implementation produces the same results as the Python implementation.
NOTE: FlashInfer did not directly expose an interface for fused top-k and
top-p prob renorm (it did provide fused sampling but we cannot compare
sampling results due to randomness), so we will compare the probability
renormed consequently by top-k and then top-p of FlashInfer implementation.
"""
try:
from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs
is_flashinfer_available = True
except ImportError:
is_flashinfer_available = False
FLASHINFER_ENABLED = current_platform.is_cuda() and is_flashinfer_available
if not FLASHINFER_ENABLED:
pytest.skip("FlashInfer not installed or not available on this platform.")
torch.set_default_device(DEVICE)
generator = Generator(device=DEVICE).manual_seed(42)
# Generate random logits
logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
# Generate various top-k and top-p values
k_values = torch.randint(1, 1000, (BATCH_SIZE,), generator=generator)
p_values = (
torch.rand((BATCH_SIZE,), generator=generator) * 0.5 + 0.5
) # range in [0.5, 1.0]
# Sometimes disable top-k (k=vocab_size)
k_values.masked_fill_(
torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool),
VOCAB_SIZE,
)
# Sometimes disable top-p (p=1.0)
p_values.masked_fill_(
torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool), 1.0
)
python_logits = apply_top_k_top_p(
logits=logits.clone(),
k=k_values,
p=p_values,
)
python_probs = torch.softmax(python_logits, dim=-1)
# FlashInfer only exposed renorm interfaces for probs so convert first
flashinfer_probs = torch.softmax(logits.clone(), dim=-1)
flashinfer_probs = top_k_renorm_probs(
probs=flashinfer_probs,
top_k=k_values,
)
flashinfer_probs = top_p_renorm_probs(
probs=flashinfer_probs,
top_p=p_values,
)
# Compare the results
assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), (
"FlashInfer and Python sampling implementations do not match!"
)

237
tests/v1/sample/utils.py Normal file
View File

@@ -0,0 +1,237 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterator
from enum import Enum
from typing import NamedTuple
import regex as re
import torch
from vllm import CompletionOutput
from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.sample.logits_processor import BatchUpdate, LogitsProcessor
from vllm.v1.sample.metadata import SamplingMetadata
class BatchLogprobsComposition(Enum):
"""Types of logprobs configs to include in test batch"""
NONE = 0
SAMPLE = 1
PROMPT = 2
SAMPLE_PROMPT = 3
BatchLogprobsSpecType = list[tuple[int | None, int | None]]
def get_test_batch(
batch_logprobs_composition: BatchLogprobsComposition,
) -> BatchLogprobsSpecType:
"""Generate logprobs configs for a batch of requests
A given request's logprobs configuration is (1) num_sample_logprobs and (2)
num_prompt_logprobs. The batch logprobs configuration is the list of request
logprobs configs.
batch_logprobs_composition == NONE yields a batch with no sample or prompt
logprobs
batch_logprobs_composition == SAMPLE yields a batch with some requests
configured for sample logprobs only, and others configured for no logprobs
batch_logprobs_composition == PROMPT yields a batch with some requests
configured for prompt logprobs only, and others configured for no logprobs
batch_logprobs_composition == SAMPLE_PROMPT yields a batch with some
requests configured for sample logprobs and prompt logprobs, some configured
for only sample logprobs or only prompt logprobs, and some configured for
no logprobs
Args:
batch_logprobs_composition: types of logprobs configs to include in batch
Returns:
list of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
tuples
"""
if batch_logprobs_composition == BatchLogprobsComposition.NONE:
# No requests with sample or prompt logprobs
return [(None, None)]
elif batch_logprobs_composition == BatchLogprobsComposition.SAMPLE:
# Requests requiring sample logprobs or no logprobs
return [
(None, None),
(0, None),
(5, None),
(3, None),
]
elif batch_logprobs_composition == BatchLogprobsComposition.PROMPT:
# Requests requiring prompt logprobs or no logprobs
return [
(None, None),
(None, 0),
(None, 6),
(None, 5),
]
elif batch_logprobs_composition == BatchLogprobsComposition.SAMPLE_PROMPT:
# Requests requiring either no logprobs, just
# sample logprobs, just prompt logprobs, or
# both sample and prompt logprobs
return [
(None, None),
(0, None),
(5, None),
(3, None),
(0, 3),
(6, 0),
(6, 3),
(None, 6),
(None, 5),
(None, 0),
]
else:
raise ValueError("Invalid logprobs batch configuration for test.")
def assert_incr_detok_str_matches_non_incr_detok_str(
incremental_detokenization_str: str,
non_incremental_detokenization_str: str,
msg: str,
) -> None:
"""Compare incrementally detok. text to non-incrementally detok. text
Fail if the strings mismatch after non-alphanumeric characters are stripped
out.
Rationale: incremental detokenization in the text generation process allows
the tokenizer to adjust the next token text output based on the token's
context in the string. However, logprobs detokenization detokenizes each
token individually, and the resultant strings may include some
non-alphanumeric placeholder characters where there could be i.e.
whitespace. So, this function compares only the alphanumeric text
between two strings and fails if there is a mismatch, which helps
with validating logprobs detokenization.
Args:
incremental_detokenization_str: incrementally-detokenized generated text
non_incremental_detokenization_str: non-incrementally-detokenized logprob
tokens
msg: error message if `assert` fails
"""
rgx = r"[^a-zA-Z0-9]+"
assert re.sub(rgx, "", incremental_detokenization_str) == re.sub(
rgx, "", non_incremental_detokenization_str
), msg
def compute_correct_cumulative_logprob(completion_output: CompletionOutput) -> float:
"""Compute known-good value for evaluating cumulative logprob
Args:
completion_output: completion output from engine
Returns:
Known-good cumulative logprob value
"""
token_ids = completion_output.token_ids
logprobs = completion_output.logprobs
assert logprobs is not None
return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
def create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
return fake_logits
def create_penalty_tensor(
batch_size: int, penalty_value: float, device: torch.device
) -> torch.Tensor:
return torch.full(
(batch_size,), fill_value=penalty_value, dtype=torch.float, device=device
)
def create_prompt_tokens_tensor(
prompt_token_ids: list[list[int]],
vocab_size: int,
device: torch.device,
) -> torch.Tensor:
return make_tensor_with_pad(
prompt_token_ids,
pad=vocab_size,
device=device,
dtype=torch.int64,
pin_memory=False,
)
class LogitsprocsTestFakes(NamedTuple):
"""Wraps fake data structures to support testing"""
logits: torch.Tensor
sampling_metadata: SamplingMetadata
def get_logitsprocs_by_cls(
self,
cls: type[LogitsProcessor],
) -> Iterator[LogitsProcessor]:
"""Yield logits processors of a specific class.
Args:
cls: :class:`LogitsProcessor` subclass
Returns:
Iterator over logits processors
"""
return (
lp for lp in self.sampling_metadata.logitsprocs.all if isinstance(lp, cls)
)
def get_logitsprocs(self) -> Iterator[LogitsProcessor]:
"""Iterator over all logits processors."""
return self.sampling_metadata.logitsprocs.all
def fake_update_logitsprocs_state(
test_fakes: LogitsprocsTestFakes,
batch_update: BatchUpdate,
) -> None:
"""Imitate logits processors persistent batch state update
in engine core"""
for logitproc in test_fakes.get_logitsprocs():
logitproc.update_state(batch_update)
def fake_apply_logitsprocs(
test_fakes: LogitsprocsTestFakes,
slice_indices: list[int],
) -> torch.Tensor:
"""Imitate application of logits processors in engine core"""
logits = test_fakes.logits[torch.tensor(slice_indices, dtype=torch.long)].clone()
for processor in test_fakes.get_logitsprocs():
logits = processor.apply(logits)
return logits
def create_allowed_token_ids(
batch_size: int,
vocab_size: int,
num_allowed_token_ids: int,
device: torch.device,
) -> torch.Tensor | None:
mask: torch.Tensor | None = None
for i in range(batch_size):
if i % 2 == 1:
continue
if mask is None:
mask = torch.zeros(
(batch_size, vocab_size), dtype=torch.bool, device=device
)
start = min(i, vocab_size - 1)
end = min(i + num_allowed_token_ids, vocab_size - 1)
mask[i, start:end] = True
return mask