Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/v1/sample/init.py
+++ b/tests/v1/sample/init.py
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -0,0 +1,685 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+import math
+from collections.abc import Generator
+from typing import get_args
+
+import pytest
+import torch
+
+from tests.utils import large_gpu_mark
+from tests.v1.sample.utils import (
+    BatchLogprobsComposition,
+    BatchLogprobsSpecType,
+    assert_incr_detok_str_matches_non_incr_detok_str,
+    compute_correct_cumulative_logprob,
+    get_test_batch,
+)
+from vllm import SamplingParams
+from vllm.config.model import LogprobsMode
+from vllm.distributed import cleanup_dist_env_and_memory
+
+from ...conftest import HfRunner, VllmRunner
+
+MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+DTYPE = "half"
+
+NONE = BatchLogprobsComposition.NONE
+SAMPLE = BatchLogprobsComposition.SAMPLE
+PROMPT = BatchLogprobsComposition.PROMPT
+SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT
+
+
+@pytest.fixture(
+    scope="module",
+    # Parameterize APC
+    params=[False, True],
+)
+def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
+    with vllm_runner(
+        MODEL,
+        dtype=DTYPE,
+        max_logprobs=7,
+        # Very small number of batched tokens to ensure
+        # that we test chunking.
+        max_num_batched_tokens=16,
+        max_num_seqs=16,
+        max_model_len=128,
+        enable_chunked_prefill=True,
+        enforce_eager=True,
+        # TODO: enable this once we support it for
+        # prompt logprobs.
+        enable_prefix_caching=request.param,
+        gpu_memory_utilization=0.4,  # up to 2 alive concurrently
+    ) as vllm_model:
+        yield vllm_model
+
+
+@pytest.fixture(scope="module")
+def hf_model(hf_runner) -> Generator[HfRunner, None, None]:
+    with hf_runner(MODEL, dtype=DTYPE) as hf_model:
+        yield hf_model
+
+
+def _repeat_logprob_config(
+    test_prompts,
+    logprob_prompt_logprob_list: BatchLogprobsSpecType,
+) -> BatchLogprobsSpecType:
+    """Ensure each test prompt has a logprob config.
+
+    A logprob config specifies the optional (i.e.
+    may-be-`None`) number of sample logprobs and
+    the optional number of prompt logprobs.
+
+    If more test prompts than logprob configs are
+    provided, the provided logprob configs are
+    tiled to match the number of test prompts.
+
+    If fewer test prompts than logprob configs
+    are provided, the list of logprob configs
+    is truncated to match the number of test
+    prompts.
+
+    Otherwise, the list of logprob configs
+    is returned as-is.
+
+    Args:
+      test_prompts: list of prompts under test
+      logprob_prompt_logprob_list: list of
+                            (optional num sample logprob,
+                             optional num prompt logprob)
+                             tuples
+
+    Returns:
+      list of
+      (optional num sample logprob,optional num prompt logprob)
+      tuples which is either identical to
+      `logprob_prompt_logprob_list`, or else repeats
+      `logprob_prompt_logprob_list` enough times to match the
+      number of `test_prompts`, or else is truncated to match
+      the number of `test_prompts`
+    """
+    num_test_prompts = len(test_prompts)
+    # Make sure there is a logprobs configuration for each test prompt
+    logprob_prompt_logprob_list = list(
+        itertools.islice(itertools.cycle(logprob_prompt_logprob_list), num_test_prompts)
+    )
+    # Now the number of prompts should match the number of sample params combos
+    assert num_test_prompts == len(logprob_prompt_logprob_list)
+    return logprob_prompt_logprob_list
+
+
+def _run_and_validate(
+    vllm_model: VllmRunner,
+    test_prompts: list[str],
+    vllm_sampling_params: SamplingParams,
+    hf_logprobs: list[list[torch.Tensor]],
+    hf_outputs: list[tuple[list[int], str]],
+    logprob_prompt_logprob_list: BatchLogprobsSpecType,
+    temperature: float,
+    max_tokens: int,
+    do_apc: bool,
+) -> None:
+    vllm_results = vllm_model.llm.generate(
+        test_prompts, sampling_params=vllm_sampling_params
+    )
+
+    for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
+        vllm_results, hf_logprobs, hf_outputs, logprob_prompt_logprob_list
+    ):
+        # Extract request-level (prompt)logprobs config
+        num_top_logprobs, num_top_prompt_logprobs = logprob_prompt_logprob
+
+        # Test whether sampled token output is consistent between vLLM and HF
+        # vLLM prompt+completion should match HF output
+        if temperature == 0.0:
+            assert (
+                vllm_result.prompt_token_ids + vllm_result.outputs[0].token_ids
+                == hf_output[0]
+            )
+        else:
+            # Sampled tokens won't match if not greedy
+            assert (
+                vllm_result.prompt_token_ids
+                == hf_output[0][: len(vllm_result.prompt_token_ids)]
+            )
+
+        # Validate sample logprobs
+        if num_top_logprobs is not None:
+            assert num_top_logprobs is not None
+            # Confirm that the structure of the sample logprobs in the result is
+            # correct
+            assert vllm_result.outputs[0].logprobs is not None
+            assert len(vllm_result.outputs[0].logprobs) == max_tokens
+            for logprobs, token_id in zip(
+                vllm_result.outputs[0].logprobs, vllm_result.outputs[0].token_ids
+            ):
+                assert logprobs is not None
+
+                # Confirm that the output token appears among the logprobs
+                assert token_id in logprobs
+                token_in_topk = logprobs[token_id].rank <= num_top_logprobs
+
+                # If the output token is not included in the top K
+                # logprob, it can return 1 more data
+                if token_in_topk and num_top_logprobs != 0:
+                    assert len(logprobs) == num_top_logprobs
+                else:
+                    assert len(logprobs) == num_top_logprobs + 1
+
+                if num_top_logprobs > 0:
+                    # We should have an entry for each of the topk ranks
+                    all_ranks = {lp.rank for lp in logprobs.values()}
+                    assert all(r in all_ranks for r in range(1, num_top_logprobs + 1))
+
+            output_text = vllm_result.outputs[0].text
+            output_string_from_most_likely_tokens_lst: list[str] = []
+            for top_logprobs in vllm_result.outputs[0].logprobs:
+                top_logprob = next(iter(top_logprobs.values()))
+                output_string_from_most_likely_tokens_lst.append(
+                    top_logprob.decoded_token
+                )
+
+            output_string_from_most_likely_tokens = "".join(
+                output_string_from_most_likely_tokens_lst
+            )
+            assert_incr_detok_str_matches_non_incr_detok_str(
+                output_text,
+                output_string_from_most_likely_tokens,
+                "The output text from the top logprob for each token "
+                "position should be the same as the output text in the "
+                "result.",
+            )
+
+            # Compare vLLM sample logprobs to HF
+            vllm_sample_logprobs = vllm_result.outputs[0].logprobs
+            for i, top_logprobs in enumerate(vllm_sample_logprobs):
+                for token_id, sample_logprob in top_logprobs.items():
+                    if temperature == 0.0 or i == 0:
+                        logprob = sample_logprob.logprob
+                        torch.testing.assert_close(
+                            logprob,
+                            hf_logprob[i][-1][token_id].item(),
+                            atol=1e-2,
+                            rtol=1e-2,
+                        )
+                    assert isinstance(sample_logprob.decoded_token, str), (
+                        "The token should be decoded by the time it is"
+                        " returned to the user."
+                    )
+
+            # At this point we know the sample logprobs are correct for this
+            # request. Validate that cumulative_logprob is actually the sum.
+            # For each request, assert that the returned cumulative logprob
+            # matches the correct value, which is computed below.
+            torch.testing.assert_close(
+                vllm_result.outputs[0].cumulative_logprob,
+                compute_correct_cumulative_logprob(vllm_result.outputs[0]),
+                atol=1e-6,
+                rtol=1e-6,
+            )
+        else:
+            # Logprobs disabled for this request; should be None
+            assert vllm_result.outputs[0].logprobs is None
+
+        # Validate prompt logprobs
+        if num_top_prompt_logprobs is not None:
+            # Confirm that structure of prompt logprobs in result is correct
+            assert vllm_result.prompt_logprobs is not None
+            # - The first prompt logprob is always None
+            assert vllm_result.prompt_logprobs[0] is None
+            # - Prompt logprobs are returned for all indices in
+            #   the prompt
+            assert len(vllm_result.prompt_logprobs) == len(vllm_result.prompt_token_ids)
+            for prompt_logprobs, prompt_token_id in zip(
+                vllm_result.prompt_logprobs[1:], vllm_result.prompt_token_ids[1:]
+            ):
+                assert prompt_logprobs is not None
+
+                # Confirm that the prompt token appears among the logprobs
+                assert prompt_token_id in prompt_logprobs
+                token_in_topk = (
+                    prompt_logprobs[prompt_token_id].rank <= num_top_prompt_logprobs
+                )
+
+                # If the prompt token is not included in the top K
+                # logprob, it can return 1 more data
+                if token_in_topk and num_top_prompt_logprobs != 0:
+                    assert len(prompt_logprobs) == num_top_prompt_logprobs
+                else:
+                    assert len(prompt_logprobs) == num_top_prompt_logprobs + 1
+
+                if num_top_prompt_logprobs > 0:
+                    # We should have an entry for each of the topk ranks
+                    all_ranks = {lp.rank for lp in prompt_logprobs.values()}
+                    assert all(
+                        r in all_ranks for r in range(1, num_top_prompt_logprobs + 1)
+                    )
+
+            # Compare prompt logprobs to HF
+            # The first prompt logprob is always None, so we compare it from
+            # 1:.
+            vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
+            for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
+                for token_id, logprob in vllm_prompt_logprob_dict.items():
+                    torch.testing.assert_close(
+                        logprob.logprob,
+                        hf_logprob[0][i][token_id].item(),
+                        atol=2e-2,
+                        rtol=2e-2,
+                    )
+        else:
+            assert vllm_result.prompt_logprobs is None
+
+
+@pytest.mark.parametrize(
+    "batch_logprobs_composition", [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT]
+)
+@pytest.mark.parametrize("temperature", [0.0, 2.0])
+def test_get_logprobs_and_prompt_logprobs(
+    hf_model,
+    vllm_model,
+    batch_logprobs_composition: BatchLogprobsComposition,
+    temperature: float,
+    example_prompts: list[str],
+) -> None:
+    """Test V1 Engine logprobs & prompt logprobs
+
+    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
+    settings and validate that
+    * The generated logprobs and prompt logprobs are consistent with the
+      configuration settings, in terms of whether or not the logprobs
+      (of either type) were requested and how many were requested
+    * The generated logprobs are consistent with the generated tokens
+    * The generated (prompt)logprobs are consistent with HuggingFace
+      (prompt)logprobs, as a reference
+
+    batch_logprobs_composition controls the logprobs configurations for
+    requests in the batch under test.
+
+    APC tests run two test iterations so that cache hits occur.
+
+    To save time, only test one APC-enabled scenario
+    (sample & prompt logprobs enabled, temperature>0.0).
+
+    Args:
+      hf_model: HuggingFace reference model fixture
+      vllm_model: vLLM model fixture
+      batch_logprobs_composition: logprobs configuration for test batch
+      temperature: "temperature" sampling parameter
+      example_prompts: example prompt fixture
+    """
+    do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
+    if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
+        # Skip some test-cases to save time.
+        pytest.skip()
+    test_prompts = example_prompts
+
+    max_tokens = 5
+    hf_outputs = hf_model.generate_greedy(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+    hf_logprobs = hf_model.generate_greedy_logprobs(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+
+    # Batch has mixed sample params
+    # (different logprobs/prompt logprobs combos)
+    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
+
+    # Ensure that each test prompt has a logprob config for testing
+    logprob_prompt_logprob_list = _repeat_logprob_config(
+        test_prompts, logprob_prompt_logprob_list
+    )
+    # Generate SamplingParams
+    vllm_sampling_params = [
+        SamplingParams(
+            max_tokens=max_tokens,
+            logprobs=num_lp,
+            prompt_logprobs=num_plp,
+            temperature=temperature,
+            seed=1984,
+        )
+        for num_lp, num_plp in logprob_prompt_logprob_list
+    ]
+    for _ in range(2 if do_apc else 1):
+        _run_and_validate(
+            vllm_model=vllm_model,
+            test_prompts=test_prompts,
+            vllm_sampling_params=vllm_sampling_params,
+            hf_logprobs=hf_logprobs,
+            hf_outputs=hf_outputs,
+            logprob_prompt_logprob_list=logprob_prompt_logprob_list,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            do_apc=do_apc,
+        )
+
+
+def test_max_logprobs():
+    """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
+    Should also fail for `prompt_logprobs > max_logprobs`
+    APC should not matter as this test checks basic request validation.
+    """
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=1,
+        enable_prefix_caching=False,
+        # 2 other llms alive during whole session
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+
+
+def test_none_logprobs(vllm_model, example_prompts):
+    """Engine should return `logprobs` and `prompt_logprobs` as `None`
+
+    Args:
+      vllm_model: vLLM model fixture
+      example_prompts: list of example prompts (test fixture)
+    """
+    max_tokens = 5
+
+    sampling_params_logprobs_none = SamplingParams(
+        max_tokens=max_tokens,
+        logprobs=None,
+        prompt_logprobs=None,
+        temperature=0.0,
+    )
+    results_logprobs_none = vllm_model.llm.generate(
+        example_prompts,
+        sampling_params=sampling_params_logprobs_none,
+    )
+
+    for i in range(len(results_logprobs_none)):
+        # Check sample logprobs are None
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
+        # Check prompt logprobs are None
+        assert results_logprobs_none[i].prompt_logprobs is None
+
+
+def test_zero_logprobs(vllm_model, example_prompts):
+    """Engine should return sampled token and prompt token logprobs
+
+    Args:
+      vllm_model: vLLM model fixture
+      example_prompts: list of example prompts (test fixture)
+    """
+    max_tokens = 5
+
+    sampling_params_logprobs_zero = SamplingParams(
+        max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
+    )
+    results_logprobs_zero = vllm_model.llm.generate(
+        example_prompts, sampling_params=sampling_params_logprobs_zero
+    )
+
+    for i in range(len(results_logprobs_zero)):
+        # Check that there is one sample logprob dict for each
+        # sample token
+        logprobs = results_logprobs_zero[i].outputs[0].logprobs
+        prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
+        sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
+        prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
+        assert logprobs is not None
+        assert len(sampled_token_ids) == len(logprobs)
+        assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
+        # Check that there is one prompt logprob dict for each
+        # prompt token
+        assert prompt_logprobs is not None
+        assert len(prompt_token_ids) == len(prompt_logprobs)
+
+
+def test_all_logprobs(example_prompts):
+    """Engine should return all vocabulary logprobs and prompt logprobs
+
+    Args:
+      example_prompts: list of example prompts (test fixture)
+    """
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=-1,
+        enable_prefix_caching=False,
+        # 2 other llms alive during whole session
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
+
+    sampling_params_logprobs_all = SamplingParams(
+        max_tokens=5, logprobs=-1, prompt_logprobs=-1
+    )
+    results_logprobs_all = runner.llm.generate(
+        example_prompts, sampling_params=sampling_params_logprobs_all
+    )
+    vocab_size = runner.llm.llm_engine.model_config.get_vocab_size()
+
+    for i in range(len(results_logprobs_all)):
+        logprobs = results_logprobs_all[i].outputs[0].logprobs
+        prompt_logprobs = results_logprobs_all[i].prompt_logprobs
+        assert logprobs is not None
+        for logprob in logprobs:
+            assert len(logprob) == vocab_size
+        assert prompt_logprobs is not None
+        assert prompt_logprobs[0] is None
+        for prompt_logprob in prompt_logprobs[1:]:
+            assert len(prompt_logprob) == vocab_size
+
+
+@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
+def test_logprobs_mode(logprobs_mode: LogprobsMode):
+    """Test with LLM engine with different logprobs_mode.
+    For logprobs, we should have non-positive values.
+    For logits, we should expect at least one positive values.
+    """
+    from vllm import LLM
+
+    llm = LLM(
+        "facebook/opt-125m",
+        max_logprobs=5,
+        enable_prefix_caching=False,
+        # 2 other llms alive during whole session
+        gpu_memory_utilization=0.05,
+        max_model_len=16,
+        logprobs_mode=logprobs_mode,
+    )
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    total_token_with_logprobs = 0
+    positive_values = 0
+    for output in results[0].outputs:
+        for logprobs in output.logprobs:
+            for token_id in logprobs:
+                logprob = logprobs[token_id]
+                if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
+                    assert logprob.logprob <= 0
+                if logprob.logprob > 0:
+                    positive_values = positive_values + 1
+                total_token_with_logprobs = total_token_with_logprobs + 1
+    assert total_token_with_logprobs >= len(results[0].outputs)
+    if logprobs_mode in ("raw_logits", "processed_logits"):
+        assert positive_values > 0
+    del llm
+
+
+@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
+@pytest.mark.parametrize(
+    "model_setup",
+    [
+        pytest.param(
+            (
+                "eagle",
+                "meta-llama/Llama-3.2-1B-Instruct",
+                "nm-testing/Llama3_2_1B_speculator.eagle3",
+            ),
+            marks=large_gpu_mark(min_gb=32),
+        ),
+    ],
+)
+@pytest.mark.parametrize("top_logprobs", [0, 3])
+def test_spec_decode_logprobs(
+    logprobs_mode: LogprobsMode,
+    model_setup: tuple[str, str, str],
+    top_logprobs: int,
+):
+    """Spec decode logprobs should match those of the base model.
+
+    Args:
+        logprobs_mode: logprobs mode.
+        model_setup: Spec decode method, base model name, and
+        draft model name.
+    """
+    from vllm import LLM
+
+    prompt = "Hello world " * 50
+    sampling_params = SamplingParams(
+        temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False
+    )
+    method, model_name, spec_model_name = model_setup
+    max_model_len = 256
+
+    # Run base LLM.
+    ref_llm = LLM(
+        model=model_name,
+        max_logprobs=5,
+        max_model_len=max_model_len,
+        seed=42,
+        logprobs_mode=logprobs_mode,
+        gpu_memory_utilization=0.4,
+    )
+    ref_results = ref_llm.generate([prompt], sampling_params)
+    # Collect logprobs outputs from reference LLM.
+    ref_logprobs = []
+    for output in ref_results[0].outputs:
+        for logprobs in output.logprobs:
+            for token_id in logprobs:
+                ref_logprobs.append(logprobs[token_id])
+    del ref_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    # Run spec decode LLM.
+    spec_llm = LLM(
+        model_name,
+        speculative_config={
+            "method": method,
+            "model": spec_model_name,
+            "num_speculative_tokens": 3,
+            "max_model_len": max_model_len,
+        },
+        max_logprobs=5,
+        max_model_len=max_model_len,
+        seed=42,
+        logprobs_mode=logprobs_mode,
+        gpu_memory_utilization=0.4,
+        # Force prefill chunking
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=32,
+    )
+    spec_results = spec_llm.generate([prompt], sampling_params)
+    # Collect logprobs outputs from spec decode LLM.
+    spec_logprobs = []
+    for output in spec_results[0].outputs:
+        for logprobs in output.logprobs:
+            for token_id in logprobs:
+                spec_logprobs.append(logprobs[token_id])
+    del spec_llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    # Per-token logprobs are expected to be the same.
+    assert len(ref_logprobs) == len(spec_logprobs)
+    for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
+        assert math.isclose(
+            ref_logprob.logprob, spec_logprob.logprob, rel_tol=5e-2, abs_tol=1e-1
+        )
+        assert ref_logprob.rank == spec_logprob.rank
+        assert ref_logprob.decoded_token == spec_logprob.decoded_token
+
+
+def test_prompt_logprobs_with_chunking_and_preemption():
+    """Test that prompt logprobs are correctly returned when using
+    both chunked prefill and preemption.
+
+    This test ensures that the num_prompt_logprobs tracking persists
+    across preemptions and prefill chunks.
+    """
+
+    # Create prompts that will trigger chunking and preemption
+    prompts = [
+        "The following numbers of the sequence "
+        + ", ".join(str(i) for i in range(10))
+        + " are:",
+        "In one word, the capital of France is ",
+    ] + [f"Tell me about the number {i}: " for i in range(32)]
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=40,
+        min_tokens=20,
+        prompt_logprobs=2,  # Request prompt logprobs
+    )
+
+    with VllmRunner(
+        "Qwen/Qwen3-0.6B",
+        max_model_len=512,
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=48,  # Force prefill chunking
+        num_gpu_blocks_override=32,  # Force preemptions
+        disable_log_stats=False,
+        gpu_memory_utilization=0.25,
+    ) as vllm_model:
+        metrics_before = vllm_model.llm.get_metrics()
+
+        # Generate with prompt logprobs using generate_w_logprobs which
+        # returns (output_ids, output_str, output_logprobs, prompt_logprobs)
+        outputs = vllm_model.generate_w_logprobs(
+            prompts, sampling_params=sampling_params, include_prompt_token_ids=True
+        )
+
+        # Verify that all outputs have prompt logprobs
+        for i, output in enumerate(outputs):
+            _, _, _, prompt_token_ids, prompt_logprobs = output
+            assert prompt_logprobs is not None and len(prompt_logprobs) > 0, (
+                f"Output {i} missing prompt logprobs"
+            )
+            assert len(prompt_logprobs) == len(prompt_token_ids), (
+                "Unexpected number of prompt logprob positions"
+            )
+
+            # Each position should have the requested number of logprobs
+            for pos, logprobs_dict in enumerate(prompt_logprobs):
+                if logprobs_dict is not None:  # First token may be None
+                    assert (
+                        sampling_params.prompt_logprobs
+                        <= len(logprobs_dict)
+                        <= sampling_params.prompt_logprobs + 1
+                    ), (
+                        f"Output {i} position {pos} has {len(logprobs_dict)} "
+                        f"logprobs, expected {sampling_params.prompt_logprobs}"
+                    )
+
+        # Check that we actually had preemptions
+        metrics_after = vllm_model.llm.get_metrics()
+        preemptions_before = next(
+            (m.value for m in metrics_before if m.name == "vllm:num_preemptions"), 0
+        )
+        preemptions_after = next(
+            (m.value for m in metrics_after if m.name == "vllm:num_preemptions"), 0
+        )
+        preemptions = preemptions_after - preemptions_before
+        assert preemptions > 0, "Test did not trigger any preemptions"
+
+        print(f"Test passed with {preemptions} preemptions")
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import lm_eval
+
+from ...utils import RemoteOpenAIServer
+
+# arc-easy uses prompt_logprobs=1, logprobs=1
+TASK = "arc_easy"
+FILTER = "acc_norm,none"
+RTOL = 0.03
+EXPECTED_VALUE = 0.62
+
+# FIXME(rob): enable prefix caching once supported.
+MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8"  # noqa: E501
+SERVER_ARGS = [
+    "--enforce_eager",
+    "--no_enable_prefix_caching",
+    "--gpu-memory-utilization=0.8",
+]
+NUM_CONCURRENT = 100
+
+
+def test_prompt_logprobs_e2e():
+    results = lm_eval.simple_evaluate(
+        model="vllm", model_args=MODEL_ARGS, tasks=TASK, batch_size="auto"
+    )
+
+    measured_value = results["results"][TASK][FILTER]
+    assert (
+        measured_value - RTOL < EXPECTED_VALUE
+        and measured_value + RTOL > EXPECTED_VALUE
+    ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+
+
+def test_prompt_logprobs_e2e_server():
+    with RemoteOpenAIServer(MODEL, SERVER_ARGS) as remote_server:
+        url = f"{remote_server.url_for('v1')}/completions"
+
+        model_args = (
+            f"model={MODEL},"
+            f"base_url={url},"
+            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False"
+        )
+
+        results = lm_eval.simple_evaluate(
+            model="local-completions",
+            model_args=model_args,
+            tasks=TASK,
+        )
+
+        measured_value = results["results"][TASK][FILTER]
+        assert (
+            measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+        ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -0,0 +1,781 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+from unittest.mock import Mock
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from tests.v1.sample.utils import create_allowed_token_ids
+from vllm.platforms import current_platform
+from vllm.v1.sample.logits_processor import LogitsProcessors
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.rejection_sampler import PLACEHOLDER_TOKEN_ID, RejectionSampler
+from vllm.v1.sample.sampler import Sampler, SamplerOutput
+from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
+
+DEVICE = current_platform.device_type
+
+
+@pytest.fixture
+def rejection_sampler():
+    mock_sampler = Mock(spec=Sampler)
+    mock_sampler.logprobs_mode = "raw_logprobs"
+    return RejectionSampler(mock_sampler)
+
+
+def mock_sampler_output(
+    rejection_sampler: RejectionSampler, bonus_token_ids: torch.Tensor
+):
+    rejection_sampler.sampler.return_value = SamplerOutput(
+        sampled_token_ids=bonus_token_ids, logprobs_tensors=None
+    )
+
+
+def create_spec_decode_metadata(
+    spec_tokens: list[list[int]], logits: torch.Tensor
+) -> SpecDecodeMetadata:
+    metadata = SpecDecodeMetadata.make_dummy(spec_tokens, device=logits.device)
+    metadata.target_logits_indices = torch.arange(logits.shape[0])
+    # Output bonus token ids are mocked, so the bonus logit indices should
+    # be empty.
+    metadata.bonus_logits_indices = torch.empty(0, dtype=torch.int32)
+    return metadata
+
+
+def create_logits_tensor(
+    output_token_ids: list[list[int]],
+    vocab_size: int = 100,
+    token_idx_to_override: int | None = None,
+) -> torch.Tensor:
+    """Helper function to create logits tensor that
+    will produce desired token ids on argmax"""
+    token_ids = [tokens[:-1] for tokens in output_token_ids]
+    num_total_tokens = sum(len(tokens) for tokens in token_ids)
+    logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
+    start_loc = 0
+    for tokens in token_ids:
+        for j, token_id in enumerate(tokens):
+            logits[start_loc + j, token_id] = 100.0
+        start_loc += len(tokens)
+    if token_idx_to_override:
+        logits[:, token_idx_to_override] = 99.0
+    return logits
+
+
+def create_sampling_metadata(
+    all_greedy: bool,
+    output_token_ids: list[list[int]] | None = None,
+    prompt_token_ids: torch.Tensor | None = None,
+    spec_token_ids: torch.Tensor | None = None,
+    temperature: torch.Tensor | None = None,
+    top_k: torch.Tensor | None = None,
+    top_p: torch.Tensor | None = None,
+    generators: dict[int, Any] | None = None,
+    frequency_penalties: list[float] | None = None,
+    presence_penalties: list[float] | None = None,
+    repetition_penalties: list[float] | None = None,
+    bad_words_token_ids: dict[int, list[list[int]]] | None = None,
+    allowed_token_ids_mask: torch.Tensor | None = None,
+) -> SamplingMetadata:
+    """Create a v1 sampling metadata object with all_greedy set
+    to the given value. Either all greedy or all random sampling
+    is used.
+    """
+    generators = generators or {}
+    if all_greedy:
+        temperature = None
+    else:
+        assert temperature is not None
+
+    if any([frequency_penalties, presence_penalties, repetition_penalties]):
+        no_penalties = False
+
+        assert output_token_ids
+        assert len(output_token_ids) > 0
+
+        frequency_penalties = torch.tensor(frequency_penalties, device=DEVICE)
+        presence_penalties = torch.tensor(presence_penalties, device=DEVICE)
+        repetition_penalties = torch.tensor(repetition_penalties, device=DEVICE)
+    else:
+        no_penalties = True
+        frequency_penalties = torch.tensor([])
+        presence_penalties = torch.tensor([])
+        repetition_penalties = torch.tensor([])
+
+    return SamplingMetadata(
+        temperature=temperature,
+        all_greedy=all_greedy,
+        all_random=not all_greedy,
+        top_p=top_p,
+        top_k=top_k,
+        generators=generators,
+        max_num_logprobs=None,
+        no_penalties=no_penalties,
+        prompt_token_ids=prompt_token_ids,
+        frequency_penalties=frequency_penalties,
+        presence_penalties=presence_penalties,
+        repetition_penalties=repetition_penalties,
+        output_token_ids=[] if output_token_ids is None else output_token_ids,
+        spec_token_ids=[] if spec_token_ids is None else spec_token_ids,
+        allowed_token_ids_mask=allowed_token_ids_mask,
+        bad_words_token_ids={} if bad_words_token_ids is None else bad_words_token_ids,
+        logitsprocs=LogitsProcessors(),
+    )
+
+
+########################### Tests for Greedy Sampling ###################
+def test_perfect_match(rejection_sampler):
+    """Test when output tokens perfectly match speculated tokens"""
+    spec_tokens = [[1, 2, 3]]
+    output_tokens = [[1, 2, 3, 4]]  # 4 is the bonus token
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
+
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor([[1, 2, 3, 4]], dtype=torch.int, device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_early_mismatch(rejection_sampler):
+    """Test when there's an early mismatch in tokens"""
+    spec_tokens = [[1, 2, 3]]
+    output_tokens = [[1, 5, 3, 4]]  # Mismatch at position 1
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
+
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor(
+        [[1, 5, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID]],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_multiple_sequences(rejection_sampler):
+    """Test handling multiple sequences of speculated tokens"""
+    spec_tokens = [[1, 2], [3]]
+    output_tokens = [[1, 2, 5], [3, 4]]  # Two sequences with bonus tokens 5 and 4
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device
+    )
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
+
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor(
+        [[1, 2, 5], [3, 4, PLACEHOLDER_TOKEN_ID]], dtype=torch.int, device=logits.device
+    )
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_single_token_sequence(rejection_sampler):
+    """Test handling sequences with single token"""
+    spec_tokens = [[1]]
+    output_tokens = [[1, 2]]  # Single token with bonus token 2
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
+
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_empty_sequence(rejection_sampler):
+    """Test handling empty sequence of speculated tokens"""
+    spec_tokens: list[list[int]] = [[]]
+    output_tokens = [[5]]  # Just the bonus token
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]], device=logits.device)
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
+
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_multiple_mismatches(rejection_sampler):
+    """Test handling multiple sequences with mismatches"""
+    spec_tokens = [[1, 2, 3], [4, 5, 6]]
+    output_tokens = [[1, 2, 7, 6], [4, 8, 6, 9]]  # Mismatches in both sequences
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device
+    )
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
+
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor(
+        [
+            [1, 2, 7, PLACEHOLDER_TOKEN_ID],
+            [4, 8, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID],
+        ],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+@pytest.mark.parametrize(
+    "spec_tokens,output_tokens,expected",
+    [
+        ([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]),  # Perfect match with bonus
+        ([[1]], [[2, 3]], [[2, PLACEHOLDER_TOKEN_ID]]),  # First mismatch
+        (
+            [[1, 2], [3, 4]],
+            [[1, 5, 6], [3, 4, 7]],
+            [[1, 5, PLACEHOLDER_TOKEN_ID], [3, 4, 7]],
+        ),  # Mixed matches
+    ],
+)
+def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens, expected):
+    """Parametrized test for various matching scenarios"""
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor(
+        [tokens[-1] for tokens in output_tokens], device=logits.device
+    )
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
+
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+    expected_tensor = torch.tensor(expected, dtype=torch.int, device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected_tensor)
+
+
+########################### Tests for Random Sampling ###################
+@pytest.mark.parametrize("k", [1, 3, 5])
+@pytest.mark.parametrize("vocab_size", [1000])
+@pytest.mark.parametrize("batch_size", [1, 4, 8])
+@pytest.mark.parametrize("frac_seeded", [0.0, 0.5])
+@pytest.mark.parametrize("n_rep", [20])
+def test_deterministic_when_seeded(
+    rejection_sampler,
+    k: int,
+    vocab_size: int,
+    batch_size: int,
+    frac_seeded: float,
+    n_rep: int,
+):
+    num_tokens = batch_size * k
+    draft_probs = torch.rand(num_tokens, vocab_size, dtype=torch.float32, device=DEVICE)
+    draft_probs = F.softmax(draft_probs, dim=-1)
+    target_logits = torch.rand_like(draft_probs)
+    bonus_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64, device=DEVICE
+    )
+    draft_token_ids = torch.randint(
+        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64, device=DEVICE
+    )
+
+    seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
+
+    results = []
+    for _ in range(n_rep):
+        seeded_seqs = {
+            i: torch.Generator(device=DEVICE).manual_seed(i)
+            for i in range(batch_size)
+            if seeded_mask[i]
+        }
+
+        temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+        sampling_metadata = create_sampling_metadata(
+            all_greedy=False, temperature=temperature, generators=seeded_seqs
+        )
+        spec_decode_metadata = create_spec_decode_metadata(
+            draft_token_ids.tolist(), target_logits
+        )
+
+        mock_sampler_output(rejection_sampler, bonus_token_ids)
+        rep_result = rejection_sampler(
+            spec_decode_metadata,
+            draft_probs=None,
+            logits=target_logits,
+            sampling_metadata=sampling_metadata,
+        )
+
+        results.append(rep_result.sampled_token_ids)
+
+    for i in range(batch_size):
+        if seeded_mask[i]:
+            for j in range(1, n_rep):
+                assert torch.equal(results[j][i], results[0][i])
+
+
+def test_rejection_sampling_approximates_target_distribution():
+    """Verify rejection sampling approximates target distribution,
+    despite sampling from a potentially distinct draft distribution.
+
+    This is done by first creating a random target probability
+    distribution and a random draft probability distribution. We then
+    sample token ids from the rejection sampler using these draft
+    and target distributions. The samples are used to estimate
+    the output probability distribution, which we expect to approximate
+    the target distribution.
+
+    A basic distance metric is used to determine similarity between
+    distributions.
+
+    We expect that as we increase the number of samples,
+    the distance between the observed distribution and the target
+    distribution decreases. To measure this, we compare the distance
+    of the observed distribution against both the target distribution
+    and a uniform random distribution. We expect the distance between
+    the observed distribution and the target distribution to improve
+    much more than the distance improvement between the observed
+    distribution and the random distribution.
+    """
+    torch.set_default_device(DEVICE)
+    vocab_size = 10
+    k = 2
+    num_reference_probs = 100
+
+    # Prepare draft, target, and reference probability distributions
+    draft_probs = F.softmax(torch.rand(vocab_size, dtype=torch.float32), dim=-1)
+    target_logits = torch.rand(vocab_size, dtype=torch.float32)
+    target_probs = F.softmax(target_logits, dim=-1)
+    reference_probs = F.softmax(
+        torch.rand(num_reference_probs, vocab_size, dtype=torch.float32),
+        dim=-1,
+    )
+
+    sample_sizes = [10, 100, 1_000, 10_000, 100_000]
+    distance_wrt_reference: list[float] = []
+    distance_wrt_target: list[float] = []
+
+    for num_samples in sample_sizes:
+        # Sample using rejection sampling.
+        rej_sample_probs = estimate_rejection_sampling_pdf(
+            draft_probs, target_logits, k, vocab_size, num_samples
+        )
+        rej_sample_probs = rej_sample_probs.to(DEVICE)
+
+        # Average distance from reference probs.
+        reference_vs_rejsample_dist = (
+            torch.dist(reference_probs, rej_sample_probs).item()
+            / reference_probs.shape[0]
+        )
+        target_vs_rejsample_dist = torch.dist(target_probs, rej_sample_probs).item()
+
+        distance_wrt_reference.append(reference_vs_rejsample_dist)
+        distance_wrt_target.append(target_vs_rejsample_dist)
+
+        relative_change_in_distance_wrt_target = get_ratio_first_to_last(
+            distance_wrt_target
+        )
+        relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
+            distance_wrt_reference
+        )
+
+        print(
+            f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
+            f"{reference_vs_rejsample_dist=:.05f}"
+        )
+        print(
+            f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
+            f"{relative_change_in_distance_wrt_reference=:.02f}"
+        )
+
+    relative_change_in_distance_wrt_target = get_ratio_first_to_last(
+        distance_wrt_target
+    )
+    relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
+        distance_wrt_reference
+    )
+
+    expected_improvement_multiplier = 20
+    assert (
+        relative_change_in_distance_wrt_target
+        > relative_change_in_distance_wrt_reference * expected_improvement_multiplier
+    )
+
+
+def get_ratio_first_to_last(elements: list[float]) -> float:
+    return elements[0] / elements[-1]
+
+
+def estimate_rejection_sampling_pdf(
+    draft_probs: torch.Tensor,
+    target_logits: torch.Tensor,
+    k: int,
+    vocab_size: int,
+    num_samples: int,
+) -> torch.Tensor:
+    """Estimate the probability distribution of the output tokens
+    using rejection sampling.
+
+    Args:
+        draft_probs: Draft probability distribution.
+        target_logits: Target logits.
+        num_samples: Number of samples to draw.
+
+    Returns:
+        Estimated probability distribution of the output tokens.
+    """
+    mock_sampler = Mock(spec=Sampler)
+    mock_sampler.logprobs_mode = "raw_logprobs"
+    rejection_sampler = RejectionSampler(mock_sampler)
+    num_tokens = num_samples * k
+    # Repeat draft probs num_samples * k times.
+    draft_probs = draft_probs.reshape(1, 1, vocab_size).repeat(num_samples, k, 1)
+
+    # Repeat target probs num_tokens times.
+    target_logits = target_logits.reshape(1, vocab_size).repeat(num_tokens, 1)
+
+    # Randomly sample draft token ids from draft probs.
+    draft_token_ids = torch.multinomial(
+        draft_probs[:, 0, :], num_samples=k, replacement=True
+    ).reshape(num_samples, k)
+    draft_probs = draft_probs.view(num_tokens, vocab_size)
+
+    # Bonus tokens not used but required.
+    bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64, device=DEVICE).repeat(
+        num_samples, 1
+    )
+
+    temperature = torch.ones(num_samples, dtype=torch.float32, device=DEVICE)
+    sampling_metadata = create_sampling_metadata(
+        all_greedy=False, temperature=temperature
+    )
+    spec_decode_metadata = create_spec_decode_metadata(
+        draft_token_ids.tolist(), target_logits
+    )
+
+    mock_sampler_output(rejection_sampler, bonus_token_ids)
+    sampler_output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=draft_probs,
+        logits=target_logits,
+        sampling_metadata=sampling_metadata,
+    )
+    output_token_ids = sampler_output.sampled_token_ids[:, :-1].flatten()
+
+    hist = torch.histogram(
+        output_token_ids.to(dtype=torch.float, device="cpu"),
+        bins=vocab_size,
+        range=(0, vocab_size),
+        density=True,
+    )
+
+    return hist.hist
+
+
+def _test_masked_logits(
+    rejection_sampler,
+    batch_size: int,
+    num_draft_tokens: int,
+    vocab_size: int,
+    target_logits: torch.Tensor,
+    unmasked_indices: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+):
+    # Set up test parameters
+    num_tokens = batch_size * num_draft_tokens
+
+    # Create random draft probabilities.
+    draft_probs = torch.rand(
+        (num_tokens, vocab_size), dtype=torch.float32, device=DEVICE
+    )
+    draft_probs = F.softmax(draft_probs, dim=-1)
+
+    # Randomly sample draft token ids from draft probs
+    draft_token_ids = torch.multinomial(draft_probs, num_samples=1)
+    draft_token_ids = draft_token_ids.reshape(batch_size, num_draft_tokens)
+    draft_token_ids = draft_token_ids.tolist()
+
+    # Bonus tokens not used but required
+    bonus_token_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=DEVICE)
+
+    # Create spec decode metadata
+    spec_decode_metadata = create_spec_decode_metadata(draft_token_ids, target_logits)
+
+    # Run rejection sampling
+    mock_sampler_output(rejection_sampler, bonus_token_ids)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=draft_probs,
+        logits=target_logits,
+        sampling_metadata=sampling_metadata,
+    )
+
+    # Remove bonus tokens and reshape
+    output_token_ids = output.sampled_token_ids[:, :-1].flatten().tolist()
+
+    # Check that all sampled tokens are within the unmasked indices.
+    for i in range(num_tokens):
+        token_id = output_token_ids[i]
+        if token_id == PLACEHOLDER_TOKEN_ID:
+            continue
+        assert token_id in unmasked_indices[i]
+
+
+@pytest.mark.parametrize("top_k", [1, 5, 99])
+def test_top_k(rejection_sampler, top_k):
+    """Test rejection sampling with top-k sampling"""
+    vocab_size = 100
+    batch_size = 100
+    num_draft_tokens = 3
+    num_tokens = batch_size * num_draft_tokens
+
+    # Randomly create top-k indices.
+    top_k_indices = [
+        torch.randperm(vocab_size, device=DEVICE)[:top_k] for _ in range(num_tokens)
+    ]
+    top_k_indices = torch.stack(top_k_indices)
+
+    # Create logits with the uniform distribution.
+    target_logits = torch.zeros((num_tokens, vocab_size), device=DEVICE)
+
+    # Increment the logits for top-k indices, a little bit more than the other
+    # ones. If the masking is effective, the non-topk indices will never be
+    # sampled despite the small difference in logits.
+    for i in range(num_tokens):
+        target_logits[i, top_k_indices[i]] += 0.1
+
+    # Create sampling metadata
+    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+    sampling_metadata = create_sampling_metadata(
+        all_greedy=False,
+        temperature=temperature,
+        top_k=torch.tensor([top_k] * batch_size, device=DEVICE, dtype=torch.int64),
+    )
+
+    _test_masked_logits(
+        rejection_sampler,
+        batch_size=batch_size,
+        num_draft_tokens=num_draft_tokens,
+        vocab_size=vocab_size,
+        target_logits=target_logits,
+        unmasked_indices=top_k_indices,
+        sampling_metadata=sampling_metadata,
+    )
+
+
+@pytest.mark.parametrize("top_p", [0.5, 0.9, 0.99])
+def test_top_p(rejection_sampler, top_p):
+    """Test rejection sampling with top-p sampling"""
+    vocab_size = 100
+    batch_size = 100
+    num_draft_tokens = 3
+    num_tokens = batch_size * num_draft_tokens
+
+    # Create logits with the uniform distribution.
+    target_logits = torch.randn((num_tokens, vocab_size), device=DEVICE)
+    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+    rescaled_logits = target_logits / temperature
+
+    logits_sort, logits_idx = rescaled_logits.sort(dim=-1, descending=False)
+    probs_sort = logits_sort.softmax(dim=-1)
+    probs_sum = probs_sort.cumsum(dim=-1)
+    top_p_mask = probs_sum <= 1 - top_p
+    # at least one
+    top_p_mask[:, -1] = False
+
+    # Get the top-p indices.
+    top_p_indices = []
+    for i in range(num_tokens):
+        top_p_indices.append(logits_idx[i][~top_p_mask[i]].tolist())
+
+    # Create sampling metadata
+    sampling_metadata = create_sampling_metadata(
+        all_greedy=False,
+        temperature=temperature,
+        top_p=torch.tensor([top_p] * batch_size, device=DEVICE, dtype=torch.float32),
+    )
+
+    _test_masked_logits(
+        rejection_sampler,
+        batch_size=batch_size,
+        num_draft_tokens=num_draft_tokens,
+        vocab_size=vocab_size,
+        target_logits=target_logits,
+        unmasked_indices=top_p_indices,
+        sampling_metadata=sampling_metadata,
+    )
+
+
+########################### Tests for Logit Processors ###################
+def test_frequency_penalties(rejection_sampler):
+    """Test rejection sampling with frequency penalties"""
+    spec_tokens = [[1, 1, 1], [], [1, 1, 1]]
+    output_tokens = [[1, 1, 1, 1], [7], [1, 1, 1, 1]]  # 1, 7 and 1 are the bonus tokens
+
+    num_requsts = len(spec_tokens)
+    logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
+    metadata = create_sampling_metadata(
+        all_greedy=True,
+        output_token_ids=[[2], [3], [4]],
+        spec_token_ids=spec_tokens,
+        prompt_token_ids=torch.tensor([[5, 6, 7], [6, 7, 8], [7, 8, 9]], device=DEVICE),
+        frequency_penalties=[1.5, 1.5, 0.7],
+        presence_penalties=[0.0] * num_requsts,
+        repetition_penalties=[1.0] * num_requsts,
+    )
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
+    )
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        spec_tokens, device=logits.device
+    )
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor(
+        [[1, 15, -1, -1], [7, -1, -1, -1], [1, 1, 15, -1]],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_bad_words(rejection_sampler):
+    """Test rejection sampling with bad words constraints"""
+    spec_tokens = [[1, 2, 3], [1, 15, 3], [1, 2, 3]]
+    output_tokens = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
+
+    logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
+    metadata = create_sampling_metadata(
+        all_greedy=True,
+        output_token_ids=[[2], [3], [4]],
+        spec_token_ids=spec_tokens,
+        bad_words_token_ids={
+            0: [
+                [
+                    2,
+                ]
+            ],
+            1: [
+                [
+                    2,
+                ]
+            ],
+            # Do not apply bad words to the last request
+        },
+    )
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
+    )
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+
+    expected = torch.tensor(
+        [[1, 15, -1, -1], [1, 15, 3, 4], [1, 2, 3, 4]],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_allowed_token_ids(rejection_sampler):
+    """Test rejection sampling with allowed token ids"""
+    spec_tokens = [[1, 2, 10], [10, 5, 3], [7, 10, 12]]
+    output_tokens = [[1, 2, 10, 5], [10, 5, 10, 5], [7, 10, 12, 5]]
+    # Not allowed tokens:
+    # 0: 0-4
+    # 1: 1-5
+    # 2: 2-6
+    num_allowed_token_ids = 5
+
+    # Use the token 15 as the sampler choose if a token rejected
+    logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
+
+    batch_size = len(output_tokens)
+    _, vocab_size = logits.size()
+    mask = create_allowed_token_ids(
+        batch_size=batch_size,
+        vocab_size=vocab_size,
+        num_allowed_token_ids=num_allowed_token_ids,
+        device=logits.device,
+    )
+    metadata = create_sampling_metadata(
+        all_greedy=True,
+        output_token_ids=[[], [], []],
+        spec_token_ids=spec_tokens,
+        allowed_token_ids_mask=mask,
+    )
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
+    )
+    spec_decode_metadata = create_spec_decode_metadata(spec_tokens, logits)
+    mock_sampler_output(rejection_sampler, bonus_token_tensor)
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        logits=logits,
+        sampling_metadata=metadata,
+    )
+
+    expected = torch.tensor(
+        [[15, -1, -1, -1], [10, 5, 10, -1], [7, 10, 12, 5]],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output.sampled_token_ids, expected)
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -0,0 +1,449 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import pytest
+import torch
+
+from tests.v1.sample.utils import create_allowed_token_ids
+from vllm.platforms import current_platform
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.utils.torch_utils import make_tensor_with_pad
+from vllm.v1.sample.logits_processor import LogitsProcessors
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.sampler import Sampler
+
+PIN_MEMORY_AVAILABLE = is_pin_memory_available()
+MAX_NUM_REQS = 256
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+CUDA_DEVICES = [
+    f"{current_platform.device_type}:{i}"
+    for i in range(1 if current_platform.device_count() == 1 else 2)
+]
+MAX_NUM_PROMPT_TOKENS = 64
+
+
+def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
+    return fake_logits
+
+
+def _create_penalty_tensor(
+    batch_size: int, penalty_value: float, device: torch.device
+) -> torch.Tensor:
+    return torch.full(
+        (batch_size,), fill_value=penalty_value, dtype=torch.float, device=device
+    )
+
+
+def _create_prompt_tokens_tensor(
+    prompt_token_ids: list[list[int]],
+    vocab_size: int,
+    device: torch.device,
+) -> torch.Tensor:
+    return make_tensor_with_pad(
+        prompt_token_ids,
+        pad=vocab_size,
+        device=device,
+        dtype=torch.int64,
+        pin_memory=False,
+    )
+
+
+def _create_bad_words_token_ids(
+    batch_size: int,
+    vocab_size: int,
+    bad_words_lengths: tuple[int, ...],
+) -> dict[int, list[list[int]]]:
+    bad_words_token_ids = {}
+    for batch_idx in range(batch_size):
+        token_ids_single_batch = []
+        for bad_words_length in bad_words_lengths:
+            token_ids = np.random.choice(
+                vocab_size, size=bad_words_length, replace=True
+            ).tolist()
+            token_ids_single_batch.append(token_ids)
+        bad_words_token_ids[batch_idx] = token_ids_single_batch
+    if batch_size >= 2:
+        # Test no bad_words for some batch
+        no_bad_words_batch_idx = np.random.choice(batch_size)
+        bad_words_token_ids.pop(no_bad_words_batch_idx, None)
+    return bad_words_token_ids
+
+
+# Returns all last tokens of bad word sequences that share the same prefix
+# as `given_prefix` (excluding the last token).
+def _collect_suffixes_with_same_prefix(
+    given_prefix: list[int], bad_words_token_ids: list[list[int]]
+) -> list[int]:
+    return [bwt[-1] for bwt in bad_words_token_ids if bwt[:-1] == given_prefix]
+
+
+# generate a valid token id that is not in bad_words_token_ids
+def _generate_valid_token_id(
+    bad_words_token_ids: list[list[int]], vocab_size: int
+) -> int:
+    forbidden_start_tokens = set()
+    for bad_word in bad_words_token_ids:
+        forbidden_start_tokens.add(bad_word[0])
+    # Get a safe token that's not in forbidden starts
+    safe_token_candidates = list(set(range(vocab_size)) - forbidden_start_tokens)
+    # Pick a random safe token
+    return np.random.choice(safe_token_candidates)
+
+
+def _update_output_token_ids_for_bad_words(
+    metadata: SamplingMetadata, vocab_size: int
+) -> dict[int, list[int]]:
+    bad_words_last_tokens = {}
+    for batch_idx, bad_words_token_ids in metadata.bad_words_token_ids.items():
+        output_token_ids = metadata.output_token_ids[batch_idx]
+        bad_words_last_token: list[int] = []
+        for i, bad_word_token_ids in enumerate(bad_words_token_ids):
+            if len(bad_word_token_ids) == 1:
+                # Single token id always affects logits
+                bad_words_last_token.append(bad_word_token_ids[0])
+            else:
+                prefix_length = len(bad_word_token_ids) - 1
+                has_bad_words = np.random.choice([True, False])
+                if has_bad_words:
+                    prefix = bad_word_token_ids[:-1]
+                    output_token_ids[-prefix_length:] = prefix
+                    # Collect all last tokens from other bad words
+                    # that share this prefix
+                    bad_words_last_token.extend(
+                        _collect_suffixes_with_same_prefix(prefix, bad_words_token_ids)
+                    )
+                    break  # Maximum one update to output_token_ids
+                else:  # Make sure no accidental match to bad words
+                    output_token_ids[-1] = _generate_valid_token_id(
+                        bad_words_token_ids, vocab_size
+                    )
+        bad_words_last_tokens[batch_idx] = bad_words_last_token
+    return bad_words_last_tokens
+
+
+def _create_default_sampling_metadata(
+    num_output_tokens: int,
+    batch_size: int,
+    vocab_size: int,
+    device: torch.device,
+) -> SamplingMetadata:
+    output_token_ids: list[list[int]] = []
+    prompt_token_ids: list[list[int]] = []
+    for _ in range(batch_size):
+        output_token_ids.append(
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist()
+        )
+        prompt_token_ids.append(
+            np.random.randint(
+                0, vocab_size, size=np.random.randint(1, MAX_NUM_PROMPT_TOKENS)
+            ).tolist()
+        )
+    fake_sampling_metadata = SamplingMetadata(
+        temperature=torch.full((batch_size,), 0.0),
+        all_greedy=True,
+        all_random=False,
+        top_p=None,
+        top_k=None,
+        generators={},
+        max_num_logprobs=0,
+        prompt_token_ids=_create_prompt_tokens_tensor(
+            prompt_token_ids, vocab_size, device
+        ),
+        output_token_ids=output_token_ids,
+        spec_token_ids=[[] for _ in range(batch_size)],
+        frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
+        no_penalties=True,
+        allowed_token_ids_mask=None,
+        bad_words_token_ids={},
+        logitsprocs=LogitsProcessors(),
+    )
+    return fake_sampling_metadata
+
+
+def _create_weighted_output_token_list(
+    batch_size: int, vocab_size: int
+) -> tuple[list[list[int]], list[list[int]]]:
+    """
+    Creates an output token list where each token occurs a distinct
+    number of times.
+
+    For each batch, a random subset of token IDs is selected from the
+    vocabulary. The selected tokens are then added to the output token
+    list, each with a different frequency.
+
+    Returns:
+        tuple[list[list[int]], list[list[int]]]:
+            - The first element is the output token list, where each sublist
+              corresponds to a batch and contains tokens with weighted
+              frequencies.
+            - The second element is a list of distinct token IDs for each
+              batch, ordered by their frequency in the corresponding output
+              list.
+    """
+    output_token_ids: list[list[int]] = []
+    sorted_token_ids_in_output: list[list[int]] = []
+    for _ in range(batch_size):
+        distinct_token_ids = np.random.choice(
+            vocab_size, size=np.random.randint(1, 10), replace=False
+        ).tolist()
+        sorted_token_ids_in_output.append(distinct_token_ids)
+        output_token_ids_for_batch = []
+        for index, token_id in enumerate(distinct_token_ids):
+            output_token_ids_for_batch.extend([token_id for _ in range(index + 1)])
+        output_token_ids.append(output_token_ids_for_batch)
+    return output_token_ids, sorted_token_ids_in_output
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
+def test_sampler_presence_penalty(
+    device: str, batch_size: int, presence_penalty: float
+):
+    """
+    Test to verify that if presence penalty is enabled then tokens
+    are penalized as per their presence in the existing output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
+    output_token_ids = sampling_metadata.output_token_ids
+    sampling_metadata.presence_penalties = _create_penalty_tensor(
+        batch_size, presence_penalty, torch.device(device)
+    )
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    logits = sampler.apply_penalties(
+        fake_logits, sampling_metadata, sampling_metadata.output_token_ids
+    )
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        # Since all tokens initially have the same logits, the non-penalized
+        # token ID will be the one with the highest logit value, while the
+        # penalized token ID will be the one with the lowest logit value.
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
+        if presence_penalty > 0:
+            # If `presence_penalty` is set to a value greater than 0, it
+            # indicates a preference for new tokens over those already
+            # present in the output.
+            # Verify that the penalized token ID exists in the output, while the
+            # non-penalized token ID does not.
+            assert penalized_token_id in output_token_ids[batch_idx]
+            assert non_penalized_token_id not in output_token_ids[batch_idx]
+        elif presence_penalty < 0:
+            # If `presence_penalty` is set to a value less than 0, it indicates
+            # a preference for existing tokens over new ones. Verify that the
+            # non-penalized token ID exists in the output, while the penalized
+            # token ID does not.
+            assert non_penalized_token_id in output_token_ids[batch_idx]
+            assert penalized_token_id not in output_token_ids[batch_idx]
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
+def test_sampler_frequency_penalty(
+    device: str, batch_size: int, frequency_penalty: float
+):
+    """
+    Test to verify that if frequency penalty is enabled then tokens are
+    penalized as per their frequency of occurrence.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
+    sampling_metadata.frequency_penalties = _create_penalty_tensor(
+        batch_size, frequency_penalty, torch.device(device)
+    )
+    output_token_ids, sorted_token_ids_in_output = _create_weighted_output_token_list(
+        batch_size,
+        VOCAB_SIZE,
+    )
+    sampling_metadata.output_token_ids = output_token_ids
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    logits = sampler.apply_penalties(
+        fake_logits, sampling_metadata, sampling_metadata.output_token_ids
+    )
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
+        distinct_sorted_token_ids_in_output = sorted_token_ids_in_output[batch_idx]
+        most_frequent_token_id = distinct_sorted_token_ids_in_output[
+            len(distinct_sorted_token_ids_in_output) - 1
+        ]
+        if frequency_penalty > 0:
+            # If `frequency_penalty` is set to > 0, it indicates
+            # a preference for new tokens over existing ones. Verify that the
+            # non-penalized token ID is not present in the output, while the
+            # most penalized token is the one that occurs most frequently in
+            # the output.
+            assert non_penalized_token_id not in distinct_sorted_token_ids_in_output
+            assert penalized_token_id == most_frequent_token_id
+        elif frequency_penalty < 0:
+            # If `frequency_penalty` is set to < 0, it indicates
+            # a preference for existing tokens over new ones. Verify that the
+            # non-penalized token ID is the one that occurs most frequently
+            # in the output, while the penalized token ID is one that has not
+            # yet appeared.
+            assert non_penalized_token_id == most_frequent_token_id
+            assert penalized_token_id not in distinct_sorted_token_ids_in_output
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
+def test_sampler_repetition_penalty(
+    device: str, batch_size: int, repetition_penalty: float
+):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
+    sampling_metadata.repetition_penalties = _create_penalty_tensor(
+        batch_size, repetition_penalty, torch.device(device)
+    )
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    logits = sampler.apply_penalties(
+        fake_logits, sampling_metadata, sampling_metadata.output_token_ids
+    )
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
+        prompt_tokens = sampling_metadata.prompt_token_ids[batch_idx][:].tolist()
+        output_tokens = sampling_metadata.output_token_ids[batch_idx]
+        if repetition_penalty > 1.0:
+            # If `repetition_penalty` > 1.0, verify that the non-penalized
+            # token ID has not been seen before, while the penalized token ID
+            # exists either in the prompt or the output.
+            assert (
+                non_penalized_token_id not in prompt_tokens
+                and non_penalized_token_id not in output_tokens
+            )
+            assert (
+                penalized_token_id in prompt_tokens
+                or penalized_token_id in output_tokens
+            )
+        elif repetition_penalty < 1.0:
+            # If `repetition_penalty` < 1.0, verify that the penalized
+            # token ID has not been seen before, while the non-penalized
+            # token ID exists either in the prompt or the output.
+            assert (
+                penalized_token_id not in prompt_tokens
+                and penalized_token_id not in output_tokens
+            )
+            assert (
+                non_penalized_token_id in prompt_tokens
+                or non_penalized_token_id in output_tokens
+            )
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2])
+def test_sampler_allowed_token_ids(
+    device: str, batch_size: int, num_allowed_token_ids: int
+):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
+    mask = create_allowed_token_ids(
+        batch_size=batch_size,
+        vocab_size=VOCAB_SIZE,
+        num_allowed_token_ids=num_allowed_token_ids,
+        device=device,
+    )
+    sampling_metadata.allowed_token_ids_mask = mask
+    sampler = Sampler()
+    logits = sampler.apply_logits_processors(
+        fake_logits, sampling_metadata, predict_bonus_token=False
+    )
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        if batch_idx % 2 == 1:
+            assert torch.all(logits_for_req != -float("inf"))
+            continue
+        for token_id in range(VOCAB_SIZE):
+            start = min(batch_idx, VOCAB_SIZE - 1)
+            end = min(batch_idx + num_allowed_token_ids, VOCAB_SIZE - 1)
+            if token_id >= start and token_id < end:
+                assert logits_for_req[token_id] == -float("inf"), (
+                    f"{batch_idx}, {token_id}"
+                )
+            else:
+                assert logits_for_req[token_id] != -float("inf")
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("bad_words_lengths", [(1,), (1, 3), (2, 2)])
+def test_sampler_bad_words(
+    device: str, batch_size: int, bad_words_lengths: tuple[int, ...]
+):
+    """
+    Test to verify that when the bad words restriction is present, tokens
+    are penalized based on their match with the bad words.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)
+    )
+    sampling_metadata.bad_words_token_ids = _create_bad_words_token_ids(
+        batch_size, VOCAB_SIZE, bad_words_lengths
+    )
+    bad_words_last_tokens = _update_output_token_ids_for_bad_words(
+        sampling_metadata, VOCAB_SIZE
+    )
+    sampler = Sampler()
+    logits = sampler.apply_logits_processors(
+        fake_logits, sampling_metadata, predict_bonus_token=False
+    )
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        for token_id in range(VOCAB_SIZE):
+            if (
+                batch_idx in bad_words_last_tokens
+                and token_id in bad_words_last_tokens[batch_idx]
+            ):
+                assert logits_for_req[token_id] == -float("inf")
+            else:
+                assert logits_for_req[token_id] != -float("inf")
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+MODEL = "hmellor/tiny-random-LlamaForCausalLM"
+PROMPT = "Hello my name is Robert and I"
+
+
+@pytest.fixture(scope="module")
+def llm() -> LLM:
+    return LLM(MODEL, enforce_eager=True)
+
+
+def test_n_gt_1(llm):
+    """ParallelSampling is supported."""
+
+    params = SamplingParams(n=3)
+    outputs = llm.generate(PROMPT, params)
+    assert len(outputs[0].outputs) == 3
+
+
+def test_penalties(llm):
+    """Check that we do not get errors if applied."""
+
+    params = SamplingParams(
+        temperature=1.2,
+        presence_penalty=1.2,
+        frequency_penalty=1.2,
+        repetition_penalty=1.2,
+        min_p=0.5,
+        top_p=0.5,
+        top_k=3,
+    )
+    _ = llm.generate(PROMPT, params)
+
+
+def test_stop(llm):
+    """Check that we respect the stop words."""
+
+    output = llm.generate(PROMPT, SamplingParams(temperature=0))
+    split_text = output[0].outputs[0].text.split()
+
+    STOP_IDX = 5
+    params = SamplingParams(temperature=0, stop=split_text[STOP_IDX])
+    output = llm.generate(PROMPT, params)
+    new_split_text = output[0].outputs[0].text.split()
+
+    # Output should not contain the stop word.
+    assert len(new_split_text) == STOP_IDX
+
+    params = SamplingParams(
+        temperature=0, stop=split_text[STOP_IDX], include_stop_str_in_output=True
+    )
+    output = llm.generate(PROMPT, params)
+    new_split_text = output[0].outputs[0].text.split()
+
+    # Output should contain the stop word.
+    assert len(new_split_text) == STOP_IDX + 1
+
+
+def test_stop_token_ids(llm):
+    """Check that we respect the stop token ids."""
+
+    output = llm.generate(PROMPT, SamplingParams(temperature=0))
+
+    stop_token_id_0 = output[0].outputs[0].token_ids[5]
+    stop_token_id_1 = output[0].outputs[0].token_ids[6]
+
+    stop_token_ids = [stop_token_id_1, stop_token_id_0]
+    params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
+    output = llm.generate(PROMPT, params)
+    assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
+
+    stop_token_ids = [stop_token_id_0, stop_token_id_1]
+    params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
+    output = llm.generate(PROMPT, params)
+    assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
+
+
+def test_detokenize_false(llm):
+    """Check that detokenize=False option works."""
+
+    output = llm.generate(PROMPT, SamplingParams(detokenize=False))
+    assert len(output[0].outputs[0].token_ids) > 0
+    assert len(output[0].outputs[0].text) == 0
+
+    output = llm.generate(
+        PROMPT, SamplingParams(detokenize=False, logprobs=3, prompt_logprobs=3)
+    )
+    assert len(output[0].outputs[0].token_ids) > 0
+    assert len(output[0].outputs[0].text) == 0
+
+    prompt_logprobs = output[0].prompt_logprobs
+    sampled_logprobs = output[0].outputs[0].logprobs
+    assert len(prompt_logprobs) > 1
+    assert len(sampled_logprobs) > 1
+    for all_logprobs in (prompt_logprobs[1:], sampled_logprobs):
+        for logprobs in all_logprobs:
+            assert 3 <= len(logprobs) <= 4
+            assert all(lp.decoded_token is None for lp in logprobs.values())
+
+
+def test_bad_words(llm):
+    """Check that we respect bad words."""
+
+    tokenizer = llm.get_tokenizer()
+
+    def contains_bad_word(text: str, tokens: list[int], bad_word: str) -> bool:
+        """Check if word appears in BOTH text and token sequence."""
+        if bad_word not in text:
+            return False
+
+        for add_prefix_space in [False, True]:
+            prefix = " " if add_prefix_space else ""
+            bad_words_token = tokenizer.encode(
+                prefix + bad_word.lstrip(), add_special_tokens=False
+            )
+            if not bad_words_token:
+                continue
+            for i in range(len(tokens) - len(bad_words_token) + 1):
+                if tokens[i : i + len(bad_words_token)] == bad_words_token:
+                    return True
+        return False
+
+    output = llm.generate(PROMPT, SamplingParams(temperature=0))
+    split_text = output[0].outputs[0].text.split()
+
+    bad_words_1 = " ".join(split_text[:2])
+    params = SamplingParams(temperature=0, bad_words=[bad_words_1])
+    output = llm.generate(PROMPT, params)
+    new_text = output[0].outputs[0].text
+    new_tokens = output[0].outputs[0].token_ids
+    assert not contains_bad_word(new_text, new_tokens, bad_words_1)
+
+    bad_words_2 = new_text.split()[-1]
+    params = SamplingParams(temperature=0, bad_words=[bad_words_1, bad_words_2])
+    output = llm.generate(PROMPT, params)
+    new_text = output[0].outputs[0].text
+    new_tokens = output[0].outputs[0].token_ids
+    assert not contains_bad_word(new_text, new_tokens, bad_words_1)
+    assert not contains_bad_word(new_text, new_tokens, bad_words_2)
+
+
+def test_logits_processor(llm):
+    """Check that we reject logits processor."""
+
+    # This sample logits processor gives infinite score to the i-th token,
+    # where i is the length of the input sequence.
+    # We therefore expect the output token sequence to be [0, 1, 2, ...]
+    def pick_ith(token_ids, logits):
+        logits[len(token_ids)] = float("inf")
+        return logits
+
+    with pytest.raises(ValueError):
+        _ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith]))
+
+
+def test_allowed_token_ids(llm):
+    """Check that we can use allowed_token_ids."""
+
+    TOKEN_ID = 10
+    allowed_token_ids = [TOKEN_ID]
+    output = llm.generate(PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
+    assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
+
+    # Reject empty allowed_token_ids.
+    with pytest.raises(ValueError):
+        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
+
+    # Reject negative token id.
+    with pytest.raises(ValueError):
+        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
+
+    # Reject out of vocabulary.
+    with pytest.raises(ValueError):
+        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
+
+
+def test_seed(llm):
+    """Check that seed impacts randomness."""
+
+    out_1 = llm.generate(PROMPT, SamplingParams(seed=42))
+    out_2 = llm.generate(PROMPT, SamplingParams(seed=42))
+    out_3 = llm.generate(PROMPT, SamplingParams(seed=43))
+
+    assert out_1[0].outputs[0].text == out_2[0].outputs[0].text
+    assert out_1[0].outputs[0].text != out_3[0].outputs[0].text
--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from torch import Generator
+
+from vllm.platforms import current_platform
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+
+DEVICE = current_platform.device_type
+
+BATCH_SIZE = 1024
+VOCAB_SIZE = 128 * 1024
+
+
+@pytest.fixture(autouse=True)
+def reset_default_device():
+    """
+    Explicitly set the default device, which can affect subsequent tests.
+    Adding this fixture helps avoid this problem.
+    """
+    original_device = torch.get_default_device()
+    yield
+    torch.set_default_device(original_device)
+
+
+def test_topk_impl_equivalence():
+    torch.set_default_device(DEVICE)
+    generator = Generator(device=DEVICE).manual_seed(33)
+
+    logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+    # Random top-k values between 1 and 9.
+    k = torch.randint(1, 10, (BATCH_SIZE,), generator=generator)
+
+    # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+    k.masked_fill_(
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=bool), VOCAB_SIZE
+    )
+
+    # Top-k only implementation
+    result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+
+    # Top-p + top-k
+    no_op_top_p = torch.tensor([1.0])
+    result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
+
+    assert torch.allclose(result1, result2)
+
+
+def test_flashinfer_sampler():
+    """
+    This test verifies that the FlashInfer top-k and top-p sampling
+    implementation produces the same results as the Python implementation.
+
+    NOTE: FlashInfer did not directly expose an interface for fused top-k and
+    top-p prob renorm (it did provide fused sampling but we cannot compare
+    sampling results due to randomness), so we will compare the probability
+    renormed consequently by top-k and then top-p of FlashInfer implementation.
+    """
+    try:
+        from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs
+
+        is_flashinfer_available = True
+    except ImportError:
+        is_flashinfer_available = False
+
+    FLASHINFER_ENABLED = current_platform.is_cuda() and is_flashinfer_available
+
+    if not FLASHINFER_ENABLED:
+        pytest.skip("FlashInfer not installed or not available on this platform.")
+
+    torch.set_default_device(DEVICE)
+    generator = Generator(device=DEVICE).manual_seed(42)
+
+    # Generate random logits
+    logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+    # Generate various top-k and top-p values
+    k_values = torch.randint(1, 1000, (BATCH_SIZE,), generator=generator)
+    p_values = (
+        torch.rand((BATCH_SIZE,), generator=generator) * 0.5 + 0.5
+    )  # range in [0.5, 1.0]
+
+    # Sometimes disable top-k (k=vocab_size)
+    k_values.masked_fill_(
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool),
+        VOCAB_SIZE,
+    )
+
+    # Sometimes disable top-p (p=1.0)
+    p_values.masked_fill_(
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool), 1.0
+    )
+
+    python_logits = apply_top_k_top_p(
+        logits=logits.clone(),
+        k=k_values,
+        p=p_values,
+    )
+    python_probs = torch.softmax(python_logits, dim=-1)
+
+    # FlashInfer only exposed renorm interfaces for probs so convert first
+    flashinfer_probs = torch.softmax(logits.clone(), dim=-1)
+    flashinfer_probs = top_k_renorm_probs(
+        probs=flashinfer_probs,
+        top_k=k_values,
+    )
+    flashinfer_probs = top_p_renorm_probs(
+        probs=flashinfer_probs,
+        top_p=p_values,
+    )
+
+    # Compare the results
+    assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), (
+        "FlashInfer and Python sampling implementations do not match!"
+    )
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterator
+from enum import Enum
+from typing import NamedTuple
+
+import regex as re
+import torch
+
+from vllm import CompletionOutput
+from vllm.utils.torch_utils import make_tensor_with_pad
+from vllm.v1.sample.logits_processor import BatchUpdate, LogitsProcessor
+from vllm.v1.sample.metadata import SamplingMetadata
+
+
+class BatchLogprobsComposition(Enum):
+    """Types of logprobs configs to include in test batch"""
+
+    NONE = 0
+    SAMPLE = 1
+    PROMPT = 2
+    SAMPLE_PROMPT = 3
+
+
+BatchLogprobsSpecType = list[tuple[int | None, int | None]]
+
+
+def get_test_batch(
+    batch_logprobs_composition: BatchLogprobsComposition,
+) -> BatchLogprobsSpecType:
+    """Generate logprobs configs for a batch of requests
+
+    A given request's logprobs configuration is (1) num_sample_logprobs and (2)
+    num_prompt_logprobs. The batch logprobs configuration is the list of request
+    logprobs configs.
+
+    batch_logprobs_composition == NONE yields a batch with no sample or prompt
+    logprobs
+
+    batch_logprobs_composition == SAMPLE yields a batch with some requests
+    configured for sample logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == PROMPT yields a batch with some requests
+    configured for prompt logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == SAMPLE_PROMPT yields a batch with some
+    requests configured for sample logprobs and prompt logprobs, some configured
+    for only sample logprobs or only prompt logprobs, and some configured for
+    no logprobs
+
+    Args:
+      batch_logprobs_composition: types of logprobs configs to include in batch
+
+    Returns:
+
+      list of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
+      tuples
+    """
+    if batch_logprobs_composition == BatchLogprobsComposition.NONE:
+        # No requests with sample or prompt logprobs
+        return [(None, None)]
+    elif batch_logprobs_composition == BatchLogprobsComposition.SAMPLE:
+        # Requests requiring sample logprobs or no logprobs
+        return [
+            (None, None),
+            (0, None),
+            (5, None),
+            (3, None),
+        ]
+    elif batch_logprobs_composition == BatchLogprobsComposition.PROMPT:
+        # Requests requiring prompt logprobs or no logprobs
+        return [
+            (None, None),
+            (None, 0),
+            (None, 6),
+            (None, 5),
+        ]
+    elif batch_logprobs_composition == BatchLogprobsComposition.SAMPLE_PROMPT:
+        # Requests requiring either no logprobs, just
+        # sample logprobs, just prompt logprobs, or
+        # both sample and prompt logprobs
+        return [
+            (None, None),
+            (0, None),
+            (5, None),
+            (3, None),
+            (0, 3),
+            (6, 0),
+            (6, 3),
+            (None, 6),
+            (None, 5),
+            (None, 0),
+        ]
+    else:
+        raise ValueError("Invalid logprobs batch configuration for test.")
+
+
+def assert_incr_detok_str_matches_non_incr_detok_str(
+    incremental_detokenization_str: str,
+    non_incremental_detokenization_str: str,
+    msg: str,
+) -> None:
+    """Compare incrementally detok. text to non-incrementally detok. text
+
+    Fail if the strings mismatch after non-alphanumeric characters are stripped
+    out.
+
+    Rationale: incremental detokenization in the text generation process allows
+    the tokenizer to adjust the next token text output based on the token's
+    context in the string. However, logprobs detokenization detokenizes each
+    token individually, and the resultant strings may include some
+    non-alphanumeric placeholder characters where there could be i.e.
+    whitespace. So, this function compares only the alphanumeric text
+    between two strings and fails if there is a mismatch, which helps
+    with validating logprobs detokenization.
+
+    Args:
+      incremental_detokenization_str: incrementally-detokenized generated text
+      non_incremental_detokenization_str: non-incrementally-detokenized logprob
+                                          tokens
+      msg: error message if `assert` fails
+    """
+    rgx = r"[^a-zA-Z0-9]+"
+    assert re.sub(rgx, "", incremental_detokenization_str) == re.sub(
+        rgx, "", non_incremental_detokenization_str
+    ), msg
+
+
+def compute_correct_cumulative_logprob(completion_output: CompletionOutput) -> float:
+    """Compute known-good value for evaluating cumulative logprob
+
+    Args:
+      completion_output: completion output from engine
+
+    Returns:
+      Known-good cumulative logprob value
+    """
+    token_ids = completion_output.token_ids
+    logprobs = completion_output.logprobs
+    assert logprobs is not None
+    return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
+
+
+def create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
+    return fake_logits
+
+
+def create_penalty_tensor(
+    batch_size: int, penalty_value: float, device: torch.device
+) -> torch.Tensor:
+    return torch.full(
+        (batch_size,), fill_value=penalty_value, dtype=torch.float, device=device
+    )
+
+
+def create_prompt_tokens_tensor(
+    prompt_token_ids: list[list[int]],
+    vocab_size: int,
+    device: torch.device,
+) -> torch.Tensor:
+    return make_tensor_with_pad(
+        prompt_token_ids,
+        pad=vocab_size,
+        device=device,
+        dtype=torch.int64,
+        pin_memory=False,
+    )
+
+
+class LogitsprocsTestFakes(NamedTuple):
+    """Wraps fake data structures to support testing"""
+
+    logits: torch.Tensor
+    sampling_metadata: SamplingMetadata
+
+    def get_logitsprocs_by_cls(
+        self,
+        cls: type[LogitsProcessor],
+    ) -> Iterator[LogitsProcessor]:
+        """Yield logits processors of a specific class.
+
+        Args:
+          cls: :class:`LogitsProcessor` subclass
+
+        Returns:
+          Iterator over logits processors
+        """
+        return (
+            lp for lp in self.sampling_metadata.logitsprocs.all if isinstance(lp, cls)
+        )
+
+    def get_logitsprocs(self) -> Iterator[LogitsProcessor]:
+        """Iterator over all logits processors."""
+        return self.sampling_metadata.logitsprocs.all
+
+
+def fake_update_logitsprocs_state(
+    test_fakes: LogitsprocsTestFakes,
+    batch_update: BatchUpdate,
+) -> None:
+    """Imitate logits processors persistent batch state update
+    in engine core"""
+    for logitproc in test_fakes.get_logitsprocs():
+        logitproc.update_state(batch_update)
+
+
+def fake_apply_logitsprocs(
+    test_fakes: LogitsprocsTestFakes,
+    slice_indices: list[int],
+) -> torch.Tensor:
+    """Imitate application of logits processors in engine core"""
+    logits = test_fakes.logits[torch.tensor(slice_indices, dtype=torch.long)].clone()
+    for processor in test_fakes.get_logitsprocs():
+        logits = processor.apply(logits)
+    return logits
+
+
+def create_allowed_token_ids(
+    batch_size: int,
+    vocab_size: int,
+    num_allowed_token_ids: int,
+    device: torch.device,
+) -> torch.Tensor | None:
+    mask: torch.Tensor | None = None
+    for i in range(batch_size):
+        if i % 2 == 1:
+            continue
+        if mask is None:
+            mask = torch.zeros(
+                (batch_size, vocab_size), dtype=torch.bool, device=device
+            )
+        start = min(i, vocab_size - 1)
+        end = min(i + num_allowed_token_ids, vocab_size - 1)
+        mask[i, start:end] = True
+    return mask