init

2026-01-09 13:34:11 +08:00
parent dfa6476b58
commit b2ef04d792
538 changed files with 105693 additions and 2 deletions
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -0,0 +1,124 @@
+import pytest
+import torch
+
+from tests.conftest import VllmRunner
+from vllm import SamplingParams
+
+MODELS = ["facebook/opt-125m"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
+@pytest.mark.parametrize("num_top_logprobs", [6])  # 32000 == vocab_size
+def test_get_prompt_logprobs(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype,
+    chunked_prefill_token_size: int,
+    num_top_logprobs: int,
+    example_prompts,
+):
+    max_num_seqs = 256
+    enable_chunked_prefill = False
+    max_num_batched_tokens = None
+    if chunked_prefill_token_size != -1:
+        enable_chunked_prefill = True
+        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+        max_num_batched_tokens = chunked_prefill_token_size
+
+    max_tokens = 5
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_logprobs = hf_model.generate_greedy_logprobs(
+        example_prompts,
+        max_tokens=max_tokens,
+    )
+    del hf_model
+
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        max_logprobs=num_top_logprobs,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_num_seqs=max_num_seqs,
+    )
+    vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
+                                          logprobs=num_top_logprobs,
+                                          prompt_logprobs=num_top_logprobs,
+                                          temperature=0.0)
+    vllm_results = vllm_model.model.generate(
+        example_prompts, sampling_params=vllm_sampling_params)
+
+    # Test whether logprobs are included in the results.
+    for result in vllm_results:
+        assert result.prompt_logprobs is not None
+        assert result.outputs[0].logprobs is not None
+        assert len(result.outputs[0].logprobs) == max_tokens
+        for logprobs in result.outputs[0].logprobs:
+            assert len(logprobs) == num_top_logprobs
+        output_text = result.outputs[0].text
+        output_string_from_most_likely_tokens = []
+        for top_logprobs in result.outputs[0].logprobs:
+            top_logprob = next(iter(top_logprobs.values()))
+            output_string_from_most_likely_tokens.append(
+                top_logprob.decoded_token)
+        output_string_from_most_likely_tokens = "".join(
+            output_string_from_most_likely_tokens)
+        assert output_text == output_string_from_most_likely_tokens, (
+            "The output text from the top logprob for each token position "
+            "should be the same as the output text in the result.")
+
+        # The first prompt logprob is always None
+        assert result.prompt_logprobs[0] is None
+        for prompt_logprobs in result.prompt_logprobs[1:]:
+            # If the prompt token is not included in the top X
+            # logprob, it can return 1 more data
+            assert (len(prompt_logprobs) == num_top_logprobs
+                    or len(prompt_logprobs) == num_top_logprobs + 1)
+
+    # Test whether prompt logprobs are consistent with HF
+    for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
+        # Check prompt logprobs
+        # The first prompt logprob is always None, so we compare it from 1:.
+        vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
+        for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
+            for token_id, logprob in vllm_prompt_logprob_dict.items():
+                torch.testing.assert_close(logprob.logprob,
+                                           hf_logprob[0][i][token_id].item(),
+                                           atol=1e-2,
+                                           rtol=1e-2)
+        vllm_sample_logprobs = vllm_result.outputs[0].logprobs
+        for i, top_logprobs in enumerate(vllm_sample_logprobs):
+            for token_id, sample_logprob in top_logprobs.items():
+                logprob = sample_logprob.logprob
+                torch.testing.assert_close(logprob,
+                                           hf_logprob[i][-1][token_id].item(),
+                                           atol=1e-2,
+                                           rtol=1e-2)
+                assert isinstance(sample_logprob.decoded_token, str), (
+                    "The token should be decoded by the time it is returned "
+                    " to the user.")
+
+    # Test if prompt logprobs are correctly set.
+    for vllm_result in vllm_results:
+        token_ids = vllm_result.prompt_token_ids
+        prompt_logprobs = vllm_result.prompt_logprobs
+
+        # The first token doesn't have logprob.
+        assert prompt_logprobs[0] is None
+
+        for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]):
+            assert token_id in logprob_dict
+
+
+def test_max_logprobs():
+    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)