Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+
+import pytest
+
+from vllm import LLM
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
+def test_sampler_different(model_name: str):
+    """
+    Test significantly different sampling params to assert the model produces
+    different results.
+    """
+    llm = LLM(
+        model_name,
+        enforce_eager=False,
+        max_num_seqs=1,
+        max_model_len=512,
+        max_num_batched_tokens=256,
+    )
+    prompts = ["Write a short story about a robot that dreams for the first time."]
+    sampling_params = SamplingParams(temperature=0.9, min_p=0.2, max_tokens=64)
+    output = llm.generate(prompts, sampling_params)
+
+    sampling_params = SamplingParams(temperature=0.1, min_p=0.8, max_tokens=64)
+    output2 = llm.generate(prompts, sampling_params)
+    assert output[0].outputs[0].text != output2[0].outputs[0].text
+
+    with pytest.raises(ValueError):
+        # Unsupported `seed` param.
+        sampling_params = SamplingParams(temperature=0.3, seed=42)
+        output2 = llm.generate(prompts, sampling_params)
+
+    # Batch-case with TopK/P
+    for B in [4, 16]:
+        p = prompts * B
+        sampling_params = [
+            SamplingParams(
+                temperature=0.1,
+                min_p=0.8,
+                max_tokens=64,
+                # Vary number of ks
+                top_k=random.randint(4, 12),
+                top_p=random.random(),
+            )
+            for _ in range(B)
+        ]
+        # Make sure first two reqs have the same K/P
+        sampling_params[0] = sampling_params[1]
+        output = llm.generate(p, sampling_params)
+        # There are natural numerical instabilities that make it difficult
+        # to have deterministic results over many tokens, tests the first ~20
+        # tokens match.
+        assert output[0].outputs[0].text[:20] == output[1].outputs[0].text[:20]
+
+
+@pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
+# TODO TPU will appear busy if we fan-out test params here
+@pytest.mark.parametrize("n_prompts", [1])
+@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
+def test_logprobs(model_name: str, n_prompts: int):
+    """
+    Request top logprobs with different sampling settings and check
+    that results contains the requested number, ordered ascendingly.
+    """
+
+    def check_num_logprobs(logprobs, expected_num: int):
+        for step in logprobs:
+            prev_logp = 1.0
+            # order by rank
+            sorted_step = dict(sorted(step.items(), key=lambda item: item[1].rank))
+
+            # Can contain the sampled token
+            assert len(step) == expected_num or len(step) == expected_num + 1
+            # Check results are ordered by prob value
+            for rankno, (tid, logp) in enumerate(sorted_step.items()):
+                assert logp.logprob <= prev_logp
+                prev_logp = logp.logprob
+                assert logp.rank == rankno + 1
+
+    llm = LLM(
+        model_name,
+        enforce_eager=False,
+        max_num_seqs=1,
+        max_model_len=128,
+        max_num_batched_tokens=128,
+    )
+    prompts = [
+        "Write a short story about a robot that dreams for the first time."
+    ] * n_prompts
+    greedy_sampling_params = SamplingParams(temperature=0.0, max_tokens=64, logprobs=4)
+    regular_sampling_params = SamplingParams(temperature=0.4, max_tokens=64, logprobs=4)
+    topkp_sampling_params = SamplingParams(
+        temperature=0.4, max_tokens=64, logprobs=4, top_k=12, top_p=0.5
+    )
+
+    for sp in [greedy_sampling_params, regular_sampling_params, topkp_sampling_params]:
+        output = llm.generate(prompts, sp)
+        for o in output:
+            check_num_logprobs(o.outputs[0].logprobs, 4)