Sync from v0.13

2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import LLM
+
+MODEL = "hmellor/tiny-random-LlamaForCausalLM"
+PROMPT = "Hello my name is Robert and I"
+
+
+@pytest.fixture(scope="module")
+def llm() -> LLM:
+    return LLM(
+        MODEL,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        long_prefill_token_threshold=2,
+        max_num_batched_tokens=6,
+        max_num_seqs=3,
+        block_size=16,
+    )
+
+
+def test_concurrent_partial_prefill(llm):
+    outputs = llm.generate([PROMPT] * 3)
+    assert len(outputs) == 3
+    for output in outputs:
+        assert len(output.outputs) == 1
+
+
+def test_prefix_cache_stats_is_recorded(llm):
+    # 17 tokens will make sure first 16 tokens are cached in a block
+    input_tokens = {"prompt_token_ids": [101] * 17}
+    _ = llm.generate([input_tokens])
+    outputs = llm.generate([input_tokens])
+    assert outputs[0].num_cached_tokens == 16