Sync from v0.13

This commit is contained in:
2026-01-19 10:38:50 +08:00
parent b2ef04d792
commit 5aef6c175a
3714 changed files with 854317 additions and 89342 deletions

View File

View File

@@ -0,0 +1,249 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections import deque
import pytest
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import RequestStatus
from vllm.v1.utils import ConstantList
from .utils import create_requests, create_scheduler
pytestmark = pytest.mark.cpu_test
def _make_model_runner_output(
scheduler_output: SchedulerOutput,
) -> ModelRunnerOutput:
req_ids = list(scheduler_output.num_scheduled_tokens.keys())
return ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)},
sampled_token_ids=[[i] for i in range(len(req_ids))],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[],
)
@pytest.mark.parametrize("max_tokens", [1, 2, 3, 5])
def test_stop_by_max_tokens(max_tokens: int):
scheduler = create_scheduler(async_scheduling=True)
requests = create_requests(num_requests=2, max_tokens=max_tokens)
req0, req1 = requests
expected_total_num_scheduled_tokens = 0
sched_outputs: deque[SchedulerOutput] = deque()
scheduler.add_request(req0)
sched_outputs.append(scheduler.schedule())
expected_total_num_scheduled_tokens += req0.num_prompt_tokens + max_tokens - 1
scheduler.add_request(req1)
sched_outputs.append(scheduler.schedule())
expected_total_num_scheduled_tokens += req1.num_prompt_tokens + max_tokens - 1
total_num_scheduled_tokens = 0
while sched_outputs:
sched_output = sched_outputs.popleft()
total_num_scheduled_tokens += sched_output.total_num_scheduled_tokens
model_runner_output = _make_model_runner_output(sched_output)
scheduler.update_from_output(sched_output, model_runner_output)
sched_output = scheduler.schedule()
if sched_output.num_scheduled_tokens:
sched_outputs.append(sched_output)
assert scheduler.get_num_unfinished_requests() == 0
assert req0.num_output_tokens == max_tokens
assert req1.num_output_tokens == max_tokens
# Ensure we aren't scheduling more tokens than necessary.
assert total_num_scheduled_tokens == expected_total_num_scheduled_tokens
def test_abort():
scheduler = create_scheduler(async_scheduling=True)
requests = create_requests(num_requests=10, max_tokens=20)
for req in requests:
scheduler.add_request(req)
sched_outputs: deque[SchedulerOutput] = deque()
sched_outputs.append(scheduler.schedule())
sched_outputs.append(scheduler.schedule())
abort_order = [0, 8, 3, 1, 6, 4, 2, 5, 7, 9]
abort_order_copy = abort_order.copy()
def abort_request():
if not abort_order:
return
req = requests[abort_order.pop(0)]
scheduler.finish_requests(req.request_id, RequestStatus.FINISHED_ABORTED)
while sched_outputs:
# Abort a scheduled request.
abort_request()
sched_output = sched_outputs.popleft()
model_runner_output = _make_model_runner_output(sched_output)
scheduler.update_from_output(sched_output, model_runner_output)
sched_output = scheduler.schedule()
if sched_output.num_scheduled_tokens:
sched_outputs.append(sched_output)
for i, req in enumerate(requests):
assert req.status == RequestStatus.FINISHED_ABORTED
assert req.num_output_tokens == abort_order_copy.index(i)
def test_preempt():
scheduler = create_scheduler(async_scheduling=True)
requests = create_requests(num_requests=10, max_tokens=20)
for req in requests:
scheduler.add_request(req)
sched_outputs: deque[SchedulerOutput] = deque()
sched_outputs.append(scheduler.schedule())
sched_outputs.append(scheduler.schedule())
abort_order = [0, 8, 3, 1, 6, 4, 2, 5, 7, 9]
abort_order_copy = abort_order.copy()
def abort_request():
if not abort_order:
return
req = requests[abort_order.pop(0)]
scheduler.finish_requests(req.request_id, RequestStatus.FINISHED_ABORTED)
while sched_outputs:
# Abort a scheduled request.
abort_request()
sched_output = sched_outputs.popleft()
model_runner_output = _make_model_runner_output(sched_output)
scheduler.update_from_output(sched_output, model_runner_output)
sched_output = scheduler.schedule()
if sched_output.num_scheduled_tokens:
sched_outputs.append(sched_output)
for i, req in enumerate(requests):
assert req.status == RequestStatus.FINISHED_ABORTED
assert req.num_output_tokens == abort_order_copy.index(i)
def test_prefix_caching_for_prefill_dedup():
CHUNK_SIZE = 1000
BLOCK_SIZE = 16
num_prompt_tokens = 100
scheduler = create_scheduler(
async_scheduling=True,
max_num_batched_tokens=CHUNK_SIZE,
enable_prefix_caching=True,
block_size=BLOCK_SIZE,
)
requests = create_requests(
num_requests=5,
num_tokens=num_prompt_tokens,
max_tokens=3,
same_prompt=True,
block_size=BLOCK_SIZE,
)
requests_copy = requests.copy()
# Two requests with the same prompt.
req0 = requests.pop(0)
req1 = requests.pop(0)
scheduler.add_request(req0)
scheduler.add_request(req1)
sched_outputs: deque[SchedulerOutput] = deque()
sched_output = scheduler.schedule()
sched_outputs.append(sched_output)
# Make sure prefix caching de-duplicates the prompts in the same step,
# so all the blocks except the last are shared between the two requests.
assert len(sched_output.num_scheduled_tokens) == 2
num_blocks = num_prompt_tokens // BLOCK_SIZE
assert req0.num_cached_tokens == 0
assert req1.num_cached_tokens >= num_blocks * BLOCK_SIZE
sched_outputs.append(scheduler.schedule())
while sched_outputs:
if requests:
scheduler.add_request(requests.pop(0))
sched_output = sched_outputs.popleft()
model_runner_output = _make_model_runner_output(sched_output)
scheduler.update_from_output(sched_output, model_runner_output)
sched_output = scheduler.schedule()
if sched_output.num_scheduled_tokens:
sched_outputs.append(sched_output)
# Other requests scheduled after the two requests should also get
# prefix cache hit.
assert scheduler.get_num_unfinished_requests() == 0
for req in requests_copy[1:]:
assert req.num_cached_tokens >= num_blocks * BLOCK_SIZE
def test_prefix_caching_for_multi_turn():
CHUNK_SIZE = 1000
BLOCK_SIZE = 16
num_prompt_tokens = 100
num_output_tokens = 200
scheduler = create_scheduler(
async_scheduling=True,
max_num_batched_tokens=CHUNK_SIZE,
enable_prefix_caching=True,
block_size=BLOCK_SIZE,
)
requests = create_requests(
num_requests=5,
num_tokens=num_prompt_tokens,
max_tokens=num_output_tokens,
block_size=BLOCK_SIZE,
)
for req in requests:
scheduler.add_request(req)
sched_outputs: deque[SchedulerOutput] = deque()
sched_outputs.append(scheduler.schedule())
sched_outputs.append(scheduler.schedule())
# Process the requests.
while sched_outputs:
sched_output = sched_outputs.popleft()
model_runner_output = _make_model_runner_output(sched_output)
scheduler.update_from_output(sched_output, model_runner_output)
sched_output = scheduler.schedule()
if sched_output.num_scheduled_tokens:
sched_outputs.append(sched_output)
assert scheduler.get_num_unfinished_requests() == 0
# Create next-turn requests whose prompts are the full output of the
# previous turn.
next_turn_requests = create_requests(
num_requests=5,
num_tokens=num_prompt_tokens + num_output_tokens,
max_tokens=num_output_tokens,
block_size=BLOCK_SIZE,
)
for i, req in enumerate(next_turn_requests):
req.prompt_token_ids = requests[i].prompt_token_ids + list(
requests[i].output_token_ids
)
req._all_token_ids = req.prompt_token_ids.copy()
req.all_token_ids = ConstantList(req._all_token_ids)
req.block_hashes = []
req.block_hashes = req.get_hash_new_full_blocks()
# Schedule the next-turn requests.
for req in next_turn_requests:
scheduler.add_request(req)
sched_outputs.append(scheduler.schedule())
# Make sure the next-turn requests get prefix cache hit by the previous
# requests.
for req in next_turn_requests:
assert req.num_cached_tokens == req.num_prompt_tokens // BLOCK_SIZE * BLOCK_SIZE

View File

@@ -0,0 +1,249 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange
from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
pytestmark = pytest.mark.cpu_test
# ------------------ Mock Classes ------------------ #
class MockRequest:
def __init__(self, request_id, mm_hashes, token_counts):
self.request_id = request_id
self._token_counts = token_counts
self.mm_features = []
for i, mm_hash in enumerate(mm_hashes):
feature = MultiModalFeatureSpec(
data=None,
modality="image",
identifier=mm_hash,
mm_position=PlaceholderRange(offset=0, length=self._token_counts[i]),
)
self.mm_features.append(feature)
def get_num_encoder_embeds(self, input_id: int) -> int:
return self._token_counts[input_id]
# ------------------ Unit Tests ------------------ #
def test_basic_allocate_and_reuse():
cache = EncoderCacheManager(cache_size=10)
req = MockRequest("r1", ["imgA"], [4])
assert not cache.check_and_update_cache(req, 0)
assert cache.can_allocate(req, 0, int(1e9), 0)
cache.allocate(req, 0)
assert cache.check_and_update_cache(req, 0)
assert "r1" in cache.cached["imgA"]
assert cache.num_free_slots == 6
# Free twice to bring refcount to 0.
cache.free_encoder_input(req, 0)
cache.free_encoder_input(req, 0)
assert not cache.cached["imgA"]
assert "imgA" in cache.freeable
assert cache.num_freeable_slots == 10
assert cache.num_free_slots == 6
def test_freeing_decreases_refcount_and_moves_to_freeable():
manager = EncoderCacheManager(cache_size=10)
req = MockRequest("req2", ["img3"], [5])
assert manager.can_allocate(req, 0, int(1e9), 0)
manager.allocate(req, 0)
assert len(manager.cached["img3"]) == 1
manager.free_encoder_input(req, 0)
assert not manager.cached["img3"]
assert "img3" in manager.freeable
assert manager.num_freeable_slots == 10
def test_free_request_frees_all_inputs():
manager = EncoderCacheManager(cache_size=10)
req = MockRequest("req3", ["a", "b"], [2, 3])
assert manager.can_allocate(req, 0, int(1e9), 0)
manager.allocate(req, 0)
assert manager.can_allocate(req, 1, int(1e9), 0)
manager.allocate(req, 1)
assert len(manager.cached["a"]) == 1
assert len(manager.cached["b"]) == 1
manager.free(req)
assert not manager.cached["a"]
assert not manager.cached["b"]
assert "a" in manager.freeable
assert "b" in manager.freeable
assert manager.num_freeable_slots == 10
def test_eviction_when_cache_is_full():
manager = EncoderCacheManager(cache_size=10)
req1 = MockRequest("req1", ["x"], [6])
req2 = MockRequest("req2", ["y"], [5])
assert manager.can_allocate(req1, 0, int(1e9), 0)
manager.allocate(req1, 0)
manager.free_encoder_input(req1, 0)
assert manager.can_allocate(req2, 0, int(1e9), 0)
manager.allocate(req2, 0)
# 'x' should have been evicted.
assert "x" not in manager.cached
assert "x" in manager.get_freed_mm_hashes()
def test_get_cached_input_ids():
manager = EncoderCacheManager(cache_size=10)
req = MockRequest("reqX", ["m", "n", "o"], [2, 4, 3])
assert manager.can_allocate(req, 0, int(1e9), 0)
manager.allocate(req, 0)
assert manager.can_allocate(req, 2, int(1e9), 0)
manager.allocate(req, 2)
cached_ids = manager.get_cached_input_ids(req)
assert cached_ids == {0, 2}
def test_has_cache_restores_from_freeable():
manager = EncoderCacheManager(cache_size=10)
req = MockRequest("reqY", ["imgZ"], [4])
assert manager.can_allocate(req, 0, int(1e9), 0)
manager.allocate(req, 0)
manager.free_encoder_input(req, 0)
# Should restore from freeable.
assert manager.check_and_update_cache(req, 0)
assert len(manager.cached["imgZ"]) == 1
assert "imgZ" not in manager.freeable
assert manager.num_freeable_slots == 6
def test_get_freed_mm_hashes_clears_freed_list():
manager = EncoderCacheManager(cache_size=10)
req1 = MockRequest("reqA", ["a"], [5])
req2 = MockRequest("reqB", ["b"], [6])
assert manager.can_allocate(req1, 0, int(1e9), 0)
manager.allocate(req1, 0)
manager.free_encoder_input(req1, 0)
# Should trigger eviction of 'a'.
assert manager.can_allocate(req2, 0, int(1e9), 0)
manager.allocate(req2, 0)
freed = manager.get_freed_mm_hashes()
assert "a" in freed
assert manager.get_freed_mm_hashes() == []
def test_schedule_request_multi_images_respect_space_limit():
manager = EncoderCacheManager(cache_size=10)
req = MockRequest("reqA", ["a", "b"], [5, 6])
compute_budget = 100
num_tokens_to_schedule = 0
assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
num_tokens_to_schedule += req.get_num_encoder_embeds(0)
compute_budget -= req.get_num_encoder_embeds(0)
assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule)
def test_schedule_request_multi_images_respect_compute_limit():
manager = EncoderCacheManager(cache_size=100)
req = MockRequest("reqA", ["a", "b"], [5, 6])
compute_budget = 10
num_tokens_to_schedule = 0
assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
num_tokens_to_schedule += req.get_num_encoder_embeds(0)
compute_budget -= req.get_num_encoder_embeds(0)
assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule)
def test_encoder_cache_with_is_embed_mask():
class MockRequestWithMask(MockRequest):
def get_num_encoder_embeds(self, input_id: int) -> int:
return self.mm_features[input_id].mm_position.get_num_embeds
is_embed = torch.zeros(100, dtype=torch.bool)
is_embed[torch.tensor([5, 15, 25, 35, 45, 55, 65, 75])] = True
request = MockRequestWithMask("r1", ["img1"], [100])
request.mm_features[0] = MultiModalFeatureSpec(
data=None,
modality="image",
identifier="img1",
mm_position=PlaceholderRange(offset=0, length=100, is_embed=is_embed),
)
manager = EncoderCacheManager(cache_size=100)
manager.allocate(request, 0)
assert manager.num_free_slots == 92
assert "img1" in manager.cached
old_size = 100
new_size = request.mm_features[0].mm_position.get_num_embeds
assert new_size == 8
savings_ratio = old_size / new_size
assert savings_ratio == 12.5
def test_encoder_cache_mask_based_retrieval():
class MockRequestWithMask(MockRequest):
def get_num_encoder_embeds(self, input_id: int) -> int:
return self.mm_features[input_id].mm_position.get_num_embeds
is_embed = torch.tensor(
[False, False, True, True, False, True, True, True, False, False]
)
request = MockRequestWithMask("r1", ["img1"], [10])
request.mm_features[0] = MultiModalFeatureSpec(
data=None,
modality="image",
identifier="img1",
mm_position=PlaceholderRange(offset=0, length=10, is_embed=is_embed),
)
manager = EncoderCacheManager(cache_size=50)
manager.allocate(request, 0)
assert request.mm_features[0].mm_position.get_num_embeds == 5
start_idx = 2
end_idx = 8
num_embeds_before = is_embed[:start_idx].sum().item()
num_embeds_in_range = is_embed[start_idx:end_idx].sum().item()
assert num_embeds_before == 0
assert num_embeds_in_range == 5
start_idx = 0
end_idx = 5
num_embeds_before = is_embed[:start_idx].sum().item() if start_idx > 0 else 0
num_embeds_in_range = is_embed[start_idx:end_idx].sum().item()
assert num_embeds_before == 0
assert num_embeds_in_range == 2

View File

@@ -0,0 +1,224 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import patch
import pytest
from vllm.v1.core.kv_cache_metrics import (
BlockMetricsState,
KVCacheMetricsCollector,
)
from vllm.v1.core.kv_cache_utils import KVCacheBlock
class TestBlockMetricsState:
def test_init(self):
with patch("time.monotonic_ns", return_value=1000000000):
state = BlockMetricsState()
assert state.birth_time_ns == 1000000000
assert state.last_access_ns == 1000000000
assert len(state.access_history) == 0
def test_access_tracking(self):
with patch("time.monotonic_ns", return_value=1000000000):
state = BlockMetricsState()
with patch("time.monotonic_ns", return_value=2000000000):
state.record_access()
assert state.last_access_ns == 2000000000
assert list(state.access_history) == [2000000000]
def test_ring_buffer_wraps_at_4(self):
with patch("time.monotonic_ns", return_value=1000000000):
state = BlockMetricsState()
for i in range(5):
t = 1000000000 + (i + 1) * 1000000000
with patch("time.monotonic_ns", return_value=t):
state.record_access()
assert len(state.access_history) == 4
assert list(state.access_history) == [
3000000000,
4000000000,
5000000000,
6000000000,
]
def test_lifetime(self):
with patch("time.monotonic_ns", return_value=1000000000):
state = BlockMetricsState()
with patch("time.monotonic_ns", return_value=6500000000):
assert abs(state.get_lifetime_seconds() - 5.5) < 0.001
def test_idle_time(self):
with patch("time.monotonic_ns", return_value=1000000000):
state = BlockMetricsState()
state.last_access_ns = 2000000000
with patch("time.monotonic_ns", return_value=5200000000):
assert abs(state.get_idle_time_seconds() - 3.2) < 0.001
def test_reuse_gaps(self):
with patch("time.monotonic_ns", return_value=1000000000):
state = BlockMetricsState()
base = 1000000000
for offset in [0, 1.5, 3.0, 5.5]:
state.access_history.append(base + int(offset * 1e9))
gaps = state.get_reuse_gaps_seconds()
assert len(gaps) == 3
assert gaps[0] == 1.5 and gaps[1] == 1.5 and gaps[2] == 2.5
def test_ring_wrap_only_gives_3_gaps(self):
# 5 accesses in size-4 buffer = 3 gaps
with patch("time.monotonic_ns", return_value=1000000000):
state = BlockMetricsState()
for i in range(5):
state.access_history.append(1000000000 + i * 1000000000)
assert len(state.get_reuse_gaps_seconds()) == 3
class TestKVCacheMetricsCollector:
def test_sample_rate_validation(self):
with pytest.raises(AssertionError):
KVCacheMetricsCollector(sample_rate=-0.1)
with pytest.raises(AssertionError):
KVCacheMetricsCollector(sample_rate=1.5)
with pytest.raises(AssertionError):
KVCacheMetricsCollector(sample_rate=0.0)
def test_sampling(self):
c = KVCacheMetricsCollector(sample_rate=1.0)
assert sum(1 for _ in range(100) if c.should_sample_block()) == 100
c = KVCacheMetricsCollector(sample_rate=0.5)
samples = sum(1 for _ in range(1000) if c.should_sample_block())
assert 400 < samples < 600
def test_alloc(self):
c = KVCacheMetricsCollector(sample_rate=1.0)
blocks = [KVCacheBlock(block_id=i) for i in range(5)]
with patch("time.monotonic_ns", return_value=1000000000):
for block in blocks:
c.on_block_allocated(block)
assert len(c.block_metrics) == 5
def test_access(self):
c = KVCacheMetricsCollector(sample_rate=1.0)
block = KVCacheBlock(block_id=0)
with patch("time.monotonic_ns", return_value=1000000000):
c.on_block_allocated(block)
for i in range(3):
t = 1000000000 + (i + 1) * 1000000000
with patch("time.monotonic_ns", return_value=t):
c.on_block_accessed(block)
assert len(c.block_metrics[0].access_history) == 3
def test_evict_no_accesses(self):
# lifetime should equal idle if never accessed
c = KVCacheMetricsCollector(sample_rate=1.0)
block = KVCacheBlock(block_id=0)
with patch("time.monotonic_ns", return_value=1000000000):
c.on_block_allocated(block)
with patch("time.monotonic_ns", return_value=6000000000):
c.on_block_evicted(block)
events = c.drain_events()
assert len(events) == 1
assert abs(events[0].lifetime_seconds - 5.0) < 0.001
assert abs(events[0].idle_seconds - 5.0) < 0.001
def test_evict(self):
c = KVCacheMetricsCollector(sample_rate=1.0)
block = KVCacheBlock(block_id=0)
with patch("time.monotonic_ns", return_value=1000000000):
c.on_block_allocated(block)
with patch("time.monotonic_ns", return_value=2000000000):
c.on_block_accessed(block)
with patch("time.monotonic_ns", return_value=3000000000):
c.on_block_accessed(block)
with patch("time.monotonic_ns", return_value=4000000000):
c.on_block_evicted(block)
events = c.drain_events()
assert len(events) == 1
sample = events[0]
assert abs(sample.lifetime_seconds - 3.0) < 0.001
assert abs(sample.idle_seconds - 1.0) < 0.001
assert sample.reuse_gaps_seconds == (1.0,)
assert 0 not in c.block_metrics
def test_reset(self):
c = KVCacheMetricsCollector(sample_rate=1.0)
with patch("time.monotonic_ns", return_value=1000000000):
for i in range(5):
c.on_block_allocated(KVCacheBlock(block_id=i))
assert len(c.block_metrics) == 5
c.reset()
assert len(c.block_metrics) == 0
with patch("time.monotonic_ns", return_value=2000000000):
c.on_block_allocated(KVCacheBlock(block_id=10))
assert 10 in c.block_metrics
def test_huge_time_jump(self):
c = KVCacheMetricsCollector(sample_rate=1.0)
block = KVCacheBlock(block_id=0)
with patch("time.monotonic_ns", return_value=1000000000):
c.on_block_allocated(block)
with patch("time.monotonic_ns", return_value=9999999999999999):
c.on_block_evicted(block)
events = c.drain_events()
assert len(events) == 1
assert events[0].lifetime_seconds > 0
def test_kv_cache_metrics_collector_smoke() -> None:
"""Simple smoke test for KVCacheMetricsCollector on CPU."""
collector = KVCacheMetricsCollector(sample_rate=1.0)
block = KVCacheBlock(block_id=123)
# Allocate at t = 1.0s.
with patch("time.monotonic_ns", return_value=1_000_000_000):
collector.on_block_allocated(block)
# Access at t = 2.0s and t = 3.0s.
with patch("time.monotonic_ns", return_value=2_000_000_000):
collector.on_block_accessed(block)
with patch("time.monotonic_ns", return_value=3_000_000_000):
collector.on_block_accessed(block)
# Evict at t = 4.0s.
with patch("time.monotonic_ns", return_value=4_000_000_000):
collector.on_block_evicted(block)
events = collector.drain_events()
assert len(events) == 1
event = events[0]
# Lifetime: 1.0s → 4.0s.
assert abs(event.lifetime_seconds - 3.0) < 1e-6
# Idle: last access at 3.0s, evicted at 4.0s.
assert abs(event.idle_seconds - 1.0) < 1e-6
# One reuse gap between the two accesses.
assert event.reuse_gaps_seconds == (1.0,)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,103 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
pytestmark = pytest.mark.cpu_test
def new_kv_cache_spec():
return FullAttentionSpec(16, 1, 1, torch.float32, False)
def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
"""
Test initializing KV cache sharing with different attention groups.
Layers in the same KV cache group might be placed in different attn groups
if they have different attention backends.
"""
shared_kv_cache_layers = {
"model.layers.2": "model.layers.0",
"model.layers.3": "model.layers.1",
}
# Layers 0 and 1 both belong in KV cache group 0
# However, if they have different attention backends, they will be
# placed in different attention groups for KV cache group 0
kv_cache_groups = [
KVCacheGroupSpec(["model.layers.0", "model.layers.1"], new_kv_cache_spec()),
]
add_kv_sharing_layers_to_kv_cache_groups(
shared_kv_cache_layers=shared_kv_cache_layers,
kv_cache_groups=kv_cache_groups,
)
# Check that the layers were added to the correct KV cache group
assert len(kv_cache_groups) == 1
assert kv_cache_groups[0].layer_names == [
"model.layers.0",
"model.layers.1",
"model.layers.2",
"model.layers.3",
]
def test_initialize_kv_cache_for_kv_sharing_same_attn_groups():
"""
Test case assuming that all layers in the same KV cache group have the same
attention backends. This is true for most models.
"""
shared_kv_cache_layers = {
"model.layers.2": "model.layers.0",
"model.layers.3": "model.layers.1",
}
kv_cache_groups = [
KVCacheGroupSpec(["model.layers.0", "model.layers.1"], new_kv_cache_spec()),
]
add_kv_sharing_layers_to_kv_cache_groups(
shared_kv_cache_layers=shared_kv_cache_layers,
kv_cache_groups=kv_cache_groups,
)
# Check that the layers were added to the correct KV cache group
assert len(kv_cache_groups) == 1
assert kv_cache_groups[0].layer_names == [
"model.layers.0",
"model.layers.1",
"model.layers.2",
"model.layers.3",
]
def test_initialize_kv_cache_for_kv_sharing_no_attn_groups():
"""
Test KV sharing set up when no attention groups are provided.
This is the case for the TPU model runner, which doesn't have
support for attention groups yet.
"""
shared_kv_cache_layers = {
"model.layers.2": "model.layers.0",
"model.layers.3": "model.layers.1",
}
kv_cache_groups = [
KVCacheGroupSpec(["model.layers.0"], new_kv_cache_spec()),
KVCacheGroupSpec(["model.layers.1"], new_kv_cache_spec()),
]
add_kv_sharing_layers_to_kv_cache_groups(
shared_kv_cache_layers=shared_kv_cache_layers,
kv_cache_groups=kv_cache_groups,
)
# Check that the layers were added to the correct KV cache group
assert len(kv_cache_groups) == 2
assert kv_cache_groups[0].layer_names == ["model.layers.0", "model.layers.2"]
assert kv_cache_groups[1].layer_names == ["model.layers.1", "model.layers.3"]

View File

@@ -0,0 +1,36 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm.v1.core.sched.output import NewRequestData
def _create_new_requests_data(prompt_embeds: torch.Tensor | None) -> NewRequestData:
return NewRequestData(
req_id="test_req",
prompt_token_ids=None,
mm_features=[],
sampling_params=None,
pooling_params=None,
block_ids=([],),
num_computed_tokens=0,
lora_request=None,
prompt_embeds=prompt_embeds,
)
def test_repr_with_none() -> None:
"""Test repr when prompt_embeds is None."""
new_requests_data = _create_new_requests_data(None)
assert "prompt_embeds_shape=None" in repr(new_requests_data)
assert "prompt_embeds_shape=None" in new_requests_data.anon_repr()
def test_repr_with_multi_element_tensor() -> None:
"""Test repr when prompt_embeds is a multi-element tensor."""
prompt_embeds = torch.randn(10, 768)
new_requests_data = _create_new_requests_data(prompt_embeds)
assert "prompt_embeds_shape=torch.Size([10, 768])" in repr(new_requests_data)
assert "prompt_embeds_shape=torch.Size([10, 768])" in new_requests_data.anon_repr()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,262 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import uuid
import pytest
from vllm.config import VllmConfig
from vllm.multimodal.inputs import (
MultiModalFeatureSpec,
MultiModalKwargsItem,
PlaceholderRange,
)
from vllm.sampling_params import SamplingParams
from vllm.utils.hashing import get_hash_fn_by_name
from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
from vllm.v1.request import Request
from .test_scheduler import create_scheduler_with_priority
from .utils import EOS_TOKEN_ID
pytestmark = pytest.mark.cpu_test
def _create_random_request(
max_tokens_range: tuple[int, int],
num_tokens_range: tuple[int, int],
arrival_time_range: tuple[float, float],
priority_range: tuple[int, int],
num_mm_item_range: tuple[int, int],
vllm_config: VllmConfig,
):
max_tokens = random.randint(*max_tokens_range)
num_tokens = random.randint(*num_tokens_range)
priority = random.randint(*priority_range)
arrival_time = random.uniform(*arrival_time_range)
num_mm_item = random.randint(*num_mm_item_range)
mm_positions: list[PlaceholderRange] = []
for mm_start in sorted(
random.sample(range(num_tokens), min(num_mm_item, num_tokens))
):
if mm_start + 10 > num_tokens:
continue
mm_positions.append(PlaceholderRange(offset=mm_start, length=10))
request_id = uuid.uuid4().hex
sampling_params = SamplingParams(
ignore_eos=False,
max_tokens=max_tokens,
)
mm_features = []
for j, position in enumerate(mm_positions):
identifier = f"{request_id}_hash_{j}"
mm_feature = MultiModalFeatureSpec(
data=MultiModalKwargsItem.dummy("dummy_m"),
mm_position=position,
identifier=identifier,
modality="image",
)
mm_features.append(mm_feature)
prompt_token_ids = random.choices(range(100), k=num_tokens)
caching_hash_fn = get_hash_fn_by_name(
vllm_config.cache_config.prefix_caching_hash_algo
)
init_none_hash(caching_hash_fn)
block_hasher = get_request_block_hasher(
vllm_config.cache_config.block_size, caching_hash_fn
)
request = Request(
request_id=request_id,
prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params,
pooling_params=None,
mm_features=mm_features if mm_features else None,
eos_token_id=EOS_TOKEN_ID,
arrival_time=arrival_time,
priority=priority,
block_hasher=block_hasher,
)
return request
def _mock_execute_model(
scheduler_output: SchedulerOutput, num_output_tokens_range: tuple[int, int]
) -> ModelRunnerOutput:
request_ids: list[str] = []
request_ids.extend(req.req_id for req in scheduler_output.scheduled_new_reqs)
request_ids.extend(scheduler_output.scheduled_cached_reqs.req_ids)
random.shuffle(request_ids)
num_output_tokens = [
random.randint(*num_output_tokens_range) for _ in range(len(request_ids))
]
sampled_token_ids = [
[random.randint(0, 100) for _ in range(num_tokens)]
for num_tokens in num_output_tokens
]
return ModelRunnerOutput(
req_ids=request_ids,
req_id_to_index={req_id: i for i, req_id in enumerate(request_ids)},
sampled_token_ids=sampled_token_ids,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[],
)
def _mock_draft_token_ids(
scheduler_output: SchedulerOutput,
num_output_tokens_range: tuple[int, int],
seen_request_prompt_length: dict[str, int],
) -> DraftTokenIds:
request_ids: list[str] = []
sampled_token_ids: list[list[int]] = []
for request in scheduler_output.scheduled_new_reqs:
assert request.req_id not in seen_request_prompt_length
seen_request_prompt_length[request.req_id] = len(request.prompt_token_ids or [])
if request.num_computed_tokens >= seen_request_prompt_length[request.req_id]:
num_tokens = random.randint(*num_output_tokens_range)
request_ids.append(request.req_id)
sampled_token_ids.append(
[random.randint(0, 100) for _ in range(num_tokens)]
)
for req_id, num_computed_tokens in zip(
scheduler_output.scheduled_cached_reqs.req_ids,
scheduler_output.scheduled_cached_reqs.num_computed_tokens,
):
if num_computed_tokens >= seen_request_prompt_length[req_id]:
num_tokens = random.randint(*num_output_tokens_range)
request_ids.append(req_id)
sampled_token_ids.append(
[random.randint(0, 100) for _ in range(num_tokens)]
)
return DraftTokenIds(req_ids=request_ids, draft_token_ids=sampled_token_ids)
def _chech_valid_scheduler_output(
scheduler_output: SchedulerOutput,
seen_request_ids: set[str],
seen_mm_hashes: set[str],
):
for req in scheduler_output.scheduled_new_reqs:
assert req.req_id not in seen_request_ids
seen_request_ids.add(req.req_id)
for req_id in scheduler_output.scheduled_cached_reqs.req_ids:
assert req_id in seen_request_ids
req_ids = set[str]()
req_ids.update(req.req_id for req in scheduler_output.scheduled_new_reqs)
req_ids.update(scheduler_output.scheduled_cached_reqs.req_ids)
assert set(scheduler_output.num_scheduled_tokens.keys()) == req_ids
assert (
sum(scheduler_output.num_scheduled_tokens.values())
== scheduler_output.total_num_scheduled_tokens
)
assert set(scheduler_output.scheduled_spec_decode_tokens.keys()) <= req_ids
assert set(scheduler_output.scheduled_encoder_inputs.keys()) <= req_ids
for req in scheduler_output.scheduled_new_reqs:
for mm_feature in req.mm_features:
seen_mm_hashes.add(mm_feature.identifier)
for mm_hash in scheduler_output.free_encoder_mm_hashes:
assert mm_hash in seen_mm_hashes
assert scheduler_output.finished_req_ids <= seen_request_ids
@pytest.mark.parametrize("enable_prefix_caching", [True, False])
@pytest.mark.parametrize("num_speculative_tokens", [None, 1, 5])
@pytest.mark.parametrize(
("max_input_tokens", "max_output_tokens", "max_num_seqs", "num_blocks"),
[
# Standard profile
(5000, 500, 256, 10000),
# Generation heavy + high max_num_seqs + low num_blocks -> Many preemptions
(500, 5000, 1024, 1000),
],
ids=["standard", "preemption"],
)
def test_priority_scheduling_blast(
enable_prefix_caching: bool,
num_speculative_tokens: int | None,
max_input_tokens: int,
max_output_tokens: int,
max_num_seqs: int,
num_blocks: int,
):
random.seed(42)
seen_request_prompt_length = dict[str, int]()
seen_request_ids = set[str]()
seen_mm_hashes = set[str]()
scheduler = create_scheduler_with_priority(
model="Qwen/Qwen2.5-VL-3B-Instruct",
max_num_seqs=max_num_seqs,
enable_prefix_caching=enable_prefix_caching,
num_blocks=num_blocks,
num_speculative_tokens=num_speculative_tokens,
)
num_initial_requests = 10
for _ in range(num_initial_requests):
req = _create_random_request(
max_tokens_range=(1, max_output_tokens),
num_tokens_range=(1, max_input_tokens),
arrival_time_range=(0, 1),
priority_range=(-3, 3),
num_mm_item_range=(0, 2),
vllm_config=scheduler.vllm_config,
)
scheduler.add_request(req)
num_initial_requests = 2
for _ in range(num_initial_requests):
req = _create_random_request(
max_tokens_range=(1, max_output_tokens),
num_tokens_range=(1, max_input_tokens),
arrival_time_range=(0, 0),
priority_range=(4, 4),
num_mm_item_range=(0, 2),
vllm_config=scheduler.vllm_config,
)
scheduler.add_request(req)
for _ in range(20000):
if len(scheduler.waiting) == 0:
num_new_requests = random.randint(0, 2)
for _ in range(num_new_requests):
req = _create_random_request(
max_tokens_range=(1, max_output_tokens),
num_tokens_range=(1, max_input_tokens),
arrival_time_range=(0, 1),
priority_range=(-3, 3),
num_mm_item_range=(0, 2),
vllm_config=scheduler.vllm_config,
)
scheduler.add_request(req)
scheduler_output = scheduler.schedule()
_chech_valid_scheduler_output(
scheduler_output, seen_request_ids, seen_mm_hashes
)
model_output = _mock_execute_model(
scheduler_output,
num_output_tokens_range=(1, 1 + (num_speculative_tokens or 0)),
)
scheduler.update_from_output(scheduler_output, model_output)
if num_speculative_tokens is not None:
scheduler.update_draft_token_ids(
_mock_draft_token_ids(
scheduler_output,
(0, num_speculative_tokens),
seen_request_prompt_length,
)
)

View File

@@ -0,0 +1,69 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import EngineArgs, LLMEngine, SamplingParams
PROMPTS = [
"A robot may not injure a human being ",
"To be or not to be,",
"What is the meaning of life?",
"What does the fox say? " * 20, # Test long prompt
]
def test_reset_prefix_cache_e2e(monkeypatch):
# "spawn" is required for test to be deterministic
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
engine_args = EngineArgs(
model="Qwen/Qwen3-0.6B",
gpu_memory_utilization=0.2,
async_scheduling=True,
max_num_batched_tokens=32,
max_model_len=2048,
compilation_config={"mode": 0},
dtype="float16",
)
engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=16,
)
# No preempt case:
for i, prompt in enumerate(PROMPTS):
engine.add_request("ground_truth_" + str(i), prompt, sampling_params)
ground_truth_results = {}
while engine.has_unfinished_requests():
request_outputs = engine.step()
for request_output in request_outputs:
if request_output.finished:
ground_truth_results[request_output.request_id] = request_output
# Preempt case:
for i, prompt in enumerate(PROMPTS):
engine.add_request("preempted_" + str(i), prompt, sampling_params)
step_id = 0
preempted_results = {}
while engine.has_unfinished_requests():
if step_id == 10:
engine.reset_prefix_cache(reset_running_requests=True)
request_outputs = engine.step()
for request_output in request_outputs:
if request_output.finished:
preempted_results[request_output.request_id] = request_output
step_id += 1
for i in range(len(PROMPTS)):
assert (
ground_truth_results["ground_truth_" + str(i)].outputs[0].text
== preempted_results["preempted_" + str(i)].outputs[0].text
), (
f"ground_truth_results['ground_truth_{i}'].outputs[0].text="
f"{ground_truth_results['ground_truth_' + str(i)].outputs[0].text} "
f"preempted_results['preempted_{i}'].outputs[0].text="
f"{preempted_results['preempted_' + str(i)].outputs[0].text}"
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,37 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm import LLM
MODEL = "hmellor/tiny-random-LlamaForCausalLM"
PROMPT = "Hello my name is Robert and I"
@pytest.fixture(scope="module")
def llm() -> LLM:
return LLM(
MODEL,
enforce_eager=True,
enable_prefix_caching=True,
long_prefill_token_threshold=2,
max_num_batched_tokens=6,
max_num_seqs=3,
block_size=16,
)
def test_concurrent_partial_prefill(llm):
outputs = llm.generate([PROMPT] * 3)
assert len(outputs) == 3
for output in outputs:
assert len(output.outputs) == 1
def test_prefix_cache_stats_is_recorded(llm):
# 17 tokens will make sure first 16 tokens are cached in a block
input_tokens = {"prompt_token_ids": [101] * 17}
_ = llm.generate([input_tokens])
outputs = llm.generate([input_tokens])
assert outputs[0].num_cached_tokens == 16

View File

@@ -0,0 +1,366 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import pytest
import torch
from vllm.v1.core.block_pool import BlockPool
from vllm.v1.core.kv_cache_utils import (
BlockHash,
KVCacheBlock,
make_block_hash_with_group_id,
)
from vllm.v1.core.single_type_kv_cache_manager import (
ChunkedLocalAttentionManager,
SlidingWindowManager,
)
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, SlidingWindowSpec
pytestmark = pytest.mark.cpu_test
def get_sliding_window_manager(sliding_window_spec, block_pool):
return SlidingWindowManager(sliding_window_spec, block_pool, kv_cache_group_id=0)
def get_chunked_local_attention_manager(chunked_local_attention_spec, block_pool):
return ChunkedLocalAttentionManager(
chunked_local_attention_spec, block_pool, kv_cache_group_id=0
)
def test_chunked_local_attention_possible_cached_prefix():
block_size = 2
chunked_local_attention_spec = ChunkedLocalAttentionSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
attention_chunk_size=4,
)
block_pool = BlockPool(
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
)
manager = get_chunked_local_attention_manager(
chunked_local_attention_spec, block_pool
)
def run_one_case(block_is_cached, tail_token, expect_length):
block_hash_list = [
BlockHash(str(i).encode()) for i in range(len(block_is_cached))
]
block_pool.cached_block_hash_to_block._cache.clear()
# Mock the block pool with the cached blocks
for i, (block_hash, is_cached) in enumerate(
zip(block_hash_list, block_is_cached)
):
if is_cached:
block_pool.cached_block_hash_to_block.insert(
make_block_hash_with_group_id(block_hash, 0),
block_pool.blocks[i + 10],
)
computed_blocks = manager.find_longest_cache_hit(
block_hashes=block_hash_list,
max_length=len(block_hash_list) * block_size + tail_token,
kv_cache_group_ids=[0],
block_pool=block_pool,
kv_cache_spec=chunked_local_attention_spec,
use_eagle=False,
alignment_tokens=block_size,
)[0]
assert len(computed_blocks) == expect_length
assert all(
block == block_pool.null_block
for block in computed_blocks[: (expect_length - 1) // 2]
)
run_one_case([True], 0, 1)
run_one_case([True], 1, 1)
run_one_case([True, False], 0, 2)
run_one_case([True, False], 1, 2)
run_one_case([True, True], 0, 2)
run_one_case([True, True], 1, 2)
run_one_case([True, True, False], 0, 2)
run_one_case([True, True, False], 1, 2)
run_one_case([True, True, True], 0, 3)
run_one_case([True, True, True], 1, 3)
run_one_case([True, True, True, False], 0, 4)
run_one_case([True, True, True, False], 1, 4)
run_one_case([random.choice([True, False])] * 8 + [True], 1, 9)
run_one_case([random.choice([True, False])] * 8 + [False], 1, 8)
run_one_case([random.choice([True, False])] * 8 + [True, True], 1, 10)
run_one_case([random.choice([True, False])] * 8 + [True, False], 0, 10)
run_one_case([random.choice([True, False])] * 8 + [True, False], 1, 10)
run_one_case([random.choice([True, False])] * 8 + [False, True], 0, 10)
run_one_case([random.choice([True, False])] * 8 + [False, True], 1, 10)
run_one_case([random.choice([True, False])] * 8 + [False, False], 0, 10)
run_one_case([random.choice([True, False])] * 8 + [False, False], 1, 10)
def test_sliding_window_possible_cached_prefix():
block_size = 2
sliding_window_spec = SlidingWindowSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
sliding_window=4,
)
block_pool = BlockPool(
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
)
manager = get_sliding_window_manager(sliding_window_spec, block_pool)
def run_one_case(block_is_cached, expect_length):
block_hash_list = [
BlockHash(str(i).encode()) for i in range(len(block_is_cached))
]
block_pool.cached_block_hash_to_block._cache.clear()
# Mock the block pool with the cached blocks
for i, (block_hash, is_cached) in enumerate(
zip(block_hash_list, block_is_cached)
):
if is_cached:
block_pool.cached_block_hash_to_block.insert(
make_block_hash_with_group_id(block_hash, 0),
block_pool.blocks[i + 10],
)
computed_blocks = manager.find_longest_cache_hit(
block_hashes=block_hash_list,
max_length=len(block_hash_list) * block_size,
kv_cache_group_ids=[0],
block_pool=block_pool,
kv_cache_spec=sliding_window_spec,
use_eagle=False,
alignment_tokens=block_size,
)[0]
assert len(computed_blocks) == expect_length
assert all(
block == block_pool.null_block
for block in computed_blocks[: expect_length - 2]
)
for i in range(2):
if i < expect_length:
block_index = expect_length - i - 1
assert computed_blocks[block_index].block_id == block_index + 10
run_one_case([False] * 10, 0)
run_one_case([True], 1)
run_one_case([True, False], 1)
run_one_case([True, True], 2)
run_one_case([True, True, False], 2)
run_one_case([True, True, True], 3)
run_one_case([True, True, True, False], 3)
run_one_case(
[True, True, False, True, False, False, True, True, False, True, True, True], 12
)
run_one_case(
[True, True, False, True, False, False, True, True, False, False, False], 8
)
run_one_case(
[True, True, False, True, False, False, True, True, False, False, False, True],
8,
)
def test_chunked_local_attention_remove_skipped_blocks():
attention_spec = ChunkedLocalAttentionSpec(
block_size=2,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
attention_chunk_size=4,
)
block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
manager = get_chunked_local_attention_manager(attention_spec, block_pool)
null_block_id = block_pool.null_block.block_id
def id_to_block_table(ids) -> list[KVCacheBlock]:
return [
KVCacheBlock(id_) if id_ != null_block_id else block_pool.null_block
for id_ in ids
]
def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
for block, id_ in zip(block_table, ids):
if id_ == null_block_id:
assert block == block_pool.null_block
else:
assert block.block_id == id_
original_block_ids = [
1000,
1001,
1002,
1003,
1004,
1005,
1006,
1007,
1008,
1009,
1010,
]
block_table = id_to_block_table(original_block_ids)
manager.req_to_blocks["test"] = block_table
manager.remove_skipped_blocks("test", 0)
assert_block_id(block_table, original_block_ids)
# For 4th token (0-indexed), token 0-3 is out of the local attention window.
manager.remove_skipped_blocks("test", 4)
assert_block_id(block_table, [null_block_id] * 2)
# For 6th token (0-indexed), token 4 - 6 are in local attention window,
# token 0 - 3 are out, 2 blocks can be removed.
manager.remove_skipped_blocks("test", 6)
assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
# For 12th token (0-indexed),
# token 0-11 are out, 6 block can be removed.
manager.remove_skipped_blocks("test", 12)
assert_block_id(block_table, [null_block_id] * 6)
def test_sliding_window_remove_skipped_blocks():
sliding_window_spec = SlidingWindowSpec(
block_size=2,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
sliding_window=4,
)
block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
manager = get_sliding_window_manager(sliding_window_spec, block_pool)
null_block_id = block_pool.null_block.block_id
def id_to_block_table(ids) -> list[KVCacheBlock]:
return [
KVCacheBlock(id_) if id_ != null_block_id else block_pool.null_block
for id_ in ids
]
def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
for block, id_ in zip(block_table, ids):
if id_ == null_block_id:
assert block == block_pool.null_block
else:
assert block.block_id == id_
original_block_ids = [
1000,
1001,
1002,
1003,
1004,
1005,
1006,
1007,
1008,
1009,
1010,
]
block_table = id_to_block_table(original_block_ids)
manager.req_to_blocks["test"] = block_table
manager.remove_skipped_blocks("test", 0)
assert_block_id(block_table, original_block_ids)
# 4 tokens are computed. Only token 0 is out of the sliding window. As
# block 1000 also contains token 1 that is in the sliding window, block 1000
# cannot be removed.
manager.remove_skipped_blocks("test", 4)
assert_block_id(block_table, original_block_ids)
# 5 tokens are computed. Token 0 & 1 are out of the sliding window.
# Block 1000 can be removed.
manager.remove_skipped_blocks("test", 5)
assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
# 6 tokens are computed. Token 0-2 are out of the sliding window.
# Cannot remove new block as the block 1001 is still used by token 3.
manager.remove_skipped_blocks("test", 6)
assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
# 7 tokens are computed. Token 0-3 are out of the sliding window.
# Block 1001 can be removed and block 1000 is already removed.
manager.remove_skipped_blocks("test", 7)
assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
# 11 tokens are computed. Token 0-7 are out of the sliding window.
# Block 1002 & 1003 can be removed now. Block 1003 represents a longer
# sequence, and is expected to be evicted earlier than 1002, so the order
# of removed blocks should be [1003, 1002].
manager.remove_skipped_blocks("test", 11)
assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
def test_get_num_blocks_to_allocate():
block_size = 2
sliding_window_spec = SlidingWindowSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
sliding_window=4, # Placeholder value, not related to test result
)
block_pool = BlockPool(
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
)
manager = get_sliding_window_manager(sliding_window_spec, block_pool)
cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
KVCacheBlock(i + 1) for i in range(5)
]
assert (
manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
)
assert (
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
)
def test_chunked_local_attention_get_num_blocks_to_allocate():
block_size = 2
attention_spec = ChunkedLocalAttentionSpec(
block_size=block_size,
num_kv_heads=1,
head_size=1,
dtype=torch.float32,
attention_chunk_size=4, # Placeholder value, not related to test result
)
block_pool = BlockPool(
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
)
manager = get_chunked_local_attention_manager(attention_spec, block_pool)
cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
KVCacheBlock(i + 1) for i in range(5)
]
assert (
manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
)
assert (
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
)

248
tests/v1/core/utils.py Normal file
View File

@@ -0,0 +1,248 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from tests.v1.kv_connector.unit.utils import MockKVConfig
from vllm.config import (
CacheConfig,
ECTransferConfig,
KVTransferConfig,
ModelConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
)
from vllm.multimodal.inputs import (
MultiModalFeatureSpec,
MultiModalKwargsItem,
PlaceholderRange,
)
from vllm.sampling_params import SamplingParams
from vllm.utils.hashing import sha256
from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
from vllm.v1.core.sched.async_scheduler import AsyncScheduler
from vllm.v1.core.sched.scheduler import Scheduler
from vllm.v1.kv_cache_interface import (
FullAttentionSpec,
KVCacheConfig,
KVCacheGroupSpec,
)
from vllm.v1.request import Request
from vllm.v1.structured_output import StructuredOutputManager
EOS_TOKEN_ID = 50256
def mock_kv(matched_tokens: int, is_async: bool):
return MockKVConfig(matched_tokens=matched_tokens, is_async=is_async)
def create_scheduler(
model: str = "facebook/opt-125m",
max_num_seqs: int = 16,
max_num_batched_tokens: int = 8192,
enable_chunked_prefill: bool = True,
enable_prefix_caching: bool = False,
long_prefill_token_threshold: int = 0,
disable_chunked_mm_input: bool = False,
use_kv_connector: None | bool | MockKVConfig = None,
num_blocks: int = 10000,
block_size: int = 16,
max_model_len: int | None = None,
num_speculative_tokens: int | None = None,
skip_tokenizer_init: bool = False,
async_scheduling: bool = False,
use_ec_connector: bool = False,
ec_role: str | None = None,
) -> Scheduler | AsyncScheduler:
"""Create scheduler under test.
Args:
model: model under test
max_num_seqs: max sequences to schedule
max_num_batch_tokens: max num tokens to batch
enable_prefix_caching: optionally force APC config
(True/False) or use default
(False)
Returns:
{class}`Scheduler` instance
"""
model_config = ModelConfig(
model=model,
trust_remote_code=True,
dtype="float16",
seed=42,
skip_tokenizer_init=skip_tokenizer_init,
)
if max_model_len is None:
max_model_len = max_num_batched_tokens
scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_model_len,
long_prefill_token_threshold=long_prefill_token_threshold,
disable_chunked_mm_input=disable_chunked_mm_input,
enable_chunked_prefill=enable_chunked_prefill,
async_scheduling=async_scheduling,
is_encoder_decoder=model_config.is_encoder_decoder,
)
# Cache config, optionally force APC
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=enable_prefix_caching,
)
kv_transfer_config = None
if isinstance(use_kv_connector, MockKVConfig):
kv_transfer_config = KVTransferConfig(
kv_connector="MockKVConnector",
kv_role="kv_both",
kv_connector_extra_config={
"matched_tokens": use_kv_connector.matched_tokens,
"is_async": use_kv_connector.is_async,
},
)
elif use_kv_connector:
kv_transfer_config = KVTransferConfig(
kv_connector="ExampleConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": "local_storage"},
)
speculative_config: SpeculativeConfig | None = None
if num_speculative_tokens is not None:
speculative_config = SpeculativeConfig(
model="ngram", num_speculative_tokens=num_speculative_tokens
)
ec_transfer_config = (
ECTransferConfig(
ec_connector="ECExampleConnector",
ec_role=ec_role,
ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"},
)
if use_ec_connector
else None
)
vllm_config = VllmConfig(
scheduler_config=scheduler_config,
model_config=model_config,
cache_config=cache_config,
kv_transfer_config=kv_transfer_config,
speculative_config=speculative_config,
ec_transfer_config=ec_transfer_config,
)
kv_cache_config = KVCacheConfig(
num_blocks=num_blocks, # A large number of blocks to hold all requests
kv_cache_tensors=[],
kv_cache_groups=[
KVCacheGroupSpec(
["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
)
],
)
cache_config.num_gpu_blocks = num_blocks
scheduler_cls = AsyncScheduler if async_scheduling else Scheduler
return scheduler_cls(
vllm_config=vllm_config,
kv_cache_config=kv_cache_config,
block_size=block_size,
log_stats=True,
structured_output_manager=StructuredOutputManager(vllm_config),
)
_none_hash_initialized = False
def create_requests(
num_requests: int,
num_tokens: int = 10,
mm_hashes_list: list[list[str]] | None = None,
mm_positions: list[list[PlaceholderRange]] | None = None,
max_tokens: int = 16,
stop_token_ids: list[int] | None = None,
prompt_logprobs: int | None = None,
same_prompt: bool = False,
block_size: int = 16,
req_ids: list[str] | None = None,
) -> list[Request]:
global _none_hash_initialized
if not _none_hash_initialized:
init_none_hash(sha256)
_none_hash_initialized = True
block_hasher = get_request_block_hasher(block_size, sha256)
sampling_params = SamplingParams(
ignore_eos=False,
max_tokens=max_tokens,
stop_token_ids=stop_token_ids,
prompt_logprobs=prompt_logprobs,
)
requests = []
if mm_hashes_list is not None:
# NOTE: allow manual input; some mm items can have the same identifier
# no. of mm_hashes and mm_positions for each request should be identical
assert mm_positions is not None, (
"mm_positions must be provided when mm_hashes_list is provided"
)
assert len(mm_hashes_list) == len(mm_positions) == num_requests
assert [len(h) for h in mm_hashes_list] == [len(p) for p in mm_positions]
# Since same identifier would imply they are identical encoder output
# Verify mm items with identical identifier are having mm_position.length
seen_hashes: dict[str, int] = {}
if req_ids:
assert len(req_ids) == num_requests
else:
req_ids = [f"{i}" for i in range(num_requests)]
for i in range(num_requests):
mm_features = []
for j, position in enumerate(
mm_positions[i] if mm_positions is not None else []
):
if mm_hashes_list is not None:
identifier = mm_hashes_list[i][j]
# Verify if position length is identical
position_length = position.length
if identifier in seen_hashes:
assert seen_hashes[identifier] == position_length, (
f"mm_hash '{identifier}' has inconsistent position lengths: "
f"previously {seen_hashes[identifier]}, now {position_length} "
f"at request {i}, position {j}"
)
else:
seen_hashes[identifier] = position_length
else:
# Unique dummy hash for each mm item
identifier = f"hash{i}_{j}"
mm_feature = MultiModalFeatureSpec(
data=MultiModalKwargsItem.dummy("dummy_m"),
mm_position=position,
identifier=identifier,
modality="image",
)
mm_features.append(mm_feature)
prompt_token_ids = [0] * num_tokens if same_prompt else [i] * num_tokens
request = Request(
request_id=req_ids[i],
prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params,
pooling_params=None,
mm_features=mm_features if mm_features else None,
eos_token_id=EOS_TOKEN_ID,
block_hasher=block_hasher,
)
requests.append(request)
return requests