Sync from v0.13
This commit is contained in:
0
tests/v1/core/__init__.py
Normal file
0
tests/v1/core/__init__.py
Normal file
249
tests/v1/core/test_async_scheduler.py
Normal file
249
tests/v1/core/test_async_scheduler.py
Normal file
@@ -0,0 +1,249 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections import deque
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import RequestStatus
|
||||
from vllm.v1.utils import ConstantList
|
||||
|
||||
from .utils import create_requests, create_scheduler
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def _make_model_runner_output(
|
||||
scheduler_output: SchedulerOutput,
|
||||
) -> ModelRunnerOutput:
|
||||
req_ids = list(scheduler_output.num_scheduled_tokens.keys())
|
||||
return ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)},
|
||||
sampled_token_ids=[[i] for i in range(len(req_ids))],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("max_tokens", [1, 2, 3, 5])
|
||||
def test_stop_by_max_tokens(max_tokens: int):
|
||||
scheduler = create_scheduler(async_scheduling=True)
|
||||
requests = create_requests(num_requests=2, max_tokens=max_tokens)
|
||||
req0, req1 = requests
|
||||
|
||||
expected_total_num_scheduled_tokens = 0
|
||||
sched_outputs: deque[SchedulerOutput] = deque()
|
||||
scheduler.add_request(req0)
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
expected_total_num_scheduled_tokens += req0.num_prompt_tokens + max_tokens - 1
|
||||
|
||||
scheduler.add_request(req1)
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
expected_total_num_scheduled_tokens += req1.num_prompt_tokens + max_tokens - 1
|
||||
|
||||
total_num_scheduled_tokens = 0
|
||||
while sched_outputs:
|
||||
sched_output = sched_outputs.popleft()
|
||||
total_num_scheduled_tokens += sched_output.total_num_scheduled_tokens
|
||||
model_runner_output = _make_model_runner_output(sched_output)
|
||||
scheduler.update_from_output(sched_output, model_runner_output)
|
||||
|
||||
sched_output = scheduler.schedule()
|
||||
if sched_output.num_scheduled_tokens:
|
||||
sched_outputs.append(sched_output)
|
||||
|
||||
assert scheduler.get_num_unfinished_requests() == 0
|
||||
assert req0.num_output_tokens == max_tokens
|
||||
assert req1.num_output_tokens == max_tokens
|
||||
# Ensure we aren't scheduling more tokens than necessary.
|
||||
assert total_num_scheduled_tokens == expected_total_num_scheduled_tokens
|
||||
|
||||
|
||||
def test_abort():
|
||||
scheduler = create_scheduler(async_scheduling=True)
|
||||
requests = create_requests(num_requests=10, max_tokens=20)
|
||||
|
||||
for req in requests:
|
||||
scheduler.add_request(req)
|
||||
|
||||
sched_outputs: deque[SchedulerOutput] = deque()
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
|
||||
abort_order = [0, 8, 3, 1, 6, 4, 2, 5, 7, 9]
|
||||
abort_order_copy = abort_order.copy()
|
||||
|
||||
def abort_request():
|
||||
if not abort_order:
|
||||
return
|
||||
req = requests[abort_order.pop(0)]
|
||||
scheduler.finish_requests(req.request_id, RequestStatus.FINISHED_ABORTED)
|
||||
|
||||
while sched_outputs:
|
||||
# Abort a scheduled request.
|
||||
abort_request()
|
||||
sched_output = sched_outputs.popleft()
|
||||
model_runner_output = _make_model_runner_output(sched_output)
|
||||
scheduler.update_from_output(sched_output, model_runner_output)
|
||||
|
||||
sched_output = scheduler.schedule()
|
||||
if sched_output.num_scheduled_tokens:
|
||||
sched_outputs.append(sched_output)
|
||||
|
||||
for i, req in enumerate(requests):
|
||||
assert req.status == RequestStatus.FINISHED_ABORTED
|
||||
assert req.num_output_tokens == abort_order_copy.index(i)
|
||||
|
||||
|
||||
def test_preempt():
|
||||
scheduler = create_scheduler(async_scheduling=True)
|
||||
requests = create_requests(num_requests=10, max_tokens=20)
|
||||
|
||||
for req in requests:
|
||||
scheduler.add_request(req)
|
||||
|
||||
sched_outputs: deque[SchedulerOutput] = deque()
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
|
||||
abort_order = [0, 8, 3, 1, 6, 4, 2, 5, 7, 9]
|
||||
abort_order_copy = abort_order.copy()
|
||||
|
||||
def abort_request():
|
||||
if not abort_order:
|
||||
return
|
||||
req = requests[abort_order.pop(0)]
|
||||
scheduler.finish_requests(req.request_id, RequestStatus.FINISHED_ABORTED)
|
||||
|
||||
while sched_outputs:
|
||||
# Abort a scheduled request.
|
||||
abort_request()
|
||||
sched_output = sched_outputs.popleft()
|
||||
model_runner_output = _make_model_runner_output(sched_output)
|
||||
scheduler.update_from_output(sched_output, model_runner_output)
|
||||
|
||||
sched_output = scheduler.schedule()
|
||||
if sched_output.num_scheduled_tokens:
|
||||
sched_outputs.append(sched_output)
|
||||
|
||||
for i, req in enumerate(requests):
|
||||
assert req.status == RequestStatus.FINISHED_ABORTED
|
||||
assert req.num_output_tokens == abort_order_copy.index(i)
|
||||
|
||||
|
||||
def test_prefix_caching_for_prefill_dedup():
|
||||
CHUNK_SIZE = 1000
|
||||
BLOCK_SIZE = 16
|
||||
num_prompt_tokens = 100
|
||||
scheduler = create_scheduler(
|
||||
async_scheduling=True,
|
||||
max_num_batched_tokens=CHUNK_SIZE,
|
||||
enable_prefix_caching=True,
|
||||
block_size=BLOCK_SIZE,
|
||||
)
|
||||
requests = create_requests(
|
||||
num_requests=5,
|
||||
num_tokens=num_prompt_tokens,
|
||||
max_tokens=3,
|
||||
same_prompt=True,
|
||||
block_size=BLOCK_SIZE,
|
||||
)
|
||||
requests_copy = requests.copy()
|
||||
|
||||
# Two requests with the same prompt.
|
||||
req0 = requests.pop(0)
|
||||
req1 = requests.pop(0)
|
||||
scheduler.add_request(req0)
|
||||
scheduler.add_request(req1)
|
||||
|
||||
sched_outputs: deque[SchedulerOutput] = deque()
|
||||
sched_output = scheduler.schedule()
|
||||
sched_outputs.append(sched_output)
|
||||
# Make sure prefix caching de-duplicates the prompts in the same step,
|
||||
# so all the blocks except the last are shared between the two requests.
|
||||
assert len(sched_output.num_scheduled_tokens) == 2
|
||||
num_blocks = num_prompt_tokens // BLOCK_SIZE
|
||||
assert req0.num_cached_tokens == 0
|
||||
assert req1.num_cached_tokens >= num_blocks * BLOCK_SIZE
|
||||
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
while sched_outputs:
|
||||
if requests:
|
||||
scheduler.add_request(requests.pop(0))
|
||||
sched_output = sched_outputs.popleft()
|
||||
model_runner_output = _make_model_runner_output(sched_output)
|
||||
scheduler.update_from_output(sched_output, model_runner_output)
|
||||
sched_output = scheduler.schedule()
|
||||
if sched_output.num_scheduled_tokens:
|
||||
sched_outputs.append(sched_output)
|
||||
|
||||
# Other requests scheduled after the two requests should also get
|
||||
# prefix cache hit.
|
||||
assert scheduler.get_num_unfinished_requests() == 0
|
||||
for req in requests_copy[1:]:
|
||||
assert req.num_cached_tokens >= num_blocks * BLOCK_SIZE
|
||||
|
||||
|
||||
def test_prefix_caching_for_multi_turn():
|
||||
CHUNK_SIZE = 1000
|
||||
BLOCK_SIZE = 16
|
||||
num_prompt_tokens = 100
|
||||
num_output_tokens = 200
|
||||
scheduler = create_scheduler(
|
||||
async_scheduling=True,
|
||||
max_num_batched_tokens=CHUNK_SIZE,
|
||||
enable_prefix_caching=True,
|
||||
block_size=BLOCK_SIZE,
|
||||
)
|
||||
requests = create_requests(
|
||||
num_requests=5,
|
||||
num_tokens=num_prompt_tokens,
|
||||
max_tokens=num_output_tokens,
|
||||
block_size=BLOCK_SIZE,
|
||||
)
|
||||
|
||||
for req in requests:
|
||||
scheduler.add_request(req)
|
||||
sched_outputs: deque[SchedulerOutput] = deque()
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
|
||||
# Process the requests.
|
||||
while sched_outputs:
|
||||
sched_output = sched_outputs.popleft()
|
||||
model_runner_output = _make_model_runner_output(sched_output)
|
||||
scheduler.update_from_output(sched_output, model_runner_output)
|
||||
sched_output = scheduler.schedule()
|
||||
if sched_output.num_scheduled_tokens:
|
||||
sched_outputs.append(sched_output)
|
||||
assert scheduler.get_num_unfinished_requests() == 0
|
||||
|
||||
# Create next-turn requests whose prompts are the full output of the
|
||||
# previous turn.
|
||||
next_turn_requests = create_requests(
|
||||
num_requests=5,
|
||||
num_tokens=num_prompt_tokens + num_output_tokens,
|
||||
max_tokens=num_output_tokens,
|
||||
block_size=BLOCK_SIZE,
|
||||
)
|
||||
for i, req in enumerate(next_turn_requests):
|
||||
req.prompt_token_ids = requests[i].prompt_token_ids + list(
|
||||
requests[i].output_token_ids
|
||||
)
|
||||
req._all_token_ids = req.prompt_token_ids.copy()
|
||||
req.all_token_ids = ConstantList(req._all_token_ids)
|
||||
req.block_hashes = []
|
||||
req.block_hashes = req.get_hash_new_full_blocks()
|
||||
|
||||
# Schedule the next-turn requests.
|
||||
for req in next_turn_requests:
|
||||
scheduler.add_request(req)
|
||||
sched_outputs.append(scheduler.schedule())
|
||||
|
||||
# Make sure the next-turn requests get prefix cache hit by the previous
|
||||
# requests.
|
||||
for req in next_turn_requests:
|
||||
assert req.num_cached_tokens == req.num_prompt_tokens // BLOCK_SIZE * BLOCK_SIZE
|
||||
249
tests/v1/core/test_encoder_cache_manager.py
Normal file
249
tests/v1/core/test_encoder_cache_manager.py
Normal file
@@ -0,0 +1,249 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange
|
||||
from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
# ------------------ Mock Classes ------------------ #
|
||||
class MockRequest:
|
||||
def __init__(self, request_id, mm_hashes, token_counts):
|
||||
self.request_id = request_id
|
||||
self._token_counts = token_counts
|
||||
self.mm_features = []
|
||||
for i, mm_hash in enumerate(mm_hashes):
|
||||
feature = MultiModalFeatureSpec(
|
||||
data=None,
|
||||
modality="image",
|
||||
identifier=mm_hash,
|
||||
mm_position=PlaceholderRange(offset=0, length=self._token_counts[i]),
|
||||
)
|
||||
self.mm_features.append(feature)
|
||||
|
||||
def get_num_encoder_embeds(self, input_id: int) -> int:
|
||||
return self._token_counts[input_id]
|
||||
|
||||
|
||||
# ------------------ Unit Tests ------------------ #
|
||||
def test_basic_allocate_and_reuse():
|
||||
cache = EncoderCacheManager(cache_size=10)
|
||||
req = MockRequest("r1", ["imgA"], [4])
|
||||
|
||||
assert not cache.check_and_update_cache(req, 0)
|
||||
assert cache.can_allocate(req, 0, int(1e9), 0)
|
||||
|
||||
cache.allocate(req, 0)
|
||||
|
||||
assert cache.check_and_update_cache(req, 0)
|
||||
assert "r1" in cache.cached["imgA"]
|
||||
assert cache.num_free_slots == 6
|
||||
|
||||
# Free twice to bring refcount to 0.
|
||||
cache.free_encoder_input(req, 0)
|
||||
cache.free_encoder_input(req, 0)
|
||||
|
||||
assert not cache.cached["imgA"]
|
||||
assert "imgA" in cache.freeable
|
||||
assert cache.num_freeable_slots == 10
|
||||
assert cache.num_free_slots == 6
|
||||
|
||||
|
||||
def test_freeing_decreases_refcount_and_moves_to_freeable():
|
||||
manager = EncoderCacheManager(cache_size=10)
|
||||
req = MockRequest("req2", ["img3"], [5])
|
||||
|
||||
assert manager.can_allocate(req, 0, int(1e9), 0)
|
||||
manager.allocate(req, 0)
|
||||
|
||||
assert len(manager.cached["img3"]) == 1
|
||||
|
||||
manager.free_encoder_input(req, 0)
|
||||
|
||||
assert not manager.cached["img3"]
|
||||
assert "img3" in manager.freeable
|
||||
assert manager.num_freeable_slots == 10
|
||||
|
||||
|
||||
def test_free_request_frees_all_inputs():
|
||||
manager = EncoderCacheManager(cache_size=10)
|
||||
req = MockRequest("req3", ["a", "b"], [2, 3])
|
||||
|
||||
assert manager.can_allocate(req, 0, int(1e9), 0)
|
||||
manager.allocate(req, 0)
|
||||
|
||||
assert manager.can_allocate(req, 1, int(1e9), 0)
|
||||
manager.allocate(req, 1)
|
||||
|
||||
assert len(manager.cached["a"]) == 1
|
||||
assert len(manager.cached["b"]) == 1
|
||||
|
||||
manager.free(req)
|
||||
|
||||
assert not manager.cached["a"]
|
||||
assert not manager.cached["b"]
|
||||
assert "a" in manager.freeable
|
||||
assert "b" in manager.freeable
|
||||
assert manager.num_freeable_slots == 10
|
||||
|
||||
|
||||
def test_eviction_when_cache_is_full():
|
||||
manager = EncoderCacheManager(cache_size=10)
|
||||
|
||||
req1 = MockRequest("req1", ["x"], [6])
|
||||
req2 = MockRequest("req2", ["y"], [5])
|
||||
|
||||
assert manager.can_allocate(req1, 0, int(1e9), 0)
|
||||
manager.allocate(req1, 0)
|
||||
manager.free_encoder_input(req1, 0)
|
||||
|
||||
assert manager.can_allocate(req2, 0, int(1e9), 0)
|
||||
manager.allocate(req2, 0)
|
||||
|
||||
# 'x' should have been evicted.
|
||||
assert "x" not in manager.cached
|
||||
assert "x" in manager.get_freed_mm_hashes()
|
||||
|
||||
|
||||
def test_get_cached_input_ids():
|
||||
manager = EncoderCacheManager(cache_size=10)
|
||||
req = MockRequest("reqX", ["m", "n", "o"], [2, 4, 3])
|
||||
|
||||
assert manager.can_allocate(req, 0, int(1e9), 0)
|
||||
manager.allocate(req, 0)
|
||||
|
||||
assert manager.can_allocate(req, 2, int(1e9), 0)
|
||||
manager.allocate(req, 2)
|
||||
|
||||
cached_ids = manager.get_cached_input_ids(req)
|
||||
assert cached_ids == {0, 2}
|
||||
|
||||
|
||||
def test_has_cache_restores_from_freeable():
|
||||
manager = EncoderCacheManager(cache_size=10)
|
||||
req = MockRequest("reqY", ["imgZ"], [4])
|
||||
|
||||
assert manager.can_allocate(req, 0, int(1e9), 0)
|
||||
manager.allocate(req, 0)
|
||||
|
||||
manager.free_encoder_input(req, 0)
|
||||
|
||||
# Should restore from freeable.
|
||||
assert manager.check_and_update_cache(req, 0)
|
||||
assert len(manager.cached["imgZ"]) == 1
|
||||
assert "imgZ" not in manager.freeable
|
||||
assert manager.num_freeable_slots == 6
|
||||
|
||||
|
||||
def test_get_freed_mm_hashes_clears_freed_list():
|
||||
manager = EncoderCacheManager(cache_size=10)
|
||||
req1 = MockRequest("reqA", ["a"], [5])
|
||||
req2 = MockRequest("reqB", ["b"], [6])
|
||||
|
||||
assert manager.can_allocate(req1, 0, int(1e9), 0)
|
||||
manager.allocate(req1, 0)
|
||||
manager.free_encoder_input(req1, 0)
|
||||
|
||||
# Should trigger eviction of 'a'.
|
||||
assert manager.can_allocate(req2, 0, int(1e9), 0)
|
||||
manager.allocate(req2, 0)
|
||||
|
||||
freed = manager.get_freed_mm_hashes()
|
||||
assert "a" in freed
|
||||
assert manager.get_freed_mm_hashes() == []
|
||||
|
||||
|
||||
def test_schedule_request_multi_images_respect_space_limit():
|
||||
manager = EncoderCacheManager(cache_size=10)
|
||||
req = MockRequest("reqA", ["a", "b"], [5, 6])
|
||||
compute_budget = 100
|
||||
|
||||
num_tokens_to_schedule = 0
|
||||
assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
|
||||
num_tokens_to_schedule += req.get_num_encoder_embeds(0)
|
||||
compute_budget -= req.get_num_encoder_embeds(0)
|
||||
|
||||
assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule)
|
||||
|
||||
|
||||
def test_schedule_request_multi_images_respect_compute_limit():
|
||||
manager = EncoderCacheManager(cache_size=100)
|
||||
req = MockRequest("reqA", ["a", "b"], [5, 6])
|
||||
compute_budget = 10
|
||||
num_tokens_to_schedule = 0
|
||||
assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
|
||||
num_tokens_to_schedule += req.get_num_encoder_embeds(0)
|
||||
compute_budget -= req.get_num_encoder_embeds(0)
|
||||
|
||||
assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule)
|
||||
|
||||
|
||||
def test_encoder_cache_with_is_embed_mask():
|
||||
class MockRequestWithMask(MockRequest):
|
||||
def get_num_encoder_embeds(self, input_id: int) -> int:
|
||||
return self.mm_features[input_id].mm_position.get_num_embeds
|
||||
|
||||
is_embed = torch.zeros(100, dtype=torch.bool)
|
||||
is_embed[torch.tensor([5, 15, 25, 35, 45, 55, 65, 75])] = True
|
||||
|
||||
request = MockRequestWithMask("r1", ["img1"], [100])
|
||||
request.mm_features[0] = MultiModalFeatureSpec(
|
||||
data=None,
|
||||
modality="image",
|
||||
identifier="img1",
|
||||
mm_position=PlaceholderRange(offset=0, length=100, is_embed=is_embed),
|
||||
)
|
||||
|
||||
manager = EncoderCacheManager(cache_size=100)
|
||||
manager.allocate(request, 0)
|
||||
|
||||
assert manager.num_free_slots == 92
|
||||
assert "img1" in manager.cached
|
||||
|
||||
old_size = 100
|
||||
new_size = request.mm_features[0].mm_position.get_num_embeds
|
||||
assert new_size == 8
|
||||
savings_ratio = old_size / new_size
|
||||
assert savings_ratio == 12.5
|
||||
|
||||
|
||||
def test_encoder_cache_mask_based_retrieval():
|
||||
class MockRequestWithMask(MockRequest):
|
||||
def get_num_encoder_embeds(self, input_id: int) -> int:
|
||||
return self.mm_features[input_id].mm_position.get_num_embeds
|
||||
|
||||
is_embed = torch.tensor(
|
||||
[False, False, True, True, False, True, True, True, False, False]
|
||||
)
|
||||
|
||||
request = MockRequestWithMask("r1", ["img1"], [10])
|
||||
request.mm_features[0] = MultiModalFeatureSpec(
|
||||
data=None,
|
||||
modality="image",
|
||||
identifier="img1",
|
||||
mm_position=PlaceholderRange(offset=0, length=10, is_embed=is_embed),
|
||||
)
|
||||
|
||||
manager = EncoderCacheManager(cache_size=50)
|
||||
manager.allocate(request, 0)
|
||||
|
||||
assert request.mm_features[0].mm_position.get_num_embeds == 5
|
||||
|
||||
start_idx = 2
|
||||
end_idx = 8
|
||||
num_embeds_before = is_embed[:start_idx].sum().item()
|
||||
num_embeds_in_range = is_embed[start_idx:end_idx].sum().item()
|
||||
|
||||
assert num_embeds_before == 0
|
||||
assert num_embeds_in_range == 5
|
||||
|
||||
start_idx = 0
|
||||
end_idx = 5
|
||||
num_embeds_before = is_embed[:start_idx].sum().item() if start_idx > 0 else 0
|
||||
num_embeds_in_range = is_embed[start_idx:end_idx].sum().item()
|
||||
|
||||
assert num_embeds_before == 0
|
||||
assert num_embeds_in_range == 2
|
||||
224
tests/v1/core/test_kv_cache_metrics.py
Normal file
224
tests/v1/core/test_kv_cache_metrics.py
Normal file
@@ -0,0 +1,224 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.v1.core.kv_cache_metrics import (
|
||||
BlockMetricsState,
|
||||
KVCacheMetricsCollector,
|
||||
)
|
||||
from vllm.v1.core.kv_cache_utils import KVCacheBlock
|
||||
|
||||
|
||||
class TestBlockMetricsState:
|
||||
def test_init(self):
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
state = BlockMetricsState()
|
||||
assert state.birth_time_ns == 1000000000
|
||||
assert state.last_access_ns == 1000000000
|
||||
assert len(state.access_history) == 0
|
||||
|
||||
def test_access_tracking(self):
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
state = BlockMetricsState()
|
||||
|
||||
with patch("time.monotonic_ns", return_value=2000000000):
|
||||
state.record_access()
|
||||
|
||||
assert state.last_access_ns == 2000000000
|
||||
assert list(state.access_history) == [2000000000]
|
||||
|
||||
def test_ring_buffer_wraps_at_4(self):
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
state = BlockMetricsState()
|
||||
|
||||
for i in range(5):
|
||||
t = 1000000000 + (i + 1) * 1000000000
|
||||
with patch("time.monotonic_ns", return_value=t):
|
||||
state.record_access()
|
||||
|
||||
assert len(state.access_history) == 4
|
||||
assert list(state.access_history) == [
|
||||
3000000000,
|
||||
4000000000,
|
||||
5000000000,
|
||||
6000000000,
|
||||
]
|
||||
|
||||
def test_lifetime(self):
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
state = BlockMetricsState()
|
||||
with patch("time.monotonic_ns", return_value=6500000000):
|
||||
assert abs(state.get_lifetime_seconds() - 5.5) < 0.001
|
||||
|
||||
def test_idle_time(self):
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
state = BlockMetricsState()
|
||||
state.last_access_ns = 2000000000
|
||||
with patch("time.monotonic_ns", return_value=5200000000):
|
||||
assert abs(state.get_idle_time_seconds() - 3.2) < 0.001
|
||||
|
||||
def test_reuse_gaps(self):
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
state = BlockMetricsState()
|
||||
|
||||
base = 1000000000
|
||||
for offset in [0, 1.5, 3.0, 5.5]:
|
||||
state.access_history.append(base + int(offset * 1e9))
|
||||
|
||||
gaps = state.get_reuse_gaps_seconds()
|
||||
assert len(gaps) == 3
|
||||
assert gaps[0] == 1.5 and gaps[1] == 1.5 and gaps[2] == 2.5
|
||||
|
||||
def test_ring_wrap_only_gives_3_gaps(self):
|
||||
# 5 accesses in size-4 buffer = 3 gaps
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
state = BlockMetricsState()
|
||||
|
||||
for i in range(5):
|
||||
state.access_history.append(1000000000 + i * 1000000000)
|
||||
|
||||
assert len(state.get_reuse_gaps_seconds()) == 3
|
||||
|
||||
|
||||
class TestKVCacheMetricsCollector:
|
||||
def test_sample_rate_validation(self):
|
||||
with pytest.raises(AssertionError):
|
||||
KVCacheMetricsCollector(sample_rate=-0.1)
|
||||
with pytest.raises(AssertionError):
|
||||
KVCacheMetricsCollector(sample_rate=1.5)
|
||||
with pytest.raises(AssertionError):
|
||||
KVCacheMetricsCollector(sample_rate=0.0)
|
||||
|
||||
def test_sampling(self):
|
||||
c = KVCacheMetricsCollector(sample_rate=1.0)
|
||||
assert sum(1 for _ in range(100) if c.should_sample_block()) == 100
|
||||
|
||||
c = KVCacheMetricsCollector(sample_rate=0.5)
|
||||
samples = sum(1 for _ in range(1000) if c.should_sample_block())
|
||||
assert 400 < samples < 600
|
||||
|
||||
def test_alloc(self):
|
||||
c = KVCacheMetricsCollector(sample_rate=1.0)
|
||||
|
||||
blocks = [KVCacheBlock(block_id=i) for i in range(5)]
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
for block in blocks:
|
||||
c.on_block_allocated(block)
|
||||
|
||||
assert len(c.block_metrics) == 5
|
||||
|
||||
def test_access(self):
|
||||
c = KVCacheMetricsCollector(sample_rate=1.0)
|
||||
block = KVCacheBlock(block_id=0)
|
||||
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
c.on_block_allocated(block)
|
||||
|
||||
for i in range(3):
|
||||
t = 1000000000 + (i + 1) * 1000000000
|
||||
with patch("time.monotonic_ns", return_value=t):
|
||||
c.on_block_accessed(block)
|
||||
|
||||
assert len(c.block_metrics[0].access_history) == 3
|
||||
|
||||
def test_evict_no_accesses(self):
|
||||
# lifetime should equal idle if never accessed
|
||||
c = KVCacheMetricsCollector(sample_rate=1.0)
|
||||
|
||||
block = KVCacheBlock(block_id=0)
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
c.on_block_allocated(block)
|
||||
|
||||
with patch("time.monotonic_ns", return_value=6000000000):
|
||||
c.on_block_evicted(block)
|
||||
|
||||
events = c.drain_events()
|
||||
assert len(events) == 1
|
||||
assert abs(events[0].lifetime_seconds - 5.0) < 0.001
|
||||
assert abs(events[0].idle_seconds - 5.0) < 0.001
|
||||
|
||||
def test_evict(self):
|
||||
c = KVCacheMetricsCollector(sample_rate=1.0)
|
||||
|
||||
block = KVCacheBlock(block_id=0)
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
c.on_block_allocated(block)
|
||||
|
||||
with patch("time.monotonic_ns", return_value=2000000000):
|
||||
c.on_block_accessed(block)
|
||||
with patch("time.monotonic_ns", return_value=3000000000):
|
||||
c.on_block_accessed(block)
|
||||
|
||||
with patch("time.monotonic_ns", return_value=4000000000):
|
||||
c.on_block_evicted(block)
|
||||
|
||||
events = c.drain_events()
|
||||
assert len(events) == 1
|
||||
sample = events[0]
|
||||
assert abs(sample.lifetime_seconds - 3.0) < 0.001
|
||||
assert abs(sample.idle_seconds - 1.0) < 0.001
|
||||
assert sample.reuse_gaps_seconds == (1.0,)
|
||||
assert 0 not in c.block_metrics
|
||||
|
||||
def test_reset(self):
|
||||
c = KVCacheMetricsCollector(sample_rate=1.0)
|
||||
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
for i in range(5):
|
||||
c.on_block_allocated(KVCacheBlock(block_id=i))
|
||||
|
||||
assert len(c.block_metrics) == 5
|
||||
c.reset()
|
||||
assert len(c.block_metrics) == 0
|
||||
|
||||
with patch("time.monotonic_ns", return_value=2000000000):
|
||||
c.on_block_allocated(KVCacheBlock(block_id=10))
|
||||
assert 10 in c.block_metrics
|
||||
|
||||
def test_huge_time_jump(self):
|
||||
c = KVCacheMetricsCollector(sample_rate=1.0)
|
||||
|
||||
block = KVCacheBlock(block_id=0)
|
||||
with patch("time.monotonic_ns", return_value=1000000000):
|
||||
c.on_block_allocated(block)
|
||||
|
||||
with patch("time.monotonic_ns", return_value=9999999999999999):
|
||||
c.on_block_evicted(block)
|
||||
|
||||
events = c.drain_events()
|
||||
assert len(events) == 1
|
||||
assert events[0].lifetime_seconds > 0
|
||||
|
||||
|
||||
def test_kv_cache_metrics_collector_smoke() -> None:
|
||||
"""Simple smoke test for KVCacheMetricsCollector on CPU."""
|
||||
collector = KVCacheMetricsCollector(sample_rate=1.0)
|
||||
block = KVCacheBlock(block_id=123)
|
||||
|
||||
# Allocate at t = 1.0s.
|
||||
with patch("time.monotonic_ns", return_value=1_000_000_000):
|
||||
collector.on_block_allocated(block)
|
||||
|
||||
# Access at t = 2.0s and t = 3.0s.
|
||||
with patch("time.monotonic_ns", return_value=2_000_000_000):
|
||||
collector.on_block_accessed(block)
|
||||
with patch("time.monotonic_ns", return_value=3_000_000_000):
|
||||
collector.on_block_accessed(block)
|
||||
|
||||
# Evict at t = 4.0s.
|
||||
with patch("time.monotonic_ns", return_value=4_000_000_000):
|
||||
collector.on_block_evicted(block)
|
||||
|
||||
events = collector.drain_events()
|
||||
assert len(events) == 1
|
||||
|
||||
event = events[0]
|
||||
# Lifetime: 1.0s → 4.0s.
|
||||
assert abs(event.lifetime_seconds - 3.0) < 1e-6
|
||||
# Idle: last access at 3.0s, evicted at 4.0s.
|
||||
assert abs(event.idle_seconds - 1.0) < 1e-6
|
||||
# One reuse gap between the two accesses.
|
||||
assert event.reuse_gaps_seconds == (1.0,)
|
||||
1800
tests/v1/core/test_kv_cache_utils.py
Normal file
1800
tests/v1/core/test_kv_cache_utils.py
Normal file
File diff suppressed because it is too large
Load Diff
103
tests/v1/core/test_kv_sharing.py
Normal file
103
tests/v1/core/test_kv_sharing.py
Normal file
@@ -0,0 +1,103 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
|
||||
from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def new_kv_cache_spec():
|
||||
return FullAttentionSpec(16, 1, 1, torch.float32, False)
|
||||
|
||||
|
||||
def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
|
||||
"""
|
||||
Test initializing KV cache sharing with different attention groups.
|
||||
Layers in the same KV cache group might be placed in different attn groups
|
||||
if they have different attention backends.
|
||||
"""
|
||||
shared_kv_cache_layers = {
|
||||
"model.layers.2": "model.layers.0",
|
||||
"model.layers.3": "model.layers.1",
|
||||
}
|
||||
|
||||
# Layers 0 and 1 both belong in KV cache group 0
|
||||
# However, if they have different attention backends, they will be
|
||||
# placed in different attention groups for KV cache group 0
|
||||
kv_cache_groups = [
|
||||
KVCacheGroupSpec(["model.layers.0", "model.layers.1"], new_kv_cache_spec()),
|
||||
]
|
||||
|
||||
add_kv_sharing_layers_to_kv_cache_groups(
|
||||
shared_kv_cache_layers=shared_kv_cache_layers,
|
||||
kv_cache_groups=kv_cache_groups,
|
||||
)
|
||||
|
||||
# Check that the layers were added to the correct KV cache group
|
||||
assert len(kv_cache_groups) == 1
|
||||
assert kv_cache_groups[0].layer_names == [
|
||||
"model.layers.0",
|
||||
"model.layers.1",
|
||||
"model.layers.2",
|
||||
"model.layers.3",
|
||||
]
|
||||
|
||||
|
||||
def test_initialize_kv_cache_for_kv_sharing_same_attn_groups():
|
||||
"""
|
||||
Test case assuming that all layers in the same KV cache group have the same
|
||||
attention backends. This is true for most models.
|
||||
"""
|
||||
shared_kv_cache_layers = {
|
||||
"model.layers.2": "model.layers.0",
|
||||
"model.layers.3": "model.layers.1",
|
||||
}
|
||||
|
||||
kv_cache_groups = [
|
||||
KVCacheGroupSpec(["model.layers.0", "model.layers.1"], new_kv_cache_spec()),
|
||||
]
|
||||
|
||||
add_kv_sharing_layers_to_kv_cache_groups(
|
||||
shared_kv_cache_layers=shared_kv_cache_layers,
|
||||
kv_cache_groups=kv_cache_groups,
|
||||
)
|
||||
|
||||
# Check that the layers were added to the correct KV cache group
|
||||
assert len(kv_cache_groups) == 1
|
||||
assert kv_cache_groups[0].layer_names == [
|
||||
"model.layers.0",
|
||||
"model.layers.1",
|
||||
"model.layers.2",
|
||||
"model.layers.3",
|
||||
]
|
||||
|
||||
|
||||
def test_initialize_kv_cache_for_kv_sharing_no_attn_groups():
|
||||
"""
|
||||
Test KV sharing set up when no attention groups are provided.
|
||||
This is the case for the TPU model runner, which doesn't have
|
||||
support for attention groups yet.
|
||||
"""
|
||||
shared_kv_cache_layers = {
|
||||
"model.layers.2": "model.layers.0",
|
||||
"model.layers.3": "model.layers.1",
|
||||
}
|
||||
|
||||
kv_cache_groups = [
|
||||
KVCacheGroupSpec(["model.layers.0"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["model.layers.1"], new_kv_cache_spec()),
|
||||
]
|
||||
|
||||
add_kv_sharing_layers_to_kv_cache_groups(
|
||||
shared_kv_cache_layers=shared_kv_cache_layers,
|
||||
kv_cache_groups=kv_cache_groups,
|
||||
)
|
||||
|
||||
# Check that the layers were added to the correct KV cache group
|
||||
assert len(kv_cache_groups) == 2
|
||||
assert kv_cache_groups[0].layer_names == ["model.layers.0", "model.layers.2"]
|
||||
assert kv_cache_groups[1].layer_names == ["model.layers.1", "model.layers.3"]
|
||||
36
tests/v1/core/test_output.py
Normal file
36
tests/v1/core/test_output.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import torch
|
||||
|
||||
from vllm.v1.core.sched.output import NewRequestData
|
||||
|
||||
|
||||
def _create_new_requests_data(prompt_embeds: torch.Tensor | None) -> NewRequestData:
|
||||
return NewRequestData(
|
||||
req_id="test_req",
|
||||
prompt_token_ids=None,
|
||||
mm_features=[],
|
||||
sampling_params=None,
|
||||
pooling_params=None,
|
||||
block_ids=([],),
|
||||
num_computed_tokens=0,
|
||||
lora_request=None,
|
||||
prompt_embeds=prompt_embeds,
|
||||
)
|
||||
|
||||
|
||||
def test_repr_with_none() -> None:
|
||||
"""Test repr when prompt_embeds is None."""
|
||||
new_requests_data = _create_new_requests_data(None)
|
||||
|
||||
assert "prompt_embeds_shape=None" in repr(new_requests_data)
|
||||
assert "prompt_embeds_shape=None" in new_requests_data.anon_repr()
|
||||
|
||||
|
||||
def test_repr_with_multi_element_tensor() -> None:
|
||||
"""Test repr when prompt_embeds is a multi-element tensor."""
|
||||
prompt_embeds = torch.randn(10, 768)
|
||||
new_requests_data = _create_new_requests_data(prompt_embeds)
|
||||
|
||||
assert "prompt_embeds_shape=torch.Size([10, 768])" in repr(new_requests_data)
|
||||
assert "prompt_embeds_shape=torch.Size([10, 768])" in new_requests_data.anon_repr()
|
||||
1688
tests/v1/core/test_prefix_caching.py
Normal file
1688
tests/v1/core/test_prefix_caching.py
Normal file
File diff suppressed because it is too large
Load Diff
262
tests/v1/core/test_priority_scheduler_random.py
Normal file
262
tests/v1/core/test_priority_scheduler_random.py
Normal file
@@ -0,0 +1,262 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import random
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalFeatureSpec,
|
||||
MultiModalKwargsItem,
|
||||
PlaceholderRange,
|
||||
)
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils.hashing import get_hash_fn_by_name
|
||||
from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
|
||||
from vllm.v1.request import Request
|
||||
|
||||
from .test_scheduler import create_scheduler_with_priority
|
||||
from .utils import EOS_TOKEN_ID
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def _create_random_request(
|
||||
max_tokens_range: tuple[int, int],
|
||||
num_tokens_range: tuple[int, int],
|
||||
arrival_time_range: tuple[float, float],
|
||||
priority_range: tuple[int, int],
|
||||
num_mm_item_range: tuple[int, int],
|
||||
vllm_config: VllmConfig,
|
||||
):
|
||||
max_tokens = random.randint(*max_tokens_range)
|
||||
num_tokens = random.randint(*num_tokens_range)
|
||||
priority = random.randint(*priority_range)
|
||||
arrival_time = random.uniform(*arrival_time_range)
|
||||
num_mm_item = random.randint(*num_mm_item_range)
|
||||
|
||||
mm_positions: list[PlaceholderRange] = []
|
||||
for mm_start in sorted(
|
||||
random.sample(range(num_tokens), min(num_mm_item, num_tokens))
|
||||
):
|
||||
if mm_start + 10 > num_tokens:
|
||||
continue
|
||||
mm_positions.append(PlaceholderRange(offset=mm_start, length=10))
|
||||
|
||||
request_id = uuid.uuid4().hex
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
ignore_eos=False,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
mm_features = []
|
||||
for j, position in enumerate(mm_positions):
|
||||
identifier = f"{request_id}_hash_{j}"
|
||||
mm_feature = MultiModalFeatureSpec(
|
||||
data=MultiModalKwargsItem.dummy("dummy_m"),
|
||||
mm_position=position,
|
||||
identifier=identifier,
|
||||
modality="image",
|
||||
)
|
||||
mm_features.append(mm_feature)
|
||||
|
||||
prompt_token_ids = random.choices(range(100), k=num_tokens)
|
||||
|
||||
caching_hash_fn = get_hash_fn_by_name(
|
||||
vllm_config.cache_config.prefix_caching_hash_algo
|
||||
)
|
||||
init_none_hash(caching_hash_fn)
|
||||
block_hasher = get_request_block_hasher(
|
||||
vllm_config.cache_config.block_size, caching_hash_fn
|
||||
)
|
||||
|
||||
request = Request(
|
||||
request_id=request_id,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=None,
|
||||
mm_features=mm_features if mm_features else None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
arrival_time=arrival_time,
|
||||
priority=priority,
|
||||
block_hasher=block_hasher,
|
||||
)
|
||||
return request
|
||||
|
||||
|
||||
def _mock_execute_model(
|
||||
scheduler_output: SchedulerOutput, num_output_tokens_range: tuple[int, int]
|
||||
) -> ModelRunnerOutput:
|
||||
request_ids: list[str] = []
|
||||
request_ids.extend(req.req_id for req in scheduler_output.scheduled_new_reqs)
|
||||
request_ids.extend(scheduler_output.scheduled_cached_reqs.req_ids)
|
||||
random.shuffle(request_ids)
|
||||
|
||||
num_output_tokens = [
|
||||
random.randint(*num_output_tokens_range) for _ in range(len(request_ids))
|
||||
]
|
||||
sampled_token_ids = [
|
||||
[random.randint(0, 100) for _ in range(num_tokens)]
|
||||
for num_tokens in num_output_tokens
|
||||
]
|
||||
|
||||
return ModelRunnerOutput(
|
||||
req_ids=request_ids,
|
||||
req_id_to_index={req_id: i for i, req_id in enumerate(request_ids)},
|
||||
sampled_token_ids=sampled_token_ids,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[],
|
||||
)
|
||||
|
||||
|
||||
def _mock_draft_token_ids(
|
||||
scheduler_output: SchedulerOutput,
|
||||
num_output_tokens_range: tuple[int, int],
|
||||
seen_request_prompt_length: dict[str, int],
|
||||
) -> DraftTokenIds:
|
||||
request_ids: list[str] = []
|
||||
sampled_token_ids: list[list[int]] = []
|
||||
for request in scheduler_output.scheduled_new_reqs:
|
||||
assert request.req_id not in seen_request_prompt_length
|
||||
seen_request_prompt_length[request.req_id] = len(request.prompt_token_ids or [])
|
||||
if request.num_computed_tokens >= seen_request_prompt_length[request.req_id]:
|
||||
num_tokens = random.randint(*num_output_tokens_range)
|
||||
request_ids.append(request.req_id)
|
||||
sampled_token_ids.append(
|
||||
[random.randint(0, 100) for _ in range(num_tokens)]
|
||||
)
|
||||
for req_id, num_computed_tokens in zip(
|
||||
scheduler_output.scheduled_cached_reqs.req_ids,
|
||||
scheduler_output.scheduled_cached_reqs.num_computed_tokens,
|
||||
):
|
||||
if num_computed_tokens >= seen_request_prompt_length[req_id]:
|
||||
num_tokens = random.randint(*num_output_tokens_range)
|
||||
request_ids.append(req_id)
|
||||
sampled_token_ids.append(
|
||||
[random.randint(0, 100) for _ in range(num_tokens)]
|
||||
)
|
||||
return DraftTokenIds(req_ids=request_ids, draft_token_ids=sampled_token_ids)
|
||||
|
||||
|
||||
def _chech_valid_scheduler_output(
|
||||
scheduler_output: SchedulerOutput,
|
||||
seen_request_ids: set[str],
|
||||
seen_mm_hashes: set[str],
|
||||
):
|
||||
for req in scheduler_output.scheduled_new_reqs:
|
||||
assert req.req_id not in seen_request_ids
|
||||
seen_request_ids.add(req.req_id)
|
||||
for req_id in scheduler_output.scheduled_cached_reqs.req_ids:
|
||||
assert req_id in seen_request_ids
|
||||
|
||||
req_ids = set[str]()
|
||||
req_ids.update(req.req_id for req in scheduler_output.scheduled_new_reqs)
|
||||
req_ids.update(scheduler_output.scheduled_cached_reqs.req_ids)
|
||||
|
||||
assert set(scheduler_output.num_scheduled_tokens.keys()) == req_ids
|
||||
assert (
|
||||
sum(scheduler_output.num_scheduled_tokens.values())
|
||||
== scheduler_output.total_num_scheduled_tokens
|
||||
)
|
||||
|
||||
assert set(scheduler_output.scheduled_spec_decode_tokens.keys()) <= req_ids
|
||||
assert set(scheduler_output.scheduled_encoder_inputs.keys()) <= req_ids
|
||||
|
||||
for req in scheduler_output.scheduled_new_reqs:
|
||||
for mm_feature in req.mm_features:
|
||||
seen_mm_hashes.add(mm_feature.identifier)
|
||||
for mm_hash in scheduler_output.free_encoder_mm_hashes:
|
||||
assert mm_hash in seen_mm_hashes
|
||||
|
||||
assert scheduler_output.finished_req_ids <= seen_request_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_prefix_caching", [True, False])
|
||||
@pytest.mark.parametrize("num_speculative_tokens", [None, 1, 5])
|
||||
@pytest.mark.parametrize(
|
||||
("max_input_tokens", "max_output_tokens", "max_num_seqs", "num_blocks"),
|
||||
[
|
||||
# Standard profile
|
||||
(5000, 500, 256, 10000),
|
||||
# Generation heavy + high max_num_seqs + low num_blocks -> Many preemptions
|
||||
(500, 5000, 1024, 1000),
|
||||
],
|
||||
ids=["standard", "preemption"],
|
||||
)
|
||||
def test_priority_scheduling_blast(
|
||||
enable_prefix_caching: bool,
|
||||
num_speculative_tokens: int | None,
|
||||
max_input_tokens: int,
|
||||
max_output_tokens: int,
|
||||
max_num_seqs: int,
|
||||
num_blocks: int,
|
||||
):
|
||||
random.seed(42)
|
||||
seen_request_prompt_length = dict[str, int]()
|
||||
seen_request_ids = set[str]()
|
||||
seen_mm_hashes = set[str]()
|
||||
|
||||
scheduler = create_scheduler_with_priority(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
max_num_seqs=max_num_seqs,
|
||||
enable_prefix_caching=enable_prefix_caching,
|
||||
num_blocks=num_blocks,
|
||||
num_speculative_tokens=num_speculative_tokens,
|
||||
)
|
||||
|
||||
num_initial_requests = 10
|
||||
for _ in range(num_initial_requests):
|
||||
req = _create_random_request(
|
||||
max_tokens_range=(1, max_output_tokens),
|
||||
num_tokens_range=(1, max_input_tokens),
|
||||
arrival_time_range=(0, 1),
|
||||
priority_range=(-3, 3),
|
||||
num_mm_item_range=(0, 2),
|
||||
vllm_config=scheduler.vllm_config,
|
||||
)
|
||||
scheduler.add_request(req)
|
||||
num_initial_requests = 2
|
||||
for _ in range(num_initial_requests):
|
||||
req = _create_random_request(
|
||||
max_tokens_range=(1, max_output_tokens),
|
||||
num_tokens_range=(1, max_input_tokens),
|
||||
arrival_time_range=(0, 0),
|
||||
priority_range=(4, 4),
|
||||
num_mm_item_range=(0, 2),
|
||||
vllm_config=scheduler.vllm_config,
|
||||
)
|
||||
scheduler.add_request(req)
|
||||
for _ in range(20000):
|
||||
if len(scheduler.waiting) == 0:
|
||||
num_new_requests = random.randint(0, 2)
|
||||
for _ in range(num_new_requests):
|
||||
req = _create_random_request(
|
||||
max_tokens_range=(1, max_output_tokens),
|
||||
num_tokens_range=(1, max_input_tokens),
|
||||
arrival_time_range=(0, 1),
|
||||
priority_range=(-3, 3),
|
||||
num_mm_item_range=(0, 2),
|
||||
vllm_config=scheduler.vllm_config,
|
||||
)
|
||||
scheduler.add_request(req)
|
||||
scheduler_output = scheduler.schedule()
|
||||
_chech_valid_scheduler_output(
|
||||
scheduler_output, seen_request_ids, seen_mm_hashes
|
||||
)
|
||||
model_output = _mock_execute_model(
|
||||
scheduler_output,
|
||||
num_output_tokens_range=(1, 1 + (num_speculative_tokens or 0)),
|
||||
)
|
||||
scheduler.update_from_output(scheduler_output, model_output)
|
||||
if num_speculative_tokens is not None:
|
||||
scheduler.update_draft_token_ids(
|
||||
_mock_draft_token_ids(
|
||||
scheduler_output,
|
||||
(0, num_speculative_tokens),
|
||||
seen_request_prompt_length,
|
||||
)
|
||||
)
|
||||
69
tests/v1/core/test_reset_prefix_cache_e2e.py
Normal file
69
tests/v1/core/test_reset_prefix_cache_e2e.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm import EngineArgs, LLMEngine, SamplingParams
|
||||
|
||||
PROMPTS = [
|
||||
"A robot may not injure a human being ",
|
||||
"To be or not to be,",
|
||||
"What is the meaning of life?",
|
||||
"What does the fox say? " * 20, # Test long prompt
|
||||
]
|
||||
|
||||
|
||||
def test_reset_prefix_cache_e2e(monkeypatch):
|
||||
# "spawn" is required for test to be deterministic
|
||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
engine_args = EngineArgs(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
gpu_memory_utilization=0.2,
|
||||
async_scheduling=True,
|
||||
max_num_batched_tokens=32,
|
||||
max_model_len=2048,
|
||||
compilation_config={"mode": 0},
|
||||
dtype="float16",
|
||||
)
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=16,
|
||||
)
|
||||
|
||||
# No preempt case:
|
||||
for i, prompt in enumerate(PROMPTS):
|
||||
engine.add_request("ground_truth_" + str(i), prompt, sampling_params)
|
||||
|
||||
ground_truth_results = {}
|
||||
while engine.has_unfinished_requests():
|
||||
request_outputs = engine.step()
|
||||
for request_output in request_outputs:
|
||||
if request_output.finished:
|
||||
ground_truth_results[request_output.request_id] = request_output
|
||||
|
||||
# Preempt case:
|
||||
for i, prompt in enumerate(PROMPTS):
|
||||
engine.add_request("preempted_" + str(i), prompt, sampling_params)
|
||||
|
||||
step_id = 0
|
||||
preempted_results = {}
|
||||
while engine.has_unfinished_requests():
|
||||
if step_id == 10:
|
||||
engine.reset_prefix_cache(reset_running_requests=True)
|
||||
|
||||
request_outputs = engine.step()
|
||||
|
||||
for request_output in request_outputs:
|
||||
if request_output.finished:
|
||||
preempted_results[request_output.request_id] = request_output
|
||||
step_id += 1
|
||||
|
||||
for i in range(len(PROMPTS)):
|
||||
assert (
|
||||
ground_truth_results["ground_truth_" + str(i)].outputs[0].text
|
||||
== preempted_results["preempted_" + str(i)].outputs[0].text
|
||||
), (
|
||||
f"ground_truth_results['ground_truth_{i}'].outputs[0].text="
|
||||
f"{ground_truth_results['ground_truth_' + str(i)].outputs[0].text} "
|
||||
f"preempted_results['preempted_{i}'].outputs[0].text="
|
||||
f"{preempted_results['preempted_' + str(i)].outputs[0].text}"
|
||||
)
|
||||
3328
tests/v1/core/test_scheduler.py
Normal file
3328
tests/v1/core/test_scheduler.py
Normal file
File diff suppressed because it is too large
Load Diff
37
tests/v1/core/test_scheduler_e2e.py
Normal file
37
tests/v1/core/test_scheduler_e2e.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
MODEL = "hmellor/tiny-random-LlamaForCausalLM"
|
||||
PROMPT = "Hello my name is Robert and I"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm() -> LLM:
|
||||
return LLM(
|
||||
MODEL,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
long_prefill_token_threshold=2,
|
||||
max_num_batched_tokens=6,
|
||||
max_num_seqs=3,
|
||||
block_size=16,
|
||||
)
|
||||
|
||||
|
||||
def test_concurrent_partial_prefill(llm):
|
||||
outputs = llm.generate([PROMPT] * 3)
|
||||
assert len(outputs) == 3
|
||||
for output in outputs:
|
||||
assert len(output.outputs) == 1
|
||||
|
||||
|
||||
def test_prefix_cache_stats_is_recorded(llm):
|
||||
# 17 tokens will make sure first 16 tokens are cached in a block
|
||||
input_tokens = {"prompt_token_ids": [101] * 17}
|
||||
_ = llm.generate([input_tokens])
|
||||
outputs = llm.generate([input_tokens])
|
||||
assert outputs[0].num_cached_tokens == 16
|
||||
366
tests/v1/core/test_single_type_kv_cache_manager.py
Normal file
366
tests/v1/core/test_single_type_kv_cache_manager.py
Normal file
@@ -0,0 +1,366 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
from vllm.v1.core.kv_cache_utils import (
|
||||
BlockHash,
|
||||
KVCacheBlock,
|
||||
make_block_hash_with_group_id,
|
||||
)
|
||||
from vllm.v1.core.single_type_kv_cache_manager import (
|
||||
ChunkedLocalAttentionManager,
|
||||
SlidingWindowManager,
|
||||
)
|
||||
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, SlidingWindowSpec
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def get_sliding_window_manager(sliding_window_spec, block_pool):
|
||||
return SlidingWindowManager(sliding_window_spec, block_pool, kv_cache_group_id=0)
|
||||
|
||||
|
||||
def get_chunked_local_attention_manager(chunked_local_attention_spec, block_pool):
|
||||
return ChunkedLocalAttentionManager(
|
||||
chunked_local_attention_spec, block_pool, kv_cache_group_id=0
|
||||
)
|
||||
|
||||
|
||||
def test_chunked_local_attention_possible_cached_prefix():
|
||||
block_size = 2
|
||||
chunked_local_attention_spec = ChunkedLocalAttentionSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=1,
|
||||
head_size=1,
|
||||
dtype=torch.float32,
|
||||
attention_chunk_size=4,
|
||||
)
|
||||
|
||||
block_pool = BlockPool(
|
||||
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
|
||||
)
|
||||
manager = get_chunked_local_attention_manager(
|
||||
chunked_local_attention_spec, block_pool
|
||||
)
|
||||
|
||||
def run_one_case(block_is_cached, tail_token, expect_length):
|
||||
block_hash_list = [
|
||||
BlockHash(str(i).encode()) for i in range(len(block_is_cached))
|
||||
]
|
||||
|
||||
block_pool.cached_block_hash_to_block._cache.clear()
|
||||
|
||||
# Mock the block pool with the cached blocks
|
||||
for i, (block_hash, is_cached) in enumerate(
|
||||
zip(block_hash_list, block_is_cached)
|
||||
):
|
||||
if is_cached:
|
||||
block_pool.cached_block_hash_to_block.insert(
|
||||
make_block_hash_with_group_id(block_hash, 0),
|
||||
block_pool.blocks[i + 10],
|
||||
)
|
||||
|
||||
computed_blocks = manager.find_longest_cache_hit(
|
||||
block_hashes=block_hash_list,
|
||||
max_length=len(block_hash_list) * block_size + tail_token,
|
||||
kv_cache_group_ids=[0],
|
||||
block_pool=block_pool,
|
||||
kv_cache_spec=chunked_local_attention_spec,
|
||||
use_eagle=False,
|
||||
alignment_tokens=block_size,
|
||||
)[0]
|
||||
assert len(computed_blocks) == expect_length
|
||||
|
||||
assert all(
|
||||
block == block_pool.null_block
|
||||
for block in computed_blocks[: (expect_length - 1) // 2]
|
||||
)
|
||||
|
||||
run_one_case([True], 0, 1)
|
||||
run_one_case([True], 1, 1)
|
||||
run_one_case([True, False], 0, 2)
|
||||
run_one_case([True, False], 1, 2)
|
||||
run_one_case([True, True], 0, 2)
|
||||
run_one_case([True, True], 1, 2)
|
||||
run_one_case([True, True, False], 0, 2)
|
||||
run_one_case([True, True, False], 1, 2)
|
||||
run_one_case([True, True, True], 0, 3)
|
||||
run_one_case([True, True, True], 1, 3)
|
||||
run_one_case([True, True, True, False], 0, 4)
|
||||
run_one_case([True, True, True, False], 1, 4)
|
||||
run_one_case([random.choice([True, False])] * 8 + [True], 1, 9)
|
||||
run_one_case([random.choice([True, False])] * 8 + [False], 1, 8)
|
||||
run_one_case([random.choice([True, False])] * 8 + [True, True], 1, 10)
|
||||
run_one_case([random.choice([True, False])] * 8 + [True, False], 0, 10)
|
||||
run_one_case([random.choice([True, False])] * 8 + [True, False], 1, 10)
|
||||
run_one_case([random.choice([True, False])] * 8 + [False, True], 0, 10)
|
||||
run_one_case([random.choice([True, False])] * 8 + [False, True], 1, 10)
|
||||
run_one_case([random.choice([True, False])] * 8 + [False, False], 0, 10)
|
||||
run_one_case([random.choice([True, False])] * 8 + [False, False], 1, 10)
|
||||
|
||||
|
||||
def test_sliding_window_possible_cached_prefix():
|
||||
block_size = 2
|
||||
sliding_window_spec = SlidingWindowSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=1,
|
||||
head_size=1,
|
||||
dtype=torch.float32,
|
||||
sliding_window=4,
|
||||
)
|
||||
|
||||
block_pool = BlockPool(
|
||||
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
|
||||
)
|
||||
manager = get_sliding_window_manager(sliding_window_spec, block_pool)
|
||||
|
||||
def run_one_case(block_is_cached, expect_length):
|
||||
block_hash_list = [
|
||||
BlockHash(str(i).encode()) for i in range(len(block_is_cached))
|
||||
]
|
||||
|
||||
block_pool.cached_block_hash_to_block._cache.clear()
|
||||
|
||||
# Mock the block pool with the cached blocks
|
||||
for i, (block_hash, is_cached) in enumerate(
|
||||
zip(block_hash_list, block_is_cached)
|
||||
):
|
||||
if is_cached:
|
||||
block_pool.cached_block_hash_to_block.insert(
|
||||
make_block_hash_with_group_id(block_hash, 0),
|
||||
block_pool.blocks[i + 10],
|
||||
)
|
||||
|
||||
computed_blocks = manager.find_longest_cache_hit(
|
||||
block_hashes=block_hash_list,
|
||||
max_length=len(block_hash_list) * block_size,
|
||||
kv_cache_group_ids=[0],
|
||||
block_pool=block_pool,
|
||||
kv_cache_spec=sliding_window_spec,
|
||||
use_eagle=False,
|
||||
alignment_tokens=block_size,
|
||||
)[0]
|
||||
assert len(computed_blocks) == expect_length
|
||||
|
||||
assert all(
|
||||
block == block_pool.null_block
|
||||
for block in computed_blocks[: expect_length - 2]
|
||||
)
|
||||
for i in range(2):
|
||||
if i < expect_length:
|
||||
block_index = expect_length - i - 1
|
||||
assert computed_blocks[block_index].block_id == block_index + 10
|
||||
|
||||
run_one_case([False] * 10, 0)
|
||||
run_one_case([True], 1)
|
||||
run_one_case([True, False], 1)
|
||||
run_one_case([True, True], 2)
|
||||
run_one_case([True, True, False], 2)
|
||||
run_one_case([True, True, True], 3)
|
||||
run_one_case([True, True, True, False], 3)
|
||||
run_one_case(
|
||||
[True, True, False, True, False, False, True, True, False, True, True, True], 12
|
||||
)
|
||||
run_one_case(
|
||||
[True, True, False, True, False, False, True, True, False, False, False], 8
|
||||
)
|
||||
run_one_case(
|
||||
[True, True, False, True, False, False, True, True, False, False, False, True],
|
||||
8,
|
||||
)
|
||||
|
||||
|
||||
def test_chunked_local_attention_remove_skipped_blocks():
|
||||
attention_spec = ChunkedLocalAttentionSpec(
|
||||
block_size=2,
|
||||
num_kv_heads=1,
|
||||
head_size=1,
|
||||
dtype=torch.float32,
|
||||
attention_chunk_size=4,
|
||||
)
|
||||
|
||||
block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
|
||||
|
||||
manager = get_chunked_local_attention_manager(attention_spec, block_pool)
|
||||
|
||||
null_block_id = block_pool.null_block.block_id
|
||||
|
||||
def id_to_block_table(ids) -> list[KVCacheBlock]:
|
||||
return [
|
||||
KVCacheBlock(id_) if id_ != null_block_id else block_pool.null_block
|
||||
for id_ in ids
|
||||
]
|
||||
|
||||
def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
|
||||
for block, id_ in zip(block_table, ids):
|
||||
if id_ == null_block_id:
|
||||
assert block == block_pool.null_block
|
||||
else:
|
||||
assert block.block_id == id_
|
||||
|
||||
original_block_ids = [
|
||||
1000,
|
||||
1001,
|
||||
1002,
|
||||
1003,
|
||||
1004,
|
||||
1005,
|
||||
1006,
|
||||
1007,
|
||||
1008,
|
||||
1009,
|
||||
1010,
|
||||
]
|
||||
block_table = id_to_block_table(original_block_ids)
|
||||
manager.req_to_blocks["test"] = block_table
|
||||
|
||||
manager.remove_skipped_blocks("test", 0)
|
||||
assert_block_id(block_table, original_block_ids)
|
||||
|
||||
# For 4th token (0-indexed), token 0-3 is out of the local attention window.
|
||||
manager.remove_skipped_blocks("test", 4)
|
||||
assert_block_id(block_table, [null_block_id] * 2)
|
||||
|
||||
# For 6th token (0-indexed), token 4 - 6 are in local attention window,
|
||||
# token 0 - 3 are out, 2 blocks can be removed.
|
||||
manager.remove_skipped_blocks("test", 6)
|
||||
assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
|
||||
# For 12th token (0-indexed),
|
||||
# token 0-11 are out, 6 block can be removed.
|
||||
manager.remove_skipped_blocks("test", 12)
|
||||
assert_block_id(block_table, [null_block_id] * 6)
|
||||
|
||||
|
||||
def test_sliding_window_remove_skipped_blocks():
|
||||
sliding_window_spec = SlidingWindowSpec(
|
||||
block_size=2,
|
||||
num_kv_heads=1,
|
||||
head_size=1,
|
||||
dtype=torch.float32,
|
||||
sliding_window=4,
|
||||
)
|
||||
|
||||
block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
|
||||
|
||||
manager = get_sliding_window_manager(sliding_window_spec, block_pool)
|
||||
|
||||
null_block_id = block_pool.null_block.block_id
|
||||
|
||||
def id_to_block_table(ids) -> list[KVCacheBlock]:
|
||||
return [
|
||||
KVCacheBlock(id_) if id_ != null_block_id else block_pool.null_block
|
||||
for id_ in ids
|
||||
]
|
||||
|
||||
def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
|
||||
for block, id_ in zip(block_table, ids):
|
||||
if id_ == null_block_id:
|
||||
assert block == block_pool.null_block
|
||||
else:
|
||||
assert block.block_id == id_
|
||||
|
||||
original_block_ids = [
|
||||
1000,
|
||||
1001,
|
||||
1002,
|
||||
1003,
|
||||
1004,
|
||||
1005,
|
||||
1006,
|
||||
1007,
|
||||
1008,
|
||||
1009,
|
||||
1010,
|
||||
]
|
||||
block_table = id_to_block_table(original_block_ids)
|
||||
manager.req_to_blocks["test"] = block_table
|
||||
|
||||
manager.remove_skipped_blocks("test", 0)
|
||||
assert_block_id(block_table, original_block_ids)
|
||||
|
||||
# 4 tokens are computed. Only token 0 is out of the sliding window. As
|
||||
# block 1000 also contains token 1 that is in the sliding window, block 1000
|
||||
# cannot be removed.
|
||||
manager.remove_skipped_blocks("test", 4)
|
||||
assert_block_id(block_table, original_block_ids)
|
||||
|
||||
# 5 tokens are computed. Token 0 & 1 are out of the sliding window.
|
||||
# Block 1000 can be removed.
|
||||
manager.remove_skipped_blocks("test", 5)
|
||||
assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
|
||||
|
||||
# 6 tokens are computed. Token 0-2 are out of the sliding window.
|
||||
# Cannot remove new block as the block 1001 is still used by token 3.
|
||||
manager.remove_skipped_blocks("test", 6)
|
||||
assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
|
||||
|
||||
# 7 tokens are computed. Token 0-3 are out of the sliding window.
|
||||
# Block 1001 can be removed and block 1000 is already removed.
|
||||
manager.remove_skipped_blocks("test", 7)
|
||||
assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
|
||||
|
||||
# 11 tokens are computed. Token 0-7 are out of the sliding window.
|
||||
# Block 1002 & 1003 can be removed now. Block 1003 represents a longer
|
||||
# sequence, and is expected to be evicted earlier than 1002, so the order
|
||||
# of removed blocks should be [1003, 1002].
|
||||
manager.remove_skipped_blocks("test", 11)
|
||||
assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
|
||||
|
||||
|
||||
def test_get_num_blocks_to_allocate():
|
||||
block_size = 2
|
||||
sliding_window_spec = SlidingWindowSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=1,
|
||||
head_size=1,
|
||||
dtype=torch.float32,
|
||||
sliding_window=4, # Placeholder value, not related to test result
|
||||
)
|
||||
|
||||
block_pool = BlockPool(
|
||||
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
|
||||
)
|
||||
manager = get_sliding_window_manager(sliding_window_spec, block_pool)
|
||||
cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
|
||||
cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
|
||||
KVCacheBlock(i + 1) for i in range(5)
|
||||
]
|
||||
|
||||
assert (
|
||||
manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
|
||||
)
|
||||
assert (
|
||||
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
|
||||
)
|
||||
|
||||
|
||||
def test_chunked_local_attention_get_num_blocks_to_allocate():
|
||||
block_size = 2
|
||||
attention_spec = ChunkedLocalAttentionSpec(
|
||||
block_size=block_size,
|
||||
num_kv_heads=1,
|
||||
head_size=1,
|
||||
dtype=torch.float32,
|
||||
attention_chunk_size=4, # Placeholder value, not related to test result
|
||||
)
|
||||
|
||||
block_pool = BlockPool(
|
||||
num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
|
||||
)
|
||||
manager = get_chunked_local_attention_manager(attention_spec, block_pool)
|
||||
cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
|
||||
cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
|
||||
KVCacheBlock(i + 1) for i in range(5)
|
||||
]
|
||||
|
||||
assert (
|
||||
manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
|
||||
)
|
||||
assert (
|
||||
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
|
||||
)
|
||||
248
tests/v1/core/utils.py
Normal file
248
tests/v1/core/utils.py
Normal file
@@ -0,0 +1,248 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import torch
|
||||
|
||||
from tests.v1.kv_connector.unit.utils import MockKVConfig
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
ECTransferConfig,
|
||||
KVTransferConfig,
|
||||
ModelConfig,
|
||||
SchedulerConfig,
|
||||
SpeculativeConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalFeatureSpec,
|
||||
MultiModalKwargsItem,
|
||||
PlaceholderRange,
|
||||
)
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils.hashing import sha256
|
||||
from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
|
||||
from vllm.v1.core.sched.async_scheduler import AsyncScheduler
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
from vllm.v1.kv_cache_interface import (
|
||||
FullAttentionSpec,
|
||||
KVCacheConfig,
|
||||
KVCacheGroupSpec,
|
||||
)
|
||||
from vllm.v1.request import Request
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
EOS_TOKEN_ID = 50256
|
||||
|
||||
|
||||
def mock_kv(matched_tokens: int, is_async: bool):
|
||||
return MockKVConfig(matched_tokens=matched_tokens, is_async=is_async)
|
||||
|
||||
|
||||
def create_scheduler(
|
||||
model: str = "facebook/opt-125m",
|
||||
max_num_seqs: int = 16,
|
||||
max_num_batched_tokens: int = 8192,
|
||||
enable_chunked_prefill: bool = True,
|
||||
enable_prefix_caching: bool = False,
|
||||
long_prefill_token_threshold: int = 0,
|
||||
disable_chunked_mm_input: bool = False,
|
||||
use_kv_connector: None | bool | MockKVConfig = None,
|
||||
num_blocks: int = 10000,
|
||||
block_size: int = 16,
|
||||
max_model_len: int | None = None,
|
||||
num_speculative_tokens: int | None = None,
|
||||
skip_tokenizer_init: bool = False,
|
||||
async_scheduling: bool = False,
|
||||
use_ec_connector: bool = False,
|
||||
ec_role: str | None = None,
|
||||
) -> Scheduler | AsyncScheduler:
|
||||
"""Create scheduler under test.
|
||||
|
||||
Args:
|
||||
model: model under test
|
||||
max_num_seqs: max sequences to schedule
|
||||
max_num_batch_tokens: max num tokens to batch
|
||||
enable_prefix_caching: optionally force APC config
|
||||
(True/False) or use default
|
||||
(False)
|
||||
|
||||
Returns:
|
||||
{class}`Scheduler` instance
|
||||
"""
|
||||
model_config = ModelConfig(
|
||||
model=model,
|
||||
trust_remote_code=True,
|
||||
dtype="float16",
|
||||
seed=42,
|
||||
skip_tokenizer_init=skip_tokenizer_init,
|
||||
)
|
||||
if max_model_len is None:
|
||||
max_model_len = max_num_batched_tokens
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_model_len=max_model_len,
|
||||
long_prefill_token_threshold=long_prefill_token_threshold,
|
||||
disable_chunked_mm_input=disable_chunked_mm_input,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
async_scheduling=async_scheduling,
|
||||
is_encoder_decoder=model_config.is_encoder_decoder,
|
||||
)
|
||||
# Cache config, optionally force APC
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
enable_prefix_caching=enable_prefix_caching,
|
||||
)
|
||||
kv_transfer_config = None
|
||||
if isinstance(use_kv_connector, MockKVConfig):
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="MockKVConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"matched_tokens": use_kv_connector.matched_tokens,
|
||||
"is_async": use_kv_connector.is_async,
|
||||
},
|
||||
)
|
||||
elif use_kv_connector:
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="ExampleConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={"shared_storage_path": "local_storage"},
|
||||
)
|
||||
|
||||
speculative_config: SpeculativeConfig | None = None
|
||||
if num_speculative_tokens is not None:
|
||||
speculative_config = SpeculativeConfig(
|
||||
model="ngram", num_speculative_tokens=num_speculative_tokens
|
||||
)
|
||||
|
||||
ec_transfer_config = (
|
||||
ECTransferConfig(
|
||||
ec_connector="ECExampleConnector",
|
||||
ec_role=ec_role,
|
||||
ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"},
|
||||
)
|
||||
if use_ec_connector
|
||||
else None
|
||||
)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
scheduler_config=scheduler_config,
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
speculative_config=speculative_config,
|
||||
ec_transfer_config=ec_transfer_config,
|
||||
)
|
||||
kv_cache_config = KVCacheConfig(
|
||||
num_blocks=num_blocks, # A large number of blocks to hold all requests
|
||||
kv_cache_tensors=[],
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(
|
||||
["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
|
||||
)
|
||||
],
|
||||
)
|
||||
cache_config.num_gpu_blocks = num_blocks
|
||||
scheduler_cls = AsyncScheduler if async_scheduling else Scheduler
|
||||
return scheduler_cls(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
block_size=block_size,
|
||||
log_stats=True,
|
||||
structured_output_manager=StructuredOutputManager(vllm_config),
|
||||
)
|
||||
|
||||
|
||||
_none_hash_initialized = False
|
||||
|
||||
|
||||
def create_requests(
|
||||
num_requests: int,
|
||||
num_tokens: int = 10,
|
||||
mm_hashes_list: list[list[str]] | None = None,
|
||||
mm_positions: list[list[PlaceholderRange]] | None = None,
|
||||
max_tokens: int = 16,
|
||||
stop_token_ids: list[int] | None = None,
|
||||
prompt_logprobs: int | None = None,
|
||||
same_prompt: bool = False,
|
||||
block_size: int = 16,
|
||||
req_ids: list[str] | None = None,
|
||||
) -> list[Request]:
|
||||
global _none_hash_initialized
|
||||
if not _none_hash_initialized:
|
||||
init_none_hash(sha256)
|
||||
_none_hash_initialized = True
|
||||
|
||||
block_hasher = get_request_block_hasher(block_size, sha256)
|
||||
sampling_params = SamplingParams(
|
||||
ignore_eos=False,
|
||||
max_tokens=max_tokens,
|
||||
stop_token_ids=stop_token_ids,
|
||||
prompt_logprobs=prompt_logprobs,
|
||||
)
|
||||
requests = []
|
||||
|
||||
if mm_hashes_list is not None:
|
||||
# NOTE: allow manual input; some mm items can have the same identifier
|
||||
# no. of mm_hashes and mm_positions for each request should be identical
|
||||
assert mm_positions is not None, (
|
||||
"mm_positions must be provided when mm_hashes_list is provided"
|
||||
)
|
||||
assert len(mm_hashes_list) == len(mm_positions) == num_requests
|
||||
assert [len(h) for h in mm_hashes_list] == [len(p) for p in mm_positions]
|
||||
|
||||
# Since same identifier would imply they are identical encoder output
|
||||
# Verify mm items with identical identifier are having mm_position.length
|
||||
seen_hashes: dict[str, int] = {}
|
||||
|
||||
if req_ids:
|
||||
assert len(req_ids) == num_requests
|
||||
else:
|
||||
req_ids = [f"{i}" for i in range(num_requests)]
|
||||
|
||||
for i in range(num_requests):
|
||||
mm_features = []
|
||||
|
||||
for j, position in enumerate(
|
||||
mm_positions[i] if mm_positions is not None else []
|
||||
):
|
||||
if mm_hashes_list is not None:
|
||||
identifier = mm_hashes_list[i][j]
|
||||
|
||||
# Verify if position length is identical
|
||||
position_length = position.length
|
||||
if identifier in seen_hashes:
|
||||
assert seen_hashes[identifier] == position_length, (
|
||||
f"mm_hash '{identifier}' has inconsistent position lengths: "
|
||||
f"previously {seen_hashes[identifier]}, now {position_length} "
|
||||
f"at request {i}, position {j}"
|
||||
)
|
||||
else:
|
||||
seen_hashes[identifier] = position_length
|
||||
else:
|
||||
# Unique dummy hash for each mm item
|
||||
identifier = f"hash{i}_{j}"
|
||||
mm_feature = MultiModalFeatureSpec(
|
||||
data=MultiModalKwargsItem.dummy("dummy_m"),
|
||||
mm_position=position,
|
||||
identifier=identifier,
|
||||
modality="image",
|
||||
)
|
||||
mm_features.append(mm_feature)
|
||||
|
||||
prompt_token_ids = [0] * num_tokens if same_prompt else [i] * num_tokens
|
||||
request = Request(
|
||||
request_id=req_ids[i],
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=None,
|
||||
mm_features=mm_features if mm_features else None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
block_hasher=block_hasher,
|
||||
)
|
||||
requests.append(request)
|
||||
return requests
|
||||
Reference in New Issue
Block a user