add qwen3

This commit is contained in:
Chranos
2026-02-04 17:22:39 +08:00
parent d1c0f68ab4
commit 8511fe8530
1932 changed files with 300426 additions and 0 deletions

View File

View File

@@ -0,0 +1,12 @@
import pytest
@pytest.fixture()
def should_do_global_cleanup_after_test() -> bool:
"""Disable the global cleanup fixture for tests in this directory. This
provides a ~10x speedup for unit tests that don't load a model to GPU.
This requires that tests in this directory clean up after themselves if they
use the GPU.
"""
return False

View File

@@ -0,0 +1,67 @@
from typing import Callable, Iterable, Optional
import pytest
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.utils import set_random_seed
@pytest.fixture
def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, seed):
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, seed)
@pytest.fixture
def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
test_llm_kwargs, seed):
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
test_llm_kwargs, seed)
def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
distinct_llm_kwargs, seed):
kwargs = {
**common_llm_kwargs,
**per_test_common_llm_kwargs,
**distinct_llm_kwargs,
}
def generator_inner():
llm = LLM(**kwargs)
set_random_seed(seed)
yield llm
del llm
cleanup_dist_env_and_memory()
for llm in generator_inner():
yield llm
del llm
def get_text_from_llm_generator(llm_generator: Iterable[LLM],
prompts,
sampling_params,
llm_cb: Optional[Callable[[LLM],
None]] = None):
for llm in llm_generator:
if llm_cb:
llm_cb(llm)
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
text = [output.outputs[0].text for output in outputs]
del llm
return text
def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
for llm in llm_generator:
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
token_ids = [output.outputs[0].token_ids for output in outputs]
del llm
return token_ids

View File

@@ -0,0 +1,489 @@
from itertools import cycle
import pytest
from vllm import SamplingParams
from .conftest import get_token_ids_from_llm_generator
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"preemption_mode": "swap"
}, {
"preemption_mode": "recompute"
}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
def test_block_manager_with_preemption(baseline_llm_generator,
test_llm_generator, batch_size):
"""Verify block manager produces same outputs even when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted.
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
NOTE(Kuntai): Though we have removed block manager v1, this test is still
useful as it asserts the behavior of block manager v2 (now it is called
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
keep this test.
"""
output_len = 1024
temperature = 0.0
# We want to ensure equality even with preemption.
# We force the total block size to be 1 + cdiv(output_len, block_size)
# so that only one sequence can fit at a time (once the sequences grow).
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
'''
=============================
Modify by vllm_mlu
=============================
@brief(block_size): MLU paged attention only support block_size=16
'''
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# Our prompts will generate 128 tokens; since the prompts themselves are
# small, we don't need much KV space beyond 128.
"max_model_len": 160,
# skip cuda graph creation for fast test.
"enforce_eager": True,
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
[
{
"block_size": 16,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size
"num_gpu_blocks_override": 2 * (8 + 1),
},
{
"block_size": 16,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 16 = 128/block_size
"num_gpu_blocks_override": 2 * (16 + 2),
}
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"num_lookahead_slots": 0,
}])
@pytest.mark.parametrize(
"test_llm_kwargs",
[
{
# We run one test with block_size < lookahead_slots, one test with
# block_size > lookahead_slots
"num_lookahead_slots": 10,
"preemption_mode": "swap",
},
{
"num_lookahead_slots": 10,
"preemption_mode": "recompute",
}
])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
test_llm_generator,
batch_size):
"""Verify vLLM produces the same output with greedy sampling, when lookahead
scheduling is used vs. not.
Lookahead scheduling is not expected to modify the output, as it simply
allocates empty slots ahead of the known token ids in a sliding fashion.
This test constrains the total number of blocks to force preemption. It also
varies the block size so that the lookahead size is less than and greater
than the block size.
"""
output_len = 128
temperature = 0.0
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids without lookahead scheduling')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids with lookahead scheduling')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
'''
=============================
Modify by vllm_mlu
=============================
@brief(block_size): Only support Paged block_size 16, change block_size from 8 to 16
'''
@pytest.mark.parametrize(
"common_llm_kwargs",
[
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
"enable_chunked_prefill": True,
"gpu_memory_utilization": 0.6,
},
])
@pytest.mark.parametrize("per_test_common_llm_kwargs",
[{
"block_size": 16,
"max_num_batched_tokens": 2,
"max_num_seqs": 2,
}, {
"block_size": 16,
"max_num_batched_tokens": 3,
"max_num_seqs": 2,
}, {
"block_size": 16,
"max_num_batched_tokens": 256,
"max_num_seqs": 10,
}])
@pytest.mark.parametrize("baseline_llm_kwargs", [
{},
])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"num_lookahead_slots": 0,
},
{
"num_lookahead_slots": 5,
},
])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
def test_chunked_prefill_block_manager(baseline_llm_generator,
test_llm_generator, batch_size):
"""Verify that chunked prefill works with SelfAttnBlockSpaceManager,
with and without lookahead scheduling.
"""
output_len = 32
temperature = 0.0
prompts = [
"Hello, my name is",
"The president of the United States is",
("1 + " * 50) + " 1 = ", # Longer prompt.
"The capital of France is",
"The future of AI is",
]
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids with BlockManager')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids with BlockManager, with lookahead slots.')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
# Enable prefill cache
"enable_prefix_caching": True,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"preemption_mode": "swap"
}, {
"preemption_mode": "recompute"
}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
def test_block_manager_prefix_caching_enabled_with_preemption(
baseline_llm_generator, test_llm_generator, batch_size):
"""Verify block manager produces same outputs even when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted.
NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
NOTE(Kuntai): Though we have removed block manager v1, this test is still
useful as it asserts the behavior of block manager v2 (now it is called
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
keep this test.
"""
output_len = 1024
temperature = 0.0
# We want to ensure equality even with preemption.
# We force the total block size to be 1 + cdiv(output_len, block_size)
# so that only one sequence can fit at a time (once the sequences grow).
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids from block manager')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids from block manager, with preemption')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"enable_prefix_caching": False
}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"enable_prefix_caching": True,
"preemption_mode": "swap"
}, {
"enable_prefix_caching": True,
"preemption_mode": "recompute"
}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
test_llm_generator, batch_size):
"""Verify block manager v2 with auto prefix caching enabled produces same
outputs as auto prefix caching disabled, even when there is preemption.
This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.
If the output token ids are equivalent, then we have confidence that auto
prefix caching itself at least don't cause result error.
"""
output_len = 1024
temperature = 0.0
# We want to ensure equality even with preemption.
# We force the total block size to be 1 + cdiv(output_len, block_size)
# so that only one sequence can fit at a time (once the sequences grow).
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids with APC disabled')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids with APC enabled')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
# skip cuda graph creation for fast test.
"enforce_eager": True,
# we keep the blocks small, so that hit eviction quickly
"max_model_len": 48,
"block_size": 16,
"num_gpu_blocks_override": 3,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"enable_prefix_caching": False
}])
@pytest.mark.parametrize("test_llm_kwargs", [{
"enable_prefix_caching": True,
}])
@pytest.mark.parametrize("seed", [1])
def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
test_llm_generator):
"""Verify block manager v2 with auto prefix caching could works normal
even when eviction started.
With APC enabled, all blocks are held by native block at the beginning.
Then blocks are managed by evictor instead. If cache hit at the evitor's
block, then it could be reused, or we need to recompute its kv cache.
"""
output_len = 10
temperature = 0.0
prompts = [
"You are a helpful assistant. Please answer truthfully and write "
"out your thinking step by step to be sure you get the right answer. "
"If you make a mistake, attempt to correct it. who are you?",
"You are a helpful assistant. Please answer truthfully and write out "
"your thinking step by step to be sure you get the right answer. You "
"are helpful and harmless and you follow ethical guidelines. "
"who are you?"
]
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
print('Getting token ids with APC disabled')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)
print('Getting token ids with APC enabled')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids

View File

@@ -0,0 +1,180 @@
import random
from typing import List
import pytest
from vllm import LLM, SamplingParams
from .conftest import get_text_from_llm_generator
# relatively small model with 4k sliding window.
'''
=============================
Modify by vllm_mlu
=============================
Currently tmo.apply_rotary not support offsets so bigcode/starcoder2-3b cannot run.
use mistralai/Mistral-7B-v0.1 instead, which also have 4k sliding window.
'''
# The original model is: MODEL = "bigcode/starcoder2-3b"
MODEL = "mistralai/Mistral-7B-v0.1"
BLOCK_SIZE = 16
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model": MODEL,
# skip cuda graph creation for fast test.
"enforce_eager": True,
"block_size": BLOCK_SIZE,
# needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
@pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1])
def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
batch_size, seed):
"""
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
asks for value of one of them (which is outside the sliding window).
If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly).
Additionally, we compare the results of the v1 and v2 managers.
"""
sampling_params = SamplingParams(
max_tokens=1024,
ignore_eos=True,
temperature=0.0,
)
prompts, answer, indices = prep_prompts(batch_size)
baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
prompts,
sampling_params,
llm_cb=check_window(prompts))
check_answers(indices, answer, baseline_texts)
print('Getting token ids from block manager v2')
test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
sampling_params)
check_answers(indices, answer, test_texts)
cmp = [
expected_text == actual_text
for expected_text, actual_text in zip(baseline_texts, test_texts)
]
print(cmp)
# make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768
# however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290
# states that xformers and flash_attn have different ideas about the window
# size anyways
assert sum(cmp) > 0.7 * len(cmp)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model": MODEL,
# skip cuda graph creation for fast test.
"enforce_eager": True,
"block_size": BLOCK_SIZE,
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
@pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1])
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
"""
This is similar to test_sliding_window_retrival, however, it doesn't
compare against the v1 block manager since v1 doesn't support
chunked prefill with sliding window.
The results with and without chunked prefill are not the same due to
numerical instabilities.
"""
sampling_params = SamplingParams(
max_tokens=10,
ignore_eos=True,
temperature=0.0,
)
prompts, answer, indices = prep_prompts(batch_size)
# We don't compare with the baseline model here, since the results
# slightly different due to different tailing in attention.
test_texts = get_text_from_llm_generator(test_llm_generator,
prompts,
sampling_params,
llm_cb=check_window(prompts))
check_answers(indices, answer, test_texts)
def prep_prompts(batch_size: int):
"""
Generate prompts which a bunch of assignments,
then asking for the value of one of them.
The prompt is just under 10k tokens; sliding window is 4k
so the answer is outside sliding window, but should still be correct.
"""
prompts: List[str] = []
answer: List[int] = []
indices: List[int] = []
random.seed(1)
for _ in range(batch_size):
idx = random.randint(30, 90)
indices.append(idx)
prompt = "```python\n# We set a number of variables, " + \
f"x{idx} will be important later\n"
'''
=============================
Modify by vllm_mlu
=============================
Since we have used a different model, the length of the
prompt need to reset to the proper value as well
'''
# The original value is 800~1100
ln = random.randint(400, 500)
for k in range(30, ln):
v = random.randint(10, 99)
if k == idx:
answer.append(v)
prompt += f"x{k} = {v}\n"
prompt += f"# Now, we check the value of x{idx}:\n"
prompt += f"assert x{idx} == "
prompts.append(prompt)
return prompts, answer, indices
def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
answer2 = [int(text[0:2].strip()) for text in outputs]
print(list(zip(indices, zip(answer, answer2))))
numok = 0
for a1, a2 in zip(answer, answer2):
if a1 == a2:
numok += 1
frac_ok = numok / len(answer)
print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
# The original value is 0.7
assert frac_ok >= 0.4
def check_window(prompts: List[str]):
def inner(llm: LLM):
sliding_window = llm.llm_engine.model_config.get_sliding_window()
assert sliding_window and sliding_window > 0
assert any(
len(llm.get_tokenizer().tokenize(prompt)) > sliding_window
for prompt in prompts)
return inner

View File

@@ -0,0 +1,491 @@
import pytest
from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
STR_NOT_IMPL_ENC_DEC_SWA)
from vllm.core.block_manager import SelfAttnBlockSpaceManager
from vllm.core.interfaces import AllocStatus
from vllm.sequence import Logprob, SequenceStatus
from vllm.utils import chunk_list
from ..utils import (create_dummy_prompt, create_seq_group,
create_seq_group_encoder_decoder)
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
@pytest.mark.parametrize("watermark", [0.0, 0.5])
def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
num_gpu_blocks: int, watermark: float):
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
watermark=watermark,
)
num_watermark_blocks = int(watermark * num_gpu_blocks)
num_output_blocks_per_seq = 1
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
# the current implementation assumes all seqs are new prompts / don't have
# different output lens.
num_output_blocks = num_output_blocks_per_seq
for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
seq_group = create_seq_group(
seq_prompt_len=block_size * num_prompt_blocks,
seq_output_lens=[
block_size * num_output_blocks_per_seq
for _ in range(num_seqs_per_group)
],
)
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
can_allocate_result = block_manager.can_allocate(seq_group)
num_required_blocks = num_prompt_blocks + num_output_blocks
if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
assert can_allocate_result == AllocStatus.NEVER
elif num_gpu_blocks >= num_required_blocks:
assert can_allocate_result == AllocStatus.OK
else:
assert can_allocate_result == AllocStatus.LATER
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
@pytest.mark.parametrize("watermark", [0.0, 0.5])
def test_can_allocate_seq_group_encoder_decoder(block_size: int,
num_seqs_per_group: int,
num_gpu_blocks: int,
watermark: float):
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
watermark=watermark,
)
num_watermark_blocks = int(watermark * num_gpu_blocks)
num_output_blocks_per_seq = 1
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
# the current implementation assumes all seqs are new prompts / don't have
# different output lens.
num_output_blocks = num_output_blocks_per_seq
for bdx, num_prompt_blocks in enumerate(
range(1, num_gpu_blocks - num_output_blocks)):
num_cross_blocks_per_seq = num_prompt_blocks
seq_group = create_seq_group_encoder_decoder(
seq_prompt_len=block_size * num_prompt_blocks,
seq_output_lens=[
block_size * num_output_blocks_per_seq
for _ in range(num_seqs_per_group)
],
request_id=str(bdx))
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
can_allocate_result = block_manager.can_allocate(seq_group)
num_required_blocks = num_prompt_blocks + \
num_output_blocks + \
num_cross_blocks_per_seq
if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
assert can_allocate_result == AllocStatus.NEVER
elif num_gpu_blocks >= num_required_blocks:
assert can_allocate_result == AllocStatus.OK
else:
assert can_allocate_result == AllocStatus.LATER
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("num_gpu_blocks", [16])
@pytest.mark.parametrize("num_seqs_per_group", [1])
@pytest.mark.parametrize("watermark", [0.0, 0.5])
def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
num_seqs_per_group: int,
num_gpu_blocks: int,
watermark: float):
'''
SWA short for Sliding Window Attention.
At time of writing block manager does not support SWA.
However even when SWA is implemented for block manager,
there will still most likely be a separate workstream required
to enable SWA for encoder/decoder models.
Therefore this test enforces that one of the following cases
hold true:
1. Block manager does not support SWA at all (true at time of writing)
2. Block manager fails with NotImplementError when SWA is enabled
AND a SequenceGroup with an encoder sequence (i.e. in support of an
encoder/decoder model) is passed into can_allocate() as an argument
The setup for this test is stripped down version of
test_can_allocate_seq_group_encoder_decoder()
'''
with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
watermark=watermark,
sliding_window=5 # SWA
)
num_output_blocks_per_seq = 1
num_prompt_blocks = 1
num_output_blocks = num_output_blocks_per_seq
seq_group = create_seq_group_encoder_decoder(
seq_prompt_len=block_size * num_prompt_blocks,
seq_output_lens=[
block_size * num_output_blocks_per_seq
for _ in range(num_seqs_per_group)
],
request_id="0")
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
block_manager.can_allocate(seq_group)
# Assert that either
# 1. Block manager constructor fails with assertion that sliding window
# is not yet supported (most likely near-term outcome at time of
# writing), or
# 2. can_allocate() fails with NotImplementedError due to combination of
# encoder/decoder and sliding window attention
if isinstance(exc_info.value, NotImplementedError):
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
elif isinstance(exc_info.value, AssertionError):
assert str(exc_info.value) == "Sliding window not yet supported"
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("num_gpu_blocks", [16])
@pytest.mark.parametrize("num_seqs_per_group", [1])
@pytest.mark.parametrize("watermark", [0.0, 0.5])
def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
watermark: float):
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
watermark=watermark,
enable_caching=True # Prefix cache
)
num_output_blocks_per_seq = 1
num_prompt_blocks = 1
num_output_blocks = num_output_blocks_per_seq
seq_group = create_seq_group_encoder_decoder(
seq_prompt_len=block_size * num_prompt_blocks,
seq_output_lens=[
block_size * num_output_blocks_per_seq
for _ in range(num_seqs_per_group)
],
request_id="0")
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
# Assert that either can_allocate() fails with NotImplementedError
# due to combination of encoder/decoder and prefix cache
with pytest.raises(NotImplementedError) as exc_info:
block_manager.can_allocate(seq_group)
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("prompt_len", [1, 7, 8])
@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
@pytest.mark.parametrize("num_lookahead_slots", [0, 10])
def test_append_slots(block_size, prompt_len, num_slots_to_append,
num_lookahead_slots):
"""Verify append_slots consumes the correct number of blocks from the block
table.
"""
num_gpu_blocks = 1024
watermark = 0.1
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
watermark=watermark,
)
seq_group = create_seq_group(
seq_prompt_len=prompt_len,
seq_output_lens=[0],
)
# Allocate seq
assert block_manager.can_allocate(seq_group)
block_manager.allocate(seq_group)
# Seq seq to RUNNING
seq = seq_group.get_seqs()[0]
seq.status = SequenceStatus.RUNNING
# Append tokens to the sequeqnce
for token_id in range(num_slots_to_append):
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
# Append slots for new tokens and lookahead slots.
free_blocks_before_append = block_manager.get_num_free_gpu_blocks()
block_manager.append_slots(seq, num_lookahead_slots)
num_consumed_blocks = (free_blocks_before_append -
block_manager.get_num_free_gpu_blocks())
# Expect consumed blocks to be new blocks required to support the new slots.
expected_consumed_blocks = len(
list(
chunk_list(
list(
range(prompt_len + num_slots_to_append +
num_lookahead_slots)),
block_size))) - len(
list(chunk_list(list(range(prompt_len)), block_size)))
assert num_consumed_blocks == expected_consumed_blocks
@pytest.mark.parametrize("block_size", [8])
@pytest.mark.parametrize("num_cpu_blocks", [4])
@pytest.mark.parametrize("num_gpu_blocks", [4])
@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
@pytest.mark.parametrize("enable_caching", [False, True])
def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
enable_caching):
"""Verify blocks number on src/desc device is correct after swapping in/out
sequence group (not missing or extra blocks).
"""
block_manager = SelfAttnBlockSpaceManager(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
prompt.status = SequenceStatus.WAITING
block_manager.allocate(seq_group)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id = 0
prompt.status = SequenceStatus.RUNNING
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
# Swap seq group from GPU -> CPU.
gpu_blocks = block_manager.get_block_table(prompt)
assert block_manager.can_swap_out(seq_group)
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
mapping = block_manager.swap_out(seq_group)
mapping_keys = [key for key, _ in mapping]
assert mapping_keys == gpu_blocks
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
prompt.status = SequenceStatus.SWAPPED
# Swap seq group from CPU -> GPU.
assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
mapping = block_manager.swap_in(seq_group)
cpu_blocks = block_manager.get_block_table(prompt)
mapping_keys = [key for key, _ in mapping]
assert mapping_keys == [cpu_blocks[0]]
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
@pytest.mark.parametrize("block_size", [8])
@pytest.mark.parametrize("num_gpu_blocks", [4])
@pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10])
@pytest.mark.parametrize("enable_caching", [True, False])
def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
enable_caching):
""" Verify the block manager can correctly determine if a sequence group
can be swapped in/out.
"""
num_cpu_blocks = num_gpu_blocks
block_manager = SelfAttnBlockSpaceManager(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
prompt, seq_group = create_dummy_prompt(
"1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
prompt.status = SequenceStatus.WAITING
block_manager.allocate(seq_group)
prompt.status = SequenceStatus.RUNNING
# Swap seq group from GPU -> CPU.
gpu_blocks = block_manager.get_block_table(prompt)
assert block_manager.can_swap_out(seq_group)
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
mapping = block_manager.swap_out(seq_group)
mapping_keys = [key for key, _ in mapping]
assert mapping_keys == gpu_blocks
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
prompt.status = SequenceStatus.SWAPPED
# At this moment, we still have enough free blocks to swap in the seq group.
if num_lookahead_slots <= block_size:
assert block_manager.can_swap_in(seq_group,
num_lookahead_slots) == AllocStatus.OK
else:
assert block_manager.can_swap_in(
seq_group, num_lookahead_slots) == AllocStatus.NEVER
# During Swapped out, 2 cached blocks were evicted from the GPU,
# so the prompt1 can't be swapped in
prompt2_len = 2 * block_size - 1
prompt2, seq_group2 = create_dummy_prompt(
"2",
prompt_length=prompt2_len,
prompt_tokens=[10000 + i for i in range(prompt2_len)])
prompt2.status = SequenceStatus.WAITING
block_manager.allocate(seq_group2)
# Swap seq group from CPU -> GPU.
if num_lookahead_slots <= block_size:
assert block_manager.can_swap_in(
seq_group, num_lookahead_slots) == AllocStatus.LATER
else:
assert block_manager.can_swap_in(
seq_group, num_lookahead_slots) == AllocStatus.NEVER
@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
@pytest.mark.parametrize("enable_caching", [False, True])
def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
"""Verifies that swapping fails if there is not enough free blocks
to account for unseen tokens and lookahead_slots.
"""
block_size = 8
num_cpu_blocks = 1
num_gpu_blocks = 1
block_manager = SelfAttnBlockSpaceManager(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
prompt_length = block_size - 3
assert prompt_length > 0
prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
prompt.status = SequenceStatus.WAITING
block_manager.allocate(seq_group)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id = 0
prompt.status = SequenceStatus.RUNNING
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
# Swap seq group from GPU -> CPU.
assert block_manager.can_swap_out(seq_group)
block_manager.swap_out(seq_group)
prompt.status = SequenceStatus.SWAPPED
# Swap seq group from CPU -> GPU.
# The number of unseen tokens is 1. If the number of existing
# tokens plus the unseen ones and number of lookahead slots exceeds
# the total number of available GPU blocks then the swap
# should fail.
num_unseen_tokens = 1
if (num_lookahead_slots + num_unseen_tokens +
prompt_length) <= (block_size * num_gpu_blocks):
assert block_manager.can_swap_in(seq_group,
num_lookahead_slots) == AllocStatus.OK
else:
assert block_manager.can_swap_in(
seq_group, num_lookahead_slots) == AllocStatus.NEVER
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
@pytest.mark.parametrize("block_size", [8, 16])
@pytest.mark.parametrize("prompt_len", [10, 300, 1000])
@pytest.mark.parametrize("num_slots_to_append", [50])
@pytest.mark.parametrize("sliding_window", [20, 32, 200, 512])
def test_sliding_window(block_size, prompt_len, num_slots_to_append,
sliding_window):
"""Verify append_slots consumes the correct number of blocks from the block
table.
"""
num_gpu_blocks = 1024
watermark = 0.1
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
watermark=watermark,
sliding_window=sliding_window,
)
def check_used(min_n, max_n=None):
if max_n is None:
max_n = min_n
used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
assert min_n <= used
assert used <= max_n
def num_blocks(num_tokens):
return (num_tokens + block_size - 1) // block_size
check_used(0)
seq_group = create_seq_group(
seq_prompt_len=prompt_len,
seq_output_lens=[0],
)
check_used(0)
# Allocate seq
assert block_manager.can_allocate(seq_group)
block_manager.allocate(seq_group)
check_used(num_blocks(prompt_len))
# Seq seq to RUNNING
seq = seq_group.get_seqs()[0]
seq.status = SequenceStatus.RUNNING
seq.data.update_num_computed_tokens(prompt_len)
check_used(num_blocks(prompt_len))
# this is how we compute it in SelfAttnBlockSpaceManager.__init__
sliding_blocks = (sliding_window // block_size) + 2
# plus one block for null block
sliding_blocks += 1
# Append tokens to the sequeqnce
for token_id in range(num_slots_to_append):
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
seq.data.update_num_computed_tokens(1)
block_manager.append_slots(seq, num_lookahead_slots=0)
if prompt_len < sliding_window + 10:
check_used(0, sliding_blocks + 1)
else:
check_used(sliding_blocks, sliding_blocks + 1)

View File

@@ -0,0 +1,576 @@
from typing import List
import pytest
from vllm.core.block.block_table import BlockTable
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
from vllm.utils import Device, cdiv, chunk_list
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
def test_allocate_naive(block_size: int, sequence_len: int):
"""Test the allocation of blocks using the naive allocator.
This test creates a CpuGpuBlockAllocator with the specified block size and
number of blocks. It then allocates multiple BlockTables with varying
sequence lengths and verifies that the number of free blocks decreases as
expected after each allocation.
"""
assert block_size > 1
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type="naive",
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
block_tables: List[BlockTable] = []
for i in range(5):
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
block_tables.append(
BlockTable(
block_size=block_size,
block_allocator=allocator,
))
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
def test_allocate_prefix_caching(block_size: int, sequence_len: int):
"""Test the allocation of blocks using the prefix caching allocator.
This test creates a CpuGpuBlockAllocator with the specified block size and
number of blocks, using the prefix caching allocator. It then allocates
multiple BlockTables with varying sequence lengths and verifies that the
number of free blocks decreases as expected after each allocation.
The test expects all sequences to share allocations, except for their last
block, which may be mutable. It calculates the expected number of immutable
and mutable blocks per allocation based on the sequence length and block
size.
"""
assert block_size > 1
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type="prefix_caching",
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
chunked_tokens = list(chunk_list(token_ids, block_size))
num_mutable_blocks_per_alloc = 0 if len(
chunked_tokens[-1]) == block_size else 1
num_immutable_blocks_per_alloc = len(
chunked_tokens) - num_mutable_blocks_per_alloc
block_tables: List[BlockTable] = []
for alloc_i in range(1, 6):
block_tables.append(
BlockTable(
block_size=block_size,
block_allocator=allocator,
))
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
# Expect all sequences to share allocations, except for their last block
# (which may be mutable).
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_gpu_blocks - (
num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
(alloc_i))
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
@pytest.mark.parametrize("device", ["cpu", "gpu"])
def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
device: str):
"""Test the allocation and freeing of blocks using different allocators and
devices.
This test creates a CpuGpuBlockAllocator with the specified block size,
number of blocks, allocator type, and device. It then allocates a BlockTable
multiple times with the same sequence and verifies that the number of free
blocks remains consistent after each allocation and freeing.
"""
device = Device[device.upper()]
num_device_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_device_blocks,
num_cpu_blocks=num_device_blocks,
block_size=block_size,
)
token_ids = list(range(sequence_len))
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
for i in range(5):
block_table.allocate(token_ids=token_ids, device=device)
assert allocator.get_num_free_blocks(
device) == num_device_blocks - num_blocks_per_alloc
assert all(block_id is not None
for block_id in block_table.physical_block_ids)
block_table.free()
assert allocator.get_num_free_blocks(device) == num_device_blocks
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("append_len", [1, 16, 129])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_append_token_ids_allocation(block_size: int, sequence_len: int,
append_len: int, allocator_type: str):
"""Test the allocation behavior when appending token IDs to a BlockTable.
This test creates a CpuGpuBlockAllocator with the specified block size,
number of blocks, and allocator type. It then allocates a BlockTable with an
initial sequence and appends additional token IDs to it. The test verifies
that the number of allocated blocks before and after appending matches the
expected values.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(append_len))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
num_expected_blocks_before_append = len(
list(chunk_list(token_ids, block_size)))
num_expected_appended_blocks = len(
list(chunk_list(token_ids + token_ids_to_append,
block_size))) - num_expected_blocks_before_append
block_table.allocate(token_ids=token_ids, device=Device.GPU)
assert len(
block_table.physical_block_ids) == num_expected_blocks_before_append
block_table.append_token_ids(token_ids_to_append)
assert len(
block_table.physical_block_ids
) == num_expected_blocks_before_append + num_expected_appended_blocks
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
num_empty_slots: int,
allocator_type: str):
"""Test the allocation behavior when ensuring a certain number of empty
slots in a BlockTable.
This test creates a CpuGpuBlockAllocator with the specified block size,
number of blocks, and allocator type. It then allocates a BlockTable with an
initial sequence and ensures a certain number of empty slots. The test
verifies that the number of allocated blocks before and after ensuring empty
slots matches the expected values. It also checks that filling up the empty
slots does not consume additional blocks.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
num_expected_blocks_before_append = len(
list(chunk_list(token_ids, block_size)))
num_expected_appended_blocks = len(
list(chunk_list(token_ids + [-1] * num_empty_slots,
block_size))) - num_expected_blocks_before_append
block_table.allocate(token_ids=token_ids, device=Device.GPU)
# Assert that the empty slots consume the expected number of additional
# blocks.
assert len(
block_table.physical_block_ids) == num_expected_blocks_before_append
block_table.ensure_num_empty_slots(num_empty_slots)
assert len(
block_table.physical_block_ids
) == num_expected_blocks_before_append + num_expected_appended_blocks
# Now, ensure no additional blocks consumed as we fill up the empty slots.
num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("sequence_len", [1, 9])
@pytest.mark.parametrize("append_len", [1, 16, 129])
@pytest.mark.parametrize("append_size", [1, 4, 129])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
append_len: int, allocator_type: str,
append_size: int):
"""Verify token ids are correctly appended. Appends various amounts of
token ids in various append sizes, and verifies the final sequence is
correct.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(append_len))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
block_table.allocate(token_ids=token_ids, device=Device.GPU)
appended_so_far: List[int] = []
for append in chunk_list(token_ids_to_append, append_size):
block_table.append_token_ids(append)
appended_so_far.extend(append)
assert block_table._get_all_token_ids() == token_ids + appended_so_far
assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
@pytest.mark.parametrize("seq_len", [1, 9, 129])
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_fork(seq_len: int, block_size: int, allocator_type: str):
"""Create a sequence using the specified allocator.
1. Assert that after forking the sequence, the free block count is the
same.
2. Assert that the forked sequence has the same physical mappings.
3. Then free the original sequence; verify that the free block count is
the same.
4. Finally, free the forked sequence and verify that the free block
count drops to zero.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
block_size=block_size,
)
token_ids = list(range(seq_len))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
block_table.allocate(token_ids)
num_free_blocks_before_fork = allocator.get_num_free_blocks(
device=Device.GPU)
forked_block_table = block_table.fork()
# Expect physical_block_ids and token_ids to match.
assert (block_table.physical_block_ids ==
forked_block_table.physical_block_ids)
assert block_table._get_all_token_ids(
) == forked_block_table._get_all_token_ids()
# Do not expect any additional allocations.
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_free_blocks_before_fork
# Free the original blocks. Assert num free blocks does not change, since
# refcount is nonzero.
block_table.free()
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_free_blocks_before_fork
# Expect the forked block table to be unaffected by the free.
assert all(block_id is not None
for block_id in forked_block_table.physical_block_ids)
# Free the forked blocks. Assert num free blocks does change, since
# refcount is now zero.
forked_block_table.free()
assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
@pytest.mark.parametrize("block_size", [8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("append_len", [1, 16, 129])
@pytest.mark.parametrize("appender", ["forked", "original"])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_cow(block_size: int, sequence_len: int, append_len: int,
allocator_type: str, appender: str):
"""Fork a sequence; append to the forked sequence; verify there's a CoW.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(append_len))
original_block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
num_expected_cow_blocks = cdiv(sequence_len + append_len,
block_size) - (sequence_len // block_size)
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
original_block_ids = original_block_table.physical_block_ids[:]
print("original_block_ids = {}".format(original_block_ids))
forked_block_table = original_block_table.fork()
# Expect no additional allocation (copy on _write_).
assert allocator.get_num_free_blocks(
Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
if appender == "forked":
appender_block_table = forked_block_table
static_block_table = original_block_table
elif appender == "original":
appender_block_table = original_block_table
static_block_table = forked_block_table
else:
raise ValueError(f"unknown test config {appender=}")
# Write tokens.
appender_block_table.append_token_ids(token_ids_to_append)
# Expect the non-appending block table to have no change.
assert static_block_table.physical_block_ids == original_block_ids
assert appender_block_table.physical_block_ids != original_block_ids
# Expect the blocks changed during append to have a CoW.
assert allocator.get_num_free_blocks(
Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
num_expected_cow_blocks)
cows = allocator.clear_copy_on_writes()
if sequence_len % block_size > 0:
# If the last block in the sequence is not full, then when appending we
# expect a CoW.
assert cows
cow_block_id = sequence_len // block_size
expected_src = static_block_table.physical_block_ids[cow_block_id]
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
assert (expected_src, expected_dst) in cows
else:
# Otherwise, there should be no copy-on-write.
assert not cows
static_block_table.free()
appender_block_table.free()
# After free, expect all blocks to be freed.
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
@pytest.mark.parametrize("block_size", [8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("append_len", [1, 16, 129])
@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
@pytest.mark.parametrize("appender", ["forked", "original"])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_cow_lookahead_simple(block_size: int, sequence_len: int,
append_len: int, lookahead_slots: int,
allocator_type: str, appender: str):
"""Similar to test_cow, except with lookahead allocation. The assertions are
less rigorous due to the complexity of the property under test.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(append_len))
original_block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
# Allocate lookahead slots.
original_block_table.ensure_num_empty_slots(lookahead_slots)
original_block_ids = original_block_table.physical_block_ids[:]
forked_block_table = original_block_table.fork()
if appender == "forked":
appender_block_table = forked_block_table
static_block_table = original_block_table
elif appender == "original":
appender_block_table = original_block_table
static_block_table = forked_block_table
else:
raise ValueError(f"unknown test config {appender=}")
# Write tokens.
appender_block_table.append_token_ids(token_ids_to_append)
# Expect the non-appending block table to have no change.
assert static_block_table.physical_block_ids == original_block_ids
assert appender_block_table.physical_block_ids != original_block_ids
cows = allocator.clear_copy_on_writes()
# Always expect copy-on-write
assert cows
if sequence_len % block_size > 0:
# If the last block in the sequence is not full, then when appending we
# expect a CoW.
assert cows
cow_block_id = sequence_len // block_size
expected_src = static_block_table.physical_block_ids[cow_block_id]
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
assert (expected_src, expected_dst) in cows
static_block_table.free()
appender_block_table.free()
# After free, expect all blocks to be freed.
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
@pytest.mark.parametrize("block_size", [1, 8])
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
@pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
num_new_tokens: int,
num_lookahead_slots: int,
allocator_type: str):
"""Verify correct calculation of get_num_blocks_touched_by_append_slots.
This is done by using copy-on-write, which requires any modified block to
be copied before write if the refcount > 1. We set the refcount>1 by forking
a sequence, then measure the free blocks before and after an append. If the
number of consumed blocks equals what `get_num_blocks_touched_by_append_
slots` returns, then the calculation is correct.
"""
num_gpu_blocks = 1024
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
block_size=block_size,
)
token_ids = list(range(sequence_len))
token_ids_to_append = list(range(num_new_tokens))
block_table = BlockTable(
block_size=block_size,
block_allocator=allocator,
)
block_table.allocate(token_ids=token_ids, device=Device.GPU)
# Add lookahead before fork so both sequences have the same lookahead
# blocks.
block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
# Fork sequence so that every block has refcount > 1.
_ = block_table.fork()
# Determine how many blocks should be touched.
expected_num_touched_blocks = (
block_table.get_num_blocks_touched_by_append_slots(
token_ids=token_ids_to_append,
num_lookahead_slots=num_lookahead_slots))
# Measure how many blocks are touched by measuring num_free_blocks before
# and after the append.
#
# We expect append_token_ids to CoW all mutated blocks that have refcount>1.
num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
num_consumed_blocks = (num_free_blocks_before_append -
allocator.get_num_free_blocks(Device.GPU))
# TODO(cade) ensure equality when num_lookahead_slots > 0.
# The reason we have < is because lookahead blocks are not copied eagerly;
# they are copied on first write. This will cause issues for beam search +
# speculative decoding. This is acceptable for now as it is a large effort
# to combine the two. To fix this, we can ensure single sequence ownership
# of lookahead blocks by appending empty slots to each block, which will
# trigger the CoW.
#
# Until then, we can accept that the consumed tokens are <= the expected
# tokens when appending with lookahead.
if num_lookahead_slots > 0:
assert num_consumed_blocks <= expected_num_touched_blocks
else:
assert num_consumed_blocks == expected_num_touched_blocks

View File

@@ -0,0 +1,42 @@
import random
import pytest
from vllm.core.block.common import RefCounter
@pytest.mark.parametrize("seed", list(range(20)))
@pytest.mark.parametrize("num_incrs", [1, 100])
@pytest.mark.parametrize("num_blocks", [1024])
def test_incr(seed: int, num_incrs: int, num_blocks: int):
random.seed(seed)
all_block_indices = list(range(num_blocks))
counter = RefCounter(all_block_indices=all_block_indices)
block_id = random.randint(0, num_blocks - 1)
for i in range(num_incrs):
value = counter.incr(block_id)
assert value == i + 1
@pytest.mark.parametrize("seed", list(range(20)))
@pytest.mark.parametrize("num_incrs", [1, 100])
@pytest.mark.parametrize("num_blocks", [1024])
def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
random.seed(seed)
all_block_indices = list(range(num_blocks))
counter = RefCounter(all_block_indices=all_block_indices)
block_id = random.randint(0, num_blocks - 1)
for i in range(num_incrs):
value = counter.incr(block_id)
assert value == i + 1
for i in range(num_incrs):
value = counter.decr(block_id)
assert value == num_incrs - (i + 1)
with pytest.raises(AssertionError):
counter.decr(block_id)

View File

@@ -0,0 +1,93 @@
import pytest
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
from vllm.utils import Device, chunk_list
@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
@pytest.mark.parametrize("num_gpu_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str):
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
block_size=block_size,
)
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
cpu_blocks = [
allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
for _ in range(num_cpu_blocks)
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
gpu_blocks = [
allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
for _ in range(num_gpu_blocks)
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == 0
_ = [allocator.free(block) for block in cpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == 0
_ = [allocator.free(block) for block in gpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
@pytest.mark.parametrize("num_gpu_blocks", [1024])
@pytest.mark.parametrize("block_size", [2])
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
block_size: int, allocator_type: str):
allocator = CpuGpuBlockAllocator.create(
allocator_type=allocator_type,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
block_size=block_size,
)
unique_token_ids = list(
range((num_cpu_blocks + num_gpu_blocks) * block_size))
gpu_token_ids = list(
chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
cpu_token_ids = list(
chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
cpu_blocks = [
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids,
device=Device.CPU)
for token_ids in cpu_token_ids
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
gpu_blocks = [
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids,
device=Device.GPU)
for token_ids in gpu_token_ids
]
assert allocator.get_num_free_blocks(Device.CPU) == 0
assert allocator.get_num_free_blocks(Device.GPU) == 0
_ = [allocator.free(block) for block in cpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == 0
_ = [allocator.free(block) for block in gpu_blocks]
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks

View File

@@ -0,0 +1,145 @@
from typing import List, Optional
import pytest
from vllm.core.block.interfaces import Block, BlockAllocator
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
class TestNaiveBlockAllocator:
@staticmethod
def create_allocate_lambda(allocate_type: str,
allocator: NaiveBlockAllocator,
prev_block: Optional[Block],
token_ids: List[int]):
if allocate_type == "immutable":
allocate_block = lambda: allocator.allocate_immutable_block(
prev_block=prev_block, token_ids=token_ids)
elif allocate_type == "mutable":
allocate_block = lambda: allocator.allocate_mutable_block(
prev_block=prev_block)
else:
raise ValueError()
return allocate_block
@staticmethod
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_allocate_ooms(allocate_type: str, num_blocks: int,
block_size: int):
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
allocate_type,
allocator,
prev_block=None,
token_ids=list(range(block_size)))
[allocate_block() for _ in range(num_blocks)]
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocate_block()
@staticmethod
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_free_prevents_oom(allocate_type: str, num_blocks: int,
block_size: int):
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
allocate_type,
allocator,
prev_block=None,
token_ids=list(range(block_size)))
blocks = [allocate_block() for _ in range(num_blocks)]
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocate_block()
block_to_free = blocks.pop()
for _ in range(100):
block_id = block_to_free.block_id
allocator.free(block_to_free)
assert block_to_free.block_id is None
new_block = allocate_block()
assert new_block.block_id == block_id
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocate_block()
block_to_free = new_block
@staticmethod
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
@pytest.mark.parametrize("num_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
block_size: int):
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
allocate_type,
allocator,
prev_block=None,
token_ids=list(range(block_size)))
assert allocator.get_num_free_blocks() == num_blocks
blocks = [allocate_block() for _ in range(num_blocks)]
for i, block in enumerate(blocks):
assert allocator.get_num_free_blocks() == i
allocator.free(block)
@staticmethod
@pytest.mark.parametrize("num_blocks", [4])
@pytest.mark.parametrize("block_size", [8])
def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
""" Verify the allocator can correctly return the number of
full blocks touched.
"""
allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock,
num_blocks=num_blocks,
block_size=block_size)
# Create a chain of cacheable blocks in the dst
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
"immutable",
allocator_src,
prev_block=None,
token_ids=list(range(block_size)))
src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
# All blocks are cached
assert allocator_dst.get_num_full_blocks_touched(
src_blocks) == num_blocks - 1
# Insert one non-full block in the src
allocate_non_full_block = \
TestNaiveBlockAllocator.create_allocate_lambda(
"mutable", allocator_src,
prev_block=src_blocks[-1],token_ids=[]
)
src_blocks.append(allocate_non_full_block())
src_blocks[-1].append_token_ids([0])
assert allocator_dst.get_num_full_blocks_touched(
src_blocks) == num_blocks - 1
# Fill up the last source block and then invoke
# get_num_blocks_touched
src_blocks[-1].append_token_ids([0] * (block_size - 1))
assert allocator_dst.get_num_full_blocks_touched(
src_blocks) == num_blocks

View File

@@ -0,0 +1,764 @@
import math
import random
from typing import List, Optional
from unittest.mock import MagicMock
import pytest
from vllm.core.block.interfaces import Block, BlockAllocator
from vllm.core.block.prefix_caching_block import (PrefixCachingBlock,
PrefixCachingBlockAllocator)
class TestPrefixCachingBlock:
@staticmethod
@pytest.mark.parametrize("seed", list(range(10)))
@pytest.mark.parametrize("block_size", [1, 16])
@pytest.mark.parametrize("is_curr_block_full", [True, False])
def test_first_block_has_correct_content_hash(seed: int, block_size: int,
is_curr_block_full: bool):
"""Verify a block which is first in the sequence has the correct hash.
"""
random.seed(seed)
num_to_fill = block_size if is_curr_block_full else random.randint(
0, block_size - 1)
token_ids = list(range(num_to_fill))
mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
block_with_prev = PrefixCachingBlock(prev_block=None,
token_ids=token_ids,
block_size=block_size,
allocator=mock_allocator)
if is_curr_block_full:
# Expect hash since block is full.
assert block_with_prev.content_hash == (
PrefixCachingBlock.hash_block_tokens(
is_first_block=True,
prev_block_hash=None,
cur_block_token_ids=token_ids))
else:
# Do not expect hash since block is not full.
assert block_with_prev.content_hash is None
@staticmethod
@pytest.mark.parametrize("seed", list(range(10)))
@pytest.mark.parametrize("block_size", [1, 16])
@pytest.mark.parametrize("is_curr_block_full", [True, False])
@pytest.mark.parametrize("prev_block_has_hash", [True, False])
def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
is_curr_block_full: bool,
prev_block_has_hash: bool):
"""Verify a block which is not first in the sequence has the correct
hash.
"""
random.seed(seed)
previous_block = MagicMock(spec=PrefixCachingBlock)
prev_block_hash = random.randint(0, 1000)
previous_block.content_hash = (prev_block_hash
if prev_block_has_hash else None)
num_to_fill = block_size if is_curr_block_full else random.randint(
0, block_size - 1)
token_ids = list(range(num_to_fill))
mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
block_with_prev = PrefixCachingBlock(
prev_block=previous_block,
token_ids=token_ids,
block_size=block_size,
allocator=mock_allocator,
)
if is_curr_block_full and prev_block_has_hash:
# Expect hash since block is full and previous block has hash.
assert (block_with_prev.content_hash ==
PrefixCachingBlock.hash_block_tokens(
is_first_block=False,
prev_block_hash=prev_block_hash,
cur_block_token_ids=token_ids))
else:
# Do not expect hash since block is not full or the previous block
# does not have a hash.
assert block_with_prev.content_hash is None
@staticmethod
@pytest.mark.parametrize("block_size", [1, 2, 16])
@pytest.mark.parametrize("num_tokens", list(range(3)))
@pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10])
def test_blocks_have_correct_hash_in_chain(block_size: int,
num_tokens: int,
num_empty_trailing_blocks: int):
"""Create two chains of logical blocks with the same contents.
Assert the hashes are equal.
"""
random.seed(0)
token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
block_size=block_size,
token_ids=token_ids,
num_empty_trailing_blocks=num_empty_trailing_blocks)
for _ in range(2))
for first_chain_block, second_chain_block in zip(
first_chain, second_chain):
assert (first_chain_block.content_hash ==
second_chain_block.content_hash)
if not first_chain or not second_chain:
assert first_chain == second_chain
assert num_tokens == 0
@staticmethod
def create_chain(block_size: int,
token_ids: List[int],
num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
"""Helper method which creates a chain of blocks.
"""
blocks: List[PrefixCachingBlock] = []
num_blocks = math.ceil(
len(token_ids) / block_size) + num_empty_trailing_blocks
if num_blocks == 0:
return []
allocator = MagicMock(spec=PrefixCachingBlockAllocator)
prev_block = None
for block_number in range(0, num_blocks):
prev_block = PrefixCachingBlock(
prev_block=prev_block,
token_ids=[],
block_size=block_size,
allocator=allocator,
)
tokens_to_append = token_ids[block_number *
block_size:(block_number + 1) *
block_size]
if tokens_to_append:
prev_block.append_token_ids(tokens_to_append)
blocks.append(prev_block)
return blocks
class TestPrefixCachingBlockAllocator:
@staticmethod
def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
prev_block: Optional[Block],
token_ids: List[int]):
if allocate_type == "immutable":
allocate_block = lambda: allocator.allocate_immutable_block(
prev_block=prev_block, token_ids=token_ids)
elif allocate_type == "mutable":
allocate_block = lambda: allocator.allocate_mutable_block(
prev_block=prev_block)
else:
raise ValueError()
return allocate_block
@staticmethod
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
allocate_type="mutable",
allocator=allocator,
prev_block=None,
token_ids=list(range(block_size)),
)
[allocate_block() for _ in range(num_blocks)]
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocate_block()
@staticmethod
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_allocate_immutable_does_not_oom_single_hash(
num_blocks: int, block_size: int):
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
allocate_type="immutable",
allocator=allocator,
prev_block=None,
token_ids=list(range(block_size)),
)
blocks = [allocate_block() for _ in range(num_blocks)]
# Expect no OOM. If these were mutable blocks, this would OOM.
non_oom_block = allocate_block()
# Expect all blocks to have same physical block index.
for block in blocks:
assert (block.block_id == non_oom_block.block_id)
@staticmethod
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_allocate_immutable_ooms_many_hash(num_blocks: int,
block_size: int):
"""Consume all blocks using many different hashes/block content.
Do this by creating a sequence that is very long.
Expect next block to OOM.
"""
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
# Create token ids that will exhaust all blocks.
token_ids = list(range(num_blocks * block_size))
chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# Expect allocation with unseen hash to fail.
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocator.allocate_immutable_block(prev_block=chain[-1],
token_ids=list(
range(block_size)))
# Expect mutable allocation to fail.
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocator.allocate_mutable_block(prev_block=chain[-1])
# Expect allocation of exact same chain to pass.
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# Expect physical block indices to be the same in both chains.
assert chain and second_chain
for first_chain_block, second_chain_block in zip(chain, second_chain):
assert (first_chain_block.block_id == second_chain_block.block_id)
@staticmethod
@pytest.mark.parametrize("num_blocks", [1, 1024])
@pytest.mark.parametrize("block_size", [1, 16])
def test_free_prevents_oom(num_blocks: int, block_size: int):
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
# Create token ids that will exhaust all blocks.
token_ids = list(range(num_blocks * block_size))
chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# Expect mutable allocation to fail.
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocator.allocate_mutable_block(prev_block=None)
block_to_free = chain[-1]
# Expect free/allocate loop to succeed many times.
for i in range(100):
block_id = block_to_free.block_id
allocator.free(block_to_free)
assert block_to_free.block_id is None, i
new_block = allocator.allocate_mutable_block(prev_block=None)
assert new_block.block_id == block_id, i
with pytest.raises(BlockAllocator.NoFreeBlocksError):
allocator.allocate_mutable_block(prev_block=None)
block_to_free = new_block
@staticmethod
@pytest.mark.parametrize("num_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("seed", list(range(20)))
def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
random.seed(seed)
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
num_blocks_to_consume = random.randint(1, num_blocks - 1)
# Create token ids that will exhaust all blocks.
token_ids = list(range(num_blocks_to_consume * block_size))
chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# Free each block in chain, assert num free blocks includes new free
# block.
for i, block in enumerate(chain):
assert allocator.get_num_free_blocks() == (num_blocks -
num_blocks_to_consume +
i)
allocator.free(block)
@staticmethod
@pytest.mark.parametrize("num_blocks", [4])
@pytest.mark.parametrize("block_size", [8])
def test_prefix_caching_block_get_num_full_blocks_touched(
num_blocks, block_size):
""" Verify the allocator can correctly return the number of
blocks touched, when there are cached prefixes.
"""
allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
allocator_dst = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
# Create token ids that will exhaust all blocks except the last
token_ids = list(range((num_blocks - 1) * block_size))
# Create a chain of cacheable blocks in the dst
cached_blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator_dst,
)
# Create a chain of the same blocks in the src
blocks_to_swap_in = \
TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator_src,
)
# All blocks are cached
assert allocator_dst.get_num_full_blocks_touched(
blocks_to_swap_in) == 0
# Free the first block in the dst
allocator_dst.free(cached_blocks[0])
# Now the first block becomes dangling, the swapped blocks need
# to reclaim the first block in the dst
assert allocator_dst.get_num_full_blocks_touched(
blocks_to_swap_in) == 1
# Insert one non-full block in the src
non_full_block = allocator_src.allocate_mutable_block(
blocks_to_swap_in[-1])
non_full_block.append_token_ids([0])
blocks_to_swap_in.append(non_full_block)
assert allocator_dst.get_num_full_blocks_touched(
blocks_to_swap_in) == 1
# Fill up the last mutable block and invoke get_num_blocks_touched.
# Note: The last block is not cached so it will be touched.
non_full_block.append_token_ids([0] * (block_size - 1))
assert allocator_dst.get_num_full_blocks_touched(
blocks_to_swap_in) == 2
@staticmethod
@pytest.mark.parametrize("num_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("seed", list(range(20)))
def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
seed: int):
"""Verify sharing occurs by allocating two sequences that share prefixes
and incrementally freeing blocks.
"""
random.seed(seed)
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
num_blocks_to_consume = random.randint(1, num_blocks - 1)
# Create token ids that will exhaust all blocks.
token_ids = list(range(num_blocks_to_consume * block_size))
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# Free each block in the first chain. Since all blocks are shared, the
# free count should stay constant.
for i, block in enumerate(first_chain):
assert allocator.get_num_free_blocks() == (num_blocks -
num_blocks_to_consume)
allocator.free(block)
# Free each block in the second chain. Since the refcount is now zero,
# the free count should increment with each free.
for i, block in enumerate(second_chain):
assert allocator.get_num_free_blocks() == (num_blocks -
num_blocks_to_consume +
i)
allocator.free(block)
@staticmethod
@pytest.mark.parametrize("num_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("seed", list(range(20)))
def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
seed: int):
"""Verify get_common_computed_block_ids could get correct result
by create two immutable chain sharing prefix at specified pos,
and compare whether we also could get right result
from get_common_computed_block_ids.
"""
random.seed(seed)
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2,
block_size=block_size)
num_blocks_to_consume = random.randint(1, num_blocks - 1)
# Create token ids that will exhaust all blocks.
token_ids = list(range(num_blocks_to_consume * block_size))
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# After zero_point, second_chain's token_ids would be set -1, which
# make it different from here comparing with first_chain
zero_point = random.randint(1, len(token_ids) - 1)
zero_point_blocks = zero_point // block_size
token_ids[zero_point:] = [-1] * (len(token_ids) - zero_point)
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
first_computed_ids = [
first_chain[i].block_id for i in range(num_blocks_to_consume)
]
second_computed_ids = [
second_chain[i].block_id for i in range(num_blocks_to_consume)
]
res = allocator.get_common_computed_block_ids(
[first_computed_ids, second_computed_ids])
assert (len(res) == zero_point_blocks)
# Test case that assume those prompted block after first immutable would
# be freed into hashless allocator, while first immutable block get ref
# increased.
@staticmethod
@pytest.mark.parametrize("num_blocks", [3])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("seed", list(range(10)))
def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
random.seed(seed)
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
token_ids = list(range(block_size))
block = allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids)
assert allocator._refcounter.get(block.block_id) == 1
m = allocator.allocate_mutable_block(prev_block=None)
block_id = m.block_id
for i in range(block_size):
m.append_token_ids([i])
# After block get promoted to immutable from mutable, if there is
# already same content hash block, then it shall be released into
# hashless_allocator
# And first immutable block's ref get increased by 1
assert m.block_id == block.block_id
assert block_id in allocator._hashless_allocator._free_block_indices
assert allocator._refcounter.get(block.block_id) == 2
# Test case when eviction and allocation are mixed,
# make sure they work as expected
@staticmethod
@pytest.mark.parametrize("num_blocks", [3])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("seed", list(range(10)))
def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
random.seed(seed)
all_blocks_list = [i for i in range(num_blocks)]
zero_ref = {i: 0 for i in range(num_blocks)}
one_ref = {i: 1 for i in range(num_blocks)}
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
token_ids = list(range(num_blocks * block_size))
# Verify initial/pre-alloc state
# Ensure all blocks are free inside hashless allocator
assert list(allocator._hashless_allocator._free_block_indices
) == all_blocks_list
# Ensure no tracked blocks
assert len(allocator._block_tracker.keys()) == num_blocks
for block_id in range(num_blocks):
assert not allocator._block_tracker[block_id].active
# Ensure no cached blocks
assert len(allocator._cached_blocks.values()) == 0
# Ensure no evicted blocks
assert len(allocator.evictor.free_table.keys()) == 0
# Ensure 0s ref counts for all blocks
assert allocator._refcounter._refcounts == zero_ref
# Allocate immutable chains with only one block residuled in
new_block = []
for i in range(num_blocks):
block = allocator.allocate_immutable_block(
prev_block=None,
token_ids=token_ids[block_size * i:block_size * (i + 1)])
new_block.append(block)
# Verify post-alloc state
# Ensure no blocks are free inside hashless allocator
assert (len(allocator._hashless_allocator._free_block_indices) == 0)
# Ensure all blocks are tracked
assert len(allocator._block_tracker.keys()) == num_blocks
for block_id in range(num_blocks):
assert allocator._block_tracker[block_id].active
# Ensure all blocks are cached (all promoted)
assert len(allocator._cached_blocks.values()) == num_blocks
# Ensure no evicted blocks
assert len(allocator.evictor.free_table.keys()) == 0
# Ensure 1s ref counts for all blocks
assert allocator._refcounter._refcounts == one_ref
# Free all blocks, and now all blocks shall be in the evictor
# there shall be no tracking data left in _block_tracker
# all blocks shall be tracked in _cached_blocks
# all blocks' ref shall be zero
for block in new_block:
allocator.free(block)
# Verify post-free state
# Ensure no tracked blocks
assert len(allocator._block_tracker.keys()) == num_blocks
for block_id in range(num_blocks):
assert not allocator._block_tracker[block_id].active
# Ensure no blocks in hashless allocator (all promoted)
assert len(allocator._hashless_allocator._free_block_indices) == 0
# Ensure all blocks are cached
assert list(allocator._cached_blocks.values()) == all_blocks_list
# Ensure all blocks are inside the evictor
assert list(allocator.evictor.free_table.keys()) == all_blocks_list
# Ensure 0s refcounts
assert allocator._refcounter._refcounts == zero_ref
# Allocate a mutable block, and the first block shall be evicted
# and set its content hash into None, ref to 1
mutable = allocator.allocate_mutable_block(prev_block=None)
assert mutable.block_id == 0
assert mutable.content_hash is None
assert allocator._block_tracker[0].active
assert allocator._refcounter.get(0) == 1
assert 0 not in allocator._cached_blocks
assert 0 not in allocator.evictor
# Since this mutable block has no hash yet, it shall be released into
# hashless allocator
allocator.free(mutable)
assert not allocator._block_tracker[0].active
assert allocator._refcounter._refcounts == zero_ref
assert 0 not in allocator._cached_blocks
assert 0 not in allocator.evictor
assert 0 in allocator._hashless_allocator._free_block_indices
# When allocate immutable with first block_size tokens, we
# shall get free block from hashless allocator, thus no block left
# in hashless
block = allocator.allocate_immutable_block(
prev_block=None, token_ids=token_ids[:block_size])
assert block.block_id == 0
assert len(allocator._hashless_allocator._free_block_indices) == 0
assert allocator._block_tracker[0].active
assert 0 in allocator._cached_blocks.values()
assert allocator._refcounter.get(0) == 1
assert 0 not in allocator.evictor
# allocate mutable block again, it shall be popped from evictor
mutable = allocator.allocate_mutable_block(prev_block=None)
assert len(allocator._hashless_allocator._free_block_indices) == 0
assert mutable.block_id not in allocator.evictor.free_table
assert allocator._refcounter.get(mutable.block_id) == 1
# Test case where two last accessed times are equal
@staticmethod
@pytest.mark.parametrize("num_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("seed", list(range(20)))
def test_eviction_order(num_blocks: int, block_size: int, seed: int):
"""This test case simulate the two chain created and free in order,
and together they would exhaust the initial freed blocks.
So the next block created after those two chain shall use the block
from the first chain as that block has long access time.
While first chain has two blocks, it shall pick up the last one, as
it has larger token number.
"""
random.seed(seed)
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
num_blocks_to_consume = num_blocks + 1
token_ids = list(range(num_blocks_to_consume * block_size))
num_blocks_in_first_chain = 2
num_tokens_in_first_chain = block_size * num_blocks_in_first_chain
# First chain takes the first block
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids[:num_tokens_in_first_chain],
allocator=allocator,
)
# There should only be one block allocated at this point
assert allocator.get_num_free_blocks() == (num_blocks -
num_blocks_in_first_chain)
# Set the last accessed time of the first block to 1
blocks_ids = [block.block_id for block in first_chain]
allocator.mark_blocks_as_accessed(blocks_ids, 1)
# Second chain takes the rest of the blocks
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids[num_tokens_in_first_chain:-block_size],
allocator=allocator,
)
# There shouldn't be any blocks left at this point
assert allocator.get_num_free_blocks() == (0)
assert len(first_chain) == num_blocks_in_first_chain
last_block_id = first_chain[-1].block_id
# Free each block in the first chain.
for i, block in enumerate(first_chain):
allocator.free(block)
# Set the last accessed time on all of the blocks in the second chain
# to 2
blocks_ids = [block.block_id for block in second_chain]
allocator.mark_blocks_as_accessed(blocks_ids, 2)
# Free each block in the second chain.
for i, block in enumerate(second_chain):
allocator.free(block)
# Allocate a new block and check that it's the least recently used block
# from the first chain.
new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids[-block_size:],
allocator=allocator,
)
assert new_block[0].block_id == last_block_id
# Test case for cache mertics
@staticmethod
def test_metric():
block_size = 16
allocator = PrefixCachingBlockAllocator(num_blocks=4,
block_size=block_size)
# Test when no query (0/0)
assert allocator.get_prefix_cache_hit_rate() == 0.0
token_ids = list(range(block_size))
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids)
# Test 0/1 hit rate
assert allocator.get_prefix_cache_hit_rate() == 0.0
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids)
# Test 1/2 hit rate
assert allocator.get_prefix_cache_hit_rate() == 0.5
# Test more than one block
for _ in range(2, 1005):
allocator.allocate_immutable_block(prev_block=None,
token_ids=token_ids)
assert allocator.get_prefix_cache_hit_rate() > 0.99
# Test case for marking cache hit blocks as computed right after
# a batch of prefill sequences are scheduled.
@staticmethod
def test_touch_block():
block_size = 16
common_blocks = 4
allocator = PrefixCachingBlockAllocator(num_blocks=8,
block_size=block_size)
common_token_ids = list(range(block_size * common_blocks))
# Mimic the behavior of allocating the same block chain
# (i.e., common prefix) for a batch of 3 different prefill sequences.
for _ in range(3):
blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=common_token_ids,
allocator=allocator,
)
block_ids = [block.block_id for block in blocks]
# The allocated blocks should be marked as touched
# but not computed.
computed_block_ids = allocator.get_computed_block_ids(
[], block_ids, skip_last_block_id=False)
assert len(computed_block_ids) == 0
allocator.mark_blocks_as_computed([])
computed_block_ids = allocator.get_computed_block_ids(
[], block_ids, skip_last_block_id=False)
assert len(computed_block_ids) == common_blocks
@staticmethod
def create_immutable_chain(
block_size: int,
token_ids: List[int],
allocator: PrefixCachingBlockAllocator,
) -> List[PrefixCachingBlock]:
"""Helper method which creates a chain of blocks.
"""
blocks: List[Block] = []
num_blocks = math.ceil(len(token_ids) / block_size)
if num_blocks == 0:
return []
prev_block = None
for block_number in range(0, num_blocks):
block_token_ids = token_ids[block_number *
block_size:(block_number + 1) *
block_size]
prev_block = allocator.allocate_immutable_block(
prev_block=prev_block, token_ids=block_token_ids)
blocks.append(prev_block)
return blocks