forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
0
vllm-v0.6.2/tests/core/__init__.py
Normal file
0
vllm-v0.6.2/tests/core/__init__.py
Normal file
0
vllm-v0.6.2/tests/core/block/__init__.py
Normal file
0
vllm-v0.6.2/tests/core/block/__init__.py
Normal file
12
vllm-v0.6.2/tests/core/block/conftest.py
Normal file
12
vllm-v0.6.2/tests/core/block/conftest.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def should_do_global_cleanup_after_test() -> bool:
|
||||
"""Disable the global cleanup fixture for tests in this directory. This
|
||||
provides a ~10x speedup for unit tests that don't load a model to GPU.
|
||||
|
||||
This requires that tests in this directory clean up after themselves if they
|
||||
use the GPU.
|
||||
"""
|
||||
return False
|
||||
0
vllm-v0.6.2/tests/core/block/e2e/__init__.py
Normal file
0
vllm-v0.6.2/tests/core/block/e2e/__init__.py
Normal file
67
vllm-v0.6.2/tests/core/block/e2e/conftest.py
Normal file
67
vllm-v0.6.2/tests/core/block/e2e/conftest.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from typing import Callable, Iterable, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, seed):
|
||||
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, seed)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
test_llm_kwargs, seed):
|
||||
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
test_llm_kwargs, seed)
|
||||
|
||||
|
||||
def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
distinct_llm_kwargs, seed):
|
||||
kwargs = {
|
||||
**common_llm_kwargs,
|
||||
**per_test_common_llm_kwargs,
|
||||
**distinct_llm_kwargs,
|
||||
}
|
||||
|
||||
def generator_inner():
|
||||
llm = LLM(**kwargs)
|
||||
|
||||
set_random_seed(seed)
|
||||
|
||||
yield llm
|
||||
del llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
for llm in generator_inner():
|
||||
yield llm
|
||||
del llm
|
||||
|
||||
|
||||
def get_text_from_llm_generator(llm_generator: Iterable[LLM],
|
||||
prompts,
|
||||
sampling_params,
|
||||
llm_cb: Optional[Callable[[LLM],
|
||||
None]] = None):
|
||||
for llm in llm_generator:
|
||||
if llm_cb:
|
||||
llm_cb(llm)
|
||||
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||
text = [output.outputs[0].text for output in outputs]
|
||||
del llm
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
|
||||
for llm in llm_generator:
|
||||
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||
token_ids = [output.outputs[0].token_ids for output in outputs]
|
||||
del llm
|
||||
|
||||
return token_ids
|
||||
489
vllm-v0.6.2/tests/core/block/e2e/test_correctness.py
Normal file
489
vllm-v0.6.2/tests/core/block/e2e/test_correctness.py
Normal file
@@ -0,0 +1,489 @@
|
||||
from itertools import cycle
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
from .conftest import get_token_ids_from_llm_generator
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Allow only 5 sequences of ~1024 tokens in worst case.
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 5 * (64 + 1),
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"preemption_mode": "swap"
|
||||
}, {
|
||||
"preemption_mode": "recompute"
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_block_manager_with_preemption(baseline_llm_generator,
|
||||
test_llm_generator, batch_size):
|
||||
"""Verify block manager produces same outputs even when there is preemption.
|
||||
|
||||
This constructs two LLM, each with limited number of GPU blocks. The limit
|
||||
is decided such that as the sequences in the batch grow, sequences must be
|
||||
preempted and removed from cache.
|
||||
|
||||
If the output token ids are equivalent, then we have confidence that the KV
|
||||
cache is not corrupted.
|
||||
|
||||
NOTE: We want a significant number of generated tokens so that any incorrect
|
||||
KV mapping has time to build up error.
|
||||
|
||||
NOTE(Kuntai): Though we have removed block manager v1, this test is still
|
||||
useful as it asserts the behavior of block manager v2 (now it is called
|
||||
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
|
||||
keep this test.
|
||||
"""
|
||||
output_len = 1024
|
||||
temperature = 0.0
|
||||
|
||||
# We want to ensure equality even with preemption.
|
||||
# We force the total block size to be 1 + cdiv(output_len, block_size)
|
||||
# so that only one sequence can fit at a time (once the sequences grow).
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(block_size): MLU paged attention only support block_size=16
|
||||
'''
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# Our prompts will generate 128 tokens; since the prompts themselves are
|
||||
# small, we don't need much KV space beyond 128.
|
||||
"max_model_len": 160,
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
}])
|
||||
@pytest.mark.parametrize(
|
||||
"per_test_common_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"block_size": 16,
|
||||
|
||||
# Allow only 2 sequences of ~128 tokens in worst case.
|
||||
# Note 8 = 128/block_size
|
||||
"num_gpu_blocks_override": 2 * (8 + 1),
|
||||
},
|
||||
{
|
||||
"block_size": 16,
|
||||
|
||||
# Allow only 2 sequences of ~128 tokens in worst case.
|
||||
# Note 16 = 128/block_size
|
||||
"num_gpu_blocks_override": 2 * (16 + 2),
|
||||
}
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{
|
||||
"num_lookahead_slots": 0,
|
||||
}])
|
||||
@pytest.mark.parametrize(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
# We run one test with block_size < lookahead_slots, one test with
|
||||
# block_size > lookahead_slots
|
||||
"num_lookahead_slots": 10,
|
||||
"preemption_mode": "swap",
|
||||
},
|
||||
{
|
||||
"num_lookahead_slots": 10,
|
||||
"preemption_mode": "recompute",
|
||||
}
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [4])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
|
||||
test_llm_generator,
|
||||
batch_size):
|
||||
"""Verify vLLM produces the same output with greedy sampling, when lookahead
|
||||
scheduling is used vs. not.
|
||||
|
||||
Lookahead scheduling is not expected to modify the output, as it simply
|
||||
allocates empty slots ahead of the known token ids in a sliding fashion.
|
||||
|
||||
This test constrains the total number of blocks to force preemption. It also
|
||||
varies the block size so that the lookahead size is less than and greater
|
||||
than the block size.
|
||||
"""
|
||||
output_len = 128
|
||||
temperature = 0.0
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids without lookahead scheduling')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with lookahead scheduling')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(block_size): Only support Paged block_size 16, change block_size from 8 to 16
|
||||
'''
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[
|
||||
{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
"enable_chunked_prefill": True,
|
||||
"gpu_memory_utilization": 0.6,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs",
|
||||
[{
|
||||
"block_size": 16,
|
||||
"max_num_batched_tokens": 2,
|
||||
"max_num_seqs": 2,
|
||||
}, {
|
||||
"block_size": 16,
|
||||
"max_num_batched_tokens": 3,
|
||||
"max_num_seqs": 2,
|
||||
}, {
|
||||
"block_size": 16,
|
||||
"max_num_batched_tokens": 256,
|
||||
"max_num_seqs": 10,
|
||||
}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [
|
||||
{},
|
||||
])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"num_lookahead_slots": 0,
|
||||
},
|
||||
{
|
||||
"num_lookahead_slots": 5,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [4])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_chunked_prefill_block_manager(baseline_llm_generator,
|
||||
test_llm_generator, batch_size):
|
||||
"""Verify that chunked prefill works with SelfAttnBlockSpaceManager,
|
||||
with and without lookahead scheduling.
|
||||
"""
|
||||
output_len = 32
|
||||
temperature = 0.0
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
("1 + " * 50) + " 1 = ", # Longer prompt.
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids with BlockManager')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with BlockManager, with lookahead slots.')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Allow only 5 sequences of ~1024 tokens in worst case.
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 5 * (64 + 1),
|
||||
|
||||
# Enable prefill cache
|
||||
"enable_prefix_caching": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"preemption_mode": "swap"
|
||||
}, {
|
||||
"preemption_mode": "recompute"
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_block_manager_prefix_caching_enabled_with_preemption(
|
||||
baseline_llm_generator, test_llm_generator, batch_size):
|
||||
"""Verify block manager produces same outputs even when there is preemption.
|
||||
|
||||
This constructs two LLM, each with limited number of GPU blocks. The limit
|
||||
is decided such that as the sequences in the batch grow, sequences must be
|
||||
preempted and removed from cache.
|
||||
|
||||
If the output token ids are equivalent, then we have confidence that the KV
|
||||
cache is not corrupted.
|
||||
|
||||
NOTE: We want a significant number of generated tokens so that any incorrect
|
||||
KV mapping has time to build up error.
|
||||
|
||||
NOTE(Kuntai): Though we have removed block manager v1, this test is still
|
||||
useful as it asserts the behavior of block manager v2 (now it is called
|
||||
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
|
||||
keep this test.
|
||||
"""
|
||||
output_len = 1024
|
||||
temperature = 0.0
|
||||
|
||||
# We want to ensure equality even with preemption.
|
||||
# We force the total block size to be 1 + cdiv(output_len, block_size)
|
||||
# so that only one sequence can fit at a time (once the sequences grow).
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids from block manager')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids from block manager, with preemption')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Allow only 5 sequences of ~1024 tokens in worst case.
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 5 * (64 + 1),
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{
|
||||
"enable_prefix_caching": False
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"enable_prefix_caching": True,
|
||||
"preemption_mode": "swap"
|
||||
}, {
|
||||
"enable_prefix_caching": True,
|
||||
"preemption_mode": "recompute"
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
|
||||
test_llm_generator, batch_size):
|
||||
"""Verify block manager v2 with auto prefix caching enabled produces same
|
||||
outputs as auto prefix caching disabled, even when there is preemption.
|
||||
|
||||
This constructs two LLM, each with limited number of GPU blocks. The limit
|
||||
is decided such that as the sequences in the batch grow, sequences must be
|
||||
preempted and removed from cache.
|
||||
|
||||
If the output token ids are equivalent, then we have confidence that auto
|
||||
prefix caching itself at least don't cause result error.
|
||||
"""
|
||||
output_len = 1024
|
||||
temperature = 0.0
|
||||
|
||||
# We want to ensure equality even with preemption.
|
||||
# We force the total block size to be 1 + cdiv(output_len, block_size)
|
||||
# so that only one sequence can fit at a time (once the sequences grow).
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids with APC disabled')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with APC enabled')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# we keep the blocks small, so that hit eviction quickly
|
||||
"max_model_len": 48,
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 3,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{
|
||||
"enable_prefix_caching": False
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"enable_prefix_caching": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
|
||||
test_llm_generator):
|
||||
"""Verify block manager v2 with auto prefix caching could works normal
|
||||
even when eviction started.
|
||||
With APC enabled, all blocks are held by native block at the beginning.
|
||||
Then blocks are managed by evictor instead. If cache hit at the evitor's
|
||||
block, then it could be reused, or we need to recompute its kv cache.
|
||||
"""
|
||||
output_len = 10
|
||||
temperature = 0.0
|
||||
|
||||
prompts = [
|
||||
"You are a helpful assistant. Please answer truthfully and write "
|
||||
"out your thinking step by step to be sure you get the right answer. "
|
||||
"If you make a mistake, attempt to correct it. who are you?",
|
||||
"You are a helpful assistant. Please answer truthfully and write out "
|
||||
"your thinking step by step to be sure you get the right answer. You "
|
||||
"are helpful and harmless and you follow ethical guidelines. "
|
||||
"who are you?"
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids with APC disabled')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with APC enabled')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
@@ -0,0 +1,180 @@
|
||||
import random
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
from .conftest import get_text_from_llm_generator
|
||||
|
||||
# relatively small model with 4k sliding window.
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
Currently tmo.apply_rotary not support offsets so bigcode/starcoder2-3b cannot run.
|
||||
use mistralai/Mistral-7B-v0.1 instead, which also have 4k sliding window.
|
||||
'''
|
||||
# The original model is: MODEL = "bigcode/starcoder2-3b"
|
||||
MODEL = "mistralai/Mistral-7B-v0.1"
|
||||
BLOCK_SIZE = 16
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
"model": MODEL,
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
"block_size": BLOCK_SIZE,
|
||||
# needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
|
||||
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("batch_size", [5])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
|
||||
batch_size, seed):
|
||||
"""
|
||||
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
||||
asks for value of one of them (which is outside the sliding window).
|
||||
If we tell it upfront which we are going to be looking for, then
|
||||
it answers correctly (mostly).
|
||||
|
||||
Additionally, we compare the results of the v1 and v2 managers.
|
||||
"""
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=1024,
|
||||
ignore_eos=True,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
prompts, answer, indices = prep_prompts(batch_size)
|
||||
|
||||
baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
|
||||
prompts,
|
||||
sampling_params,
|
||||
llm_cb=check_window(prompts))
|
||||
|
||||
check_answers(indices, answer, baseline_texts)
|
||||
|
||||
print('Getting token ids from block manager v2')
|
||||
test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
|
||||
sampling_params)
|
||||
check_answers(indices, answer, test_texts)
|
||||
|
||||
cmp = [
|
||||
expected_text == actual_text
|
||||
for expected_text, actual_text in zip(baseline_texts, test_texts)
|
||||
]
|
||||
print(cmp)
|
||||
# make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768
|
||||
# however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290
|
||||
# states that xformers and flash_attn have different ideas about the window
|
||||
# size anyways
|
||||
assert sum(cmp) > 0.7 * len(cmp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
"model": MODEL,
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
"block_size": BLOCK_SIZE,
|
||||
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
|
||||
@pytest.mark.parametrize("batch_size", [5])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
|
||||
"""
|
||||
This is similar to test_sliding_window_retrival, however, it doesn't
|
||||
compare against the v1 block manager since v1 doesn't support
|
||||
chunked prefill with sliding window.
|
||||
|
||||
The results with and without chunked prefill are not the same due to
|
||||
numerical instabilities.
|
||||
"""
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=10,
|
||||
ignore_eos=True,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
prompts, answer, indices = prep_prompts(batch_size)
|
||||
|
||||
# We don't compare with the baseline model here, since the results
|
||||
# slightly different due to different tailing in attention.
|
||||
test_texts = get_text_from_llm_generator(test_llm_generator,
|
||||
prompts,
|
||||
sampling_params,
|
||||
llm_cb=check_window(prompts))
|
||||
check_answers(indices, answer, test_texts)
|
||||
|
||||
|
||||
def prep_prompts(batch_size: int):
|
||||
"""
|
||||
Generate prompts which a bunch of assignments,
|
||||
then asking for the value of one of them.
|
||||
The prompt is just under 10k tokens; sliding window is 4k
|
||||
so the answer is outside sliding window, but should still be correct.
|
||||
"""
|
||||
prompts: List[str] = []
|
||||
answer: List[int] = []
|
||||
indices: List[int] = []
|
||||
random.seed(1)
|
||||
for _ in range(batch_size):
|
||||
idx = random.randint(30, 90)
|
||||
indices.append(idx)
|
||||
prompt = "```python\n# We set a number of variables, " + \
|
||||
f"x{idx} will be important later\n"
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
Since we have used a different model, the length of the
|
||||
prompt need to reset to the proper value as well
|
||||
'''
|
||||
# The original value is 800~1100
|
||||
ln = random.randint(400, 500)
|
||||
for k in range(30, ln):
|
||||
v = random.randint(10, 99)
|
||||
if k == idx:
|
||||
answer.append(v)
|
||||
prompt += f"x{k} = {v}\n"
|
||||
prompt += f"# Now, we check the value of x{idx}:\n"
|
||||
prompt += f"assert x{idx} == "
|
||||
prompts.append(prompt)
|
||||
return prompts, answer, indices
|
||||
|
||||
|
||||
def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
|
||||
answer2 = [int(text[0:2].strip()) for text in outputs]
|
||||
print(list(zip(indices, zip(answer, answer2))))
|
||||
numok = 0
|
||||
for a1, a2 in zip(answer, answer2):
|
||||
if a1 == a2:
|
||||
numok += 1
|
||||
frac_ok = numok / len(answer)
|
||||
print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
|
||||
# The original value is 0.7
|
||||
assert frac_ok >= 0.4
|
||||
|
||||
|
||||
def check_window(prompts: List[str]):
|
||||
|
||||
def inner(llm: LLM):
|
||||
sliding_window = llm.llm_engine.model_config.get_sliding_window()
|
||||
assert sliding_window and sliding_window > 0
|
||||
assert any(
|
||||
len(llm.get_tokenizer().tokenize(prompt)) > sliding_window
|
||||
for prompt in prompts)
|
||||
|
||||
return inner
|
||||
491
vllm-v0.6.2/tests/core/block/test_block_manager.py
Normal file
491
vllm-v0.6.2/tests/core/block/test_block_manager.py
Normal file
@@ -0,0 +1,491 @@
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
|
||||
STR_NOT_IMPL_ENC_DEC_SWA)
|
||||
from vllm.core.block_manager import SelfAttnBlockSpaceManager
|
||||
from vllm.core.interfaces import AllocStatus
|
||||
from vllm.sequence import Logprob, SequenceStatus
|
||||
from vllm.utils import chunk_list
|
||||
|
||||
from ..utils import (create_dummy_prompt, create_seq_group,
|
||||
create_seq_group_encoder_decoder)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
|
||||
num_gpu_blocks: int, watermark: float):
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
)
|
||||
num_watermark_blocks = int(watermark * num_gpu_blocks)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
|
||||
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
|
||||
# the current implementation assumes all seqs are new prompts / don't have
|
||||
# different output lens.
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
|
||||
for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
)
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
|
||||
can_allocate_result = block_manager.can_allocate(seq_group)
|
||||
|
||||
num_required_blocks = num_prompt_blocks + num_output_blocks
|
||||
|
||||
if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
|
||||
assert can_allocate_result == AllocStatus.NEVER
|
||||
elif num_gpu_blocks >= num_required_blocks:
|
||||
assert can_allocate_result == AllocStatus.OK
|
||||
else:
|
||||
assert can_allocate_result == AllocStatus.LATER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_seq_group_encoder_decoder(block_size: int,
|
||||
num_seqs_per_group: int,
|
||||
num_gpu_blocks: int,
|
||||
watermark: float):
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
)
|
||||
num_watermark_blocks = int(watermark * num_gpu_blocks)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
|
||||
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
|
||||
# the current implementation assumes all seqs are new prompts / don't have
|
||||
# different output lens.
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
|
||||
for bdx, num_prompt_blocks in enumerate(
|
||||
range(1, num_gpu_blocks - num_output_blocks)):
|
||||
num_cross_blocks_per_seq = num_prompt_blocks
|
||||
|
||||
seq_group = create_seq_group_encoder_decoder(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
request_id=str(bdx))
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
|
||||
can_allocate_result = block_manager.can_allocate(seq_group)
|
||||
|
||||
num_required_blocks = num_prompt_blocks + \
|
||||
num_output_blocks + \
|
||||
num_cross_blocks_per_seq
|
||||
|
||||
if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
|
||||
assert can_allocate_result == AllocStatus.NEVER
|
||||
elif num_gpu_blocks >= num_required_blocks:
|
||||
assert can_allocate_result == AllocStatus.OK
|
||||
else:
|
||||
assert can_allocate_result == AllocStatus.LATER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [16])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
|
||||
num_seqs_per_group: int,
|
||||
num_gpu_blocks: int,
|
||||
watermark: float):
|
||||
'''
|
||||
SWA short for Sliding Window Attention.
|
||||
|
||||
At time of writing block manager does not support SWA.
|
||||
|
||||
However even when SWA is implemented for block manager,
|
||||
there will still most likely be a separate workstream required
|
||||
to enable SWA for encoder/decoder models.
|
||||
|
||||
Therefore this test enforces that one of the following cases
|
||||
hold true:
|
||||
1. Block manager does not support SWA at all (true at time of writing)
|
||||
2. Block manager fails with NotImplementError when SWA is enabled
|
||||
AND a SequenceGroup with an encoder sequence (i.e. in support of an
|
||||
encoder/decoder model) is passed into can_allocate() as an argument
|
||||
|
||||
The setup for this test is stripped down version of
|
||||
test_can_allocate_seq_group_encoder_decoder()
|
||||
'''
|
||||
|
||||
with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
sliding_window=5 # SWA
|
||||
)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
num_prompt_blocks = 1
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
seq_group = create_seq_group_encoder_decoder(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
request_id="0")
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
block_manager.can_allocate(seq_group)
|
||||
|
||||
# Assert that either
|
||||
# 1. Block manager constructor fails with assertion that sliding window
|
||||
# is not yet supported (most likely near-term outcome at time of
|
||||
# writing), or
|
||||
# 2. can_allocate() fails with NotImplementedError due to combination of
|
||||
# encoder/decoder and sliding window attention
|
||||
if isinstance(exc_info.value, NotImplementedError):
|
||||
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
|
||||
elif isinstance(exc_info.value, AssertionError):
|
||||
assert str(exc_info.value) == "Sliding window not yet supported"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [16])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
|
||||
block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
|
||||
watermark: float):
|
||||
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
enable_caching=True # Prefix cache
|
||||
)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
num_prompt_blocks = 1
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
seq_group = create_seq_group_encoder_decoder(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
request_id="0")
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
|
||||
# Assert that either can_allocate() fails with NotImplementedError
|
||||
# due to combination of encoder/decoder and prefix cache
|
||||
with pytest.raises(NotImplementedError) as exc_info:
|
||||
block_manager.can_allocate(seq_group)
|
||||
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("prompt_len", [1, 7, 8])
|
||||
@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [0, 10])
|
||||
def test_append_slots(block_size, prompt_len, num_slots_to_append,
|
||||
num_lookahead_slots):
|
||||
"""Verify append_slots consumes the correct number of blocks from the block
|
||||
table.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
watermark = 0.1
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
watermark=watermark,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=prompt_len,
|
||||
seq_output_lens=[0],
|
||||
)
|
||||
|
||||
# Allocate seq
|
||||
assert block_manager.can_allocate(seq_group)
|
||||
block_manager.allocate(seq_group)
|
||||
|
||||
# Seq seq to RUNNING
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
# Append tokens to the sequeqnce
|
||||
for token_id in range(num_slots_to_append):
|
||||
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
|
||||
# Append slots for new tokens and lookahead slots.
|
||||
free_blocks_before_append = block_manager.get_num_free_gpu_blocks()
|
||||
block_manager.append_slots(seq, num_lookahead_slots)
|
||||
num_consumed_blocks = (free_blocks_before_append -
|
||||
block_manager.get_num_free_gpu_blocks())
|
||||
|
||||
# Expect consumed blocks to be new blocks required to support the new slots.
|
||||
expected_consumed_blocks = len(
|
||||
list(
|
||||
chunk_list(
|
||||
list(
|
||||
range(prompt_len + num_slots_to_append +
|
||||
num_lookahead_slots)),
|
||||
block_size))) - len(
|
||||
list(chunk_list(list(range(prompt_len)), block_size)))
|
||||
assert num_consumed_blocks == expected_consumed_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
@pytest.mark.parametrize("num_cpu_blocks", [4])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [4])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
|
||||
@pytest.mark.parametrize("enable_caching", [False, True])
|
||||
def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
|
||||
enable_caching):
|
||||
"""Verify blocks number on src/desc device is correct after swapping in/out
|
||||
sequence group (not missing or extra blocks).
|
||||
"""
|
||||
block_manager = SelfAttnBlockSpaceManager(block_size,
|
||||
num_cpu_blocks,
|
||||
num_gpu_blocks,
|
||||
watermark=0,
|
||||
enable_caching=enable_caching)
|
||||
prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
|
||||
prompt.status = SequenceStatus.WAITING
|
||||
block_manager.allocate(seq_group)
|
||||
|
||||
# Emulate a forward pass by appending a single token.
|
||||
# The block manager then knows how many unprocessed
|
||||
# tokens will be written in the next forward pass.
|
||||
token_id = 0
|
||||
prompt.status = SequenceStatus.RUNNING
|
||||
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
|
||||
# Swap seq group from GPU -> CPU.
|
||||
gpu_blocks = block_manager.get_block_table(prompt)
|
||||
assert block_manager.can_swap_out(seq_group)
|
||||
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
mapping = block_manager.swap_out(seq_group)
|
||||
mapping_keys = [key for key, _ in mapping]
|
||||
assert mapping_keys == gpu_blocks
|
||||
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
|
||||
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
|
||||
prompt.status = SequenceStatus.SWAPPED
|
||||
|
||||
# Swap seq group from CPU -> GPU.
|
||||
assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
|
||||
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
mapping = block_manager.swap_in(seq_group)
|
||||
cpu_blocks = block_manager.get_block_table(prompt)
|
||||
mapping_keys = [key for key, _ in mapping]
|
||||
assert mapping_keys == [cpu_blocks[0]]
|
||||
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [4])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10])
|
||||
@pytest.mark.parametrize("enable_caching", [True, False])
|
||||
def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
|
||||
enable_caching):
|
||||
""" Verify the block manager can correctly determine if a sequence group
|
||||
can be swapped in/out.
|
||||
"""
|
||||
num_cpu_blocks = num_gpu_blocks
|
||||
block_manager = SelfAttnBlockSpaceManager(block_size,
|
||||
num_cpu_blocks,
|
||||
num_gpu_blocks,
|
||||
watermark=0,
|
||||
enable_caching=enable_caching)
|
||||
prompt, seq_group = create_dummy_prompt(
|
||||
"1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
|
||||
prompt.status = SequenceStatus.WAITING
|
||||
block_manager.allocate(seq_group)
|
||||
prompt.status = SequenceStatus.RUNNING
|
||||
|
||||
# Swap seq group from GPU -> CPU.
|
||||
gpu_blocks = block_manager.get_block_table(prompt)
|
||||
assert block_manager.can_swap_out(seq_group)
|
||||
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
mapping = block_manager.swap_out(seq_group)
|
||||
mapping_keys = [key for key, _ in mapping]
|
||||
assert mapping_keys == gpu_blocks
|
||||
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
|
||||
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
|
||||
prompt.status = SequenceStatus.SWAPPED
|
||||
|
||||
# At this moment, we still have enough free blocks to swap in the seq group.
|
||||
if num_lookahead_slots <= block_size:
|
||||
assert block_manager.can_swap_in(seq_group,
|
||||
num_lookahead_slots) == AllocStatus.OK
|
||||
else:
|
||||
assert block_manager.can_swap_in(
|
||||
seq_group, num_lookahead_slots) == AllocStatus.NEVER
|
||||
|
||||
# During Swapped out, 2 cached blocks were evicted from the GPU,
|
||||
# so the prompt1 can't be swapped in
|
||||
prompt2_len = 2 * block_size - 1
|
||||
prompt2, seq_group2 = create_dummy_prompt(
|
||||
"2",
|
||||
prompt_length=prompt2_len,
|
||||
prompt_tokens=[10000 + i for i in range(prompt2_len)])
|
||||
prompt2.status = SequenceStatus.WAITING
|
||||
block_manager.allocate(seq_group2)
|
||||
|
||||
# Swap seq group from CPU -> GPU.
|
||||
if num_lookahead_slots <= block_size:
|
||||
assert block_manager.can_swap_in(
|
||||
seq_group, num_lookahead_slots) == AllocStatus.LATER
|
||||
else:
|
||||
assert block_manager.can_swap_in(
|
||||
seq_group, num_lookahead_slots) == AllocStatus.NEVER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
|
||||
@pytest.mark.parametrize("enable_caching", [False, True])
|
||||
def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
|
||||
"""Verifies that swapping fails if there is not enough free blocks
|
||||
to account for unseen tokens and lookahead_slots.
|
||||
"""
|
||||
block_size = 8
|
||||
num_cpu_blocks = 1
|
||||
num_gpu_blocks = 1
|
||||
block_manager = SelfAttnBlockSpaceManager(block_size,
|
||||
num_cpu_blocks,
|
||||
num_gpu_blocks,
|
||||
watermark=0,
|
||||
enable_caching=enable_caching)
|
||||
prompt_length = block_size - 3
|
||||
assert prompt_length > 0
|
||||
prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
|
||||
prompt.status = SequenceStatus.WAITING
|
||||
block_manager.allocate(seq_group)
|
||||
# Emulate a forward pass by appending a single token.
|
||||
# The block manager then knows how many unprocessed
|
||||
# tokens will be written in the next forward pass.
|
||||
token_id = 0
|
||||
prompt.status = SequenceStatus.RUNNING
|
||||
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
|
||||
# Swap seq group from GPU -> CPU.
|
||||
assert block_manager.can_swap_out(seq_group)
|
||||
block_manager.swap_out(seq_group)
|
||||
prompt.status = SequenceStatus.SWAPPED
|
||||
|
||||
# Swap seq group from CPU -> GPU.
|
||||
# The number of unseen tokens is 1. If the number of existing
|
||||
# tokens plus the unseen ones and number of lookahead slots exceeds
|
||||
# the total number of available GPU blocks then the swap
|
||||
# should fail.
|
||||
num_unseen_tokens = 1
|
||||
if (num_lookahead_slots + num_unseen_tokens +
|
||||
prompt_length) <= (block_size * num_gpu_blocks):
|
||||
assert block_manager.can_swap_in(seq_group,
|
||||
num_lookahead_slots) == AllocStatus.OK
|
||||
else:
|
||||
assert block_manager.can_swap_in(
|
||||
seq_group, num_lookahead_slots) == AllocStatus.NEVER
|
||||
|
||||
|
||||
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8, 16])
|
||||
@pytest.mark.parametrize("prompt_len", [10, 300, 1000])
|
||||
@pytest.mark.parametrize("num_slots_to_append", [50])
|
||||
@pytest.mark.parametrize("sliding_window", [20, 32, 200, 512])
|
||||
def test_sliding_window(block_size, prompt_len, num_slots_to_append,
|
||||
sliding_window):
|
||||
"""Verify append_slots consumes the correct number of blocks from the block
|
||||
table.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
watermark = 0.1
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
watermark=watermark,
|
||||
sliding_window=sliding_window,
|
||||
)
|
||||
|
||||
def check_used(min_n, max_n=None):
|
||||
if max_n is None:
|
||||
max_n = min_n
|
||||
used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
|
||||
assert min_n <= used
|
||||
assert used <= max_n
|
||||
|
||||
def num_blocks(num_tokens):
|
||||
return (num_tokens + block_size - 1) // block_size
|
||||
|
||||
check_used(0)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=prompt_len,
|
||||
seq_output_lens=[0],
|
||||
)
|
||||
|
||||
check_used(0)
|
||||
|
||||
# Allocate seq
|
||||
assert block_manager.can_allocate(seq_group)
|
||||
block_manager.allocate(seq_group)
|
||||
|
||||
check_used(num_blocks(prompt_len))
|
||||
|
||||
# Seq seq to RUNNING
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
seq.data.update_num_computed_tokens(prompt_len)
|
||||
check_used(num_blocks(prompt_len))
|
||||
|
||||
# this is how we compute it in SelfAttnBlockSpaceManager.__init__
|
||||
sliding_blocks = (sliding_window // block_size) + 2
|
||||
# plus one block for null block
|
||||
sliding_blocks += 1
|
||||
|
||||
# Append tokens to the sequeqnce
|
||||
for token_id in range(num_slots_to_append):
|
||||
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
seq.data.update_num_computed_tokens(1)
|
||||
block_manager.append_slots(seq, num_lookahead_slots=0)
|
||||
if prompt_len < sliding_window + 10:
|
||||
check_used(0, sliding_blocks + 1)
|
||||
else:
|
||||
check_used(sliding_blocks, sliding_blocks + 1)
|
||||
576
vllm-v0.6.2/tests/core/block/test_block_table.py
Normal file
576
vllm-v0.6.2/tests/core/block/test_block_table.py
Normal file
@@ -0,0 +1,576 @@
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.block_table import BlockTable
|
||||
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
|
||||
from vllm.utils import Device, cdiv, chunk_list
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
def test_allocate_naive(block_size: int, sequence_len: int):
|
||||
"""Test the allocation of blocks using the naive allocator.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size and
|
||||
number of blocks. It then allocates multiple BlockTables with varying
|
||||
sequence lengths and verifies that the number of free blocks decreases as
|
||||
expected after each allocation.
|
||||
"""
|
||||
assert block_size > 1
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type="naive",
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
|
||||
|
||||
block_tables: List[BlockTable] = []
|
||||
for i in range(5):
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
|
||||
|
||||
block_tables.append(
|
||||
BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
))
|
||||
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
def test_allocate_prefix_caching(block_size: int, sequence_len: int):
|
||||
"""Test the allocation of blocks using the prefix caching allocator.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size and
|
||||
number of blocks, using the prefix caching allocator. It then allocates
|
||||
multiple BlockTables with varying sequence lengths and verifies that the
|
||||
number of free blocks decreases as expected after each allocation.
|
||||
|
||||
The test expects all sequences to share allocations, except for their last
|
||||
block, which may be mutable. It calculates the expected number of immutable
|
||||
and mutable blocks per allocation based on the sequence length and block
|
||||
size.
|
||||
"""
|
||||
assert block_size > 1
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type="prefix_caching",
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
chunked_tokens = list(chunk_list(token_ids, block_size))
|
||||
num_mutable_blocks_per_alloc = 0 if len(
|
||||
chunked_tokens[-1]) == block_size else 1
|
||||
num_immutable_blocks_per_alloc = len(
|
||||
chunked_tokens) - num_mutable_blocks_per_alloc
|
||||
|
||||
block_tables: List[BlockTable] = []
|
||||
for alloc_i in range(1, 6):
|
||||
|
||||
block_tables.append(
|
||||
BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
))
|
||||
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Expect all sequences to share allocations, except for their last block
|
||||
# (which may be mutable).
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_gpu_blocks - (
|
||||
num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
|
||||
(alloc_i))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
@pytest.mark.parametrize("device", ["cpu", "gpu"])
|
||||
def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
|
||||
device: str):
|
||||
"""Test the allocation and freeing of blocks using different allocators and
|
||||
devices.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size,
|
||||
number of blocks, allocator type, and device. It then allocates a BlockTable
|
||||
multiple times with the same sequence and verifies that the number of free
|
||||
blocks remains consistent after each allocation and freeing.
|
||||
"""
|
||||
device = Device[device.upper()]
|
||||
|
||||
num_device_blocks = 1024
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_device_blocks,
|
||||
num_cpu_blocks=num_device_blocks,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
for i in range(5):
|
||||
block_table.allocate(token_ids=token_ids, device=device)
|
||||
assert allocator.get_num_free_blocks(
|
||||
device) == num_device_blocks - num_blocks_per_alloc
|
||||
assert all(block_id is not None
|
||||
for block_id in block_table.physical_block_ids)
|
||||
|
||||
block_table.free()
|
||||
assert allocator.get_num_free_blocks(device) == num_device_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_append_token_ids_allocation(block_size: int, sequence_len: int,
|
||||
append_len: int, allocator_type: str):
|
||||
"""Test the allocation behavior when appending token IDs to a BlockTable.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size,
|
||||
number of blocks, and allocator type. It then allocates a BlockTable with an
|
||||
initial sequence and appends additional token IDs to it. The test verifies
|
||||
that the number of allocated blocks before and after appending matches the
|
||||
expected values.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(append_len))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
num_expected_blocks_before_append = len(
|
||||
list(chunk_list(token_ids, block_size)))
|
||||
num_expected_appended_blocks = len(
|
||||
list(chunk_list(token_ids + token_ids_to_append,
|
||||
block_size))) - num_expected_blocks_before_append
|
||||
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
assert len(
|
||||
block_table.physical_block_ids) == num_expected_blocks_before_append
|
||||
block_table.append_token_ids(token_ids_to_append)
|
||||
assert len(
|
||||
block_table.physical_block_ids
|
||||
) == num_expected_blocks_before_append + num_expected_appended_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
|
||||
num_empty_slots: int,
|
||||
allocator_type: str):
|
||||
"""Test the allocation behavior when ensuring a certain number of empty
|
||||
slots in a BlockTable.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size,
|
||||
number of blocks, and allocator type. It then allocates a BlockTable with an
|
||||
initial sequence and ensures a certain number of empty slots. The test
|
||||
verifies that the number of allocated blocks before and after ensuring empty
|
||||
slots matches the expected values. It also checks that filling up the empty
|
||||
slots does not consume additional blocks.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
num_expected_blocks_before_append = len(
|
||||
list(chunk_list(token_ids, block_size)))
|
||||
num_expected_appended_blocks = len(
|
||||
list(chunk_list(token_ids + [-1] * num_empty_slots,
|
||||
block_size))) - num_expected_blocks_before_append
|
||||
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Assert that the empty slots consume the expected number of additional
|
||||
# blocks.
|
||||
assert len(
|
||||
block_table.physical_block_ids) == num_expected_blocks_before_append
|
||||
block_table.ensure_num_empty_slots(num_empty_slots)
|
||||
assert len(
|
||||
block_table.physical_block_ids
|
||||
) == num_expected_blocks_before_append + num_expected_appended_blocks
|
||||
|
||||
# Now, ensure no additional blocks consumed as we fill up the empty slots.
|
||||
num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
|
||||
block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
|
||||
assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 9])
|
||||
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("append_size", [1, 4, 129])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
|
||||
append_len: int, allocator_type: str,
|
||||
append_size: int):
|
||||
"""Verify token ids are correctly appended. Appends various amounts of
|
||||
token ids in various append sizes, and verifies the final sequence is
|
||||
correct.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(append_len))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
appended_so_far: List[int] = []
|
||||
for append in chunk_list(token_ids_to_append, append_size):
|
||||
block_table.append_token_ids(append)
|
||||
appended_so_far.extend(append)
|
||||
|
||||
assert block_table._get_all_token_ids() == token_ids + appended_so_far
|
||||
|
||||
assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_len", [1, 9, 129])
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_fork(seq_len: int, block_size: int, allocator_type: str):
|
||||
"""Create a sequence using the specified allocator.
|
||||
1. Assert that after forking the sequence, the free block count is the
|
||||
same.
|
||||
2. Assert that the forked sequence has the same physical mappings.
|
||||
3. Then free the original sequence; verify that the free block count is
|
||||
the same.
|
||||
4. Finally, free the forked sequence and verify that the free block
|
||||
count drops to zero.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(seq_len))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
block_table.allocate(token_ids)
|
||||
|
||||
num_free_blocks_before_fork = allocator.get_num_free_blocks(
|
||||
device=Device.GPU)
|
||||
|
||||
forked_block_table = block_table.fork()
|
||||
|
||||
# Expect physical_block_ids and token_ids to match.
|
||||
assert (block_table.physical_block_ids ==
|
||||
forked_block_table.physical_block_ids)
|
||||
assert block_table._get_all_token_ids(
|
||||
) == forked_block_table._get_all_token_ids()
|
||||
|
||||
# Do not expect any additional allocations.
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_free_blocks_before_fork
|
||||
|
||||
# Free the original blocks. Assert num free blocks does not change, since
|
||||
# refcount is nonzero.
|
||||
block_table.free()
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_free_blocks_before_fork
|
||||
|
||||
# Expect the forked block table to be unaffected by the free.
|
||||
assert all(block_id is not None
|
||||
for block_id in forked_block_table.physical_block_ids)
|
||||
|
||||
# Free the forked blocks. Assert num free blocks does change, since
|
||||
# refcount is now zero.
|
||||
forked_block_table.free()
|
||||
assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("appender", ["forked", "original"])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_cow(block_size: int, sequence_len: int, append_len: int,
|
||||
allocator_type: str, appender: str):
|
||||
"""Fork a sequence; append to the forked sequence; verify there's a CoW.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(append_len))
|
||||
|
||||
original_block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
|
||||
num_expected_cow_blocks = cdiv(sequence_len + append_len,
|
||||
block_size) - (sequence_len // block_size)
|
||||
|
||||
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
original_block_ids = original_block_table.physical_block_ids[:]
|
||||
|
||||
print("original_block_ids = {}".format(original_block_ids))
|
||||
forked_block_table = original_block_table.fork()
|
||||
|
||||
# Expect no additional allocation (copy on _write_).
|
||||
assert allocator.get_num_free_blocks(
|
||||
Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
|
||||
|
||||
if appender == "forked":
|
||||
appender_block_table = forked_block_table
|
||||
static_block_table = original_block_table
|
||||
elif appender == "original":
|
||||
appender_block_table = original_block_table
|
||||
static_block_table = forked_block_table
|
||||
else:
|
||||
raise ValueError(f"unknown test config {appender=}")
|
||||
|
||||
# Write tokens.
|
||||
appender_block_table.append_token_ids(token_ids_to_append)
|
||||
|
||||
# Expect the non-appending block table to have no change.
|
||||
assert static_block_table.physical_block_ids == original_block_ids
|
||||
assert appender_block_table.physical_block_ids != original_block_ids
|
||||
|
||||
# Expect the blocks changed during append to have a CoW.
|
||||
assert allocator.get_num_free_blocks(
|
||||
Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
|
||||
num_expected_cow_blocks)
|
||||
|
||||
cows = allocator.clear_copy_on_writes()
|
||||
if sequence_len % block_size > 0:
|
||||
# If the last block in the sequence is not full, then when appending we
|
||||
# expect a CoW.
|
||||
assert cows
|
||||
|
||||
cow_block_id = sequence_len // block_size
|
||||
expected_src = static_block_table.physical_block_ids[cow_block_id]
|
||||
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
|
||||
|
||||
assert (expected_src, expected_dst) in cows
|
||||
else:
|
||||
# Otherwise, there should be no copy-on-write.
|
||||
assert not cows
|
||||
|
||||
static_block_table.free()
|
||||
appender_block_table.free()
|
||||
|
||||
# After free, expect all blocks to be freed.
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
|
||||
@pytest.mark.parametrize("appender", ["forked", "original"])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_cow_lookahead_simple(block_size: int, sequence_len: int,
|
||||
append_len: int, lookahead_slots: int,
|
||||
allocator_type: str, appender: str):
|
||||
"""Similar to test_cow, except with lookahead allocation. The assertions are
|
||||
less rigorous due to the complexity of the property under test.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(append_len))
|
||||
|
||||
original_block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Allocate lookahead slots.
|
||||
original_block_table.ensure_num_empty_slots(lookahead_slots)
|
||||
original_block_ids = original_block_table.physical_block_ids[:]
|
||||
|
||||
forked_block_table = original_block_table.fork()
|
||||
|
||||
if appender == "forked":
|
||||
appender_block_table = forked_block_table
|
||||
static_block_table = original_block_table
|
||||
elif appender == "original":
|
||||
appender_block_table = original_block_table
|
||||
static_block_table = forked_block_table
|
||||
else:
|
||||
raise ValueError(f"unknown test config {appender=}")
|
||||
|
||||
# Write tokens.
|
||||
appender_block_table.append_token_ids(token_ids_to_append)
|
||||
|
||||
# Expect the non-appending block table to have no change.
|
||||
assert static_block_table.physical_block_ids == original_block_ids
|
||||
assert appender_block_table.physical_block_ids != original_block_ids
|
||||
|
||||
cows = allocator.clear_copy_on_writes()
|
||||
|
||||
# Always expect copy-on-write
|
||||
assert cows
|
||||
|
||||
if sequence_len % block_size > 0:
|
||||
# If the last block in the sequence is not full, then when appending we
|
||||
# expect a CoW.
|
||||
assert cows
|
||||
|
||||
cow_block_id = sequence_len // block_size
|
||||
expected_src = static_block_table.physical_block_ids[cow_block_id]
|
||||
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
|
||||
|
||||
assert (expected_src, expected_dst) in cows
|
||||
|
||||
static_block_table.free()
|
||||
appender_block_table.free()
|
||||
|
||||
# After free, expect all blocks to be freed.
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
|
||||
num_new_tokens: int,
|
||||
num_lookahead_slots: int,
|
||||
allocator_type: str):
|
||||
"""Verify correct calculation of get_num_blocks_touched_by_append_slots.
|
||||
|
||||
This is done by using copy-on-write, which requires any modified block to
|
||||
be copied before write if the refcount > 1. We set the refcount>1 by forking
|
||||
a sequence, then measure the free blocks before and after an append. If the
|
||||
number of consumed blocks equals what `get_num_blocks_touched_by_append_
|
||||
slots` returns, then the calculation is correct.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(num_new_tokens))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Add lookahead before fork so both sequences have the same lookahead
|
||||
# blocks.
|
||||
block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
|
||||
|
||||
# Fork sequence so that every block has refcount > 1.
|
||||
_ = block_table.fork()
|
||||
|
||||
# Determine how many blocks should be touched.
|
||||
expected_num_touched_blocks = (
|
||||
block_table.get_num_blocks_touched_by_append_slots(
|
||||
token_ids=token_ids_to_append,
|
||||
num_lookahead_slots=num_lookahead_slots))
|
||||
|
||||
# Measure how many blocks are touched by measuring num_free_blocks before
|
||||
# and after the append.
|
||||
#
|
||||
# We expect append_token_ids to CoW all mutated blocks that have refcount>1.
|
||||
num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
|
||||
block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
|
||||
num_consumed_blocks = (num_free_blocks_before_append -
|
||||
allocator.get_num_free_blocks(Device.GPU))
|
||||
|
||||
# TODO(cade) ensure equality when num_lookahead_slots > 0.
|
||||
# The reason we have < is because lookahead blocks are not copied eagerly;
|
||||
# they are copied on first write. This will cause issues for beam search +
|
||||
# speculative decoding. This is acceptable for now as it is a large effort
|
||||
# to combine the two. To fix this, we can ensure single sequence ownership
|
||||
# of lookahead blocks by appending empty slots to each block, which will
|
||||
# trigger the CoW.
|
||||
#
|
||||
# Until then, we can accept that the consumed tokens are <= the expected
|
||||
# tokens when appending with lookahead.
|
||||
if num_lookahead_slots > 0:
|
||||
assert num_consumed_blocks <= expected_num_touched_blocks
|
||||
else:
|
||||
assert num_consumed_blocks == expected_num_touched_blocks
|
||||
42
vllm-v0.6.2/tests/core/block/test_common.py
Normal file
42
vllm-v0.6.2/tests/core/block/test_common.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import random
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.common import RefCounter
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
@pytest.mark.parametrize("num_incrs", [1, 100])
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
def test_incr(seed: int, num_incrs: int, num_blocks: int):
|
||||
random.seed(seed)
|
||||
|
||||
all_block_indices = list(range(num_blocks))
|
||||
counter = RefCounter(all_block_indices=all_block_indices)
|
||||
|
||||
block_id = random.randint(0, num_blocks - 1)
|
||||
for i in range(num_incrs):
|
||||
value = counter.incr(block_id)
|
||||
assert value == i + 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
@pytest.mark.parametrize("num_incrs", [1, 100])
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
|
||||
random.seed(seed)
|
||||
|
||||
all_block_indices = list(range(num_blocks))
|
||||
counter = RefCounter(all_block_indices=all_block_indices)
|
||||
|
||||
block_id = random.randint(0, num_blocks - 1)
|
||||
for i in range(num_incrs):
|
||||
value = counter.incr(block_id)
|
||||
assert value == i + 1
|
||||
|
||||
for i in range(num_incrs):
|
||||
value = counter.decr(block_id)
|
||||
assert value == num_incrs - (i + 1)
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
counter.decr(block_id)
|
||||
93
vllm-v0.6.2/tests/core/block/test_cpu_gpu_block_allocator.py
Normal file
93
vllm-v0.6.2/tests/core/block/test_cpu_gpu_block_allocator.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
|
||||
from vllm.utils import Device, chunk_list
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
|
||||
block_size: int, allocator_type: str):
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
cpu_blocks = [
|
||||
allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
|
||||
for _ in range(num_cpu_blocks)
|
||||
]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
gpu_blocks = [
|
||||
allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
|
||||
for _ in range(num_gpu_blocks)
|
||||
]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == 0
|
||||
|
||||
_ = [allocator.free(block) for block in cpu_blocks]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == 0
|
||||
|
||||
_ = [allocator.free(block) for block in gpu_blocks]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [2])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
|
||||
block_size: int, allocator_type: str):
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
unique_token_ids = list(
|
||||
range((num_cpu_blocks + num_gpu_blocks) * block_size))
|
||||
gpu_token_ids = list(
|
||||
chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
|
||||
cpu_token_ids = list(
|
||||
chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
|
||||
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
cpu_blocks = [
|
||||
allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids,
|
||||
device=Device.CPU)
|
||||
for token_ids in cpu_token_ids
|
||||
]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
gpu_blocks = [
|
||||
allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids,
|
||||
device=Device.GPU)
|
||||
for token_ids in gpu_token_ids
|
||||
]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == 0
|
||||
|
||||
_ = [allocator.free(block) for block in cpu_blocks]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == 0
|
||||
|
||||
_ = [allocator.free(block) for block in gpu_blocks]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
145
vllm-v0.6.2/tests/core/block/test_naive_block.py
Normal file
145
vllm-v0.6.2/tests/core/block/test_naive_block.py
Normal file
@@ -0,0 +1,145 @@
|
||||
from typing import List, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.interfaces import Block, BlockAllocator
|
||||
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
|
||||
|
||||
|
||||
class TestNaiveBlockAllocator:
|
||||
|
||||
@staticmethod
|
||||
def create_allocate_lambda(allocate_type: str,
|
||||
allocator: NaiveBlockAllocator,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int]):
|
||||
if allocate_type == "immutable":
|
||||
allocate_block = lambda: allocator.allocate_immutable_block(
|
||||
prev_block=prev_block, token_ids=token_ids)
|
||||
elif allocate_type == "mutable":
|
||||
allocate_block = lambda: allocator.allocate_mutable_block(
|
||||
prev_block=prev_block)
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
return allocate_block
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_allocate_ooms(allocate_type: str, num_blocks: int,
|
||||
block_size: int):
|
||||
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
allocate_type,
|
||||
allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)))
|
||||
|
||||
[allocate_block() for _ in range(num_blocks)]
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocate_block()
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_free_prevents_oom(allocate_type: str, num_blocks: int,
|
||||
block_size: int):
|
||||
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
allocate_type,
|
||||
allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)))
|
||||
|
||||
blocks = [allocate_block() for _ in range(num_blocks)]
|
||||
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocate_block()
|
||||
|
||||
block_to_free = blocks.pop()
|
||||
|
||||
for _ in range(100):
|
||||
block_id = block_to_free.block_id
|
||||
allocator.free(block_to_free)
|
||||
assert block_to_free.block_id is None
|
||||
|
||||
new_block = allocate_block()
|
||||
assert new_block.block_id == block_id
|
||||
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocate_block()
|
||||
|
||||
block_to_free = new_block
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
|
||||
block_size: int):
|
||||
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
allocate_type,
|
||||
allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)))
|
||||
|
||||
assert allocator.get_num_free_blocks() == num_blocks
|
||||
|
||||
blocks = [allocate_block() for _ in range(num_blocks)]
|
||||
|
||||
for i, block in enumerate(blocks):
|
||||
assert allocator.get_num_free_blocks() == i
|
||||
allocator.free(block)
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [4])
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
|
||||
""" Verify the allocator can correctly return the number of
|
||||
full blocks touched.
|
||||
"""
|
||||
allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
|
||||
# Create a chain of cacheable blocks in the dst
|
||||
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
"immutable",
|
||||
allocator_src,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)))
|
||||
src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
|
||||
|
||||
# All blocks are cached
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
src_blocks) == num_blocks - 1
|
||||
|
||||
# Insert one non-full block in the src
|
||||
allocate_non_full_block = \
|
||||
TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
"mutable", allocator_src,
|
||||
prev_block=src_blocks[-1],token_ids=[]
|
||||
)
|
||||
src_blocks.append(allocate_non_full_block())
|
||||
src_blocks[-1].append_token_ids([0])
|
||||
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
src_blocks) == num_blocks - 1
|
||||
# Fill up the last source block and then invoke
|
||||
# get_num_blocks_touched
|
||||
src_blocks[-1].append_token_ids([0] * (block_size - 1))
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
src_blocks) == num_blocks
|
||||
764
vllm-v0.6.2/tests/core/block/test_prefix_caching_block.py
Normal file
764
vllm-v0.6.2/tests/core/block/test_prefix_caching_block.py
Normal file
@@ -0,0 +1,764 @@
|
||||
import math
|
||||
import random
|
||||
from typing import List, Optional
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.interfaces import Block, BlockAllocator
|
||||
from vllm.core.block.prefix_caching_block import (PrefixCachingBlock,
|
||||
PrefixCachingBlockAllocator)
|
||||
|
||||
|
||||
class TestPrefixCachingBlock:
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("seed", list(range(10)))
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
@pytest.mark.parametrize("is_curr_block_full", [True, False])
|
||||
def test_first_block_has_correct_content_hash(seed: int, block_size: int,
|
||||
is_curr_block_full: bool):
|
||||
"""Verify a block which is first in the sequence has the correct hash.
|
||||
"""
|
||||
random.seed(seed)
|
||||
num_to_fill = block_size if is_curr_block_full else random.randint(
|
||||
0, block_size - 1)
|
||||
token_ids = list(range(num_to_fill))
|
||||
mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
|
||||
|
||||
block_with_prev = PrefixCachingBlock(prev_block=None,
|
||||
token_ids=token_ids,
|
||||
block_size=block_size,
|
||||
allocator=mock_allocator)
|
||||
|
||||
if is_curr_block_full:
|
||||
# Expect hash since block is full.
|
||||
assert block_with_prev.content_hash == (
|
||||
PrefixCachingBlock.hash_block_tokens(
|
||||
is_first_block=True,
|
||||
prev_block_hash=None,
|
||||
cur_block_token_ids=token_ids))
|
||||
else:
|
||||
# Do not expect hash since block is not full.
|
||||
assert block_with_prev.content_hash is None
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("seed", list(range(10)))
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
@pytest.mark.parametrize("is_curr_block_full", [True, False])
|
||||
@pytest.mark.parametrize("prev_block_has_hash", [True, False])
|
||||
def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
|
||||
is_curr_block_full: bool,
|
||||
prev_block_has_hash: bool):
|
||||
"""Verify a block which is not first in the sequence has the correct
|
||||
hash.
|
||||
"""
|
||||
|
||||
random.seed(seed)
|
||||
|
||||
previous_block = MagicMock(spec=PrefixCachingBlock)
|
||||
prev_block_hash = random.randint(0, 1000)
|
||||
previous_block.content_hash = (prev_block_hash
|
||||
if prev_block_has_hash else None)
|
||||
|
||||
num_to_fill = block_size if is_curr_block_full else random.randint(
|
||||
0, block_size - 1)
|
||||
token_ids = list(range(num_to_fill))
|
||||
mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
|
||||
|
||||
block_with_prev = PrefixCachingBlock(
|
||||
prev_block=previous_block,
|
||||
token_ids=token_ids,
|
||||
block_size=block_size,
|
||||
allocator=mock_allocator,
|
||||
)
|
||||
|
||||
if is_curr_block_full and prev_block_has_hash:
|
||||
# Expect hash since block is full and previous block has hash.
|
||||
assert (block_with_prev.content_hash ==
|
||||
PrefixCachingBlock.hash_block_tokens(
|
||||
is_first_block=False,
|
||||
prev_block_hash=prev_block_hash,
|
||||
cur_block_token_ids=token_ids))
|
||||
else:
|
||||
# Do not expect hash since block is not full or the previous block
|
||||
# does not have a hash.
|
||||
assert block_with_prev.content_hash is None
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("block_size", [1, 2, 16])
|
||||
@pytest.mark.parametrize("num_tokens", list(range(3)))
|
||||
@pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10])
|
||||
def test_blocks_have_correct_hash_in_chain(block_size: int,
|
||||
num_tokens: int,
|
||||
num_empty_trailing_blocks: int):
|
||||
"""Create two chains of logical blocks with the same contents.
|
||||
Assert the hashes are equal.
|
||||
"""
|
||||
random.seed(0)
|
||||
|
||||
token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
|
||||
|
||||
first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
num_empty_trailing_blocks=num_empty_trailing_blocks)
|
||||
for _ in range(2))
|
||||
|
||||
for first_chain_block, second_chain_block in zip(
|
||||
first_chain, second_chain):
|
||||
assert (first_chain_block.content_hash ==
|
||||
second_chain_block.content_hash)
|
||||
|
||||
if not first_chain or not second_chain:
|
||||
assert first_chain == second_chain
|
||||
assert num_tokens == 0
|
||||
|
||||
@staticmethod
|
||||
def create_chain(block_size: int,
|
||||
token_ids: List[int],
|
||||
num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
|
||||
"""Helper method which creates a chain of blocks.
|
||||
"""
|
||||
blocks: List[PrefixCachingBlock] = []
|
||||
num_blocks = math.ceil(
|
||||
len(token_ids) / block_size) + num_empty_trailing_blocks
|
||||
|
||||
if num_blocks == 0:
|
||||
return []
|
||||
|
||||
allocator = MagicMock(spec=PrefixCachingBlockAllocator)
|
||||
|
||||
prev_block = None
|
||||
for block_number in range(0, num_blocks):
|
||||
prev_block = PrefixCachingBlock(
|
||||
prev_block=prev_block,
|
||||
token_ids=[],
|
||||
block_size=block_size,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
tokens_to_append = token_ids[block_number *
|
||||
block_size:(block_number + 1) *
|
||||
block_size]
|
||||
if tokens_to_append:
|
||||
prev_block.append_token_ids(tokens_to_append)
|
||||
|
||||
blocks.append(prev_block)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
class TestPrefixCachingBlockAllocator:
|
||||
|
||||
@staticmethod
|
||||
def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int]):
|
||||
if allocate_type == "immutable":
|
||||
allocate_block = lambda: allocator.allocate_immutable_block(
|
||||
prev_block=prev_block, token_ids=token_ids)
|
||||
elif allocate_type == "mutable":
|
||||
allocate_block = lambda: allocator.allocate_mutable_block(
|
||||
prev_block=prev_block)
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
return allocate_block
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
|
||||
allocate_type="mutable",
|
||||
allocator=allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)),
|
||||
)
|
||||
|
||||
[allocate_block() for _ in range(num_blocks)]
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocate_block()
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_allocate_immutable_does_not_oom_single_hash(
|
||||
num_blocks: int, block_size: int):
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
|
||||
allocate_type="immutable",
|
||||
allocator=allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)),
|
||||
)
|
||||
|
||||
blocks = [allocate_block() for _ in range(num_blocks)]
|
||||
|
||||
# Expect no OOM. If these were mutable blocks, this would OOM.
|
||||
non_oom_block = allocate_block()
|
||||
|
||||
# Expect all blocks to have same physical block index.
|
||||
for block in blocks:
|
||||
assert (block.block_id == non_oom_block.block_id)
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_allocate_immutable_ooms_many_hash(num_blocks: int,
|
||||
block_size: int):
|
||||
"""Consume all blocks using many different hashes/block content.
|
||||
|
||||
Do this by creating a sequence that is very long.
|
||||
Expect next block to OOM.
|
||||
"""
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
|
||||
# Create token ids that will exhaust all blocks.
|
||||
token_ids = list(range(num_blocks * block_size))
|
||||
|
||||
chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# Expect allocation with unseen hash to fail.
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocator.allocate_immutable_block(prev_block=chain[-1],
|
||||
token_ids=list(
|
||||
range(block_size)))
|
||||
|
||||
# Expect mutable allocation to fail.
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocator.allocate_mutable_block(prev_block=chain[-1])
|
||||
|
||||
# Expect allocation of exact same chain to pass.
|
||||
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# Expect physical block indices to be the same in both chains.
|
||||
assert chain and second_chain
|
||||
for first_chain_block, second_chain_block in zip(chain, second_chain):
|
||||
assert (first_chain_block.block_id == second_chain_block.block_id)
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_free_prevents_oom(num_blocks: int, block_size: int):
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
|
||||
# Create token ids that will exhaust all blocks.
|
||||
token_ids = list(range(num_blocks * block_size))
|
||||
|
||||
chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# Expect mutable allocation to fail.
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocator.allocate_mutable_block(prev_block=None)
|
||||
|
||||
block_to_free = chain[-1]
|
||||
|
||||
# Expect free/allocate loop to succeed many times.
|
||||
for i in range(100):
|
||||
block_id = block_to_free.block_id
|
||||
allocator.free(block_to_free)
|
||||
assert block_to_free.block_id is None, i
|
||||
|
||||
new_block = allocator.allocate_mutable_block(prev_block=None)
|
||||
assert new_block.block_id == block_id, i
|
||||
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocator.allocate_mutable_block(prev_block=None)
|
||||
|
||||
block_to_free = new_block
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
|
||||
random.seed(seed)
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
num_blocks_to_consume = random.randint(1, num_blocks - 1)
|
||||
|
||||
# Create token ids that will exhaust all blocks.
|
||||
token_ids = list(range(num_blocks_to_consume * block_size))
|
||||
|
||||
chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# Free each block in chain, assert num free blocks includes new free
|
||||
# block.
|
||||
for i, block in enumerate(chain):
|
||||
assert allocator.get_num_free_blocks() == (num_blocks -
|
||||
num_blocks_to_consume +
|
||||
i)
|
||||
allocator.free(block)
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [4])
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
def test_prefix_caching_block_get_num_full_blocks_touched(
|
||||
num_blocks, block_size):
|
||||
""" Verify the allocator can correctly return the number of
|
||||
blocks touched, when there are cached prefixes.
|
||||
"""
|
||||
allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocator_dst = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
|
||||
# Create token ids that will exhaust all blocks except the last
|
||||
token_ids = list(range((num_blocks - 1) * block_size))
|
||||
|
||||
# Create a chain of cacheable blocks in the dst
|
||||
cached_blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator_dst,
|
||||
)
|
||||
|
||||
# Create a chain of the same blocks in the src
|
||||
blocks_to_swap_in = \
|
||||
TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator_src,
|
||||
)
|
||||
# All blocks are cached
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
blocks_to_swap_in) == 0
|
||||
|
||||
# Free the first block in the dst
|
||||
allocator_dst.free(cached_blocks[0])
|
||||
|
||||
# Now the first block becomes dangling, the swapped blocks need
|
||||
# to reclaim the first block in the dst
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
blocks_to_swap_in) == 1
|
||||
|
||||
# Insert one non-full block in the src
|
||||
non_full_block = allocator_src.allocate_mutable_block(
|
||||
blocks_to_swap_in[-1])
|
||||
non_full_block.append_token_ids([0])
|
||||
blocks_to_swap_in.append(non_full_block)
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
blocks_to_swap_in) == 1
|
||||
# Fill up the last mutable block and invoke get_num_blocks_touched.
|
||||
# Note: The last block is not cached so it will be touched.
|
||||
non_full_block.append_token_ids([0] * (block_size - 1))
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
blocks_to_swap_in) == 2
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
|
||||
seed: int):
|
||||
"""Verify sharing occurs by allocating two sequences that share prefixes
|
||||
and incrementally freeing blocks.
|
||||
"""
|
||||
random.seed(seed)
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
num_blocks_to_consume = random.randint(1, num_blocks - 1)
|
||||
|
||||
# Create token ids that will exhaust all blocks.
|
||||
token_ids = list(range(num_blocks_to_consume * block_size))
|
||||
|
||||
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# Free each block in the first chain. Since all blocks are shared, the
|
||||
# free count should stay constant.
|
||||
for i, block in enumerate(first_chain):
|
||||
assert allocator.get_num_free_blocks() == (num_blocks -
|
||||
num_blocks_to_consume)
|
||||
allocator.free(block)
|
||||
|
||||
# Free each block in the second chain. Since the refcount is now zero,
|
||||
# the free count should increment with each free.
|
||||
for i, block in enumerate(second_chain):
|
||||
assert allocator.get_num_free_blocks() == (num_blocks -
|
||||
num_blocks_to_consume +
|
||||
i)
|
||||
allocator.free(block)
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
|
||||
seed: int):
|
||||
"""Verify get_common_computed_block_ids could get correct result
|
||||
by create two immutable chain sharing prefix at specified pos,
|
||||
and compare whether we also could get right result
|
||||
from get_common_computed_block_ids.
|
||||
"""
|
||||
random.seed(seed)
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2,
|
||||
block_size=block_size)
|
||||
num_blocks_to_consume = random.randint(1, num_blocks - 1)
|
||||
|
||||
# Create token ids that will exhaust all blocks.
|
||||
token_ids = list(range(num_blocks_to_consume * block_size))
|
||||
|
||||
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# After zero_point, second_chain's token_ids would be set -1, which
|
||||
# make it different from here comparing with first_chain
|
||||
zero_point = random.randint(1, len(token_ids) - 1)
|
||||
zero_point_blocks = zero_point // block_size
|
||||
token_ids[zero_point:] = [-1] * (len(token_ids) - zero_point)
|
||||
|
||||
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
first_computed_ids = [
|
||||
first_chain[i].block_id for i in range(num_blocks_to_consume)
|
||||
]
|
||||
second_computed_ids = [
|
||||
second_chain[i].block_id for i in range(num_blocks_to_consume)
|
||||
]
|
||||
res = allocator.get_common_computed_block_ids(
|
||||
[first_computed_ids, second_computed_ids])
|
||||
|
||||
assert (len(res) == zero_point_blocks)
|
||||
|
||||
# Test case that assume those prompted block after first immutable would
|
||||
# be freed into hashless allocator, while first immutable block get ref
|
||||
# increased.
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [3])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(10)))
|
||||
def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
|
||||
random.seed(seed)
|
||||
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
token_ids = list(range(block_size))
|
||||
|
||||
block = allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids)
|
||||
|
||||
assert allocator._refcounter.get(block.block_id) == 1
|
||||
m = allocator.allocate_mutable_block(prev_block=None)
|
||||
|
||||
block_id = m.block_id
|
||||
for i in range(block_size):
|
||||
m.append_token_ids([i])
|
||||
|
||||
# After block get promoted to immutable from mutable, if there is
|
||||
# already same content hash block, then it shall be released into
|
||||
# hashless_allocator
|
||||
# And first immutable block's ref get increased by 1
|
||||
assert m.block_id == block.block_id
|
||||
assert block_id in allocator._hashless_allocator._free_block_indices
|
||||
assert allocator._refcounter.get(block.block_id) == 2
|
||||
|
||||
# Test case when eviction and allocation are mixed,
|
||||
# make sure they work as expected
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [3])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(10)))
|
||||
def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
|
||||
random.seed(seed)
|
||||
|
||||
all_blocks_list = [i for i in range(num_blocks)]
|
||||
zero_ref = {i: 0 for i in range(num_blocks)}
|
||||
one_ref = {i: 1 for i in range(num_blocks)}
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
token_ids = list(range(num_blocks * block_size))
|
||||
|
||||
# Verify initial/pre-alloc state
|
||||
|
||||
# Ensure all blocks are free inside hashless allocator
|
||||
assert list(allocator._hashless_allocator._free_block_indices
|
||||
) == all_blocks_list
|
||||
# Ensure no tracked blocks
|
||||
assert len(allocator._block_tracker.keys()) == num_blocks
|
||||
for block_id in range(num_blocks):
|
||||
assert not allocator._block_tracker[block_id].active
|
||||
# Ensure no cached blocks
|
||||
assert len(allocator._cached_blocks.values()) == 0
|
||||
# Ensure no evicted blocks
|
||||
assert len(allocator.evictor.free_table.keys()) == 0
|
||||
# Ensure 0s ref counts for all blocks
|
||||
assert allocator._refcounter._refcounts == zero_ref
|
||||
|
||||
# Allocate immutable chains with only one block residuled in
|
||||
new_block = []
|
||||
for i in range(num_blocks):
|
||||
block = allocator.allocate_immutable_block(
|
||||
prev_block=None,
|
||||
token_ids=token_ids[block_size * i:block_size * (i + 1)])
|
||||
new_block.append(block)
|
||||
|
||||
# Verify post-alloc state
|
||||
|
||||
# Ensure no blocks are free inside hashless allocator
|
||||
assert (len(allocator._hashless_allocator._free_block_indices) == 0)
|
||||
# Ensure all blocks are tracked
|
||||
assert len(allocator._block_tracker.keys()) == num_blocks
|
||||
for block_id in range(num_blocks):
|
||||
assert allocator._block_tracker[block_id].active
|
||||
# Ensure all blocks are cached (all promoted)
|
||||
assert len(allocator._cached_blocks.values()) == num_blocks
|
||||
# Ensure no evicted blocks
|
||||
assert len(allocator.evictor.free_table.keys()) == 0
|
||||
# Ensure 1s ref counts for all blocks
|
||||
assert allocator._refcounter._refcounts == one_ref
|
||||
|
||||
# Free all blocks, and now all blocks shall be in the evictor
|
||||
# there shall be no tracking data left in _block_tracker
|
||||
# all blocks shall be tracked in _cached_blocks
|
||||
# all blocks' ref shall be zero
|
||||
for block in new_block:
|
||||
allocator.free(block)
|
||||
|
||||
# Verify post-free state
|
||||
|
||||
# Ensure no tracked blocks
|
||||
assert len(allocator._block_tracker.keys()) == num_blocks
|
||||
for block_id in range(num_blocks):
|
||||
assert not allocator._block_tracker[block_id].active
|
||||
# Ensure no blocks in hashless allocator (all promoted)
|
||||
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
||||
# Ensure all blocks are cached
|
||||
assert list(allocator._cached_blocks.values()) == all_blocks_list
|
||||
# Ensure all blocks are inside the evictor
|
||||
assert list(allocator.evictor.free_table.keys()) == all_blocks_list
|
||||
# Ensure 0s refcounts
|
||||
assert allocator._refcounter._refcounts == zero_ref
|
||||
|
||||
# Allocate a mutable block, and the first block shall be evicted
|
||||
# and set its content hash into None, ref to 1
|
||||
mutable = allocator.allocate_mutable_block(prev_block=None)
|
||||
|
||||
assert mutable.block_id == 0
|
||||
assert mutable.content_hash is None
|
||||
assert allocator._block_tracker[0].active
|
||||
assert allocator._refcounter.get(0) == 1
|
||||
assert 0 not in allocator._cached_blocks
|
||||
assert 0 not in allocator.evictor
|
||||
|
||||
# Since this mutable block has no hash yet, it shall be released into
|
||||
# hashless allocator
|
||||
allocator.free(mutable)
|
||||
|
||||
assert not allocator._block_tracker[0].active
|
||||
assert allocator._refcounter._refcounts == zero_ref
|
||||
assert 0 not in allocator._cached_blocks
|
||||
assert 0 not in allocator.evictor
|
||||
assert 0 in allocator._hashless_allocator._free_block_indices
|
||||
|
||||
# When allocate immutable with first block_size tokens, we
|
||||
# shall get free block from hashless allocator, thus no block left
|
||||
# in hashless
|
||||
block = allocator.allocate_immutable_block(
|
||||
prev_block=None, token_ids=token_ids[:block_size])
|
||||
|
||||
assert block.block_id == 0
|
||||
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
||||
assert allocator._block_tracker[0].active
|
||||
assert 0 in allocator._cached_blocks.values()
|
||||
assert allocator._refcounter.get(0) == 1
|
||||
assert 0 not in allocator.evictor
|
||||
|
||||
# allocate mutable block again, it shall be popped from evictor
|
||||
mutable = allocator.allocate_mutable_block(prev_block=None)
|
||||
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
||||
assert mutable.block_id not in allocator.evictor.free_table
|
||||
assert allocator._refcounter.get(mutable.block_id) == 1
|
||||
|
||||
# Test case where two last accessed times are equal
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
def test_eviction_order(num_blocks: int, block_size: int, seed: int):
|
||||
"""This test case simulate the two chain created and free in order,
|
||||
and together they would exhaust the initial freed blocks.
|
||||
|
||||
So the next block created after those two chain shall use the block
|
||||
from the first chain as that block has long access time.
|
||||
While first chain has two blocks, it shall pick up the last one, as
|
||||
it has larger token number.
|
||||
"""
|
||||
|
||||
random.seed(seed)
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
num_blocks_to_consume = num_blocks + 1
|
||||
|
||||
token_ids = list(range(num_blocks_to_consume * block_size))
|
||||
|
||||
num_blocks_in_first_chain = 2
|
||||
num_tokens_in_first_chain = block_size * num_blocks_in_first_chain
|
||||
# First chain takes the first block
|
||||
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids[:num_tokens_in_first_chain],
|
||||
allocator=allocator,
|
||||
)
|
||||
# There should only be one block allocated at this point
|
||||
assert allocator.get_num_free_blocks() == (num_blocks -
|
||||
num_blocks_in_first_chain)
|
||||
|
||||
# Set the last accessed time of the first block to 1
|
||||
blocks_ids = [block.block_id for block in first_chain]
|
||||
allocator.mark_blocks_as_accessed(blocks_ids, 1)
|
||||
|
||||
# Second chain takes the rest of the blocks
|
||||
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids[num_tokens_in_first_chain:-block_size],
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# There shouldn't be any blocks left at this point
|
||||
assert allocator.get_num_free_blocks() == (0)
|
||||
|
||||
assert len(first_chain) == num_blocks_in_first_chain
|
||||
last_block_id = first_chain[-1].block_id
|
||||
# Free each block in the first chain.
|
||||
for i, block in enumerate(first_chain):
|
||||
allocator.free(block)
|
||||
|
||||
# Set the last accessed time on all of the blocks in the second chain
|
||||
# to 2
|
||||
blocks_ids = [block.block_id for block in second_chain]
|
||||
allocator.mark_blocks_as_accessed(blocks_ids, 2)
|
||||
|
||||
# Free each block in the second chain.
|
||||
for i, block in enumerate(second_chain):
|
||||
allocator.free(block)
|
||||
|
||||
# Allocate a new block and check that it's the least recently used block
|
||||
# from the first chain.
|
||||
new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids[-block_size:],
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
assert new_block[0].block_id == last_block_id
|
||||
|
||||
# Test case for cache mertics
|
||||
@staticmethod
|
||||
def test_metric():
|
||||
block_size = 16
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=4,
|
||||
block_size=block_size)
|
||||
# Test when no query (0/0)
|
||||
assert allocator.get_prefix_cache_hit_rate() == 0.0
|
||||
|
||||
token_ids = list(range(block_size))
|
||||
allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids)
|
||||
# Test 0/1 hit rate
|
||||
assert allocator.get_prefix_cache_hit_rate() == 0.0
|
||||
|
||||
allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids)
|
||||
# Test 1/2 hit rate
|
||||
assert allocator.get_prefix_cache_hit_rate() == 0.5
|
||||
|
||||
# Test more than one block
|
||||
for _ in range(2, 1005):
|
||||
allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids)
|
||||
assert allocator.get_prefix_cache_hit_rate() > 0.99
|
||||
|
||||
# Test case for marking cache hit blocks as computed right after
|
||||
# a batch of prefill sequences are scheduled.
|
||||
@staticmethod
|
||||
def test_touch_block():
|
||||
block_size = 16
|
||||
common_blocks = 4
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=8,
|
||||
block_size=block_size)
|
||||
|
||||
common_token_ids = list(range(block_size * common_blocks))
|
||||
|
||||
# Mimic the behavior of allocating the same block chain
|
||||
# (i.e., common prefix) for a batch of 3 different prefill sequences.
|
||||
for _ in range(3):
|
||||
blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=common_token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
block_ids = [block.block_id for block in blocks]
|
||||
# The allocated blocks should be marked as touched
|
||||
# but not computed.
|
||||
computed_block_ids = allocator.get_computed_block_ids(
|
||||
[], block_ids, skip_last_block_id=False)
|
||||
assert len(computed_block_ids) == 0
|
||||
|
||||
allocator.mark_blocks_as_computed([])
|
||||
computed_block_ids = allocator.get_computed_block_ids(
|
||||
[], block_ids, skip_last_block_id=False)
|
||||
assert len(computed_block_ids) == common_blocks
|
||||
|
||||
@staticmethod
|
||||
def create_immutable_chain(
|
||||
block_size: int,
|
||||
token_ids: List[int],
|
||||
allocator: PrefixCachingBlockAllocator,
|
||||
) -> List[PrefixCachingBlock]:
|
||||
"""Helper method which creates a chain of blocks.
|
||||
"""
|
||||
blocks: List[Block] = []
|
||||
num_blocks = math.ceil(len(token_ids) / block_size)
|
||||
|
||||
if num_blocks == 0:
|
||||
return []
|
||||
|
||||
prev_block = None
|
||||
for block_number in range(0, num_blocks):
|
||||
block_token_ids = token_ids[block_number *
|
||||
block_size:(block_number + 1) *
|
||||
block_size]
|
||||
prev_block = allocator.allocate_immutable_block(
|
||||
prev_block=prev_block, token_ids=block_token_ids)
|
||||
blocks.append(prev_block)
|
||||
|
||||
return blocks
|
||||
509
vllm-v0.6.2/tests/core/test_chunked_prefill_scheduler.py
Normal file
509
vllm-v0.6.2/tests/core/test_chunked_prefill_scheduler.py
Normal file
@@ -0,0 +1,509 @@
|
||||
from typing import List
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest # noqa
|
||||
|
||||
from vllm.config import CacheConfig, SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.sequence import Logprob, SequenceGroup
|
||||
|
||||
from .utils import create_dummy_prompt
|
||||
|
||||
|
||||
def get_sequence_groups(scheduler_output):
|
||||
return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
|
||||
|
||||
|
||||
def append_new_token(seq_group, token_id: int):
|
||||
for seq in seq_group.get_seqs():
|
||||
seq.append_token_id(token_id, {token_id: Logprob(token_id)})
|
||||
|
||||
|
||||
def schedule_and_update_computed_tokens(scheduler):
|
||||
metas, out, _ = scheduler.schedule()
|
||||
for s, meta in zip(out.scheduled_seq_groups, metas):
|
||||
s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
|
||||
return metas, out
|
||||
|
||||
|
||||
def test_simple():
|
||||
"""Verify basic scheduling works."""
|
||||
block_size = 4
|
||||
num_seq_group = 4
|
||||
max_model_len = 16
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig("generate",
|
||||
max_num_batched_tokens,
|
||||
num_seq_group,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(num_seq_group):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=block_size,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Schedule seq groups prompts.
|
||||
num_tokens = block_size * num_seq_group
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert out.num_batched_tokens == num_tokens
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == num_seq_group
|
||||
for s in running:
|
||||
append_new_token(s, 1)
|
||||
|
||||
# Schedule seq groups generation.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert out.num_batched_tokens == num_seq_group
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == num_seq_group
|
||||
|
||||
|
||||
def test_chunk():
|
||||
"""Verify prefills are chunked properly."""
|
||||
block_size = 4
|
||||
max_seqs = 60
|
||||
max_model_len = 80
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 32
|
||||
cache_config.num_gpu_blocks = 32
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Verify the second request is chunked.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
print()
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert seq_group_meta[0].token_chunk_size == 60
|
||||
# Verify it is chunked.
|
||||
assert seq_group_meta[1].token_chunk_size == 4
|
||||
assert out.num_prefill_groups == 2
|
||||
assert out.num_batched_tokens == 64
|
||||
# Only the first seq group has a new token appended.
|
||||
append_new_token(running[0], 1)
|
||||
|
||||
# One chunked prefill, and one decoding.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
# The first one is prefill. Scheduler guarantees ordering.
|
||||
assert seq_group_meta[0].token_chunk_size == 56
|
||||
# The second one is a chunked prefill.
|
||||
assert seq_group_meta[1].token_chunk_size == 1
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == 57
|
||||
|
||||
|
||||
def test_complex():
|
||||
block_size = 4
|
||||
max_seqs = 60
|
||||
max_model_len = 80
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 64
|
||||
cache_config.num_gpu_blocks = 64
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
assert seq_group.is_prefill()
|
||||
|
||||
# Verify the second request is chunked.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert seq_group_meta[0].token_chunk_size == 60
|
||||
# Verify it is chunked.
|
||||
assert seq_group_meta[1].token_chunk_size == 4
|
||||
assert not running[0].is_prefill()
|
||||
assert running[1].is_prefill()
|
||||
assert out.num_prefill_groups == 2
|
||||
assert out.num_batched_tokens == 64
|
||||
# Only the first seq group has a new token appended.
|
||||
append_new_token(running[0], 1)
|
||||
|
||||
# Add 2 more requests.
|
||||
for i in range(2, 4):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Decoding & chunked prefill & first chunk of 3rd request is scheduled.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 3
|
||||
# The first one is the first chunked prefill.
|
||||
assert seq_group_meta[0].token_chunk_size == 7
|
||||
# The second one is the second new chunked prefill.
|
||||
assert seq_group_meta[1].token_chunk_size == 56
|
||||
# The last one is decode.
|
||||
assert seq_group_meta[2].token_chunk_size == 1
|
||||
# Two of them are in chunked prefill.
|
||||
assert out.num_prefill_groups == 2
|
||||
assert out.num_batched_tokens == 64
|
||||
# The first 2 requests are now in decodine phase.
|
||||
append_new_token(running[0], 1)
|
||||
assert not running[0].is_prefill()
|
||||
append_new_token(running[1], 1)
|
||||
assert not running[1].is_prefill()
|
||||
# The third request is still in prefill stage.
|
||||
assert running[2].is_prefill()
|
||||
|
||||
|
||||
def test_maximal_decoding():
|
||||
"""Verify decoding requests are prioritized."""
|
||||
block_size = 4
|
||||
max_seqs = 2
|
||||
max_model_len = 8
|
||||
max_num_batched_tokens = 2
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=2,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
assert seq_group.is_prefill()
|
||||
|
||||
# The first prefill is scheduled.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 1
|
||||
assert seq_group_meta[0].token_chunk_size == 2
|
||||
assert not running[0].is_prefill()
|
||||
assert running[1].is_prefill()
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == 2
|
||||
# Only the first seq group has a new token appended.
|
||||
append_new_token(running[0], 1)
|
||||
|
||||
# Create one more seq_group.
|
||||
_, seq_group = create_dummy_prompt("3",
|
||||
prompt_length=2,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
assert seq_group.is_prefill()
|
||||
# The first decoding + second chunk is scheduled.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 2
|
||||
assert seq_group_meta[0].token_chunk_size == 1
|
||||
assert seq_group_meta[1].token_chunk_size == 1
|
||||
assert not running[0].is_prefill()
|
||||
assert running[1].is_prefill()
|
||||
assert running[2].is_prefill()
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == 2
|
||||
append_new_token(running[0], 1)
|
||||
|
||||
# Decoding + running prefill is prioritized.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 2
|
||||
assert seq_group_meta[0].token_chunk_size == 1
|
||||
assert seq_group_meta[1].token_chunk_size == 1
|
||||
assert not running[0].is_prefill()
|
||||
assert not running[1].is_prefill()
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == 2
|
||||
append_new_token(running[0], 1)
|
||||
append_new_token(running[1], 1)
|
||||
|
||||
# Only decoding is prioritized.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 2
|
||||
assert seq_group_meta[0].token_chunk_size == 1
|
||||
assert seq_group_meta[1].token_chunk_size == 1
|
||||
assert not running[0].is_prefill()
|
||||
assert not running[1].is_prefill()
|
||||
assert out.num_prefill_groups == 0
|
||||
assert out.num_batched_tokens == 2
|
||||
append_new_token(running[0], 1)
|
||||
append_new_token(running[1], 1)
|
||||
|
||||
# After aborting the decoding request, the fcfs new prefill is prioritized.
|
||||
scheduler.abort_seq_group(running[0].request_id)
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 2
|
||||
assert seq_group_meta[0].token_chunk_size == 1
|
||||
assert seq_group_meta[1].token_chunk_size == 1
|
||||
assert not running[1].is_prefill()
|
||||
assert running[2].is_prefill()
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == 2
|
||||
|
||||
|
||||
def test_prompt_limit():
|
||||
"""Verify max_num_batched_tokens < max_model_len is possible."""
|
||||
block_size = 4
|
||||
max_seqs = 32
|
||||
max_model_len = 64
|
||||
max_num_batched_tokens = 32
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
cache_config.num_gpu_blocks = 16
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
_, seq_group = create_dummy_prompt("1",
|
||||
prompt_length=48,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
assert seq_group.is_prefill()
|
||||
|
||||
# The prompt length > max_num_batched_tokens should be still scheduled.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 1
|
||||
assert seq_group_meta[0].token_chunk_size == 32
|
||||
assert running[0].is_prefill()
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == 32
|
||||
|
||||
|
||||
def test_prompt_limit_exceed():
|
||||
block_size = 4
|
||||
max_seqs = 64
|
||||
max_model_len = 32
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig("generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
cache_config.num_gpu_blocks = 16
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
_, seq_group = create_dummy_prompt("2",
|
||||
prompt_length=48,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
assert seq_group.is_prefill()
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(out.ignored_seq_groups) == 1
|
||||
assert out.ignored_seq_groups[0] == seq_group
|
||||
|
||||
|
||||
def test_chunked_prefill_preempt():
|
||||
"""Verify preempt works with chunked prefill requests"""
|
||||
block_size = 4
|
||||
max_seqs = 30
|
||||
max_model_len = 200
|
||||
max_num_batched_tokens = 30
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
cache_config.num_gpu_blocks = 16
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
_, seq_group = create_dummy_prompt("1",
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
# The request is chunked.
|
||||
# prefill scheduled now.
|
||||
assert len(out.scheduled_seq_groups) == 1
|
||||
assert out.num_prefill_groups == 1
|
||||
assert seq_group.is_prefill()
|
||||
assert out.num_batched_tokens == max_num_batched_tokens
|
||||
|
||||
# The request should be preempted.
|
||||
scheduler.block_manager.can_append_slots = MagicMock()
|
||||
|
||||
def cannot_append_second_group1(seq_group, num_lookahead_slots):
|
||||
return seq_group.request_id != "1"
|
||||
|
||||
scheduler.block_manager.can_append_slots.side_effect = (
|
||||
cannot_append_second_group1)
|
||||
|
||||
# The running prefill is now preempted.
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(out.scheduled_seq_groups) == 0
|
||||
assert out.num_batched_tokens == 0
|
||||
assert out.blocks_to_swap_out == []
|
||||
assert out.blocks_to_swap_in == []
|
||||
|
||||
# Make sure we can reschedule preempted request.
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(out.scheduled_seq_groups) == 1
|
||||
assert out.num_prefill_groups == 1
|
||||
assert seq_group.is_prefill()
|
||||
assert out.num_batched_tokens == max_num_batched_tokens
|
||||
assert seq_group.get_num_uncomputed_tokens() == 30
|
||||
|
||||
# We should be able to run prefill twice as it is chunked.
|
||||
def cannot_append_second_group2(seq_group, num_lookahead_slots):
|
||||
return True
|
||||
|
||||
scheduler.block_manager.can_append_slots.side_effect = (
|
||||
cannot_append_second_group2)
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(out.scheduled_seq_groups) == 1
|
||||
assert out.num_prefill_groups == 1
|
||||
assert not seq_group.is_prefill()
|
||||
assert out.num_batched_tokens == max_num_batched_tokens
|
||||
|
||||
|
||||
def test_chunked_prefill_max_seqs():
|
||||
block_size = 4
|
||||
max_seqs = 2
|
||||
max_model_len = 80
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 128
|
||||
cache_config.num_gpu_blocks = 128
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
_, seq_group = create_dummy_prompt("1",
|
||||
prompt_length=65,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
# The first prefill is chunked.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert seq_group_meta[0].token_chunk_size == max_num_batched_tokens
|
||||
assert len(get_sequence_groups(out)) == 1
|
||||
|
||||
# Add new requests.
|
||||
for i in range(4):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=65,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Make sure only 2 requests are scheduled.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert out.num_batched_tokens == max_num_batched_tokens
|
||||
assert len(get_sequence_groups(out)) == 2
|
||||
assert not running[0].is_prefill()
|
||||
assert running[1].is_prefill()
|
||||
append_new_token(running[0], 1)
|
||||
|
||||
# Although we have enough token budget, we can only schedule max_seqs.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert seq_group_meta[0].token_chunk_size == 2
|
||||
assert seq_group_meta[1].token_chunk_size == 1
|
||||
assert out.num_batched_tokens == 3
|
||||
assert len(get_sequence_groups(out)) == max_seqs
|
||||
assert not running[0].is_prefill()
|
||||
assert not running[1].is_prefill()
|
||||
|
||||
|
||||
def test_perfix_caching():
|
||||
"""Verify allocating full blocks when prefix caching is enabled."""
|
||||
block_size = 4
|
||||
max_seqs = 10
|
||||
max_model_len = 80
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size,
|
||||
1.0,
|
||||
1,
|
||||
"auto",
|
||||
enable_prefix_caching=True)
|
||||
cache_config.num_cpu_blocks = 0
|
||||
cache_config.num_gpu_blocks = 32
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
block_size=block_size,
|
||||
prompt_length=50)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert seq_group_meta[0].token_chunk_size == 50
|
||||
# Verify it is chunked. Note that although the budget is 64-50=14,
|
||||
# we only allocate full blocks for prefix caching, so only 4*(14//4)=12
|
||||
# tokens are allocated.
|
||||
assert seq_group_meta[1].token_chunk_size == 12
|
||||
assert out.num_prefill_groups == 2
|
||||
assert out.num_batched_tokens == 62
|
||||
80
vllm-v0.6.2/tests/core/test_num_computed_tokens_update.py
Normal file
80
vllm-v0.6.2/tests/core/test_num_computed_tokens_update.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import pytest
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.core.utils import create_dummy_prompt
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SequenceGroup
|
||||
|
||||
MODEL = "JackFram/llama-160m"
|
||||
|
||||
|
||||
def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
|
||||
scheduler = engine.scheduler[0]
|
||||
scheduler.add_seq_group(seq_group)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
|
||||
@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
|
||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
||||
def test_num_computed_tokens_update(num_scheduler_steps: int,
|
||||
enable_chunked_prefill: bool,
|
||||
enforce_eager: bool):
|
||||
|
||||
is_multi_step = num_scheduler_steps > 1
|
||||
is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
|
||||
|
||||
if is_multi_step_chunked_prefill and current_platform.is_rocm():
|
||||
pytest.skip("Multi-step with Chunked-Prefill does not support "
|
||||
"rocm_flash_attn backend")
|
||||
|
||||
# Make a vllm engine
|
||||
runner = VllmRunner(model_name=MODEL,
|
||||
gpu_memory_utilization=0.3,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
enforce_eager=enforce_eager)
|
||||
engine: LLMEngine = runner.model.llm_engine
|
||||
|
||||
# In multi-step + chunked-prefill there is no separate single prompt step.
|
||||
# What is scheduled will run for num_scheduler_steps always.
|
||||
num_prompt_steps = num_scheduler_steps \
|
||||
if is_multi_step_chunked_prefill else 1
|
||||
|
||||
num_output_tokens_list = [4, 8, 12, 15, 16, 17]
|
||||
|
||||
# Create sequence and add to engine
|
||||
prompt_len = 10
|
||||
|
||||
for req_idx, num_output_tokens in enumerate(num_output_tokens_list):
|
||||
seq, seq_group = create_dummy_prompt(request_id=str(req_idx),
|
||||
prompt_length=prompt_len,
|
||||
min_tokens=num_output_tokens,
|
||||
max_tokens=num_output_tokens)
|
||||
add_seq_group_to_engine(engine, seq_group)
|
||||
|
||||
assert seq.data.get_num_computed_tokens() == 0
|
||||
|
||||
for _ in range(num_prompt_steps):
|
||||
# prompt steps
|
||||
engine.step()
|
||||
|
||||
if not seq.is_finished():
|
||||
prompt_num_computed_tokens = seq.data.get_num_computed_tokens()
|
||||
# Test correctness of num_computed_tokens after the prompt steps
|
||||
assert prompt_num_computed_tokens == \
|
||||
prompt_len + num_prompt_steps - 1
|
||||
|
||||
decode_step_counter = 0
|
||||
while not seq.is_finished():
|
||||
# Test correctness of num_computed_tokens after the decode steps
|
||||
assert seq.data.get_num_computed_tokens(
|
||||
) == prompt_num_computed_tokens + decode_step_counter
|
||||
for _ in range(num_scheduler_steps):
|
||||
# decode step
|
||||
engine.step()
|
||||
decode_step_counter += 1
|
||||
|
||||
# Test correctness of num_computed_tokens after the sequence finish.
|
||||
assert seq.data.get_num_computed_tokens(
|
||||
) == prompt_len + num_output_tokens - 1
|
||||
802
vllm-v0.6.2/tests/core/test_scheduler.py
Normal file
802
vllm-v0.6.2/tests/core/test_scheduler.py
Normal file
@@ -0,0 +1,802 @@
|
||||
import time
|
||||
from collections import deque
|
||||
from typing import List, Set, Tuple
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest # noqa
|
||||
from torch import Use # noqa
|
||||
|
||||
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
|
||||
from vllm.core.interfaces import AllocStatus
|
||||
from vllm.core.scheduler import Scheduler, SchedulingBudget
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import SequenceGroup
|
||||
|
||||
from .utils import (append_new_token, append_new_token_seq_group,
|
||||
create_dummy_prompt, get_sequence_groups,
|
||||
schedule_and_update_computed_tokens)
|
||||
|
||||
|
||||
def test_scheduler_add_seq_group():
|
||||
block_size = 4
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=100,
|
||||
max_num_seqs=64,
|
||||
max_model_len=1,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
|
||||
cache_config.num_cpu_blocks = 4
|
||||
cache_config.num_gpu_blocks = 4
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
# Add seq group to scheduler.
|
||||
num_seq_group = 4
|
||||
for i in range(num_seq_group):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
block_size,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
assert scheduler.get_num_unfinished_seq_groups() == i + 1
|
||||
|
||||
|
||||
def test_scheduler_abort_seq_group():
|
||||
block_size = 4
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=100,
|
||||
max_num_seqs=64,
|
||||
max_model_len=1,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 4
|
||||
cache_config.num_gpu_blocks = 4
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
# Add multiple seq groups to scheduler.
|
||||
num_seq_group = 4
|
||||
request_ids: Set[str] = set()
|
||||
for i in range(num_seq_group):
|
||||
_, seq_group = create_dummy_prompt(str(i), block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
request_ids.add(str(i))
|
||||
|
||||
# Abort all added seq groups.
|
||||
assert scheduler.get_num_unfinished_seq_groups() == num_seq_group
|
||||
scheduler.abort_seq_group(request_ids)
|
||||
assert scheduler.get_num_unfinished_seq_groups() == 0
|
||||
|
||||
|
||||
def test_scheduler_schedule_simple():
|
||||
block_size = 4
|
||||
num_seq_group = 4
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=num_seq_group,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(num_seq_group):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=block_size,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Schedule seq groups prompts.
|
||||
num_tokens = block_size * num_seq_group
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert out.num_batched_tokens == num_tokens
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == num_seq_group
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Schedule seq groups generation.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert out.num_batched_tokens == num_seq_group
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == num_seq_group
|
||||
append_new_token(out, 1)
|
||||
|
||||
|
||||
def test_scheduler_prefill_prioritized():
|
||||
"""Verify running batched tokens are not applied to prefill requests."""
|
||||
block_size = 4
|
||||
max_model_len = 30
|
||||
max_batched_num_tokens = 30
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=max_batched_num_tokens,
|
||||
max_num_seqs=2,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
cache_config.num_gpu_blocks = 16
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
_, seq_group_a = create_dummy_prompt("1", 1, block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group_a)
|
||||
|
||||
# Schedule seq groups prompts.
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert get_sequence_groups(out) == [seq_group_a]
|
||||
|
||||
# Add a new prefill request B.
|
||||
_, seq_group_b = create_dummy_prompt("2", 30, block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group_b)
|
||||
|
||||
# Verify prefill requests are prioritized. Since max_batched_num_tokens
|
||||
# is 1, new prefill request has to be scheduled first.
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert get_sequence_groups(out) == [seq_group_b]
|
||||
|
||||
|
||||
def test_scheduler_schedule_preempt_abort():
|
||||
block_size = 4
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=2,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 2
|
||||
cache_config.num_gpu_blocks = 2
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
seq_a, seq_group_a = create_dummy_prompt("1",
|
||||
block_size,
|
||||
block_size=block_size)
|
||||
seq_b, seq_group_b = create_dummy_prompt("2",
|
||||
block_size,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group_a)
|
||||
scheduler.add_seq_group(seq_group_b)
|
||||
|
||||
# Schedule seq groups prompts.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert get_sequence_groups(out) == [seq_group_a, seq_group_b]
|
||||
assert out.num_batched_tokens == block_size * 2 # seq_a and seq_b
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == 2
|
||||
assert scheduler.get_num_unfinished_seq_groups() == 2
|
||||
|
||||
# Append "generated" tokens, allowing the sequence to mark prompt tokens as
|
||||
# processed.
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Schedule seq groups generation and preempt seq group b.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert get_sequence_groups(out) == [seq_group_a]
|
||||
assert out.num_batched_tokens == 1
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == 1
|
||||
assert scheduler.get_num_unfinished_seq_groups() == 2
|
||||
assert out.preempted == 1
|
||||
|
||||
# Abort seq group a. Re-schedule seq group b prompt with recomputation.
|
||||
scheduler.abort_seq_group("1")
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert get_sequence_groups(out) == [seq_group_b]
|
||||
assert out.num_batched_tokens == 5 # 4 prompt + 1 generation.
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == 1
|
||||
assert scheduler.get_num_unfinished_seq_groups() == 1
|
||||
|
||||
|
||||
def test_scheduler_max_seqs():
|
||||
block_size = 4
|
||||
num_seq_group = 4
|
||||
max_seq_group = 2
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=max_seq_group,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
all_seq_groups: List[SequenceGroup] = []
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(num_seq_group):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=block_size,
|
||||
block_size=block_size)
|
||||
all_seq_groups.append(seq_group)
|
||||
|
||||
# Append 1 seq group
|
||||
scheduler.add_seq_group(all_seq_groups[0])
|
||||
|
||||
# Schedule seq groups prompts.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Schedule seq groups generation.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Append 2 more seq group
|
||||
scheduler.add_seq_group(all_seq_groups[1])
|
||||
scheduler.add_seq_group(all_seq_groups[2])
|
||||
|
||||
# Schedule seq groups prompts.
|
||||
# Only 1 seq group should be scheduled since max_seq_group is 2
|
||||
# and one is prompting.
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
|
||||
|
||||
|
||||
def test_scheduler_delay_factor():
|
||||
block_size = 4
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=100,
|
||||
max_num_seqs=64,
|
||||
max_model_len=16,
|
||||
delay_factor=0.5,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
# schedule first prompt
|
||||
seq_group_meta, seq_group = create_dummy_prompt("0",
|
||||
prompt_length=block_size,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert out.num_prefill_groups > 0
|
||||
assert seq_group_meta[0].request_id == '0'
|
||||
append_new_token(out, 1)
|
||||
|
||||
# wait for a second before scheduling next prompt
|
||||
time.sleep(1)
|
||||
seq_group_meta, seq_group = create_dummy_prompt("1",
|
||||
prompt_length=block_size,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
|
||||
# second prompt should *not* be scheduled
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert out.num_prefill_groups == 0
|
||||
assert seq_group_meta[0].request_id == '0'
|
||||
append_new_token(out, 1)
|
||||
|
||||
# wait for more than 0.5 second and try again
|
||||
time.sleep(0.6)
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert out.num_prefill_groups > 0
|
||||
assert seq_group_meta[0].request_id == '1'
|
||||
append_new_token(out, 1)
|
||||
|
||||
|
||||
def initialize_scheduler(
|
||||
*,
|
||||
max_num_seqs=1000,
|
||||
max_token_budget=1000,
|
||||
max_model_len=1000,
|
||||
lora_config=None,
|
||||
block_size=4,
|
||||
num_cpu_blocks=8,
|
||||
num_gpu_blocks=8,
|
||||
):
|
||||
block_size = block_size
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=max_token_budget,
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
scheduler = Scheduler(scheduler_config, cache_config, lora_config)
|
||||
return scheduler
|
||||
|
||||
|
||||
def create_token_budget(token_budget: int = 10000,
|
||||
max_num_seqs: int = 10000) -> SchedulingBudget:
|
||||
return SchedulingBudget(
|
||||
token_budget=token_budget,
|
||||
max_num_seqs=max_num_seqs,
|
||||
)
|
||||
|
||||
|
||||
def add_token_budget(budget: SchedulingBudget,
|
||||
num_batched_tokens: int = 0,
|
||||
num_curr_seqs: int = 0):
|
||||
mock_seq_group = create_dummy_prompt('10', prompt_length=60)[1]
|
||||
budget.add_num_batched_tokens(mock_seq_group.request_id,
|
||||
num_batched_tokens)
|
||||
budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
|
||||
|
||||
|
||||
def test_prefill_schedule_max_prompt_len():
|
||||
"""
|
||||
Test prompt longer than max_prompt_len is aborted.
|
||||
"""
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
|
||||
_, seq_group = create_dummy_prompt("0",
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 1
|
||||
assert len(output.seq_groups) == 0
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(remaining_waiting) == 0
|
||||
|
||||
|
||||
def test_prefill_schedule_token_budget():
|
||||
"""
|
||||
Test token budget respected.
|
||||
"""
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=64,
|
||||
num_gpu_blocks=64)
|
||||
budget = create_token_budget(token_budget=0)
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
|
||||
# 0 token budget == nothing is scheduled.
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 0
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(remaining_waiting) == 2
|
||||
|
||||
# 60 token budget == 1 request scheduled.
|
||||
budget = create_token_budget(token_budget=60)
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 1
|
||||
assert budget.num_batched_tokens == 60
|
||||
assert budget.num_curr_seqs == 1
|
||||
assert len(remaining_waiting) == 1
|
||||
|
||||
# Test when current_batched_tokens respected.
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=16,
|
||||
num_gpu_blocks=16)
|
||||
budget = create_token_budget(token_budget=60)
|
||||
add_token_budget(budget, 30, 0)
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
# Cannot schedule a prompt that doesn't fit the budget.
|
||||
scheduler.add_seq_group(seq_group)
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 0
|
||||
assert budget.num_batched_tokens == 30
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(remaining_waiting) == 1
|
||||
budget = create_token_budget(token_budget=90)
|
||||
add_token_budget(budget, 30, 0)
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.seq_groups) == 1
|
||||
assert budget.num_batched_tokens == 90
|
||||
assert budget.num_curr_seqs == 1
|
||||
assert len(remaining_waiting) == 0
|
||||
|
||||
|
||||
def test_prefill_schedule_max_seqs():
|
||||
"""
|
||||
Test max seq respected.
|
||||
"""
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=64,
|
||||
num_gpu_blocks=64)
|
||||
budget = create_token_budget(max_num_seqs=2)
|
||||
for i in range(3):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 2
|
||||
assert budget.num_batched_tokens == 120
|
||||
assert budget.num_curr_seqs == 2
|
||||
assert len(remaining_waiting) == 1
|
||||
|
||||
# Verify curr_num_seqs respected.
|
||||
scheduler.waiting = deque()
|
||||
budget = create_token_budget(max_num_seqs=2)
|
||||
add_token_budget(budget, 0, 2)
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 0
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 2
|
||||
assert len(remaining_waiting) == 1
|
||||
|
||||
|
||||
def test_prefill_schedule_max_lora():
|
||||
"""
|
||||
Test max lora is respected and prioritized.
|
||||
"""
|
||||
block_size = 4
|
||||
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
|
||||
scheduler = initialize_scheduler(lora_config=lora_config,
|
||||
block_size=block_size,
|
||||
num_cpu_blocks=64,
|
||||
num_gpu_blocks=64)
|
||||
budget = create_token_budget(token_budget=120)
|
||||
curr_loras: Set[int] = set()
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size,
|
||||
lora_request=LoRARequest(
|
||||
lora_name=str(i),
|
||||
lora_int_id=i + 1,
|
||||
lora_path="abc"))
|
||||
scheduler.add_seq_group(seq_group)
|
||||
# Add two more requests to verify lora is prioritized.
|
||||
# 0: Lora, 1: Lora, 2: regular, 3: regular
|
||||
# In the first iteration, index 0, 2 is scheduled.
|
||||
# If a request is not scheduled because it hits max lora, it is
|
||||
# prioritized. Verify that.
|
||||
for i in range(2, 4):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
# Schedule 2 requests (0 and 2)
|
||||
output = scheduler._schedule_prefills(budget, curr_loras)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 2
|
||||
assert budget.num_batched_tokens == 120
|
||||
assert budget.num_curr_seqs == 2
|
||||
assert len(remaining_waiting) == 2
|
||||
assert len(curr_loras) == 1
|
||||
# The second lora request is scheduled next as FCFS policy.
|
||||
# Reset curr_loras so that it can be scheduled.
|
||||
curr_loras = set()
|
||||
budget = create_token_budget(token_budget=60)
|
||||
output = scheduler._schedule_prefills(budget, curr_loras)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.seq_groups) == 1
|
||||
assert output.seq_groups[0].seq_group.request_id == "1"
|
||||
assert len(remaining_waiting) == 1
|
||||
assert len(curr_loras) == 1
|
||||
assert budget.num_batched_tokens == 60
|
||||
|
||||
|
||||
def test_prefill_schedule_no_block_manager_capacity():
|
||||
"""
|
||||
Test sequence cannot be scheduled due to block manager has no capacity.
|
||||
"""
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_gpu_blocks=128,
|
||||
num_cpu_blocks=128)
|
||||
budget = create_token_budget()
|
||||
for i in range(3):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
scheduler.block_manager.can_allocate = MagicMock()
|
||||
scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 0
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(remaining_waiting) == 3
|
||||
|
||||
scheduler = initialize_scheduler()
|
||||
budget = create_token_budget()
|
||||
for i in range(3):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
scheduler.block_manager.can_allocate = MagicMock()
|
||||
scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 3
|
||||
assert len(output.seq_groups) == 0
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(remaining_waiting) == 0
|
||||
|
||||
|
||||
def test_decode_schedule_preempted():
|
||||
"""
|
||||
Test decodes cannot be scheduled and preempted.
|
||||
"""
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=64,
|
||||
num_gpu_blocks=64)
|
||||
curr_loras = None
|
||||
for i in range(3):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
scheduler._add_seq_group_to_running(seq_group)
|
||||
scheduler.block_manager.can_append_slots = MagicMock()
|
||||
|
||||
def cannot_append_second_group(seq_group, num_lookahead_slots):
|
||||
return seq_group.request_id != "1"
|
||||
|
||||
scheduler.block_manager.can_append_slots.side_effect = (
|
||||
cannot_append_second_group)
|
||||
|
||||
# 1 cannot be scheduled, and the lowest priority (request 2)
|
||||
# should be preempted. 1 will also be preempted.
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_running(budget, curr_loras)
|
||||
remainig_running = scheduler.running
|
||||
assert len(remainig_running) == 0
|
||||
assert len(output.decode_seq_groups) == 1
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
assert output.decode_seq_groups[0].seq_group.request_id == "0"
|
||||
assert len(output.preempted) == 2
|
||||
# Verify budgets are updated.
|
||||
assert budget.num_batched_tokens == 1
|
||||
# NOTE: When enable_chunk is False, num_seqs budget is not updated.
|
||||
# assert budget.num_curr_seqs == 1
|
||||
# Both should be preempted, not swapped.
|
||||
assert output.blocks_to_swap_out == []
|
||||
# Nothing is copied.
|
||||
assert output.blocks_to_copy == []
|
||||
|
||||
|
||||
def test_schedule_decode_blocks_to_copy_update():
|
||||
"""
|
||||
Verify blocks_to_copy is updated.
|
||||
"""
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=4,
|
||||
num_cpu_blocks=16,
|
||||
num_gpu_blocks=16)
|
||||
_, seq_group = create_dummy_prompt("1",
|
||||
prompt_length=60,
|
||||
best_of=2,
|
||||
block_size=block_size)
|
||||
curr_loras = None
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
scheduler._add_seq_group_to_running(seq_group)
|
||||
|
||||
# The last request should be swapped out.
|
||||
scheduler.block_manager.append_slots = MagicMock()
|
||||
scheduler.block_manager.append_slots.return_value = [(2, 3)]
|
||||
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_running(budget, curr_loras)
|
||||
remaining_running = scheduler.running
|
||||
assert len(remaining_running) == 0
|
||||
assert len(output.decode_seq_groups) == 1
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
assert len(output.preempted) == 0
|
||||
assert len(output.swapped_out) == 0
|
||||
# Nothing is preempted.
|
||||
assert output.blocks_to_swap_out == []
|
||||
# Since append_slot returns the source -> dist mapping, it should
|
||||
# applied.
|
||||
assert output.blocks_to_copy == [(2, 3)]
|
||||
|
||||
|
||||
def test_schedule_swapped_max_loras():
|
||||
block_size = 4
|
||||
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
|
||||
scheduler = initialize_scheduler(lora_config=lora_config,
|
||||
block_size=block_size,
|
||||
num_cpu_blocks=32,
|
||||
num_gpu_blocks=32)
|
||||
curr_loras: Set[int] = set()
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size,
|
||||
lora_request=LoRARequest(
|
||||
lora_name=str(i),
|
||||
lora_int_id=i + 1,
|
||||
lora_path="abc"))
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
scheduler._swap_out(seq_group, blocks_to_swap_out)
|
||||
scheduler._add_seq_group_to_swapped(seq_group)
|
||||
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_swapped(budget, curr_loras)
|
||||
remaining_swapped = scheduler.swapped
|
||||
assert len(remaining_swapped) == 1
|
||||
assert budget.num_batched_tokens == 1
|
||||
assert budget.num_curr_seqs == 1
|
||||
assert len(output.decode_seq_groups) == 1
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
assert len(curr_loras) == 1
|
||||
|
||||
|
||||
def test_schedule_swapped_cannot_swap_in():
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=32,
|
||||
num_gpu_blocks=32)
|
||||
curr_loras = None
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
best_of=2,
|
||||
block_size=block_size)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
scheduler._swap_out(seq_group, blocks_to_swap_out)
|
||||
scheduler._add_seq_group_to_swapped(seq_group)
|
||||
|
||||
# The last request should be swapped out.
|
||||
scheduler.block_manager.can_swap_in = MagicMock()
|
||||
scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
|
||||
# Since we cannot swap in, none of the requests are swapped in.
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_swapped(budget, curr_loras)
|
||||
remaining_swapped = scheduler.swapped
|
||||
assert len(remaining_swapped) == 2
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(output.decode_seq_groups) == 0
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
|
||||
|
||||
def test_infeasible_swap():
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=32,
|
||||
num_gpu_blocks=32)
|
||||
curr_loras = None
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
best_of=2,
|
||||
block_size=block_size)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
scheduler._swap_out(seq_group, blocks_to_swap_out)
|
||||
scheduler._add_seq_group_to_swapped(seq_group)
|
||||
|
||||
# The last request should be swapped out.
|
||||
scheduler.block_manager.can_swap_in = MagicMock()
|
||||
scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER
|
||||
# Since we cannot swap in, none of the requests are swapped in.
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_swapped(budget, curr_loras)
|
||||
remaining_swapped = scheduler.swapped
|
||||
assert len(remaining_swapped) == 0
|
||||
assert len(output.infeasible_seq_groups) == 2
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(output.decode_seq_groups) == 0
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
|
||||
|
||||
def test_schedule_swapped_blocks_to_copy():
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=32,
|
||||
num_gpu_blocks=32)
|
||||
curr_loras = None
|
||||
_, seq_group = create_dummy_prompt("1",
|
||||
prompt_length=60,
|
||||
best_of=2,
|
||||
block_size=block_size)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
scheduler._swap_out(seq_group, blocks_to_swap_out)
|
||||
scheduler._add_seq_group_to_swapped(seq_group)
|
||||
|
||||
# The last request should be swapped out.
|
||||
scheduler.block_manager.append_slots = MagicMock()
|
||||
scheduler.block_manager.append_slots.return_value = [(2, 3)]
|
||||
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_swapped(budget, curr_loras)
|
||||
remaining_swapped = scheduler.swapped
|
||||
assert len(remaining_swapped) == 0
|
||||
assert len(output.decode_seq_groups) == 1
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
assert output.blocks_to_copy == [(2, 3)]
|
||||
|
||||
|
||||
def test_scheduling_budget():
|
||||
TOKEN_BUDGET = 4
|
||||
MAX_SEQS = 4
|
||||
budget = SchedulingBudget(token_budget=TOKEN_BUDGET, max_num_seqs=MAX_SEQS)
|
||||
assert budget.can_schedule(num_new_tokens=1, num_new_seqs=1)
|
||||
assert budget.can_schedule(num_new_tokens=4, num_new_seqs=4)
|
||||
assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=5)
|
||||
assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=1)
|
||||
assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=5)
|
||||
assert budget.remaining_token_budget() == TOKEN_BUDGET
|
||||
|
||||
# Verify add/subtract num batched tokens.
|
||||
_, seq_group = create_dummy_prompt("1", 3)
|
||||
budget.add_num_batched_tokens(seq_group.request_id, 2)
|
||||
assert budget.remaining_token_budget() == 2
|
||||
assert budget.num_batched_tokens == 2
|
||||
assert budget.can_schedule(num_new_tokens=2, num_new_seqs=1)
|
||||
assert not budget.can_schedule(num_new_tokens=3, num_new_seqs=1)
|
||||
# Verify adding another seq group is no-op.
|
||||
budget.add_num_batched_tokens(seq_group.request_id, 2)
|
||||
assert budget.remaining_token_budget() == 2
|
||||
assert budget.num_batched_tokens == 2
|
||||
budget.subtract_num_batched_tokens(seq_group.request_id, 2)
|
||||
assert budget.remaining_token_budget() == 4
|
||||
assert budget.num_batched_tokens == 0
|
||||
budget.subtract_num_batched_tokens(seq_group.request_id, 2)
|
||||
assert budget.remaining_token_budget() == 4
|
||||
assert budget.num_batched_tokens == 0
|
||||
|
||||
# Verify add/subtract max seqs.
|
||||
_, seq_group = create_dummy_prompt("1", 3)
|
||||
budget.add_num_seqs(seq_group.request_id, 2)
|
||||
assert budget.can_schedule(num_new_tokens=1, num_new_seqs=2)
|
||||
assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=3)
|
||||
assert budget.num_curr_seqs == 2
|
||||
# Verify adding another seq group is no-op.
|
||||
budget.add_num_seqs(seq_group.request_id, 2)
|
||||
assert budget.num_curr_seqs == 2
|
||||
budget.subtract_num_seqs(seq_group.request_id, 2)
|
||||
assert budget.num_curr_seqs == 0
|
||||
budget.subtract_num_seqs(seq_group.request_id, 2)
|
||||
assert budget.num_curr_seqs == 0
|
||||
104
vllm-v0.6.2/tests/core/test_scheduler_encoder_decoder.py
Normal file
104
vllm-v0.6.2/tests/core/test_scheduler_encoder_decoder.py
Normal file
@@ -0,0 +1,104 @@
|
||||
from typing import List
|
||||
|
||||
import pytest # noqa
|
||||
|
||||
from vllm.config import CacheConfig, SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.sequence import SequenceGroup
|
||||
|
||||
from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
|
||||
get_sequence_groups, schedule_and_update_computed_tokens)
|
||||
|
||||
|
||||
def test_scheduler_schedule_simple_encoder_decoder():
|
||||
'''
|
||||
Test basic scheduler functionality in the context
|
||||
of an encoder/decoder model. Focus on testing
|
||||
enc/dec-specific functionality sense tests already
|
||||
exist for decoder-only functionality
|
||||
|
||||
Test behavior:
|
||||
* Construct Scheduler
|
||||
* Construct dummy encoder/decoder sequence groups
|
||||
* Add dummy seq groups to scheduler backlog
|
||||
* Schedule the next seq group & validate:
|
||||
* Cross-attn block tables
|
||||
* Updated states of seq groups
|
||||
* Number of batched tokens
|
||||
* Number of blocks to copy/swap-in/swap-out
|
||||
* Number of scheduled seq groups
|
||||
* Repeat for both prefill- and decode-phase
|
||||
* Abort scheduled seq groups
|
||||
* Assert that aborted seq groups no longer appear in
|
||||
cross-attention block table
|
||||
'''
|
||||
|
||||
block_size = 4
|
||||
num_seq_group = 4
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
task="generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=num_seq_group,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group
|
||||
cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
req_id_list = []
|
||||
for i in range(num_seq_group):
|
||||
req_id = str(i)
|
||||
req_id_list.append(req_id)
|
||||
_, _, seq_group = create_dummy_prompt_encoder_decoder(
|
||||
req_id, block_size, block_size, block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Schedule seq groups prefill.
|
||||
num_tokens = block_size * num_seq_group
|
||||
seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
|
||||
# - Verify that sequence group cross-attention block tables are
|
||||
# registered with the block manager
|
||||
assert all([(req_id in scheduler.block_manager.cross_block_tables)
|
||||
for req_id in req_id_list])
|
||||
# - Validate sequence-group status
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
# - Validate number of batched tokens
|
||||
assert out.num_batched_tokens == num_tokens
|
||||
# - Validate there are no remaining blocks to swap
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
# - Validate all seq groups were scheduled
|
||||
assert len(seq_group_meta_list) == num_seq_group
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Schedule seq groups decode.
|
||||
seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
|
||||
# - Verify that sequence group metadata includes encoder attention
|
||||
# and cross-attention metadata
|
||||
assert all([
|
||||
not ((seq_group_meta.encoder_seq_data is None) or
|
||||
(seq_group_meta.cross_block_table is None))
|
||||
for seq_group_meta in seq_group_meta_list
|
||||
])
|
||||
# - Validate sequence-group status
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
# - Validate there is one batched token per seq group
|
||||
assert out.num_batched_tokens == num_seq_group
|
||||
# - Validate there are no remaining blocks to swap
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
# - Validate that all seq groups were scheduled
|
||||
assert len(seq_group_meta_list) == num_seq_group
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Abort sequences
|
||||
for req_id in req_id_list:
|
||||
scheduler.abort_seq_group(req_id)
|
||||
# - Verify that sequence group cross-attention block tables are
|
||||
# NO LONGER registered with the block manager
|
||||
assert req_id not in scheduler.block_manager.cross_block_tables
|
||||
33
vllm-v0.6.2/tests/core/test_serialization.py
Normal file
33
vllm-v0.6.2/tests/core/test_serialization.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import msgspec
|
||||
|
||||
from vllm.executor.msgspec_utils import decode_hook, encode_hook
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
|
||||
from ..spec_decode.utils import create_batch
|
||||
|
||||
|
||||
def test_msgspec_serialization():
|
||||
num_lookahead_slots = 4
|
||||
seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots)
|
||||
execute_model_req = ExecuteModelRequest(
|
||||
seq_group_metadata_list=seq_group_metadata_list,
|
||||
num_lookahead_slots=num_lookahead_slots,
|
||||
running_queue_size=4)
|
||||
|
||||
encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
|
||||
decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
|
||||
dec_hook=decode_hook)
|
||||
req = decoder.decode(encoder.encode(execute_model_req))
|
||||
expected = execute_model_req.seq_group_metadata_list
|
||||
actual = req.seq_group_metadata_list
|
||||
assert (len(expected) == len(actual))
|
||||
expected = expected[0]
|
||||
actual = actual[0]
|
||||
|
||||
assert expected.block_tables == actual.block_tables
|
||||
assert expected.is_prompt == actual.is_prompt
|
||||
assert expected.request_id == actual.request_id
|
||||
assert (expected.seq_data[0].prompt_token_ids ==
|
||||
actual.seq_data[0].prompt_token_ids)
|
||||
assert (expected.seq_data[0].output_token_ids ==
|
||||
actual.seq_data[0].output_token_ids)
|
||||
205
vllm-v0.6.2/tests/core/utils.py
Normal file
205
vllm-v0.6.2/tests/core/utils.py
Normal file
@@ -0,0 +1,205 @@
|
||||
import time
|
||||
from typing import List, Optional
|
||||
from typing import Sequence as GenericSequence
|
||||
from typing import Tuple
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.inputs import EncoderDecoderInputs, token_inputs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import Logprob, Sequence, SequenceGroup
|
||||
|
||||
|
||||
def create_dummy_prompt(
|
||||
request_id: str,
|
||||
prompt_length: int,
|
||||
block_size: Optional[int] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
best_of: int = 1,
|
||||
prompt_tokens: Optional[List[int]] = None,
|
||||
min_tokens: int = 0,
|
||||
max_tokens: int = 16,
|
||||
) -> Tuple[Sequence, SequenceGroup]:
|
||||
if not block_size:
|
||||
block_size = prompt_length
|
||||
|
||||
if prompt_tokens is None:
|
||||
# Create dummy prompt sequence with tokens 0...block_size-1
|
||||
# and prompt "0 ... block_size".
|
||||
prompt_tokens = list(range(prompt_length))
|
||||
prompt_str = " ".join([str(t) for t in prompt_tokens])
|
||||
prompt = Sequence(int(request_id),
|
||||
inputs=token_inputs(prompt_tokens, prompt=prompt_str),
|
||||
block_size=block_size)
|
||||
seq_group = SequenceGroup(request_id=request_id,
|
||||
seqs=[prompt],
|
||||
arrival_time=time.time(),
|
||||
sampling_params=SamplingParams(
|
||||
best_of=best_of,
|
||||
max_tokens=max_tokens,
|
||||
min_tokens=min_tokens),
|
||||
lora_request=lora_request)
|
||||
|
||||
return prompt, seq_group
|
||||
|
||||
|
||||
def create_dummy_prompt_encoder_decoder(
|
||||
request_id: str,
|
||||
decoder_prompt_length: int,
|
||||
encoder_prompt_length: int,
|
||||
block_size: Optional[int] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
best_of: int = 1,
|
||||
) -> Tuple[Sequence, Sequence, SequenceGroup]:
|
||||
if not block_size:
|
||||
block_size = decoder_prompt_length
|
||||
|
||||
# Create dummy prompt sequence with tokens 0...block_size-1
|
||||
# and prompt "0 ... block_size". Note that the prompt string
|
||||
# doesn't actually match the tokens
|
||||
decoder_prompt_tokens = list(range(decoder_prompt_length))
|
||||
decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
|
||||
encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
|
||||
encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
|
||||
|
||||
inputs: EncoderDecoderInputs = {
|
||||
"decoder": token_inputs(decoder_prompt_tokens,
|
||||
prompt=decoder_prompt_str),
|
||||
"encoder": token_inputs(encoder_prompt_tokens,
|
||||
prompt=encoder_prompt_str),
|
||||
}
|
||||
|
||||
decoder_prompt = Sequence(int(request_id),
|
||||
inputs=inputs["decoder"],
|
||||
block_size=block_size)
|
||||
|
||||
encoder_prompt = Sequence(int(request_id),
|
||||
inputs=inputs["encoder"],
|
||||
block_size=block_size)
|
||||
|
||||
seq_group = SequenceGroup(request_id=request_id,
|
||||
seqs=[decoder_prompt],
|
||||
sampling_params=SamplingParams(best_of=best_of),
|
||||
arrival_time=time.time(),
|
||||
lora_request=lora_request,
|
||||
encoder_seq=encoder_prompt)
|
||||
|
||||
return decoder_prompt, encoder_prompt, seq_group
|
||||
|
||||
|
||||
def create_seq_group(
|
||||
seq_prompt_len: int = 1024,
|
||||
seq_output_lens: GenericSequence[int] = (128, ),
|
||||
request_id: str = '0',
|
||||
seq_id_start: int = 0,
|
||||
sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
|
||||
|
||||
assert len(seq_output_lens) > 0
|
||||
|
||||
if sampling_params is None:
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
prompt_token_ids = [0] * seq_prompt_len
|
||||
|
||||
seqs: List[Sequence] = []
|
||||
for seq_id_offset, output_len in enumerate(seq_output_lens):
|
||||
seq = Sequence(
|
||||
seq_id=seq_id_start + seq_id_offset,
|
||||
inputs=token_inputs(prompt_token_ids),
|
||||
block_size=16,
|
||||
)
|
||||
|
||||
for i in range(output_len):
|
||||
seq.append_token_id(
|
||||
token_id=i,
|
||||
logprobs={i: Logprob(0.0)},
|
||||
)
|
||||
seqs.append(seq)
|
||||
|
||||
seq_group = SequenceGroup(
|
||||
request_id=request_id,
|
||||
seqs=seqs,
|
||||
sampling_params=sampling_params,
|
||||
arrival_time=time.time(),
|
||||
)
|
||||
|
||||
return seq_group
|
||||
|
||||
|
||||
def create_seq_group_encoder_decoder(
|
||||
seq_prompt_len: int = 1024,
|
||||
seq_output_lens: GenericSequence[int] = (128, ),
|
||||
request_id: str = '0',
|
||||
seq_id_start: int = 0,
|
||||
sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
|
||||
|
||||
assert len(seq_output_lens) > 0
|
||||
|
||||
if sampling_params is None:
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
prompt_token_ids = [0] * seq_prompt_len
|
||||
|
||||
inputs: EncoderDecoderInputs = {
|
||||
"decoder": token_inputs(prompt_token_ids),
|
||||
"encoder": token_inputs(prompt_token_ids),
|
||||
}
|
||||
|
||||
seqs = []
|
||||
for seq_id_offset, output_len in enumerate(seq_output_lens):
|
||||
# Construct decoder input sequences
|
||||
seq = Sequence(
|
||||
seq_id=seq_id_start + seq_id_offset,
|
||||
inputs=inputs["decoder"],
|
||||
block_size=16,
|
||||
)
|
||||
|
||||
for i in range(output_len):
|
||||
seq.append_token_id(
|
||||
token_id=i,
|
||||
logprobs={i: Logprob(0.0)},
|
||||
)
|
||||
seqs.append(seq)
|
||||
|
||||
# Encoder input sequence
|
||||
encoder_seq = Sequence(
|
||||
seq_id=seq_id_start + len(seq_output_lens),
|
||||
inputs=inputs["encoder"],
|
||||
block_size=16,
|
||||
)
|
||||
|
||||
return SequenceGroup(request_id=request_id,
|
||||
seqs=seqs,
|
||||
sampling_params=sampling_params,
|
||||
arrival_time=time.time(),
|
||||
encoder_seq=encoder_seq)
|
||||
|
||||
|
||||
def round_up_to_next_block(seq_len: int, block_size: int) -> int:
|
||||
return (seq_len + block_size - 1) // block_size
|
||||
|
||||
|
||||
# Helper functions for scheduler tests
|
||||
|
||||
|
||||
def get_sequence_groups(scheduler_output):
|
||||
return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
|
||||
|
||||
|
||||
def append_new_token(out, token_id: int):
|
||||
seq_groups = get_sequence_groups(out)
|
||||
for seq_group in seq_groups:
|
||||
for seq in seq_group.get_seqs():
|
||||
seq.append_token_id(token_id, {token_id: Logprob(token_id)})
|
||||
|
||||
|
||||
def schedule_and_update_computed_tokens(scheduler):
|
||||
metas, out, _ = scheduler.schedule()
|
||||
for s, meta in zip(out.scheduled_seq_groups, metas):
|
||||
s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
|
||||
return metas, out
|
||||
|
||||
|
||||
def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
|
||||
seq_group.update_num_computed_tokens(token_chunk_size)
|
||||
for seq in seq_group.get_seqs():
|
||||
seq.append_token_id(token_id, {token_id: Logprob(token_id)})
|
||||
Reference in New Issue
Block a user