forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
0
vllm-v0.6.2/tests/v1/__init__.py
Normal file
0
vllm-v0.6.2/tests/v1/__init__.py
Normal file
219
vllm-v0.6.2/tests/v1/core/test_prefix_caching.py
Normal file
219
vllm-v0.6.2/tests/v1/core/test_prefix_caching.py
Normal file
@@ -0,0 +1,219 @@
|
||||
"""Compare the with and without prefix caching."""
|
||||
from vllm.inputs import token_inputs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
|
||||
from vllm.v1.core.kv_cache_utils import hash_block_tokens
|
||||
|
||||
|
||||
def make_request(request_id, prompt_token_ids):
|
||||
return Request(
|
||||
request_id=request_id,
|
||||
inputs=token_inputs(prompt_token_ids=prompt_token_ids),
|
||||
sampling_params=SamplingParams(max_tokens=17),
|
||||
eos_token_id=100,
|
||||
arrival_time=0,
|
||||
lora_request=None,
|
||||
)
|
||||
|
||||
|
||||
def test_prefill():
|
||||
manager = KVCacheManager(
|
||||
block_size=16,
|
||||
num_gpu_blocks=10,
|
||||
sliding_window=False,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=16,
|
||||
)
|
||||
|
||||
# Complete 3 blocks (48 tokens)
|
||||
common_token_ids = [i for i in range(3) for _ in range(16)]
|
||||
|
||||
# Fully cache miss
|
||||
# Incomplete 1 block (7 tokens)
|
||||
unique_token_ids = [3] * 7
|
||||
req0 = make_request("0", common_token_ids + unique_token_ids)
|
||||
computed_blocks = manager.get_computed_blocks(req0)
|
||||
assert not computed_blocks
|
||||
blocks = manager.allocate_slots(req0, 55, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
|
||||
|
||||
# Check full block metadata
|
||||
parent_block_hash = None
|
||||
for block_id in (0, 1, 2):
|
||||
block_hash = hash_block_tokens(parent_block_hash,
|
||||
manager.block_pool[block_id].token_ids)
|
||||
assert manager.block_pool[block_id].block_hash == block_hash
|
||||
assert manager.block_pool[block_id].ref_cnt == 1
|
||||
assert manager.block_pool[block_id].num_hashed_tokens == 16 * (
|
||||
block_id + 1)
|
||||
assert manager.block_pool[block_id].token_ids == tuple([block_id] * 16)
|
||||
parent_block_hash = block_hash
|
||||
|
||||
# Check partial/preallocated block metadata
|
||||
for block_id in (3, 4):
|
||||
assert manager.block_pool[block_id].block_hash is None
|
||||
assert manager.block_pool[block_id].ref_cnt == 1
|
||||
assert manager.block_pool[block_id].num_hashed_tokens == 0
|
||||
if block_id == 3:
|
||||
assert manager.block_pool[block_id].token_ids == [3] * 7
|
||||
else:
|
||||
assert not manager.block_pool[block_id].token_ids
|
||||
|
||||
# Cache hit in the common prefix when the original block is still in use.
|
||||
# Incomplete 1 block (5 tokens)
|
||||
unique_token_ids = [3] * 5
|
||||
req1 = make_request("1", common_token_ids + unique_token_ids)
|
||||
computed_blocks = manager.get_computed_blocks(req1)
|
||||
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
|
||||
num_new_tokens = 53 - 3 * 16
|
||||
blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [5, 6]
|
||||
for block in computed_blocks:
|
||||
assert block.ref_cnt == 2
|
||||
|
||||
# At this point, we should have 3 free blocks left.
|
||||
assert manager.free_block_queue.num_free_blocks == 3
|
||||
|
||||
manager.free(req0)
|
||||
manager.free(req1)
|
||||
|
||||
# All blocks should be available.
|
||||
assert manager.free_block_queue.num_free_blocks == 10
|
||||
# The order should be
|
||||
# [unallocated (7, 8)]
|
||||
# [unique_req0 (4, 3)]
|
||||
# [unique_req1 (6, 5)]
|
||||
# [common (2, 1, 0)]
|
||||
assert [
|
||||
b.block_id for b in manager.free_block_queue.get_all_free_blocks()
|
||||
] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
|
||||
|
||||
# Cache hit in the common prefix when the original block is already free.
|
||||
# Incomplete 1 block (6 tokens)
|
||||
unique_token_ids = [3] * 6
|
||||
req2 = make_request("2", common_token_ids + unique_token_ids)
|
||||
computed_block = manager.get_computed_blocks(req2)
|
||||
assert [b.block_id for b in computed_block] == [0, 1, 2]
|
||||
num_new_tokens = 53 - 3 * 16
|
||||
blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [7, 8]
|
||||
|
||||
# Although we only have 5 free blocks, we have 8 blocks in
|
||||
# the free block queue due to lazy removal.
|
||||
assert manager.free_block_queue.num_free_blocks == 5
|
||||
assert all([
|
||||
b.ref_cnt == 0 for b in manager.free_block_queue.get_all_free_blocks()
|
||||
])
|
||||
assert len([b
|
||||
for b in manager.free_block_queue.get_all_free_blocks()]) == 5
|
||||
|
||||
manager.free(req2)
|
||||
|
||||
# Cache miss and eviction.
|
||||
req3 = make_request("3", [99] * (16 * 9))
|
||||
computed_blocks = manager.get_computed_blocks(req3)
|
||||
assert not computed_blocks
|
||||
blocks = manager.allocate_slots(req2, 16 * 9, computed_blocks)
|
||||
# This block ID order also checks the eviction order.
|
||||
assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
|
||||
assert manager.free_block_queue.num_free_blocks == 0
|
||||
assert manager.free_block_queue.free_list_head is None
|
||||
assert manager.free_block_queue.free_list_tail is None
|
||||
|
||||
|
||||
def test_decode():
|
||||
manager = KVCacheManager(
|
||||
block_size=16,
|
||||
num_gpu_blocks=10,
|
||||
sliding_window=False,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=16,
|
||||
)
|
||||
|
||||
# Complete 3 blocks (48 tokens)
|
||||
common_token_ids = [i for i in range(3) for _ in range(16)]
|
||||
|
||||
# Fully cache miss
|
||||
# Incomplete 1 block (7 tokens)
|
||||
unique_token_ids = [3] * 7
|
||||
req0 = make_request("0", common_token_ids + unique_token_ids)
|
||||
computed_blocks = manager.get_computed_blocks(req0)
|
||||
assert not computed_blocks
|
||||
blocks = manager.allocate_slots(req0, 55, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
|
||||
|
||||
# Append slots without allocating a new block.
|
||||
req0.num_computed_tokens = 55
|
||||
for _ in range(4):
|
||||
req0.append_output_token_ids(8)
|
||||
new_blocks = manager.append_slots(req0, 4)
|
||||
assert new_blocks is not None and len(new_blocks) == 0
|
||||
assert len(manager.block_pool[3].token_ids) == 11
|
||||
|
||||
# Append slots without allocating a new block, but start using the
|
||||
# preallocated block.
|
||||
req0.num_computed_tokens = 59
|
||||
# 6 tokens to fill the previous block, and 10 tokens to fill
|
||||
# the preallocated block.
|
||||
for _ in range(5 + 10):
|
||||
req0.append_output_token_ids(7)
|
||||
new_blocks = manager.append_slots(req0, 15)
|
||||
assert new_blocks is not None and len(new_blocks) == 0
|
||||
assert len(manager.block_pool[3].token_ids) == 16
|
||||
assert len(manager.block_pool[4].token_ids) == 10
|
||||
|
||||
# Append slots with allocating a new block.
|
||||
req0.num_computed_tokens = 74
|
||||
# 6 tokens to fill the previous block, and 10 tokens to fill
|
||||
# the preallocated block.
|
||||
for _ in range(6 + 11):
|
||||
req0.append_output_token_ids(12)
|
||||
new_blocks = manager.append_slots(req0, 17)
|
||||
# Plus one preallocated block.
|
||||
assert new_blocks is not None and len(new_blocks) == 2
|
||||
assert len(manager.block_pool[4].token_ids) == 16
|
||||
assert len(manager.block_pool[5].token_ids) == 11
|
||||
assert len(manager.block_pool[6].token_ids) == 0
|
||||
|
||||
|
||||
def test_evict():
|
||||
manager = KVCacheManager(
|
||||
block_size=16,
|
||||
num_gpu_blocks=10,
|
||||
sliding_window=False,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=16,
|
||||
)
|
||||
|
||||
last_token_id = 5 * 16 + 7
|
||||
req0 = make_request("0", list(range(last_token_id)))
|
||||
computed_blocks = manager.get_computed_blocks(req0)
|
||||
assert not computed_blocks
|
||||
blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks)
|
||||
assert len(blocks) == 7 # 5 full + 1 partial + 1 preallocated
|
||||
|
||||
# 3 blocks.
|
||||
req1 = make_request("1", list(range(last_token_id,
|
||||
last_token_id + 3 * 16)))
|
||||
computed_blocks = manager.get_computed_blocks(req1)
|
||||
assert not computed_blocks
|
||||
blocks = manager.allocate_slots(req1, 3 * 16, computed_blocks)
|
||||
assert len(blocks) == 3 # 3 full blocks
|
||||
last_token_id += 3 * 16
|
||||
|
||||
assert manager.free_block_queue.num_free_blocks == 0
|
||||
|
||||
manager.free(req0)
|
||||
manager.free(req1)
|
||||
assert manager.free_block_queue.num_free_blocks == 10
|
||||
assert [
|
||||
b.block_id for b in manager.free_block_queue.get_all_free_blocks()
|
||||
] == [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
|
||||
|
||||
# Touch the first 2 blocks.
|
||||
req2 = make_request("2", list(range(2 * 16 + 3)))
|
||||
computed_blocks = manager.get_computed_blocks(req2)
|
||||
assert [b.block_id for b in computed_blocks] == [0, 1]
|
||||
blocks = manager.allocate_slots(req2, 3, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [6, 5]
|
||||
assert manager.free_block_queue.num_free_blocks == 6
|
||||
0
vllm-v0.6.2/tests/v1/engine/__init__.py
Normal file
0
vllm-v0.6.2/tests/v1/engine/__init__.py
Normal file
66
vllm-v0.6.2/tests/v1/engine/test_async_llm.py
Normal file
66
vllm-v0.6.2/tests/v1/engine/test_async_llm.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import asyncio
|
||||
from typing import Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
|
||||
if not current_platform.is_cuda():
|
||||
pytest.skip(reason="V1 currently only supported on CUDA.",
|
||||
allow_module_level=True)
|
||||
|
||||
ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
|
||||
disable_log_requests=True)
|
||||
|
||||
|
||||
async def generate(engine: AsyncLLM, request_id: str,
|
||||
max_tokens: int) -> Tuple[int, str]:
|
||||
count = 0
|
||||
async for _ in engine.generate(request_id=request_id,
|
||||
prompt="Hello my name is Robert and",
|
||||
sampling_params=SamplingParams(
|
||||
max_tokens=max_tokens, temperature=0)):
|
||||
|
||||
count += 1
|
||||
await asyncio.sleep(0.)
|
||||
|
||||
return count, request_id
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load(monkeypatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
|
||||
|
||||
NUM_REQUESTS = 10000
|
||||
NUM_EXPECTED_TOKENS = 10
|
||||
|
||||
request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
|
||||
|
||||
# Create concurrent requests.
|
||||
tasks = []
|
||||
for request_id in request_ids:
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
generate(engine, request_id, NUM_EXPECTED_TOKENS)))
|
||||
|
||||
# Confirm that we got all the EXPECTED tokens from the requests.
|
||||
failed_request_id = None
|
||||
tokens = None
|
||||
for task in tasks:
|
||||
num_generated_tokens, request_id = await task
|
||||
if (num_generated_tokens != NUM_EXPECTED_TOKENS
|
||||
and failed_request_id is None):
|
||||
failed_request_id = request_id
|
||||
tokens = num_generated_tokens
|
||||
|
||||
assert failed_request_id is None, (
|
||||
f"{failed_request_id} generated {tokens} but "
|
||||
f"expected {NUM_EXPECTED_TOKENS}")
|
||||
|
||||
engine.shutdown()
|
||||
205
vllm-v0.6.2/tests/v1/engine/test_detokenizer.py
Normal file
205
vllm-v0.6.2/tests/v1/engine/test_detokenizer.py
Normal file
@@ -0,0 +1,205 @@
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.sampling_params import RequestOutputKind
|
||||
from vllm.v1.engine import EngineCoreOutput
|
||||
from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest
|
||||
|
||||
TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
|
||||
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
|
||||
|
||||
FULL_STRINGS = [
|
||||
"My name is Robert from Neural Magic and I love working on vLLM so much!",
|
||||
"Red Hat is the best open source company by far across Linux, K8s, and AI.",
|
||||
"Nick is the name of my brother in addition to my colleague from Red Hat.",
|
||||
]
|
||||
|
||||
STOP_STRINGS = ["I love working on", "company by far", "brother in"]
|
||||
|
||||
FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS]
|
||||
PROMPT_LEN = 5
|
||||
PROMPT_TOKENS = [
|
||||
tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
|
||||
]
|
||||
GENERATION_TOKENS = [
|
||||
tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
|
||||
]
|
||||
PROMPT_STRINGS = [
|
||||
tokenizer.decode(prompt_tokens, skip_special_tokens=True)
|
||||
for prompt_tokens in PROMPT_TOKENS
|
||||
]
|
||||
PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
|
||||
GENERATION_STRINGS = [
|
||||
text[prompt_len:]
|
||||
for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
|
||||
]
|
||||
|
||||
|
||||
class MockEngineCore:
|
||||
"""Mock outputs form premade tokens lists."""
|
||||
|
||||
def __init__(self, tokens_list: List[List[int]]):
|
||||
self.tokens_list = tokens_list
|
||||
self.current_idx = 0
|
||||
|
||||
def get_outputs(self) -> List[EngineCoreOutput]:
|
||||
token_idx = self.current_idx
|
||||
self.current_idx += 1
|
||||
|
||||
outputs = []
|
||||
for req_idx, token_ids in enumerate(self.tokens_list):
|
||||
if len(token_ids) > token_idx:
|
||||
output = EngineCoreOutput(request_id=f"request-{req_idx}",
|
||||
new_token_ids=[token_ids[token_idx]],
|
||||
finished=False)
|
||||
if token_idx == len(token_ids) - 1:
|
||||
output.finished = True
|
||||
output.finish_reason = "stopped"
|
||||
outputs.append(output)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"request_output_kind",
|
||||
[RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
|
||||
def test_incremental_detokenization(request_output_kind: RequestOutputKind):
|
||||
detokenizer = Detokenizer(TOKENIZER_NAME)
|
||||
engine_core = MockEngineCore(GENERATION_TOKENS)
|
||||
|
||||
# Make N requests.
|
||||
requests = [
|
||||
DetokenizerRequest(
|
||||
request_id=f"request-{idx}",
|
||||
prompt=prompt,
|
||||
prompt_token_ids=prompt_tokens,
|
||||
skip_special_tokens=False,
|
||||
spaces_between_special_tokens=False,
|
||||
output_kind=request_output_kind,
|
||||
stop=[],
|
||||
include_stop_str_in_output=False,
|
||||
) for idx, (
|
||||
prompt,
|
||||
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
|
||||
]
|
||||
|
||||
# Add requests to the detokenizer.
|
||||
for request in requests:
|
||||
detokenizer.add_request(request)
|
||||
|
||||
gen_strings = {}
|
||||
gen_tokens = {}
|
||||
while True:
|
||||
# Mock output from the EngineCore.
|
||||
outputs = engine_core.get_outputs()
|
||||
if len(outputs) == 0:
|
||||
break
|
||||
|
||||
# Step the Detokenizer.
|
||||
request_outputs, requests_to_abort = detokenizer.step(outputs)
|
||||
assert len(requests_to_abort) == 0
|
||||
|
||||
# Update tracking.
|
||||
for request_output in request_outputs:
|
||||
request_id = request_output.request_id
|
||||
new_text = request_output.outputs[0].text
|
||||
new_tokens = request_output.outputs[0].token_ids
|
||||
if request_id not in gen_strings:
|
||||
gen_strings[request_id] = new_text
|
||||
gen_tokens[request_id] = new_tokens
|
||||
else:
|
||||
gen_strings[request_id] += new_text
|
||||
gen_tokens[request_id].extend(new_tokens)
|
||||
|
||||
# Confirmed tracked values matches what we expected.
|
||||
for idx, (ref_gen_str, ref_gen_toks) in enumerate(
|
||||
zip(GENERATION_STRINGS, GENERATION_TOKENS)):
|
||||
gen_str = gen_strings[f"request-{idx}"]
|
||||
gen_toks = gen_tokens[f"request-{idx}"]
|
||||
|
||||
assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
|
||||
assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
|
||||
|
||||
assert detokenizer.get_num_unfinished_requests() == 0
|
||||
assert not detokenizer.has_unfinished_requests()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
|
||||
def test_stop_string(include_stop_str_in_output: bool):
|
||||
detokenizer = Detokenizer(TOKENIZER_NAME)
|
||||
engine_core = MockEngineCore(GENERATION_TOKENS)
|
||||
|
||||
# Make N requests.
|
||||
requests = [
|
||||
DetokenizerRequest(
|
||||
request_id=f"request-{idx}",
|
||||
prompt=prompt,
|
||||
prompt_token_ids=prompt_tokens,
|
||||
skip_special_tokens=False,
|
||||
spaces_between_special_tokens=False,
|
||||
output_kind=RequestOutputKind.DELTA,
|
||||
stop=STOP_STRINGS,
|
||||
include_stop_str_in_output=include_stop_str_in_output,
|
||||
) for idx, (
|
||||
prompt,
|
||||
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
|
||||
]
|
||||
|
||||
# Add requests to the detokenizer.
|
||||
for request in requests:
|
||||
detokenizer.add_request(request)
|
||||
|
||||
gen_strings = {}
|
||||
aborted = []
|
||||
while True:
|
||||
# Mock output from the EngineCore.
|
||||
outputs = engine_core.get_outputs()
|
||||
if len(outputs) == 0:
|
||||
break
|
||||
|
||||
# Step the Detokenizer.
|
||||
request_outputs, requests_to_abort = detokenizer.step(outputs)
|
||||
for request_output in request_outputs:
|
||||
# If aborted, we should not get a request output.
|
||||
assert request_output.request_id not in aborted
|
||||
aborted.extend(requests_to_abort)
|
||||
|
||||
# Update tracking.
|
||||
for request_output in request_outputs:
|
||||
if request_output.finished:
|
||||
assert request_output.outputs[0].finish_reason == "stop"
|
||||
|
||||
request_id = request_output.request_id
|
||||
new_text = request_output.outputs[0].text
|
||||
if request_id not in gen_strings:
|
||||
gen_strings[request_id] = new_text
|
||||
else:
|
||||
gen_strings[request_id] += new_text
|
||||
|
||||
# Confirmed tracked values matches what we expected.
|
||||
for idx, (ref_gen_str,
|
||||
stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
|
||||
|
||||
# Request should be aborted.
|
||||
request_id = f"request-{idx}"
|
||||
assert request_id in aborted
|
||||
|
||||
# Collected values that were generated.
|
||||
gen_str = gen_strings[request_id]
|
||||
|
||||
# Construct reference strings.
|
||||
stop_str_idx = ref_gen_str.find(stop_str)
|
||||
ref_str_exc_stop = ref_gen_str[:stop_str_idx]
|
||||
ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
|
||||
|
||||
if include_stop_str_in_output:
|
||||
assert gen_str == ref_str_inc_stop, (
|
||||
f"{gen_str=}, {ref_str_inc_stop=}")
|
||||
else:
|
||||
assert gen_str == ref_str_exc_stop, (
|
||||
f"{gen_str=}, {ref_str_exc_stop=}")
|
||||
|
||||
assert detokenizer.get_num_unfinished_requests() == 0
|
||||
assert not detokenizer.has_unfinished_requests()
|
||||
140
vllm-v0.6.2/tests/v1/engine/test_engine_core.py
Normal file
140
vllm-v0.6.2/tests/v1/engine/test_engine_core.py
Normal file
@@ -0,0 +1,140 @@
|
||||
import time
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
from vllm.v1.engine.core import EngineCore
|
||||
|
||||
if not current_platform.is_cuda():
|
||||
pytest.skip(reason="V1 currently only supported on CUDA.",
|
||||
allow_module_level=True)
|
||||
|
||||
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
PROMPT = "Hello my name is Robert and I love quantization kernels"
|
||||
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
|
||||
|
||||
|
||||
def make_request() -> EngineCoreRequest:
|
||||
return EngineCoreRequest(
|
||||
request_id=uuid.uuid4(),
|
||||
prompt=PROMPT,
|
||||
prompt_token_ids=PROMPT_TOKENS,
|
||||
mm_data=None,
|
||||
mm_placeholders=None,
|
||||
mm_processor_kwargs=None,
|
||||
sampling_params=SamplingParams(),
|
||||
eos_token_id=None,
|
||||
arrival_time=time.time(),
|
||||
lora_request=None,
|
||||
)
|
||||
|
||||
|
||||
def test_engine_core(monkeypatch):
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
"""Setup the EngineCore."""
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = AsyncLLM._get_executor_cls(vllm_config)
|
||||
|
||||
engine_core = EngineCore(vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
usage_context=UsageContext.UNKNOWN_CONTEXT)
|
||||
"""Test basic request lifecycle."""
|
||||
|
||||
# First request.
|
||||
engine_core.add_request(make_request())
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 1
|
||||
|
||||
# Second request.
|
||||
engine_core.add_request(make_request())
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 1
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
|
||||
# Add two requests in a row.
|
||||
engine_core.add_request(make_request())
|
||||
engine_core.add_request(make_request())
|
||||
assert len(engine_core.scheduler.waiting) == 2
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 4
|
||||
|
||||
# Loop through until they are all done.
|
||||
while len(engine_core.step()) > 0:
|
||||
pass
|
||||
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
"""Test abort cycle."""
|
||||
|
||||
# Basic abort.
|
||||
req = make_request()
|
||||
request_id = req.request_id
|
||||
|
||||
engine_core.add_request(req)
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 1
|
||||
|
||||
engine_core.abort_requests([request_id])
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
# Add, step, abort 1 of the 3.
|
||||
req0 = make_request()
|
||||
req1 = make_request()
|
||||
req2 = make_request()
|
||||
|
||||
engine_core.add_request(req0)
|
||||
engine_core.add_request(req1)
|
||||
assert len(engine_core.scheduler.waiting) == 2
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
|
||||
engine_core.add_request(req2)
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 3
|
||||
|
||||
# Abort just one.
|
||||
engine_core.abort_requests([req1.request_id])
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
|
||||
# Abort the other requests at the same time.
|
||||
engine_core.abort_requests([req2.request_id, req0.request_id])
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
205
vllm-v0.6.2/tests/v1/engine/test_engine_core_client.py
Normal file
205
vllm-v0.6.2/tests/v1/engine/test_engine_core_client.py
Normal file
@@ -0,0 +1,205 @@
|
||||
import asyncio
|
||||
import time
|
||||
import uuid
|
||||
from typing import Dict, List
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
from vllm.v1.engine.core_client import EngineCoreClient
|
||||
|
||||
if not current_platform.is_cuda():
|
||||
pytest.skip(reason="V1 currently only supported on CUDA.",
|
||||
allow_module_level=True)
|
||||
|
||||
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
PROMPT = "Hello my name is Robert and I love quantization kernels"
|
||||
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
|
||||
|
||||
|
||||
def make_request(params: SamplingParams) -> EngineCoreRequest:
|
||||
return EngineCoreRequest(
|
||||
request_id=str(uuid.uuid4()),
|
||||
prompt=PROMPT,
|
||||
prompt_token_ids=PROMPT_TOKENS,
|
||||
mm_data=None,
|
||||
mm_placeholders=None,
|
||||
mm_processor_kwargs=None,
|
||||
sampling_params=params,
|
||||
eos_token_id=None,
|
||||
arrival_time=time.time(),
|
||||
lora_request=None,
|
||||
)
|
||||
|
||||
|
||||
def loop_until_done(client: EngineCoreClient, outputs: Dict):
|
||||
|
||||
while True:
|
||||
engine_core_outputs = client.get_output()
|
||||
|
||||
if len(engine_core_outputs) == 0:
|
||||
break
|
||||
|
||||
all_finished = True
|
||||
for out in engine_core_outputs:
|
||||
outputs[out.request_id].append(out)
|
||||
if not out.finished:
|
||||
all_finished = False
|
||||
|
||||
if all_finished:
|
||||
break
|
||||
|
||||
|
||||
async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
|
||||
|
||||
while True:
|
||||
engine_core_outputs = await client.get_output_async()
|
||||
|
||||
if len(engine_core_outputs) == 0:
|
||||
break
|
||||
|
||||
all_finished = True
|
||||
for out in engine_core_outputs:
|
||||
outputs[out.request_id].append(out)
|
||||
if not out.finished:
|
||||
all_finished = False
|
||||
|
||||
if all_finished:
|
||||
break
|
||||
|
||||
|
||||
@pytest.mark.parametrize("multiprocessing_mode", [True, False])
|
||||
def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = AsyncLLM._get_executor_cls(vllm_config)
|
||||
client = EngineCoreClient.make_client(
|
||||
vllm_config,
|
||||
executor_class,
|
||||
UsageContext.UNKNOWN_CONTEXT,
|
||||
multiprocess_mode=multiprocessing_mode,
|
||||
asyncio_mode=False,
|
||||
)
|
||||
|
||||
MAX_TOKENS = 20
|
||||
params = SamplingParams(max_tokens=MAX_TOKENS)
|
||||
"""Normal Request Cycle."""
|
||||
requests = [make_request(params) for _ in range(10)]
|
||||
request_ids = [req.request_id for req in requests]
|
||||
|
||||
# Add requests to the engine.
|
||||
for request in requests:
|
||||
client.add_request(request)
|
||||
time.sleep(0.01)
|
||||
|
||||
outputs: Dict[str, List] = {req_id: [] for req_id in request_ids}
|
||||
loop_until_done(client, outputs)
|
||||
|
||||
for req_id in request_ids:
|
||||
assert len(outputs[req_id]) == MAX_TOKENS, (
|
||||
f"{outputs[req_id]=}, {MAX_TOKENS=}")
|
||||
"""Abort Request Cycle."""
|
||||
|
||||
# Note: this code pathway will only work for multiprocessing
|
||||
# since we have to call get_output() explicitly
|
||||
|
||||
# Add requests to the engine.
|
||||
for idx, request in enumerate(requests):
|
||||
client.add_request(request)
|
||||
time.sleep(0.01)
|
||||
if idx % 2 == 0:
|
||||
client.abort_requests([request.request_id])
|
||||
|
||||
outputs = {req_id: [] for req_id in request_ids}
|
||||
loop_until_done(client, outputs)
|
||||
|
||||
for idx, req_id in enumerate(request_ids):
|
||||
if idx % 2 == 0:
|
||||
assert len(outputs[req_id]) < MAX_TOKENS, (
|
||||
f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
|
||||
else:
|
||||
assert len(outputs[req_id]) == MAX_TOKENS, (
|
||||
f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
|
||||
"""Abort after request is finished."""
|
||||
|
||||
# Note: this code pathway will only work for multiprocessing
|
||||
# since we have to call get_output() explicitly
|
||||
|
||||
request = requests[0]
|
||||
client.add_request(request)
|
||||
time.sleep(10.)
|
||||
|
||||
client.abort_requests([request.request_id])
|
||||
|
||||
# Shutdown the client.
|
||||
client.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_engine_core_client_asyncio(monkeypatch):
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = AsyncLLM._get_executor_cls(vllm_config)
|
||||
client = EngineCoreClient.make_client(
|
||||
vllm_config,
|
||||
executor_class,
|
||||
UsageContext.UNKNOWN_CONTEXT,
|
||||
multiprocess_mode=True,
|
||||
asyncio_mode=True,
|
||||
)
|
||||
|
||||
MAX_TOKENS = 20
|
||||
params = SamplingParams(max_tokens=MAX_TOKENS)
|
||||
"""Normal Request Cycle."""
|
||||
|
||||
requests = [make_request(params) for _ in range(10)]
|
||||
request_ids = [req.request_id for req in requests]
|
||||
|
||||
# Add requests to the engine.
|
||||
for request in requests:
|
||||
await client.add_request_async(request)
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
outputs: Dict[str, List] = {req_id: [] for req_id in request_ids}
|
||||
await loop_until_done_async(client, outputs)
|
||||
|
||||
for req_id in request_ids:
|
||||
assert len(outputs[req_id]) == MAX_TOKENS, (
|
||||
f"{outputs[req_id]=}, {MAX_TOKENS=}")
|
||||
"""Abort Request Cycle."""
|
||||
|
||||
# Add requests to the engine.
|
||||
for idx, request in enumerate(requests):
|
||||
await client.add_request_async(request)
|
||||
await asyncio.sleep(0.01)
|
||||
if idx % 2 == 0:
|
||||
await client.abort_requests_async([request.request_id])
|
||||
|
||||
outputs = {req_id: [] for req_id in request_ids}
|
||||
await loop_until_done_async(client, outputs)
|
||||
|
||||
for idx, req_id in enumerate(request_ids):
|
||||
if idx % 2 == 0:
|
||||
assert len(outputs[req_id]) < MAX_TOKENS, (
|
||||
f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
|
||||
else:
|
||||
assert len(outputs[req_id]) == MAX_TOKENS, (
|
||||
f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
|
||||
|
||||
# Shutdown the client.
|
||||
client.shutdown()
|
||||
Reference in New Issue
Block a user