add qwen3
This commit is contained in:
0
vllm-v0.6.2/tests/__init__.py
Normal file
0
vllm-v0.6.2/tests/__init__.py
Normal file
0
vllm-v0.6.2/tests/async_engine/__init__.py
Normal file
0
vllm-v0.6.2/tests/async_engine/__init__.py
Normal file
51
vllm-v0.6.2/tests/async_engine/api_server_async_engine.py
Normal file
51
vllm-v0.6.2/tests/async_engine/api_server_async_engine.py
Normal file
@@ -0,0 +1,51 @@
|
||||
"""vllm.entrypoints.api_server with some extra logging for testing."""
|
||||
from typing import Any, Dict, Iterable
|
||||
|
||||
import uvicorn
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
|
||||
import vllm.entrypoints.api_server
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
app = vllm.entrypoints.api_server.app
|
||||
|
||||
|
||||
class AsyncLLMEngineWithStats(AsyncLLMEngine):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._num_aborts = 0
|
||||
|
||||
async def _engine_abort(self, request_ids: Iterable[str]):
|
||||
ids = list(request_ids)
|
||||
self._num_aborts += len(ids)
|
||||
await super()._engine_abort(ids)
|
||||
|
||||
def testing_stats(self) -> Dict[str, Any]:
|
||||
return {"num_aborted_requests": self._num_aborts}
|
||||
|
||||
|
||||
@app.get("/stats")
|
||||
def stats() -> Response:
|
||||
"""Get the statistics of the engine."""
|
||||
return JSONResponse(engine.testing_stats())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
|
||||
vllm.entrypoints.api_server.engine = engine
|
||||
uvicorn.run(
|
||||
app,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level="debug",
|
||||
timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
|
||||
109
vllm-v0.6.2/tests/async_engine/test_api_server.py
Normal file
109
vllm-v0.6.2/tests/async_engine/test_api_server.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from multiprocessing import Pool
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from vllm.utils import get_open_port
|
||||
|
||||
port = get_open_port()
|
||||
|
||||
def _query_server(prompt: str, max_tokens: int = 5) -> dict:
|
||||
response = requests.post(f"http://localhost:{port}/generate",
|
||||
json={
|
||||
"prompt": prompt,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0,
|
||||
"ignore_eos": True
|
||||
})
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def _query_server_long(prompt: str) -> dict:
|
||||
return _query_server(prompt, max_tokens=500)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
|
||||
script_path = Path(__file__).parent.joinpath(
|
||||
"api_server_async_engine.py").absolute()
|
||||
commands = [
|
||||
sys.executable, "-u",
|
||||
str(script_path), "--model", "facebook/opt-125m", "--host",
|
||||
"127.0.0.1", "--port", f"{port}", "--tokenizer-pool-size",
|
||||
str(tokenizer_pool_size)
|
||||
]
|
||||
|
||||
if worker_use_ray:
|
||||
commands.append("--worker-use-ray")
|
||||
uvicorn_process = subprocess.Popen(commands)
|
||||
yield
|
||||
uvicorn_process.terminate()
|
||||
time.sleep(10)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
|
||||
@pytest.mark.parametrize("worker_use_ray", [False, True])
|
||||
def test_api_server(api_server, tokenizer_pool_size: int,
|
||||
worker_use_ray: bool):
|
||||
"""
|
||||
Run the API server and test it.
|
||||
|
||||
We run both the server and requests in separate processes.
|
||||
|
||||
We test that the server can handle incoming requests, including
|
||||
multiple requests at the same time, and that it can handle requests
|
||||
being cancelled without crashing.
|
||||
"""
|
||||
with Pool(32) as pool:
|
||||
# Wait until the server is ready
|
||||
prompts = ["warm up"] * 1
|
||||
result = None
|
||||
while not result:
|
||||
try:
|
||||
for r in pool.map(_query_server, prompts):
|
||||
result = r
|
||||
break
|
||||
except requests.exceptions.ConnectionError:
|
||||
time.sleep(1)
|
||||
|
||||
# Actual tests start here
|
||||
# Try with 1 prompt
|
||||
for result in pool.map(_query_server, prompts):
|
||||
assert result
|
||||
|
||||
num_aborted_requests = requests.get(
|
||||
f"http://localhost:{port}/stats").json()["num_aborted_requests"]
|
||||
assert num_aborted_requests == 0
|
||||
|
||||
# Try with 100 prompts
|
||||
prompts = ["test prompt"] * 100
|
||||
for result in pool.map(_query_server, prompts):
|
||||
assert result
|
||||
|
||||
with Pool(32) as pool:
|
||||
# Cancel requests
|
||||
prompts = ["canceled requests"] * 100
|
||||
pool.map_async(_query_server_long, prompts)
|
||||
time.sleep(0.01)
|
||||
pool.terminate()
|
||||
pool.join()
|
||||
|
||||
# check cancellation stats
|
||||
# give it some times to update the stats
|
||||
time.sleep(1)
|
||||
|
||||
num_aborted_requests = requests.get(
|
||||
f"http://localhost:{port}/stats").json()["num_aborted_requests"]
|
||||
assert num_aborted_requests > 0
|
||||
|
||||
# check that server still runs after cancellations
|
||||
with Pool(32) as pool:
|
||||
# Try with 100 prompts
|
||||
prompts = ["test prompt after canceled"] * 100
|
||||
for result in pool.map(_query_server, prompts):
|
||||
assert result
|
||||
374
vllm-v0.6.2/tests/async_engine/test_async_llm_engine.py
Normal file
374
vllm-v0.6.2/tests/async_engine/test_async_llm_engine.py
Normal file
@@ -0,0 +1,374 @@
|
||||
import asyncio
|
||||
import os
|
||||
import uuid
|
||||
from asyncio import CancelledError
|
||||
from copy import copy
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import torch
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
|
||||
from vllm.outputs import RequestOutput as RealRequestOutput
|
||||
from vllm.sampling_params import RequestOutputKind
|
||||
|
||||
from ..utils import wait_for_gpu_memory_to_clear
|
||||
|
||||
|
||||
@dataclass
|
||||
class RequestOutput:
|
||||
request_id: int
|
||||
finished: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockModelConfig:
|
||||
use_async_output_proc = True
|
||||
|
||||
|
||||
class MockEngine:
|
||||
|
||||
def __init__(self):
|
||||
self.step_calls = 0
|
||||
self.add_request_calls = 0
|
||||
self.abort_request_calls = 0
|
||||
self.request_id = None
|
||||
# Ugly, remove dependency when possible
|
||||
self.parallel_config = ParallelConfig(1, 1, False)
|
||||
self.model_config = MockModelConfig()
|
||||
|
||||
async def step_async(self, virtual_engine):
|
||||
# PP size is 1, ignore virtual engine
|
||||
self.step_calls += 1
|
||||
return [RequestOutput(
|
||||
request_id=self.request_id)] if self.request_id else []
|
||||
|
||||
async def process_model_inputs_async(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
async def stop_remote_worker_execution_loop_async(self):
|
||||
pass
|
||||
|
||||
def generate(self, request_id):
|
||||
self.request_id = request_id
|
||||
|
||||
def stop_generating(self):
|
||||
self.request_id = None
|
||||
|
||||
def add_request(self, **kwargs):
|
||||
del kwargs # Unused
|
||||
self.add_request_calls += 1
|
||||
print(f'Request calls: {self.add_request_calls}')
|
||||
|
||||
async def add_request_async(self, **kwargs):
|
||||
self.add_request_calls += 1
|
||||
return
|
||||
|
||||
def abort_request(self, request_id):
|
||||
del request_id # Unused
|
||||
self.abort_request_calls += 1
|
||||
|
||||
def has_unfinished_requests(self):
|
||||
return self.request_id is not None
|
||||
|
||||
def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
|
||||
return self.request_id is not None
|
||||
|
||||
|
||||
class MockAsyncLLMEngine(AsyncLLMEngine):
|
||||
_engine_class = MockEngine
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_new_requests_event():
|
||||
params = SamplingParams()
|
||||
|
||||
engine = MockAsyncLLMEngine()
|
||||
engine.start_background_loop()
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.step_calls == 0
|
||||
|
||||
await engine.add_request("1", "", params)
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.add_request_calls == 1
|
||||
assert engine.engine.step_calls == 1
|
||||
|
||||
await engine.add_request("2", "", params)
|
||||
engine.engine.generate("2")
|
||||
await asyncio.sleep(0)
|
||||
await asyncio.sleep(0)
|
||||
await asyncio.sleep(0)
|
||||
assert engine.engine.add_request_calls == 2
|
||||
assert engine.engine.step_calls >= 2
|
||||
await asyncio.sleep(0.001)
|
||||
assert engine.engine.step_calls >= 3
|
||||
engine.engine.stop_generating()
|
||||
await asyncio.sleep(0.001)
|
||||
old_step_calls = engine.engine.step_calls
|
||||
await asyncio.sleep(0.001)
|
||||
assert engine.engine.step_calls == old_step_calls
|
||||
|
||||
await engine.add_request("3", "", params)
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.add_request_calls == 3
|
||||
assert engine.engine.step_calls == old_step_calls + 1
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.add_request_calls == 3
|
||||
assert engine.engine.step_calls == old_step_calls + 1
|
||||
|
||||
engine = MockAsyncLLMEngine()
|
||||
assert engine.get_model_config() is not None
|
||||
assert engine.get_tokenizer() is not None
|
||||
assert engine.get_decoding_config() is not None
|
||||
|
||||
|
||||
def start_engine():
|
||||
wait_for_gpu_memory_to_clear(
|
||||
devices=list(range(torch.cuda.device_count())),
|
||||
threshold_bytes=2 * 2**30,
|
||||
timeout_s=60,
|
||||
)
|
||||
|
||||
num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
|
||||
print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
|
||||
|
||||
return AsyncLLMEngine.from_engine_args(
|
||||
AsyncEngineArgs(model="facebook/opt-125m",
|
||||
enforce_eager=True,
|
||||
num_scheduler_steps=num_scheduler_steps))
|
||||
|
||||
|
||||
def uid() -> str:
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
@pytest_asyncio.fixture(scope="module")
|
||||
async def async_engine():
|
||||
engine = await asyncio.get_event_loop().run_in_executor(executor=None,
|
||||
func=start_engine)
|
||||
try:
|
||||
yield engine
|
||||
finally:
|
||||
engine.shutdown_background_loop()
|
||||
del engine
|
||||
await asyncio.sleep(0.1)
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def should_do_global_cleanup_after_test(request) -> bool:
|
||||
# So we can share the async engine fixture between these tests
|
||||
return False
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_asyncio_run(async_engine, stop):
|
||||
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||
|
||||
async def run(prompt: str):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=32,
|
||||
min_tokens=32,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
output_count = 0
|
||||
final_output = None
|
||||
async for output in async_engine.generate(prompt,
|
||||
sampling_params,
|
||||
request_id=uid()):
|
||||
output_count += 1
|
||||
final_output = output
|
||||
return final_output, output_count
|
||||
|
||||
results = await asyncio.gather(
|
||||
run("test0"),
|
||||
run("test0"),
|
||||
)
|
||||
assert len(results) == 2
|
||||
first, second = results
|
||||
|
||||
# remove nondeterministic fields for comparison
|
||||
first[0].metrics = None
|
||||
second[0].metrics = None
|
||||
first[0].request_id = None
|
||||
second[0].request_id = None
|
||||
|
||||
assert str(first) == str(second)
|
||||
|
||||
output_count = results[0][1]
|
||||
if num_scheduler_steps == 1:
|
||||
assert output_count == 32
|
||||
else:
|
||||
assert 1 < output_count < 32
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_output_kinds(async_engine, stop):
|
||||
"""Test that output_kind works as expected and that
|
||||
results are equivalent across different kinds."""
|
||||
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=32,
|
||||
min_tokens=32,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
async def run(prompt: str, kind: RequestOutputKind):
|
||||
params = copy(sampling_params)
|
||||
params.output_kind = kind
|
||||
|
||||
output_count = 0
|
||||
final_output = None
|
||||
async for output in async_engine.generate(prompt,
|
||||
params,
|
||||
request_id=uid()):
|
||||
output_count += 1
|
||||
final_output = output
|
||||
|
||||
assert final_output is not None
|
||||
assert final_output.finished
|
||||
|
||||
return (final_output.prompt_token_ids,
|
||||
final_output.outputs[0].token_ids,
|
||||
final_output.outputs[0].text, output_count)
|
||||
|
||||
async def run_deltas(prompt: str):
|
||||
params = copy(sampling_params)
|
||||
params.output_kind = RequestOutputKind.DELTA
|
||||
|
||||
prompt_tokens = None
|
||||
output_tokens: List[int] = []
|
||||
output_text = ""
|
||||
output_count = 0
|
||||
final_output = None
|
||||
async for output in async_engine.generate(prompt,
|
||||
params,
|
||||
request_id=uid()):
|
||||
token_ids = output.outputs[0].token_ids
|
||||
text = output.outputs[0].text
|
||||
final_output = output
|
||||
|
||||
# Ensure we get prompt ids iff we haven't yet received output tokens
|
||||
if output_tokens:
|
||||
assert 1 <= len(token_ids) <= num_scheduler_steps
|
||||
assert stop or text
|
||||
assert not output.prompt_token_ids
|
||||
else:
|
||||
assert output.prompt_token_ids
|
||||
prompt_tokens = output.prompt_token_ids
|
||||
|
||||
output_tokens.extend(token_ids)
|
||||
output_text += text
|
||||
|
||||
output_count += 1
|
||||
|
||||
assert final_output is not None
|
||||
assert final_output.finished
|
||||
|
||||
return prompt_tokens, output_tokens, output_text, output_count
|
||||
|
||||
results = await asyncio.gather(
|
||||
run("common input prompt", RequestOutputKind.CUMULATIVE),
|
||||
run("common input prompt", RequestOutputKind.FINAL_ONLY),
|
||||
run_deltas("common input prompt"))
|
||||
|
||||
# Make sure outputs are the same
|
||||
prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
|
||||
assert len(prompt_set) == 1
|
||||
|
||||
text_set = set(text for _, _, text, _ in results)
|
||||
assert len(text_set) == 1
|
||||
|
||||
tokens_set = set(tuple(ids) for _, ids, _, _ in results)
|
||||
assert len(tokens_set) == 1
|
||||
|
||||
cumulative, final, deltas = results
|
||||
|
||||
# output message counts
|
||||
assert cumulative[3] == deltas[3]
|
||||
|
||||
if num_scheduler_steps == 1:
|
||||
assert cumulative[3] == 32
|
||||
else:
|
||||
assert 1 < cumulative[3] < 32
|
||||
|
||||
assert final[3] == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_cancellation(async_engine, stop):
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
min_tokens=13,
|
||||
max_tokens=13,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
stop_at = 5 if num_scheduler_steps == 1 else 1
|
||||
|
||||
request_id = uid()
|
||||
|
||||
i = 0
|
||||
with pytest.raises(CancelledError):
|
||||
async for output in async_engine.generate("test2",
|
||||
sampling_params,
|
||||
request_id=request_id):
|
||||
assert not output.finished
|
||||
i += 1
|
||||
if i == stop_at:
|
||||
await async_engine.abort(request_id)
|
||||
|
||||
assert i == stop_at
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_delayed_generator(async_engine, stop):
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
|
||||
if scheduler_config.num_scheduler_steps != 1:
|
||||
pytest.skip("no need to test this one with multistep")
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
min_tokens=10,
|
||||
max_tokens=10,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
stream = async_engine.generate("test3", sampling_params, request_id=uid())
|
||||
i = 0
|
||||
final_output: Optional[RealRequestOutput] = None
|
||||
async for output in stream:
|
||||
final_output = output
|
||||
if i == 0:
|
||||
# wait for generation to complete before consuming
|
||||
# the remaining messages
|
||||
await asyncio.sleep(1)
|
||||
if i < 9:
|
||||
assert not output.finished
|
||||
i += 1
|
||||
|
||||
assert i == 10
|
||||
assert final_output is not None
|
||||
assert len(final_output.outputs[0].token_ids) == 10
|
||||
assert final_output.finished
|
||||
106
vllm-v0.6.2/tests/async_engine/test_openapi_server.py
Normal file
106
vllm-v0.6.2/tests/async_engine/test_openapi_server.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ..utils import VLLM_PATH, RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
|
||||
assert chatml_jinja_path.exists()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--enforce-eager",
|
||||
"--chat-template",
|
||||
str(chatml_jinja_path),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_models(client: openai.AsyncOpenAI):
|
||||
models = await client.models.list()
|
||||
models = models.data
|
||||
served_model = models[0]
|
||||
assert served_model.id == MODEL_NAME
|
||||
assert all(model.root == MODEL_NAME for model in models)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_completion(client: openai.AsyncOpenAI):
|
||||
completion = await client.completions.create(model=MODEL_NAME,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0)
|
||||
|
||||
assert completion.id is not None
|
||||
assert len(completion.choices) == 1
|
||||
assert len(completion.choices[0].text) >= 5
|
||||
assert completion.choices[0].finish_reason == "length"
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5, prompt_tokens=6, total_tokens=11)
|
||||
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_chat_session(client: openai.AsyncOpenAI):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5)
|
||||
assert chat_completion.id is not None
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=55, total_tokens=65)
|
||||
|
||||
message = choice.message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
68
vllm-v0.6.2/tests/async_engine/test_request_tracker.py
Normal file
68
vllm-v0.6.2/tests/async_engine/test_request_tracker.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import pytest
|
||||
|
||||
from vllm.engine.async_llm_engine import RequestTracker
|
||||
from vllm.outputs import RequestOutput
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_request_tracker():
|
||||
tracker = RequestTracker()
|
||||
stream_1 = tracker.add_request("1")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
assert len(new) == 1
|
||||
assert new[0]["request_id"] == "1"
|
||||
assert not aborted
|
||||
assert not stream_1.finished
|
||||
|
||||
stream_2 = tracker.add_request("2")
|
||||
stream_3 = tracker.add_request("3")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
assert len(new) == 2
|
||||
assert new[0]["request_id"] == "2"
|
||||
assert new[1]["request_id"] == "3"
|
||||
assert not aborted
|
||||
assert not stream_2.finished
|
||||
assert not stream_3.finished
|
||||
|
||||
# request_ids must be unique
|
||||
with pytest.raises(KeyError):
|
||||
tracker.add_request("1")
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
|
||||
tracker.abort_request("1")
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert len(aborted) == 1
|
||||
assert "1" in aborted
|
||||
assert not new
|
||||
assert stream_1.finished
|
||||
|
||||
stream_4 = tracker.add_request("4")
|
||||
tracker.abort_request("4")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
# aborted new requests will cancel each other out -
|
||||
# there's no need for them to propagate into the
|
||||
# engine
|
||||
assert not aborted
|
||||
assert not new
|
||||
assert stream_4.finished
|
||||
|
||||
stream_5 = tracker.add_request("5")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
tracker.process_request_output(
|
||||
RequestOutput("2", "output", [], [], [], finished=True))
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
assert not aborted
|
||||
assert len(new) == 1
|
||||
assert new[0]["request_id"] == "5"
|
||||
assert stream_2.finished
|
||||
assert not stream_5.finished
|
||||
0
vllm-v0.6.2/tests/basic_correctness/__init__.py
Normal file
0
vllm-v0.6.2/tests/basic_correctness/__init__.py
Normal file
198
vllm-v0.6.2/tests/basic_correctness/test_basic_correctness.py
Normal file
198
vllm-v0.6.2/tests/basic_correctness/test_basic_correctness.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""Compare the short outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
|
||||
"""
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
import weakref
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
]
|
||||
|
||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||
|
||||
|
||||
def test_vllm_gc_ed():
|
||||
"""Verify vllm instance is GC'ed when it is deleted"""
|
||||
llm = LLM("facebook/opt-125m")
|
||||
weak_llm = weakref.ref(llm)
|
||||
del llm
|
||||
# If there's any circular reference to vllm, this fails
|
||||
# because llm instance is not GC'ed.
|
||||
assert weak_llm() is None
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(backend): MLU device only support MLU_FLASH_ATTN backend
|
||||
'''
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("backend", ["MLU_FLASH_ATTN"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
backend: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
enforce_eager: bool,
|
||||
) -> None:
|
||||
|
||||
if backend == "FLASHINFER" and current_platform.is_rocm():
|
||||
pytest.skip("Flashinfer does not support ROCm/HIP.")
|
||||
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = backend
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
enforce_eager=enforce_eager,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(multi_gpu_test): torch_mlu not support multi-process test without 'spawn'
|
||||
@brief(backend): MLU device only support MLU_FLASH_ATTN backend
|
||||
'''
|
||||
# @multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model, distributed_executor_backend, attention_backend, "
|
||||
"test_suite", [
|
||||
("facebook/opt-125m", "ray", "", "L4"),
|
||||
# ("facebook/opt-125m", "mp", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||
# ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
||||
("facebook/opt-125m", "ray", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
|
||||
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
|
||||
])
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
attention_backend: str,
|
||||
test_suite: str,
|
||||
) -> None:
|
||||
|
||||
# use MLU_FLASH_ATTN for MLU devices
|
||||
attention_backend = "MLU_FLASH_ATTN"
|
||||
|
||||
if test_suite != TARGET_TEST_SUITE:
|
||||
pytest.skip(f"Skip test for {test_suite}")
|
||||
|
||||
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
if attention_backend:
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
def test_model_with_failure(vllm_runner) -> None:
|
||||
try:
|
||||
with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
|
||||
side_effect=ValueError()):
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
vllm_runner("facebook/opt-125m",
|
||||
dtype="half",
|
||||
enforce_eager=False,
|
||||
gpu_memory_utilization=0.7)
|
||||
matches = re.search(r"input dumped to (.+).pkl",
|
||||
str(exc_info.value))
|
||||
assert matches is not None
|
||||
filename = f"{matches.group(1)}.pkl"
|
||||
|
||||
with open(filename, "rb") as filep:
|
||||
inputs = pickle.load(filep)
|
||||
|
||||
if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
|
||||
raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
|
||||
f"{list(inputs.keys())}")
|
||||
assert isinstance(inputs["arg_1"],
|
||||
ModelInputForGPUWithSamplingMetadata)
|
||||
finally:
|
||||
os.remove(filename)
|
||||
|
||||
|
||||
def test_failure_with_async_out_proc(vllm_runner) -> None:
|
||||
|
||||
filename = None
|
||||
try:
|
||||
with vllm_runner("facebook/opt-125m",
|
||||
dtype="half",
|
||||
enforce_eager=False,
|
||||
gpu_memory_utilization=0.7) as vllm_model,\
|
||||
patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
|
||||
side_effect=ValueError()):
|
||||
model_config = vllm_model.model.llm_engine.model_config
|
||||
assert model_config.use_async_output_proc
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
vllm_model.generate_greedy('how to make pizza?', 250)
|
||||
matches = re.search(r"input dumped to (.+).pkl",
|
||||
str(exc_info.value))
|
||||
assert matches is not None
|
||||
|
||||
filename = f"{matches.group(1)}.pkl"
|
||||
finally:
|
||||
# Clean up
|
||||
if filename is not None:
|
||||
os.remove(filename)
|
||||
pass
|
||||
298
vllm-v0.6.2/tests/basic_correctness/test_chunked_prefill.py
Normal file
298
vllm-v0.6.2/tests/basic_correctness/test_chunked_prefill.py
Normal file
@@ -0,0 +1,298 @@
|
||||
"""Compare the outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
It tests chunked prefill. Chunked prefill can be enabled by
|
||||
enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens,
|
||||
prefill requests are chunked.
|
||||
|
||||
Run `pytest tests/models/test_chunked_prefill.py`.
|
||||
"""
|
||||
import os
|
||||
from contextlib import nullcontext
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
|
||||
from ..models.utils import check_logprobs_close, check_outputs_equal
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
]
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(backend): MLU device only support MLU_FLASH_ATTN backend
|
||||
NOTES: chunked_prefill_token_size=1 contains some accuracy issue.
|
||||
So we skip this case in mlu ut.
|
||||
TODO(VLLM-662): fix accuracy error
|
||||
'''
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
# The original case is: @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
|
||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
@pytest.mark.parametrize("attention_backend", ["MLU_FLASH_ATTN"])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int,
|
||||
enforce_eager: bool,
|
||||
tensor_parallel_size: int,
|
||||
attention_backend: str,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""
|
||||
Checks exact match decode between huggingface model and vllm runner with
|
||||
chunked prefill.
|
||||
"""
|
||||
override_backend_env_variable(monkeypatch, attention_backend)
|
||||
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
NOTE: Since the kv cache memory is too big for small models hich would trigger
|
||||
large tensor problem in flash attention, we need to specify the num_gpu_blocks_override to 100
|
||||
'''
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
enable_chunked_prefill=True,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
max_num_seqs=max_num_seqs,
|
||||
num_gpu_blocks_override=100,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(multi_gpu_test): torch_mlu not support multi-process test without 'spawn'
|
||||
@brief(backend): MLU device only support MLU_FLASH_ATTN backend
|
||||
'''
|
||||
# @multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("attention_backend", ["MLU_FLASH_ATTN"])
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
attention_backend: str,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
|
||||
if (model == "meta-llama/Llama-2-7b-hf"
|
||||
and distributed_executor_backend == "ray"):
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "0"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "0"
|
||||
|
||||
# Set the attention backend environment variable
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
chunked_prefill_token_size = 16
|
||||
|
||||
# Add a chunked prefill config.
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
assert chunked_prefill_token_size != -1
|
||||
enable_chunked_prefill = True
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=max_num_seqs,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
gpu_memory_utilization=0.6,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kv_cache_dtype,model",
|
||||
[("fp8_e4m3",
|
||||
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
|
||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
# Due to low-precision numerical divergence, this test is too sensitive to
|
||||
# the async postprocessor
|
||||
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
||||
def test_models_with_fp8_kv_cache(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
kv_cache_dtype: str,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int,
|
||||
enforce_eager: bool,
|
||||
tensor_parallel_size: int,
|
||||
disable_async_output_proc: bool,
|
||||
) -> None:
|
||||
"""
|
||||
Check output logprobs match between no_chunked_prefill and chunked_prefill
|
||||
with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
|
||||
so here we only check chunked prefill.
|
||||
"""
|
||||
NUM_LOG_PROBS = 8
|
||||
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
max_num_seqs=max_num_seqs,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
enable_chunked_prefill=True,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
max_num_seqs=max_num_seqs,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=no_chunked_prefill_outputs,
|
||||
outputs_1_lst=chunked_prefill_outputs,
|
||||
name_0="no_chunked_prefill",
|
||||
name_1="chunked_prefill",
|
||||
)
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
NOTES: chunk_size=32 under VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 contains some accuracy issue.
|
||||
So we skip this case in mlu ut.
|
||||
TODO(VLLM-662): fix accuracy error
|
||||
'''
|
||||
@pytest.mark.parametrize("max_tokens", [16])
|
||||
@pytest.mark.parametrize("enforce_eager", [False])
|
||||
# the original case is @pytest.mark.parametrize("chunk_size", [30, 32])
|
||||
@pytest.mark.parametrize("chunk_size", [30])
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
def test_with_prefix_caching(
|
||||
vllm_runner,
|
||||
max_tokens: int,
|
||||
enforce_eager: bool,
|
||||
chunk_size: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
"""
|
||||
Checks exact match decode with and without prefix caching
|
||||
with chunked prefill enabled.
|
||||
"""
|
||||
model = "meta-llama/Llama-2-7b-chat-hf"
|
||||
# The common prompt has 142 tokens with Llama-2 tokenizer.
|
||||
common_prompt = "You are a helpful AI assistant " * 20
|
||||
unique_prompts = [
|
||||
"Question", # Warmup
|
||||
"Question", # Fully cached
|
||||
"Another question", # Partial cached
|
||||
]
|
||||
full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
|
||||
|
||||
max_num_batched_tokens = max_num_seqs = chunk_size
|
||||
outputs = {} # type: ignore
|
||||
check_result = True
|
||||
for enable in (True, False):
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype="half",
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
enable_chunked_prefill=True,
|
||||
enable_prefix_caching=enable,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
max_num_seqs=max_num_seqs,
|
||||
) as vllm_model:
|
||||
# It should fail when prefix caching is enable and chunk
|
||||
# size is not a multiple of block size (16).
|
||||
should_fail = chunk_size % 16 != 0 and enable
|
||||
check_result &= not should_fail
|
||||
outputs[enable] = []
|
||||
# Send the request one-by-one to ensure the cache is populated.
|
||||
with pytest.raises(ValueError) if should_fail else nullcontext():
|
||||
for prompt in full_prompts:
|
||||
outputs[enable] += vllm_model.generate_greedy([prompt],
|
||||
max_tokens)
|
||||
|
||||
# Check results only if we did not expect a failure.
|
||||
if check_result:
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=outputs[False],
|
||||
outputs_1_lst=outputs[True],
|
||||
name_0="w/o prefix caching",
|
||||
name_1="with prefix caching",
|
||||
)
|
||||
6
vllm-v0.6.2/tests/basic_correctness/test_cpu_offload.py
Normal file
6
vllm-v0.6.2/tests/basic_correctness/test_cpu_offload.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from ..utils import compare_two_settings
|
||||
|
||||
|
||||
def test_cpu_offload():
|
||||
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
|
||||
["--cpu-offload-gb", "1"])
|
||||
186
vllm-v0.6.2/tests/basic_correctness/test_preemption.py
Normal file
186
vllm-v0.6.2/tests/basic_correctness/test_preemption.py
Normal file
@@ -0,0 +1,186 @@
|
||||
"""Compare the short outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
|
||||
|
||||
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
|
||||
pytest tests/basic_correctness/test_preemption.py`.
|
||||
"""
|
||||
import pytest
|
||||
from prometheus_client import REGISTRY
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import SamplingParams
|
||||
from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
|
||||
ENABLE_ARTIFICIAL_PREEMPT)
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def check_settings():
|
||||
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
|
||||
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
|
||||
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
|
||||
"pytest tests/basic_correctness/test_preemption.py`")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def worker_use_ray() -> bool:
|
||||
# When SPMD worker is used, use ray_use_worker=True
|
||||
# to test delta input optimization works with preemption.
|
||||
return envs.VLLM_USE_RAY_SPMD_WORKER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
|
||||
def test_chunked_prefill_recompute(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int,
|
||||
worker_use_ray: bool,
|
||||
) -> None:
|
||||
"""Ensure that chunked prefill works with preemption."""
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
enable_chunked_prefill = False
|
||||
max_num_batched_tokens = None
|
||||
if chunked_prefill_token_size != -1:
|
||||
enable_chunked_prefill = True
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
NOTE: Since the kv cache memory is too big for small models hich would trigger
|
||||
large tensor problem in flash attention, we need to specify the num_gpu_blocks_override to 500
|
||||
'''
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_seqs=max_num_seqs,
|
||||
worker_use_ray=worker_use_ray,
|
||||
disable_log_stats=False,
|
||||
num_gpu_blocks_override=500,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_preemption(
|
||||
caplog_vllm,
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
worker_use_ray: bool,
|
||||
) -> None:
|
||||
"""By default, recompute preemption is enabled"""
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=False,
|
||||
worker_use_ray=worker_use_ray,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
total_preemption = (
|
||||
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
|
||||
"is not enough KV cache space." in caplog_vllm.text)
|
||||
# Ensure the count bucket of request-level histogram metrics matches
|
||||
# the number of requests as a simple sanity check to ensure metrics are
|
||||
# generated
|
||||
preemption_metrics = None
|
||||
for m in REGISTRY.collect():
|
||||
if m.name == "vllm:num_preemptions":
|
||||
preemption_metrics = m
|
||||
assert preemption_metrics is not None
|
||||
total_recorded_preemption = 0
|
||||
for sample in preemption_metrics.samples:
|
||||
total_recorded_preemption += sample.value
|
||||
assert total_preemption == total_recorded_preemption
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_preemption_infeasible(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
worker_use_ray: bool,
|
||||
) -> None:
|
||||
"""Verify infeasible preemption request will be ignored."""
|
||||
BLOCK_SIZE = 16
|
||||
prefill_blocks = 2
|
||||
decode_blocks = max_tokens // BLOCK_SIZE
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
block_size=BLOCK_SIZE,
|
||||
# Not enough gpu blocks to complete a single sequence.
|
||||
# preemption should happen, and the sequence should be
|
||||
# ignored instead of hanging forever.
|
||||
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
|
||||
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
|
||||
worker_use_ray=worker_use_ray,
|
||||
) as vllm_model:
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||
ignore_eos=True)
|
||||
req_outputs = vllm_model.model.generate(
|
||||
example_prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
|
||||
# Verify the request is ignored and not hang.
|
||||
for req_output in req_outputs:
|
||||
outputs = req_output.outputs
|
||||
assert len(outputs) == 1
|
||||
assert outputs[0].finish_reason == "length"
|
||||
59
vllm-v0.6.2/tests/benchmark/test_benchmark_latency.py
Normal file
59
vllm-v0.6.2/tests/benchmark/test_benchmark_latency.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import numpy as np
|
||||
from vllm import LLM, SamplingParams
|
||||
import os
|
||||
import pandas as pd
|
||||
|
||||
def test_generating_csv():
|
||||
'''
|
||||
test generating csv
|
||||
'''
|
||||
# contents of this test is brought from benchmark_latency.py
|
||||
|
||||
csv_file = "output.csv"
|
||||
if os.path.isfile(csv_file):
|
||||
os.remove("output.csv")
|
||||
assert not os.path.isfile(csv_file)
|
||||
|
||||
os.environ['VLLM_LATENCY_DEBUG'] = "1"
|
||||
model_path = "/data/vllm/sq_per_tensor_per_channel/Llama-2-7b-hf"
|
||||
tp = 1
|
||||
batch_size = 4
|
||||
input_len = 128
|
||||
output_len = 5
|
||||
quantization = "smoothquant"
|
||||
llm = LLM(model=model_path,
|
||||
tokenizer=model_path,
|
||||
quantization=quantization,
|
||||
tensor_parallel_size=tp,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True)
|
||||
sampling_params = SamplingParams(
|
||||
n=1,
|
||||
temperature=1.0,
|
||||
top_p=1.0,
|
||||
ignore_eos=True,
|
||||
max_tokens=output_len,
|
||||
)
|
||||
dummy_prompt_token_ids = np.random.randint(10000,
|
||||
size=(batch_size,
|
||||
input_len))
|
||||
dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
|
||||
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=False)
|
||||
llm.get_metrics(0, # args.num_iters_warmup,
|
||||
False, #args.only_average,
|
||||
input_len, #args.input_len,
|
||||
output_len, #args.output_len,
|
||||
tp, #args.tensor_parallel_size,
|
||||
quantization, #args.quantization
|
||||
llm.dump_info)
|
||||
assert os.path.isfile(csv_file)
|
||||
df = pd.read_csv(csv_file)
|
||||
assert df['batch size'].item() == batch_size
|
||||
assert df['model'].item() == model_path
|
||||
assert df['input len'].item() == input_len
|
||||
assert df['output len'].item() == output_len
|
||||
assert df['tp'].item() == tp
|
||||
assert df['weight dtype'].item() == "SmoothQuant-int8"
|
||||
os.remove(csv_file)
|
||||
0
vllm-v0.6.2/tests/compile/__init__.py
Normal file
0
vllm-v0.6.2/tests/compile/__init__.py
Normal file
33
vllm-v0.6.2/tests/compile/backend.py
Normal file
33
vllm-v0.6.2/tests/compile/backend.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from copy import deepcopy
|
||||
from typing import Callable
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class TestBackend:
|
||||
"""
|
||||
This class provides a simple Inductor backend that can be used for testing.
|
||||
It takes a list of custom passes and runs them after Inductor's passes.
|
||||
It also saves the graph before and after the custom passes for inspection.
|
||||
"""
|
||||
|
||||
def __init__(self, *args: Callable[[torch.fx.Graph], None]):
|
||||
self.custom_passes = args
|
||||
from torch._inductor import config
|
||||
self.current_config = config.shallow_copy_dict()
|
||||
self.current_config['post_grad_custom_post_pass'] = self.post_pass
|
||||
|
||||
def __call__(self, graph: torch.fx.GraphModule, example_inputs):
|
||||
from torch._inductor.compile_fx import compile_fx
|
||||
return compile_fx(graph,
|
||||
example_inputs,
|
||||
config_patches=self.current_config)
|
||||
|
||||
def post_pass(self, graph: torch.fx.Graph):
|
||||
self.graph_pre_pass = deepcopy(graph)
|
||||
for pass_ in self.custom_passes:
|
||||
pass_(graph)
|
||||
|
||||
self.graph_post_pass = deepcopy(graph)
|
||||
# assign by reference, will reflect the final state of the graph
|
||||
self.final_graph = graph
|
||||
0
vllm-v0.6.2/tests/compile/piecewise/__init__.py
Normal file
0
vllm-v0.6.2/tests/compile/piecewise/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"use_cudagraph": true,
|
||||
"non_cudagraph_ops": ["silly.attention"],
|
||||
"cudagraph_copy_inputs": true
|
||||
}
|
||||
112
vllm-v0.6.2/tests/compile/piecewise/test_simple.py
Normal file
112
vllm-v0.6.2/tests/compile/piecewise/test_simple.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""
|
||||
Test the piecewise compilation with a simple model so that we
|
||||
can exactly calculate the expected output and side effects.
|
||||
"""
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.library import Library
|
||||
|
||||
from vllm.compilation.compile_context import set_compile_context
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.compilation.levels import CompilationLevel
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.utils import direct_register_custom_op
|
||||
|
||||
global_counter = 0
|
||||
|
||||
# create a library to hold the custom op
|
||||
silly_lib = Library("silly", "FRAGMENT") # noqa
|
||||
|
||||
|
||||
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
|
||||
out: torch.Tensor) -> None:
|
||||
global global_counter
|
||||
global_counter += 1
|
||||
print(f"{global_counter=}")
|
||||
out.copy_(q)
|
||||
out[0] += 1
|
||||
|
||||
|
||||
def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
|
||||
out: torch.Tensor) -> None:
|
||||
return
|
||||
|
||||
|
||||
direct_register_custom_op(
|
||||
op_name="attention",
|
||||
op_func=silly_attention,
|
||||
mutates_args=["out"],
|
||||
fake_impl=silly_attention_fake,
|
||||
target_lib=silly_lib,
|
||||
)
|
||||
|
||||
|
||||
@support_torch_compile
|
||||
class SillyModel(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = '',
|
||||
**kwargs) -> None:
|
||||
super().__init__()
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Overall effect:
|
||||
x += 1
|
||||
x[0] += 2
|
||||
global_counter += 2
|
||||
"""
|
||||
x = x + 1
|
||||
x = x + 2
|
||||
out = torch.empty_like(x)
|
||||
torch.ops.silly.attention(x, x, x, out)
|
||||
x = out
|
||||
x = x - 2
|
||||
x = x - 1
|
||||
out = torch.empty_like(x)
|
||||
torch.ops.silly.attention(x, x, x, out)
|
||||
x = out
|
||||
x = x + 1
|
||||
return x
|
||||
|
||||
|
||||
def test_simple_piecewise_compile():
|
||||
|
||||
directory = os.path.dirname(__file__)
|
||||
config = os.path.join(directory, "piecewise_compilation_config.json")
|
||||
os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config
|
||||
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
|
||||
|
||||
model = SillyModel(vllm_config=VllmConfig(), prefix='')
|
||||
|
||||
inputs = torch.randn(100).cuda()
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1, # one graph for the model
|
||||
num_piecewise_graphs_seen=5, # 2 * num_layers + 1
|
||||
num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
|
||||
num_inductor_compilations=3, # num_piecewise_capturable_graphs_seen
|
||||
num_cudagraph_caputured=
|
||||
6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
):
|
||||
|
||||
with set_compile_context([1, 2]):
|
||||
model(inputs)
|
||||
|
||||
model(torch.randn(2).cuda())
|
||||
model(torch.randn(1).cuda())
|
||||
|
||||
input = torch.zeros(2).cuda()
|
||||
global global_counter
|
||||
global_counter = 0
|
||||
output = model(input)
|
||||
assert global_counter == 2
|
||||
assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
|
||||
|
||||
# clean up to avoid side effects for other tests
|
||||
del os.environ["VLLM_TORCH_COMPILE_CONFIG"]
|
||||
444
vllm-v0.6.2/tests/compile/piecewise/test_toy_llama.py
Normal file
444
vllm-v0.6.2/tests/compile/piecewise/test_toy_llama.py
Normal file
@@ -0,0 +1,444 @@
|
||||
"""
|
||||
Test the piecewise compilation with a simple model, comparing the output
|
||||
with and without the piecewise compilation.
|
||||
|
||||
This is a tractable model, the weights and computation are specially designed
|
||||
if the config `tractable_init` is set to True. Otherwise, the weights are
|
||||
initialized randomly with a fixed seed.
|
||||
"""
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.library import Library
|
||||
|
||||
from vllm.compilation.compile_context import set_compile_context
|
||||
from vllm.compilation.config import CompilationConfig
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.compilation.levels import CompilationLevel
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.plugins import set_compilation_config
|
||||
from vllm.utils import direct_register_custom_op
|
||||
|
||||
# create a library to hold the custom op
|
||||
silly_lib = Library("silly", "FRAGMENT") # noqa
|
||||
|
||||
|
||||
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
|
||||
out: torch.Tensor) -> None:
|
||||
out.copy_(q)
|
||||
out += k
|
||||
out += v
|
||||
|
||||
|
||||
def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
|
||||
out: torch.Tensor) -> None:
|
||||
return
|
||||
|
||||
|
||||
direct_register_custom_op(
|
||||
op_name="attention",
|
||||
op_func=silly_attention,
|
||||
mutates_args=["out"],
|
||||
fake_impl=silly_attention_fake,
|
||||
target_lib=silly_lib,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LlamaConfig:
|
||||
hidden_size: int = 128
|
||||
mlp_size: int = 256
|
||||
vocab_size: int = 128
|
||||
num_layers: int = 2
|
||||
init_value: float = 1.0
|
||||
tractable_init: bool = False
|
||||
random_seed: int = 0
|
||||
|
||||
def __post_init__(self):
|
||||
assert self.mlp_size >= self.hidden_size
|
||||
|
||||
|
||||
class LlamaMLP(nn.Module):
|
||||
|
||||
def __init__(self, config: LlamaConfig) -> None:
|
||||
super().__init__()
|
||||
self.gate_up_projection = nn.Linear(
|
||||
in_features=config.hidden_size,
|
||||
out_features=config.mlp_size * 2,
|
||||
bias=False,
|
||||
)
|
||||
self.down_projection = nn.Linear(
|
||||
in_features=config.mlp_size,
|
||||
out_features=config.hidden_size,
|
||||
bias=False,
|
||||
)
|
||||
|
||||
if config.tractable_init:
|
||||
nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size])
|
||||
nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:])
|
||||
nn.init.eye_(self.down_projection.weight.data)
|
||||
else:
|
||||
nn.init.xavier_normal_(self.gate_up_projection.weight.data,
|
||||
generator=torch.Generator().manual_seed(
|
||||
config.random_seed),
|
||||
gain=0.001)
|
||||
nn.init.xavier_normal_(self.down_projection.weight.data,
|
||||
generator=torch.Generator().manual_seed(
|
||||
config.random_seed),
|
||||
gain=0.001)
|
||||
|
||||
def forward(self, x):
|
||||
# for tractable_init and positive input, this is
|
||||
# essentially an elementwise-square
|
||||
x = self.gate_up_projection(x)
|
||||
x = x[:, :x.size(1) // 2] * torch.nn.functional.relu(
|
||||
x[:, x.size(1) // 2:])
|
||||
x = self.down_projection(x)
|
||||
return x
|
||||
|
||||
|
||||
class LlamaAttention(nn.Module):
|
||||
|
||||
def __init__(self, config: LlamaConfig) -> None:
|
||||
super().__init__()
|
||||
self.qkv_projection = nn.Linear(
|
||||
in_features=config.hidden_size,
|
||||
out_features=config.hidden_size * 3,
|
||||
bias=False,
|
||||
)
|
||||
|
||||
self.output_projection = nn.Linear(
|
||||
in_features=config.hidden_size,
|
||||
out_features=config.hidden_size,
|
||||
bias=False,
|
||||
)
|
||||
|
||||
if config.tractable_init:
|
||||
nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size])
|
||||
nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 *
|
||||
config.hidden_size])
|
||||
nn.init.eye_(self.qkv_projection.weight.data[2 *
|
||||
config.hidden_size:])
|
||||
nn.init.eye_(self.output_projection.weight.data)
|
||||
else:
|
||||
nn.init.xavier_normal_(self.qkv_projection.weight.data,
|
||||
generator=torch.Generator().manual_seed(
|
||||
config.random_seed),
|
||||
gain=0.001)
|
||||
nn.init.xavier_normal_(self.output_projection.weight.data,
|
||||
generator=torch.Generator().manual_seed(
|
||||
config.random_seed),
|
||||
gain=0.001)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
# for tractable_init, this is:
|
||||
# output = (hidden_states * 3 + positions * 2)
|
||||
qkv = self.qkv_projection(hidden_states)
|
||||
hidden_size = qkv.size(-1) // 3
|
||||
q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1)
|
||||
|
||||
q = q + positions.unsqueeze(1)
|
||||
k = k + positions.unsqueeze(1)
|
||||
|
||||
attn_output = torch.empty_like(q)
|
||||
torch.ops.silly.attention(q, k, v, attn_output)
|
||||
|
||||
output = self.output_projection(attn_output)
|
||||
return output
|
||||
|
||||
|
||||
class LlamaDecoderLayer(nn.Module):
|
||||
|
||||
def __init__(self, config: LlamaConfig) -> None:
|
||||
super().__init__()
|
||||
self.self_attention = LlamaAttention(config)
|
||||
self.mlp = LlamaMLP(config)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
For tractable computation:
|
||||
- if residual is None, the outputs are:
|
||||
- residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3
|
||||
- hidden_states = (residual + 1) ** 2
|
||||
- if residual is not None, the outputs are:
|
||||
- residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3
|
||||
- hidden_states = (residual + 1) ** 2
|
||||
""" # noqa
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
hidden_states = hidden_states + 1
|
||||
else:
|
||||
hidden_states = hidden_states + residual
|
||||
residual = hidden_states
|
||||
hidden_states = hidden_states + 1
|
||||
|
||||
hidden_states = self.self_attention(positions=positions,
|
||||
hidden_states=hidden_states)
|
||||
|
||||
hidden_states = hidden_states + residual
|
||||
residual = hidden_states
|
||||
hidden_states = hidden_states + 1
|
||||
hidden_states = self.mlp(hidden_states)
|
||||
|
||||
return hidden_states, residual
|
||||
|
||||
|
||||
@support_torch_compile
|
||||
class LlamaModel(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
config: LlamaConfig,
|
||||
prefix: str = '',
|
||||
**kwargs) -> None:
|
||||
super().__init__()
|
||||
self.embedding_tokens = nn.Embedding(
|
||||
num_embeddings=config.vocab_size,
|
||||
embedding_dim=config.hidden_size,
|
||||
)
|
||||
self.layers = nn.ModuleList(
|
||||
[LlamaDecoderLayer(config) for _ in range(config.num_layers)])
|
||||
|
||||
# this is the initial value of the hidden states
|
||||
self.embedding_tokens.weight.data.fill_(config.init_value)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor],
|
||||
positions: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
hidden_states = self.embedding_tokens(input_ids)
|
||||
residual = None
|
||||
for layer in self.layers:
|
||||
hidden_states, residual = layer(positions, hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
|
||||
def tractable_computation(input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
config: LlamaConfig,
|
||||
init_value: float = 1.0) -> torch.Tensor:
|
||||
hidden_states = torch.ones(input_ids.size(0),
|
||||
config.hidden_size,
|
||||
device=input_ids.device,
|
||||
dtype=input_ids.dtype) * init_value
|
||||
|
||||
# first layer
|
||||
residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
|
||||
hidden_states = (residual + 1)**2
|
||||
|
||||
# following layers
|
||||
for _ in range(config.num_layers - 1):
|
||||
hidden_states = hidden_states + residual
|
||||
residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3
|
||||
hidden_states = (residual + 1)**2
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
@torch.inference_mode
|
||||
def run_model(llama_config,
|
||||
use_compile: bool,
|
||||
split_attn: bool = False) -> torch.Tensor:
|
||||
|
||||
if use_compile:
|
||||
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(
|
||||
CompilationLevel.PIECEWISE)
|
||||
|
||||
if split_attn:
|
||||
set_compilation_config(
|
||||
CompilationConfig(
|
||||
use_cudagraph=True,
|
||||
non_cudagraph_ops=["silly.attention"],
|
||||
))
|
||||
else:
|
||||
set_compilation_config(CompilationConfig(use_cudagraph=True, ))
|
||||
else:
|
||||
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(
|
||||
CompilationLevel.NO_COMPILATION)
|
||||
set_compilation_config(None)
|
||||
|
||||
model = LlamaModel(config=llama_config,
|
||||
vllm_config=VllmConfig(),
|
||||
prefix="").eval().cuda()
|
||||
|
||||
B = 16 # max batch size
|
||||
input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
|
||||
positions = torch.arange(B).cuda()
|
||||
|
||||
with set_compile_context([1, 2]):
|
||||
model(input_ids, positions)
|
||||
model(input_ids[:2], positions[:2])
|
||||
model(input_ids[:1], positions[:1])
|
||||
|
||||
input_ids[:2].zero_()
|
||||
output = model(input_ids[:2], positions[:2])
|
||||
|
||||
# manual cleanup
|
||||
del os.environ["VLLM_TORCH_COMPILE_LEVEL"]
|
||||
set_compilation_config(None)
|
||||
|
||||
output = output.cpu()
|
||||
|
||||
if llama_config.tractable_init:
|
||||
expected_output = tractable_computation(input_ids[:2], positions[:2],
|
||||
llama_config).cpu()
|
||||
|
||||
assert torch.allclose(output, expected_output)
|
||||
else:
|
||||
return output.cpu()
|
||||
|
||||
|
||||
def test_toy_llama():
|
||||
# compare output with and without piecewise compilation
|
||||
|
||||
llama_config = LlamaConfig(hidden_size=128,
|
||||
mlp_size=256,
|
||||
vocab_size=128,
|
||||
num_layers=12)
|
||||
|
||||
tractable_config = LlamaConfig(hidden_size=128,
|
||||
mlp_size=256,
|
||||
vocab_size=128,
|
||||
num_layers=2,
|
||||
tractable_init=True)
|
||||
|
||||
outputs = []
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=0,
|
||||
num_piecewise_graphs_seen=0,
|
||||
num_piecewise_capturable_graphs_seen=0,
|
||||
num_inductor_compilations=0,
|
||||
num_cudagraph_caputured=0,
|
||||
):
|
||||
outputs.append(run_model(llama_config, use_compile=False))
|
||||
run_model(tractable_config, use_compile=False)
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1, # one graph for the model
|
||||
num_piecewise_graphs_seen=1,
|
||||
num_piecewise_capturable_graphs_seen=1,
|
||||
num_inductor_compilations=1, # num_piecewise_capturable_graphs_seen
|
||||
num_cudagraph_caputured=
|
||||
2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
):
|
||||
outputs.append(run_model(llama_config, use_compile=True))
|
||||
run_model(tractable_config, use_compile=True)
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1, # one graph for the model
|
||||
num_piecewise_graphs_seen=2 * llama_config.num_layers +
|
||||
1, # 2 * num_layers + 1
|
||||
num_piecewise_capturable_graphs_seen=1 +
|
||||
llama_config.num_layers, # 1 + num_layers
|
||||
num_inductor_compilations=1 +
|
||||
llama_config.num_layers, # num_piecewise_capturable_graphs_seen
|
||||
num_cudagraph_caputured=2 *
|
||||
(1 + llama_config.num_layers
|
||||
), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
):
|
||||
outputs.append(
|
||||
run_model(llama_config, use_compile=True, split_attn=True))
|
||||
run_model(tractable_config, use_compile=True, split_attn=True)
|
||||
|
||||
for i in range(1, len(outputs)):
|
||||
assert torch.allclose(outputs[0], outputs[i])
|
||||
|
||||
|
||||
@torch.inference_mode
|
||||
def benchmark():
|
||||
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
|
||||
from triton.testing import do_bench
|
||||
|
||||
# similar to llama 3.1-8B
|
||||
llama_config = LlamaConfig(hidden_size=4096,
|
||||
mlp_size=14336,
|
||||
vocab_size=128 * 1024,
|
||||
num_layers=32)
|
||||
|
||||
# a tiny model to measure the overhead
|
||||
# of piecewise cudagraph
|
||||
llama_config = LlamaConfig(hidden_size=40,
|
||||
mlp_size=80,
|
||||
vocab_size=128,
|
||||
num_layers=2)
|
||||
|
||||
cudagraph_sizes = [1, 2, 4] + [i * 8 for i in range(1, 33)]
|
||||
|
||||
eager_time = {}
|
||||
full_cudagraph_time = {}
|
||||
piecewise_cudagraph_time = {}
|
||||
|
||||
pool = torch.cuda.graph_pool_handle()
|
||||
|
||||
for piecewise in [False, True]:
|
||||
if piecewise:
|
||||
set_compilation_config(
|
||||
CompilationConfig(
|
||||
use_cudagraph=True,
|
||||
non_cudagraph_ops=["silly.attention"],
|
||||
))
|
||||
else:
|
||||
set_compilation_config(None)
|
||||
|
||||
model = LlamaModel(config=llama_config,
|
||||
vllm_config=VllmConfig(),
|
||||
prefix="").eval().cuda().to(torch.bfloat16)
|
||||
|
||||
B = 256 # max batch size
|
||||
input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
|
||||
positions = torch.arange(B).cuda().to(torch.bfloat16)
|
||||
|
||||
graphs = {}
|
||||
|
||||
with set_compile_context(cudagraph_sizes):
|
||||
model(input_ids, positions)
|
||||
for b in cudagraph_sizes[::-1]:
|
||||
if not piecewise:
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph, pool=pool):
|
||||
output = model(input_ids[:b], positions[:b])
|
||||
graphs[b] = (graph, output)
|
||||
else:
|
||||
output = model(input_ids[:b], positions[:b])
|
||||
graphs[b] = (model, output)
|
||||
for b in cudagraph_sizes:
|
||||
if piecewise:
|
||||
# noqa is for `Function definition does not bind loop variable`
|
||||
# it will be problematic if we save the created lambda function
|
||||
# and use it later, because it will look up the name `b` in the
|
||||
# enclosing scope, and the value of `b` will always be 256.
|
||||
# it is fine here, because we only use the lambda function once.
|
||||
runtime = do_bench(lambda: graphs[b][0] # noqa
|
||||
(input_ids[:b], positions[:b])) # noqa
|
||||
piecewise_cudagraph_time[b] = runtime
|
||||
else:
|
||||
runtime = do_bench(lambda: graphs[b][0].replay()) # noqa
|
||||
eager_runtime = do_bench(
|
||||
lambda: model(input_ids[:b], positions[:b])) # noqa
|
||||
full_cudagraph_time[b] = runtime
|
||||
eager_time[b] = eager_runtime
|
||||
|
||||
# print in tabular format
|
||||
print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
|
||||
for b in cudagraph_sizes:
|
||||
print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
|
||||
f"\t{piecewise_cudagraph_time[b]:.3f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
benchmark()
|
||||
126
vllm-v0.6.2/tests/compile/test_basic_correctness.py
Normal file
126
vllm-v0.6.2/tests/compile/test_basic_correctness.py
Normal file
@@ -0,0 +1,126 @@
|
||||
import dataclasses
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.compilation.levels import CompilationLevel
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..utils import compare_all_settings
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class TestSetting:
|
||||
model: str
|
||||
model_args: List[str]
|
||||
pp_size: int
|
||||
tp_size: int
|
||||
attn_backend: str
|
||||
method: str
|
||||
fullgraph: bool
|
||||
|
||||
|
||||
# representative settings for testing
|
||||
test_settings = [
|
||||
# basic llama model
|
||||
TestSetting(
|
||||
model="meta-llama/Llama-3.2-1B",
|
||||
model_args=[],
|
||||
pp_size=2,
|
||||
tp_size=2,
|
||||
attn_backend="FLASHINFER",
|
||||
method="generate",
|
||||
fullgraph=True,
|
||||
),
|
||||
# llama model with quantization
|
||||
TestSetting(
|
||||
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
|
||||
model_args=["--quantization", "gptq"],
|
||||
pp_size=1,
|
||||
tp_size=1,
|
||||
attn_backend="FLASH_ATTN",
|
||||
method="generate",
|
||||
fullgraph=True,
|
||||
),
|
||||
# MoE model
|
||||
TestSetting(
|
||||
model="ibm/PowerMoE-3b",
|
||||
model_args=[],
|
||||
pp_size=1,
|
||||
tp_size=2,
|
||||
attn_backend="FLASH_ATTN",
|
||||
method="generate",
|
||||
fullgraph=True,
|
||||
),
|
||||
# embedding model
|
||||
TestSetting(
|
||||
model="BAAI/bge-multilingual-gemma2",
|
||||
model_args=["--task", "embedding"],
|
||||
pp_size=1,
|
||||
tp_size=1,
|
||||
attn_backend="FLASHINFER",
|
||||
method="encode",
|
||||
fullgraph=True,
|
||||
),
|
||||
# vision language model
|
||||
TestSetting(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
model_args=["--trust-remote-code", "--max-model-len", "2048"],
|
||||
pp_size=2,
|
||||
tp_size=1,
|
||||
attn_backend="FLASH_ATTN",
|
||||
method="generate_with_image",
|
||||
fullgraph=False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# we cannot afford testing the full Catesian product
|
||||
# of all models and all levels
|
||||
@pytest.mark.parametrize("test_setting", test_settings)
|
||||
def test_compile_correctness(test_setting: TestSetting):
|
||||
# this test is run under multiple suits, with different GPUs.
|
||||
# make sure we only run the test with correct CUDA devices.
|
||||
# don't use "<", as it will duplicate the tests.
|
||||
model = test_setting.model
|
||||
model_args = test_setting.model_args
|
||||
pp_size = test_setting.pp_size
|
||||
tp_size = test_setting.tp_size
|
||||
attn_backend = test_setting.attn_backend
|
||||
method = test_setting.method
|
||||
fullgraph = test_setting.fullgraph
|
||||
if cuda_device_count_stateless() != pp_size * tp_size:
|
||||
pytest.skip("Not correct CUDA devices for the test.")
|
||||
import os
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
|
||||
final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
|
||||
["-tp", str(tp_size)]
|
||||
|
||||
all_envs: List[Optional[Dict[str, str]]] = []
|
||||
|
||||
for level in [
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
CompilationLevel.PIECEWISE,
|
||||
]:
|
||||
all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)})
|
||||
|
||||
# inductor will change the output, so we only compare if the output
|
||||
# is close, not exactly the same.
|
||||
compare_all_settings(
|
||||
model, [final_args] * 2,
|
||||
all_envs,
|
||||
method=method if method != "generate" else "generate_close")
|
||||
all_envs.clear()
|
||||
|
||||
for level in [
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
CompilationLevel.DYNAMO_AS_IS,
|
||||
CompilationLevel.DYNAMO_ONCE,
|
||||
]:
|
||||
all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)})
|
||||
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
|
||||
# "DYNAMO_ONCE" will always use fullgraph
|
||||
all_envs[-1][
|
||||
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
|
||||
|
||||
compare_all_settings(model, [final_args] * 3, all_envs, method=method)
|
||||
20
vllm-v0.6.2/tests/compile/test_full_graph.py
Normal file
20
vllm-v0.6.2/tests/compile/test_full_graph.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import pytest
|
||||
|
||||
from vllm.compilation.levels import CompilationLevel
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from .utils import TEST_MODELS, check_full_graph_support
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", TEST_MODELS)
|
||||
@pytest.mark.parametrize(
|
||||
"optimization_level",
|
||||
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
|
||||
@fork_new_process_for_each_test
|
||||
def test_full_graph(model_info, optimization_level):
|
||||
model = model_info[0]
|
||||
model_kwargs = model_info[1]
|
||||
check_full_graph_support(model,
|
||||
model_kwargs,
|
||||
optimization_level,
|
||||
tp_size=1)
|
||||
92
vllm-v0.6.2/tests/compile/test_fusion.py
Normal file
92
vllm-v0.6.2/tests/compile/test_fusion.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import pytest
|
||||
import torch
|
||||
from compressed_tensors.quantization import FP8_DTYPE
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.compilation.config import CompilationConfig
|
||||
from vllm.compilation.fusion import (FusionPass, find_auto_fn,
|
||||
find_auto_fn_maybe)
|
||||
from vllm.compilation.reshapes import RedundantReshapesPass
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
apply_fp8_linear)
|
||||
|
||||
from .backend import TestBackend
|
||||
|
||||
|
||||
class TestModel(torch.nn.Module):
|
||||
|
||||
def __init__(self, hidden_size: int, eps: float, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
|
||||
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(4)]
|
||||
self.w = [
|
||||
torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
|
||||
for _ in range(2)
|
||||
]
|
||||
|
||||
def forward(self, x):
|
||||
resid = torch.relu(x)
|
||||
y = self.norm[0](x)
|
||||
|
||||
x2 = apply_fp8_linear(y, self.w[0], self.scale[0], self.scale[1])
|
||||
# make sure resid is used for replacement to work
|
||||
y2, resid = self.norm[1](x2, resid)
|
||||
|
||||
x3 = apply_fp8_linear(y2, self.w[1], self.scale[2], self.scale[3])
|
||||
y3, resid = self.norm[2](x3, resid) # use resid here
|
||||
return y3
|
||||
|
||||
|
||||
# Init does pattern registration, which can only happen once
|
||||
config = CompilationConfig(enable_fusion=True)
|
||||
reshape_pass = RedundantReshapesPass(config)
|
||||
fusion_pass = FusionPass.instance(config)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
|
||||
@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
|
||||
@pytest.mark.parametrize("eps", [1e-5, 1e-6])
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
|
||||
reason="Only test on CUDA")
|
||||
def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
|
||||
torch.set_default_device("cuda")
|
||||
torch.set_default_dtype(torch.float16)
|
||||
|
||||
if eps != 1e-5:
|
||||
pytest.skip("Only test eps=1e-5 for now")
|
||||
|
||||
# Reshape pass is needed for the fusion pass to work
|
||||
backend = TestBackend(reshape_pass, fusion_pass)
|
||||
model = TestModel(hidden_size, eps)
|
||||
|
||||
# First dimension dynamic
|
||||
x = torch.rand(num_tokens, hidden_size)
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
result = model(x)
|
||||
|
||||
model2 = torch.compile(model, backend=backend)
|
||||
result2 = model2(x)
|
||||
|
||||
# Check that it gives the same answer
|
||||
torch.testing.assert_close(result, result2, atol=1e-3, rtol=1e-3)
|
||||
|
||||
# Check substitution worked
|
||||
pre_nodes = backend.graph_pre_pass.nodes
|
||||
post_nodes = backend.graph_post_pass.nodes
|
||||
|
||||
rms_quant = torch.ops._C.rms_norm_static_fp8_quant.default
|
||||
add_rms_quant = torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
|
||||
fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
|
||||
|
||||
# In pre-nodes, fp8 quant should be present and fused kernels should not
|
||||
assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
|
||||
assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
|
||||
find_auto_fn(pre_nodes, fp8_quant)
|
||||
|
||||
# In post-nodes, fused kernels should be present and fp8 quant should not
|
||||
find_auto_fn(post_nodes, rms_quant)
|
||||
find_auto_fn(post_nodes, add_rms_quant)
|
||||
assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
|
||||
59
vllm-v0.6.2/tests/compile/test_wrapper.py
Normal file
59
vllm-v0.6.2/tests/compile/test_wrapper.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
|
||||
|
||||
|
||||
class MyMod(torch.nn.Module):
|
||||
|
||||
def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
|
||||
if cache is not None:
|
||||
return x + cache
|
||||
return x * 2
|
||||
|
||||
|
||||
class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
|
||||
|
||||
def __init__(self, model):
|
||||
self.model = model
|
||||
compiled_callable = torch.compile(self.forward, backend="eager")
|
||||
super().__init__(compiled_callable)
|
||||
|
||||
def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
|
||||
# this is the function to be compiled
|
||||
return self.model(x, cache)
|
||||
|
||||
def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
|
||||
# let torch.compile compile twice
|
||||
if len(self.compiled_codes) == 2:
|
||||
dispatch_id = 0 if cache is None else 1
|
||||
with self.dispatch_to_code(dispatch_id):
|
||||
return self.forward(x, cache)
|
||||
else:
|
||||
return self.compiled_callable(x, cache)
|
||||
|
||||
|
||||
def test_torch_compile_wrapper():
|
||||
mod = MyMod()
|
||||
wrappers = []
|
||||
for i in range(3):
|
||||
torch._dynamo.reset()
|
||||
wrapper = MyWrapper(mod)
|
||||
wrappers.append(wrapper)
|
||||
x = torch.tensor([1])
|
||||
wrapper(x, None) # profile run, compile
|
||||
# create a cache tensor
|
||||
cache = torch.tensor([2])
|
||||
wrapper(x, cache) # warm up with cache, recompile
|
||||
|
||||
# for new input, dispatch to the compiled code directly
|
||||
new_x = torch.tensor([3])
|
||||
assert wrapper(new_x,
|
||||
None).item() == 6 # dispatch to the first compiled code
|
||||
assert wrapper(
|
||||
new_x, cache).item() == 5 # dispatch to the second compiled code
|
||||
|
||||
for wrapper in wrappers:
|
||||
# make sure they have independent compiled codes
|
||||
assert len(wrapper.compiled_codes) == 2
|
||||
97
vllm-v0.6.2/tests/compile/utils.py
Normal file
97
vllm-v0.6.2/tests/compile/utils.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import os
|
||||
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.compilation.levels import CompilationLevel
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
TEST_MODELS = [
|
||||
("facebook/opt-125m", {}),
|
||||
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
|
||||
"dtype": torch.float16,
|
||||
"quantization": "compressed-tensors"
|
||||
}),
|
||||
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
|
||||
"dtype": torch.float16,
|
||||
"quantization": "fp8"
|
||||
}),
|
||||
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
|
||||
"quantization": "compressed-tensors"
|
||||
}),
|
||||
("meta-llama/Meta-Llama-3-8B", {}),
|
||||
]
|
||||
|
||||
if is_quant_method_supported("aqlm"):
|
||||
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
|
||||
"quantization": "aqlm"
|
||||
}))
|
||||
|
||||
# TODO: figure out why this fails.
|
||||
if False and is_quant_method_supported("gguf"): # noqa: SIM223
|
||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
|
||||
"quantization": "gguf"
|
||||
}))
|
||||
|
||||
if is_quant_method_supported("gptq"):
|
||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
|
||||
"quantization": "gptq"
|
||||
}))
|
||||
|
||||
if is_quant_method_supported("gptq_marlin"):
|
||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
|
||||
"quantization": "gptq_marlin"
|
||||
}))
|
||||
|
||||
if is_quant_method_supported("gptq_marlin_24"):
|
||||
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
|
||||
"quantization": "gptq_marlin_24"
|
||||
}))
|
||||
|
||||
if is_quant_method_supported("marlin"):
|
||||
TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
|
||||
"quantization": "marlin"
|
||||
}))
|
||||
|
||||
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
|
||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
|
||||
"quantization": "AWQ"
|
||||
}))
|
||||
|
||||
|
||||
def check_full_graph_support(model,
|
||||
model_kwargs,
|
||||
optimization_level,
|
||||
tp_size=1):
|
||||
# make sure these models can be captured in full graph mode
|
||||
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
|
||||
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
|
||||
|
||||
# The base meta llama uses too much memory.
|
||||
if (model == "meta-llama/Meta-Llama-3-8B"
|
||||
and optimization_level >= CompilationLevel.PIECEWISE):
|
||||
return
|
||||
|
||||
print(f"MODEL={model}")
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0)
|
||||
llm = LLM(model=model,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=tp_size,
|
||||
disable_custom_all_reduce=True,
|
||||
**model_kwargs)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
1039
vllm-v0.6.2/tests/conftest.py
Normal file
1039
vllm-v0.6.2/tests/conftest.py
Normal file
File diff suppressed because it is too large
Load Diff
0
vllm-v0.6.2/tests/core/__init__.py
Normal file
0
vllm-v0.6.2/tests/core/__init__.py
Normal file
0
vllm-v0.6.2/tests/core/block/__init__.py
Normal file
0
vllm-v0.6.2/tests/core/block/__init__.py
Normal file
12
vllm-v0.6.2/tests/core/block/conftest.py
Normal file
12
vllm-v0.6.2/tests/core/block/conftest.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def should_do_global_cleanup_after_test() -> bool:
|
||||
"""Disable the global cleanup fixture for tests in this directory. This
|
||||
provides a ~10x speedup for unit tests that don't load a model to GPU.
|
||||
|
||||
This requires that tests in this directory clean up after themselves if they
|
||||
use the GPU.
|
||||
"""
|
||||
return False
|
||||
0
vllm-v0.6.2/tests/core/block/e2e/__init__.py
Normal file
0
vllm-v0.6.2/tests/core/block/e2e/__init__.py
Normal file
67
vllm-v0.6.2/tests/core/block/e2e/conftest.py
Normal file
67
vllm-v0.6.2/tests/core/block/e2e/conftest.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from typing import Callable, Iterable, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, seed):
|
||||
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, seed)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
test_llm_kwargs, seed):
|
||||
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
test_llm_kwargs, seed)
|
||||
|
||||
|
||||
def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
distinct_llm_kwargs, seed):
|
||||
kwargs = {
|
||||
**common_llm_kwargs,
|
||||
**per_test_common_llm_kwargs,
|
||||
**distinct_llm_kwargs,
|
||||
}
|
||||
|
||||
def generator_inner():
|
||||
llm = LLM(**kwargs)
|
||||
|
||||
set_random_seed(seed)
|
||||
|
||||
yield llm
|
||||
del llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
for llm in generator_inner():
|
||||
yield llm
|
||||
del llm
|
||||
|
||||
|
||||
def get_text_from_llm_generator(llm_generator: Iterable[LLM],
|
||||
prompts,
|
||||
sampling_params,
|
||||
llm_cb: Optional[Callable[[LLM],
|
||||
None]] = None):
|
||||
for llm in llm_generator:
|
||||
if llm_cb:
|
||||
llm_cb(llm)
|
||||
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||
text = [output.outputs[0].text for output in outputs]
|
||||
del llm
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
|
||||
for llm in llm_generator:
|
||||
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||
token_ids = [output.outputs[0].token_ids for output in outputs]
|
||||
del llm
|
||||
|
||||
return token_ids
|
||||
489
vllm-v0.6.2/tests/core/block/e2e/test_correctness.py
Normal file
489
vllm-v0.6.2/tests/core/block/e2e/test_correctness.py
Normal file
@@ -0,0 +1,489 @@
|
||||
from itertools import cycle
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
from .conftest import get_token_ids_from_llm_generator
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Allow only 5 sequences of ~1024 tokens in worst case.
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 5 * (64 + 1),
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"preemption_mode": "swap"
|
||||
}, {
|
||||
"preemption_mode": "recompute"
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_block_manager_with_preemption(baseline_llm_generator,
|
||||
test_llm_generator, batch_size):
|
||||
"""Verify block manager produces same outputs even when there is preemption.
|
||||
|
||||
This constructs two LLM, each with limited number of GPU blocks. The limit
|
||||
is decided such that as the sequences in the batch grow, sequences must be
|
||||
preempted and removed from cache.
|
||||
|
||||
If the output token ids are equivalent, then we have confidence that the KV
|
||||
cache is not corrupted.
|
||||
|
||||
NOTE: We want a significant number of generated tokens so that any incorrect
|
||||
KV mapping has time to build up error.
|
||||
|
||||
NOTE(Kuntai): Though we have removed block manager v1, this test is still
|
||||
useful as it asserts the behavior of block manager v2 (now it is called
|
||||
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
|
||||
keep this test.
|
||||
"""
|
||||
output_len = 1024
|
||||
temperature = 0.0
|
||||
|
||||
# We want to ensure equality even with preemption.
|
||||
# We force the total block size to be 1 + cdiv(output_len, block_size)
|
||||
# so that only one sequence can fit at a time (once the sequences grow).
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(block_size): MLU paged attention only support block_size=16
|
||||
'''
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# Our prompts will generate 128 tokens; since the prompts themselves are
|
||||
# small, we don't need much KV space beyond 128.
|
||||
"max_model_len": 160,
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
}])
|
||||
@pytest.mark.parametrize(
|
||||
"per_test_common_llm_kwargs",
|
||||
[
|
||||
{
|
||||
"block_size": 16,
|
||||
|
||||
# Allow only 2 sequences of ~128 tokens in worst case.
|
||||
# Note 8 = 128/block_size
|
||||
"num_gpu_blocks_override": 2 * (8 + 1),
|
||||
},
|
||||
{
|
||||
"block_size": 16,
|
||||
|
||||
# Allow only 2 sequences of ~128 tokens in worst case.
|
||||
# Note 16 = 128/block_size
|
||||
"num_gpu_blocks_override": 2 * (16 + 2),
|
||||
}
|
||||
])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{
|
||||
"num_lookahead_slots": 0,
|
||||
}])
|
||||
@pytest.mark.parametrize(
|
||||
"test_llm_kwargs",
|
||||
[
|
||||
{
|
||||
# We run one test with block_size < lookahead_slots, one test with
|
||||
# block_size > lookahead_slots
|
||||
"num_lookahead_slots": 10,
|
||||
"preemption_mode": "swap",
|
||||
},
|
||||
{
|
||||
"num_lookahead_slots": 10,
|
||||
"preemption_mode": "recompute",
|
||||
}
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [4])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
|
||||
test_llm_generator,
|
||||
batch_size):
|
||||
"""Verify vLLM produces the same output with greedy sampling, when lookahead
|
||||
scheduling is used vs. not.
|
||||
|
||||
Lookahead scheduling is not expected to modify the output, as it simply
|
||||
allocates empty slots ahead of the known token ids in a sliding fashion.
|
||||
|
||||
This test constrains the total number of blocks to force preemption. It also
|
||||
varies the block size so that the lookahead size is less than and greater
|
||||
than the block size.
|
||||
"""
|
||||
output_len = 128
|
||||
temperature = 0.0
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids without lookahead scheduling')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with lookahead scheduling')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(block_size): Only support Paged block_size 16, change block_size from 8 to 16
|
||||
'''
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[
|
||||
{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
"enable_chunked_prefill": True,
|
||||
"gpu_memory_utilization": 0.6,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs",
|
||||
[{
|
||||
"block_size": 16,
|
||||
"max_num_batched_tokens": 2,
|
||||
"max_num_seqs": 2,
|
||||
}, {
|
||||
"block_size": 16,
|
||||
"max_num_batched_tokens": 3,
|
||||
"max_num_seqs": 2,
|
||||
}, {
|
||||
"block_size": 16,
|
||||
"max_num_batched_tokens": 256,
|
||||
"max_num_seqs": 10,
|
||||
}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [
|
||||
{},
|
||||
])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [
|
||||
{
|
||||
"num_lookahead_slots": 0,
|
||||
},
|
||||
{
|
||||
"num_lookahead_slots": 5,
|
||||
},
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [4])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_chunked_prefill_block_manager(baseline_llm_generator,
|
||||
test_llm_generator, batch_size):
|
||||
"""Verify that chunked prefill works with SelfAttnBlockSpaceManager,
|
||||
with and without lookahead scheduling.
|
||||
"""
|
||||
output_len = 32
|
||||
temperature = 0.0
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
("1 + " * 50) + " 1 = ", # Longer prompt.
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids with BlockManager')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with BlockManager, with lookahead slots.')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Allow only 5 sequences of ~1024 tokens in worst case.
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 5 * (64 + 1),
|
||||
|
||||
# Enable prefill cache
|
||||
"enable_prefix_caching": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"preemption_mode": "swap"
|
||||
}, {
|
||||
"preemption_mode": "recompute"
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_block_manager_prefix_caching_enabled_with_preemption(
|
||||
baseline_llm_generator, test_llm_generator, batch_size):
|
||||
"""Verify block manager produces same outputs even when there is preemption.
|
||||
|
||||
This constructs two LLM, each with limited number of GPU blocks. The limit
|
||||
is decided such that as the sequences in the batch grow, sequences must be
|
||||
preempted and removed from cache.
|
||||
|
||||
If the output token ids are equivalent, then we have confidence that the KV
|
||||
cache is not corrupted.
|
||||
|
||||
NOTE: We want a significant number of generated tokens so that any incorrect
|
||||
KV mapping has time to build up error.
|
||||
|
||||
NOTE(Kuntai): Though we have removed block manager v1, this test is still
|
||||
useful as it asserts the behavior of block manager v2 (now it is called
|
||||
SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we
|
||||
keep this test.
|
||||
"""
|
||||
output_len = 1024
|
||||
temperature = 0.0
|
||||
|
||||
# We want to ensure equality even with preemption.
|
||||
# We force the total block size to be 1 + cdiv(output_len, block_size)
|
||||
# so that only one sequence can fit at a time (once the sequences grow).
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids from block manager')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids from block manager, with preemption')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Allow only 5 sequences of ~1024 tokens in worst case.
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 5 * (64 + 1),
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{
|
||||
"enable_prefix_caching": False
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"enable_prefix_caching": True,
|
||||
"preemption_mode": "swap"
|
||||
}, {
|
||||
"enable_prefix_caching": True,
|
||||
"preemption_mode": "recompute"
|
||||
}])
|
||||
@pytest.mark.parametrize("batch_size", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
|
||||
test_llm_generator, batch_size):
|
||||
"""Verify block manager v2 with auto prefix caching enabled produces same
|
||||
outputs as auto prefix caching disabled, even when there is preemption.
|
||||
|
||||
This constructs two LLM, each with limited number of GPU blocks. The limit
|
||||
is decided such that as the sequences in the batch grow, sequences must be
|
||||
preempted and removed from cache.
|
||||
|
||||
If the output token ids are equivalent, then we have confidence that auto
|
||||
prefix caching itself at least don't cause result error.
|
||||
"""
|
||||
output_len = 1024
|
||||
temperature = 0.0
|
||||
|
||||
# We want to ensure equality even with preemption.
|
||||
# We force the total block size to be 1 + cdiv(output_len, block_size)
|
||||
# so that only one sequence can fit at a time (once the sequences grow).
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids with APC disabled')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with APC enabled')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
# Use a small model for a fast test.
|
||||
"model": "facebook/opt-125m",
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# we keep the blocks small, so that hit eviction quickly
|
||||
"max_model_len": 48,
|
||||
"block_size": 16,
|
||||
"num_gpu_blocks_override": 3,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{
|
||||
"enable_prefix_caching": False
|
||||
}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{
|
||||
"enable_prefix_caching": True,
|
||||
}])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
|
||||
test_llm_generator):
|
||||
"""Verify block manager v2 with auto prefix caching could works normal
|
||||
even when eviction started.
|
||||
With APC enabled, all blocks are held by native block at the beginning.
|
||||
Then blocks are managed by evictor instead. If cache hit at the evitor's
|
||||
block, then it could be reused, or we need to recompute its kv cache.
|
||||
"""
|
||||
output_len = 10
|
||||
temperature = 0.0
|
||||
|
||||
prompts = [
|
||||
"You are a helpful assistant. Please answer truthfully and write "
|
||||
"out your thinking step by step to be sure you get the right answer. "
|
||||
"If you make a mistake, attempt to correct it. who are you?",
|
||||
"You are a helpful assistant. Please answer truthfully and write out "
|
||||
"your thinking step by step to be sure you get the right answer. You "
|
||||
"are helpful and harmless and you follow ethical guidelines. "
|
||||
"who are you?"
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
print('Getting token ids with APC disabled')
|
||||
baseline_token_ids = get_token_ids_from_llm_generator(
|
||||
baseline_llm_generator, prompts, sampling_params)
|
||||
|
||||
print('Getting token ids with APC enabled')
|
||||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
|
||||
prompts, sampling_params)
|
||||
|
||||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
|
||||
test_token_ids):
|
||||
assert expected_token_ids == actual_token_ids
|
||||
|
||||
assert baseline_token_ids == test_token_ids
|
||||
@@ -0,0 +1,180 @@
|
||||
import random
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
from .conftest import get_text_from_llm_generator
|
||||
|
||||
# relatively small model with 4k sliding window.
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
Currently tmo.apply_rotary not support offsets so bigcode/starcoder2-3b cannot run.
|
||||
use mistralai/Mistral-7B-v0.1 instead, which also have 4k sliding window.
|
||||
'''
|
||||
# The original model is: MODEL = "bigcode/starcoder2-3b"
|
||||
MODEL = "mistralai/Mistral-7B-v0.1"
|
||||
BLOCK_SIZE = 16
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
"model": MODEL,
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
"block_size": BLOCK_SIZE,
|
||||
# needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
|
||||
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("batch_size", [5])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
|
||||
batch_size, seed):
|
||||
"""
|
||||
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
||||
asks for value of one of them (which is outside the sliding window).
|
||||
If we tell it upfront which we are going to be looking for, then
|
||||
it answers correctly (mostly).
|
||||
|
||||
Additionally, we compare the results of the v1 and v2 managers.
|
||||
"""
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=1024,
|
||||
ignore_eos=True,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
prompts, answer, indices = prep_prompts(batch_size)
|
||||
|
||||
baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
|
||||
prompts,
|
||||
sampling_params,
|
||||
llm_cb=check_window(prompts))
|
||||
|
||||
check_answers(indices, answer, baseline_texts)
|
||||
|
||||
print('Getting token ids from block manager v2')
|
||||
test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
|
||||
sampling_params)
|
||||
check_answers(indices, answer, test_texts)
|
||||
|
||||
cmp = [
|
||||
expected_text == actual_text
|
||||
for expected_text, actual_text in zip(baseline_texts, test_texts)
|
||||
]
|
||||
print(cmp)
|
||||
# make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768
|
||||
# however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290
|
||||
# states that xformers and flash_attn have different ideas about the window
|
||||
# size anyways
|
||||
assert sum(cmp) > 0.7 * len(cmp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
"model": MODEL,
|
||||
|
||||
# skip cuda graph creation for fast test.
|
||||
"enforce_eager": True,
|
||||
"block_size": BLOCK_SIZE,
|
||||
"num_gpu_blocks_override": 100000 // BLOCK_SIZE,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
|
||||
@pytest.mark.parametrize("batch_size", [5])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
|
||||
"""
|
||||
This is similar to test_sliding_window_retrival, however, it doesn't
|
||||
compare against the v1 block manager since v1 doesn't support
|
||||
chunked prefill with sliding window.
|
||||
|
||||
The results with and without chunked prefill are not the same due to
|
||||
numerical instabilities.
|
||||
"""
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=10,
|
||||
ignore_eos=True,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
prompts, answer, indices = prep_prompts(batch_size)
|
||||
|
||||
# We don't compare with the baseline model here, since the results
|
||||
# slightly different due to different tailing in attention.
|
||||
test_texts = get_text_from_llm_generator(test_llm_generator,
|
||||
prompts,
|
||||
sampling_params,
|
||||
llm_cb=check_window(prompts))
|
||||
check_answers(indices, answer, test_texts)
|
||||
|
||||
|
||||
def prep_prompts(batch_size: int):
|
||||
"""
|
||||
Generate prompts which a bunch of assignments,
|
||||
then asking for the value of one of them.
|
||||
The prompt is just under 10k tokens; sliding window is 4k
|
||||
so the answer is outside sliding window, but should still be correct.
|
||||
"""
|
||||
prompts: List[str] = []
|
||||
answer: List[int] = []
|
||||
indices: List[int] = []
|
||||
random.seed(1)
|
||||
for _ in range(batch_size):
|
||||
idx = random.randint(30, 90)
|
||||
indices.append(idx)
|
||||
prompt = "```python\n# We set a number of variables, " + \
|
||||
f"x{idx} will be important later\n"
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
Since we have used a different model, the length of the
|
||||
prompt need to reset to the proper value as well
|
||||
'''
|
||||
# The original value is 800~1100
|
||||
ln = random.randint(400, 500)
|
||||
for k in range(30, ln):
|
||||
v = random.randint(10, 99)
|
||||
if k == idx:
|
||||
answer.append(v)
|
||||
prompt += f"x{k} = {v}\n"
|
||||
prompt += f"# Now, we check the value of x{idx}:\n"
|
||||
prompt += f"assert x{idx} == "
|
||||
prompts.append(prompt)
|
||||
return prompts, answer, indices
|
||||
|
||||
|
||||
def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
|
||||
answer2 = [int(text[0:2].strip()) for text in outputs]
|
||||
print(list(zip(indices, zip(answer, answer2))))
|
||||
numok = 0
|
||||
for a1, a2 in zip(answer, answer2):
|
||||
if a1 == a2:
|
||||
numok += 1
|
||||
frac_ok = numok / len(answer)
|
||||
print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
|
||||
# The original value is 0.7
|
||||
assert frac_ok >= 0.4
|
||||
|
||||
|
||||
def check_window(prompts: List[str]):
|
||||
|
||||
def inner(llm: LLM):
|
||||
sliding_window = llm.llm_engine.model_config.get_sliding_window()
|
||||
assert sliding_window and sliding_window > 0
|
||||
assert any(
|
||||
len(llm.get_tokenizer().tokenize(prompt)) > sliding_window
|
||||
for prompt in prompts)
|
||||
|
||||
return inner
|
||||
491
vllm-v0.6.2/tests/core/block/test_block_manager.py
Normal file
491
vllm-v0.6.2/tests/core/block/test_block_manager.py
Normal file
@@ -0,0 +1,491 @@
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
|
||||
STR_NOT_IMPL_ENC_DEC_SWA)
|
||||
from vllm.core.block_manager import SelfAttnBlockSpaceManager
|
||||
from vllm.core.interfaces import AllocStatus
|
||||
from vllm.sequence import Logprob, SequenceStatus
|
||||
from vllm.utils import chunk_list
|
||||
|
||||
from ..utils import (create_dummy_prompt, create_seq_group,
|
||||
create_seq_group_encoder_decoder)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
|
||||
num_gpu_blocks: int, watermark: float):
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
)
|
||||
num_watermark_blocks = int(watermark * num_gpu_blocks)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
|
||||
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
|
||||
# the current implementation assumes all seqs are new prompts / don't have
|
||||
# different output lens.
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
|
||||
for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
)
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
|
||||
can_allocate_result = block_manager.can_allocate(seq_group)
|
||||
|
||||
num_required_blocks = num_prompt_blocks + num_output_blocks
|
||||
|
||||
if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
|
||||
assert can_allocate_result == AllocStatus.NEVER
|
||||
elif num_gpu_blocks >= num_required_blocks:
|
||||
assert can_allocate_result == AllocStatus.OK
|
||||
else:
|
||||
assert can_allocate_result == AllocStatus.LATER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_seq_group_encoder_decoder(block_size: int,
|
||||
num_seqs_per_group: int,
|
||||
num_gpu_blocks: int,
|
||||
watermark: float):
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
)
|
||||
num_watermark_blocks = int(watermark * num_gpu_blocks)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
|
||||
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
|
||||
# the current implementation assumes all seqs are new prompts / don't have
|
||||
# different output lens.
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
|
||||
for bdx, num_prompt_blocks in enumerate(
|
||||
range(1, num_gpu_blocks - num_output_blocks)):
|
||||
num_cross_blocks_per_seq = num_prompt_blocks
|
||||
|
||||
seq_group = create_seq_group_encoder_decoder(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
request_id=str(bdx))
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
|
||||
can_allocate_result = block_manager.can_allocate(seq_group)
|
||||
|
||||
num_required_blocks = num_prompt_blocks + \
|
||||
num_output_blocks + \
|
||||
num_cross_blocks_per_seq
|
||||
|
||||
if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
|
||||
assert can_allocate_result == AllocStatus.NEVER
|
||||
elif num_gpu_blocks >= num_required_blocks:
|
||||
assert can_allocate_result == AllocStatus.OK
|
||||
else:
|
||||
assert can_allocate_result == AllocStatus.LATER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [16])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
|
||||
num_seqs_per_group: int,
|
||||
num_gpu_blocks: int,
|
||||
watermark: float):
|
||||
'''
|
||||
SWA short for Sliding Window Attention.
|
||||
|
||||
At time of writing block manager does not support SWA.
|
||||
|
||||
However even when SWA is implemented for block manager,
|
||||
there will still most likely be a separate workstream required
|
||||
to enable SWA for encoder/decoder models.
|
||||
|
||||
Therefore this test enforces that one of the following cases
|
||||
hold true:
|
||||
1. Block manager does not support SWA at all (true at time of writing)
|
||||
2. Block manager fails with NotImplementError when SWA is enabled
|
||||
AND a SequenceGroup with an encoder sequence (i.e. in support of an
|
||||
encoder/decoder model) is passed into can_allocate() as an argument
|
||||
|
||||
The setup for this test is stripped down version of
|
||||
test_can_allocate_seq_group_encoder_decoder()
|
||||
'''
|
||||
|
||||
with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
sliding_window=5 # SWA
|
||||
)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
num_prompt_blocks = 1
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
seq_group = create_seq_group_encoder_decoder(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
request_id="0")
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
block_manager.can_allocate(seq_group)
|
||||
|
||||
# Assert that either
|
||||
# 1. Block manager constructor fails with assertion that sliding window
|
||||
# is not yet supported (most likely near-term outcome at time of
|
||||
# writing), or
|
||||
# 2. can_allocate() fails with NotImplementedError due to combination of
|
||||
# encoder/decoder and sliding window attention
|
||||
if isinstance(exc_info.value, NotImplementedError):
|
||||
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
|
||||
elif isinstance(exc_info.value, AssertionError):
|
||||
assert str(exc_info.value) == "Sliding window not yet supported"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [16])
|
||||
@pytest.mark.parametrize("num_seqs_per_group", [1])
|
||||
@pytest.mark.parametrize("watermark", [0.0, 0.5])
|
||||
def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
|
||||
block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
|
||||
watermark: float):
|
||||
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
watermark=watermark,
|
||||
enable_caching=True # Prefix cache
|
||||
)
|
||||
|
||||
num_output_blocks_per_seq = 1
|
||||
num_prompt_blocks = 1
|
||||
num_output_blocks = num_output_blocks_per_seq
|
||||
seq_group = create_seq_group_encoder_decoder(
|
||||
seq_prompt_len=block_size * num_prompt_blocks,
|
||||
seq_output_lens=[
|
||||
block_size * num_output_blocks_per_seq
|
||||
for _ in range(num_seqs_per_group)
|
||||
],
|
||||
request_id="0")
|
||||
|
||||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
|
||||
|
||||
# Assert that either can_allocate() fails with NotImplementedError
|
||||
# due to combination of encoder/decoder and prefix cache
|
||||
with pytest.raises(NotImplementedError) as exc_info:
|
||||
block_manager.can_allocate(seq_group)
|
||||
assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("prompt_len", [1, 7, 8])
|
||||
@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [0, 10])
|
||||
def test_append_slots(block_size, prompt_len, num_slots_to_append,
|
||||
num_lookahead_slots):
|
||||
"""Verify append_slots consumes the correct number of blocks from the block
|
||||
table.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
watermark = 0.1
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
watermark=watermark,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=prompt_len,
|
||||
seq_output_lens=[0],
|
||||
)
|
||||
|
||||
# Allocate seq
|
||||
assert block_manager.can_allocate(seq_group)
|
||||
block_manager.allocate(seq_group)
|
||||
|
||||
# Seq seq to RUNNING
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
# Append tokens to the sequeqnce
|
||||
for token_id in range(num_slots_to_append):
|
||||
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
|
||||
# Append slots for new tokens and lookahead slots.
|
||||
free_blocks_before_append = block_manager.get_num_free_gpu_blocks()
|
||||
block_manager.append_slots(seq, num_lookahead_slots)
|
||||
num_consumed_blocks = (free_blocks_before_append -
|
||||
block_manager.get_num_free_gpu_blocks())
|
||||
|
||||
# Expect consumed blocks to be new blocks required to support the new slots.
|
||||
expected_consumed_blocks = len(
|
||||
list(
|
||||
chunk_list(
|
||||
list(
|
||||
range(prompt_len + num_slots_to_append +
|
||||
num_lookahead_slots)),
|
||||
block_size))) - len(
|
||||
list(chunk_list(list(range(prompt_len)), block_size)))
|
||||
assert num_consumed_blocks == expected_consumed_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
@pytest.mark.parametrize("num_cpu_blocks", [4])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [4])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
|
||||
@pytest.mark.parametrize("enable_caching", [False, True])
|
||||
def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
|
||||
enable_caching):
|
||||
"""Verify blocks number on src/desc device is correct after swapping in/out
|
||||
sequence group (not missing or extra blocks).
|
||||
"""
|
||||
block_manager = SelfAttnBlockSpaceManager(block_size,
|
||||
num_cpu_blocks,
|
||||
num_gpu_blocks,
|
||||
watermark=0,
|
||||
enable_caching=enable_caching)
|
||||
prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
|
||||
prompt.status = SequenceStatus.WAITING
|
||||
block_manager.allocate(seq_group)
|
||||
|
||||
# Emulate a forward pass by appending a single token.
|
||||
# The block manager then knows how many unprocessed
|
||||
# tokens will be written in the next forward pass.
|
||||
token_id = 0
|
||||
prompt.status = SequenceStatus.RUNNING
|
||||
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
|
||||
# Swap seq group from GPU -> CPU.
|
||||
gpu_blocks = block_manager.get_block_table(prompt)
|
||||
assert block_manager.can_swap_out(seq_group)
|
||||
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
mapping = block_manager.swap_out(seq_group)
|
||||
mapping_keys = [key for key, _ in mapping]
|
||||
assert mapping_keys == gpu_blocks
|
||||
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
|
||||
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
|
||||
prompt.status = SequenceStatus.SWAPPED
|
||||
|
||||
# Swap seq group from CPU -> GPU.
|
||||
assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
|
||||
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
mapping = block_manager.swap_in(seq_group)
|
||||
cpu_blocks = block_manager.get_block_table(prompt)
|
||||
mapping_keys = [key for key, _ in mapping]
|
||||
assert mapping_keys == [cpu_blocks[0]]
|
||||
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [4])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10])
|
||||
@pytest.mark.parametrize("enable_caching", [True, False])
|
||||
def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
|
||||
enable_caching):
|
||||
""" Verify the block manager can correctly determine if a sequence group
|
||||
can be swapped in/out.
|
||||
"""
|
||||
num_cpu_blocks = num_gpu_blocks
|
||||
block_manager = SelfAttnBlockSpaceManager(block_size,
|
||||
num_cpu_blocks,
|
||||
num_gpu_blocks,
|
||||
watermark=0,
|
||||
enable_caching=enable_caching)
|
||||
prompt, seq_group = create_dummy_prompt(
|
||||
"1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
|
||||
prompt.status = SequenceStatus.WAITING
|
||||
block_manager.allocate(seq_group)
|
||||
prompt.status = SequenceStatus.RUNNING
|
||||
|
||||
# Swap seq group from GPU -> CPU.
|
||||
gpu_blocks = block_manager.get_block_table(prompt)
|
||||
assert block_manager.can_swap_out(seq_group)
|
||||
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
mapping = block_manager.swap_out(seq_group)
|
||||
mapping_keys = [key for key, _ in mapping]
|
||||
assert mapping_keys == gpu_blocks
|
||||
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
|
||||
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
|
||||
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
|
||||
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
|
||||
prompt.status = SequenceStatus.SWAPPED
|
||||
|
||||
# At this moment, we still have enough free blocks to swap in the seq group.
|
||||
if num_lookahead_slots <= block_size:
|
||||
assert block_manager.can_swap_in(seq_group,
|
||||
num_lookahead_slots) == AllocStatus.OK
|
||||
else:
|
||||
assert block_manager.can_swap_in(
|
||||
seq_group, num_lookahead_slots) == AllocStatus.NEVER
|
||||
|
||||
# During Swapped out, 2 cached blocks were evicted from the GPU,
|
||||
# so the prompt1 can't be swapped in
|
||||
prompt2_len = 2 * block_size - 1
|
||||
prompt2, seq_group2 = create_dummy_prompt(
|
||||
"2",
|
||||
prompt_length=prompt2_len,
|
||||
prompt_tokens=[10000 + i for i in range(prompt2_len)])
|
||||
prompt2.status = SequenceStatus.WAITING
|
||||
block_manager.allocate(seq_group2)
|
||||
|
||||
# Swap seq group from CPU -> GPU.
|
||||
if num_lookahead_slots <= block_size:
|
||||
assert block_manager.can_swap_in(
|
||||
seq_group, num_lookahead_slots) == AllocStatus.LATER
|
||||
else:
|
||||
assert block_manager.can_swap_in(
|
||||
seq_group, num_lookahead_slots) == AllocStatus.NEVER
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
|
||||
@pytest.mark.parametrize("enable_caching", [False, True])
|
||||
def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
|
||||
"""Verifies that swapping fails if there is not enough free blocks
|
||||
to account for unseen tokens and lookahead_slots.
|
||||
"""
|
||||
block_size = 8
|
||||
num_cpu_blocks = 1
|
||||
num_gpu_blocks = 1
|
||||
block_manager = SelfAttnBlockSpaceManager(block_size,
|
||||
num_cpu_blocks,
|
||||
num_gpu_blocks,
|
||||
watermark=0,
|
||||
enable_caching=enable_caching)
|
||||
prompt_length = block_size - 3
|
||||
assert prompt_length > 0
|
||||
prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
|
||||
prompt.status = SequenceStatus.WAITING
|
||||
block_manager.allocate(seq_group)
|
||||
# Emulate a forward pass by appending a single token.
|
||||
# The block manager then knows how many unprocessed
|
||||
# tokens will be written in the next forward pass.
|
||||
token_id = 0
|
||||
prompt.status = SequenceStatus.RUNNING
|
||||
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
|
||||
# Swap seq group from GPU -> CPU.
|
||||
assert block_manager.can_swap_out(seq_group)
|
||||
block_manager.swap_out(seq_group)
|
||||
prompt.status = SequenceStatus.SWAPPED
|
||||
|
||||
# Swap seq group from CPU -> GPU.
|
||||
# The number of unseen tokens is 1. If the number of existing
|
||||
# tokens plus the unseen ones and number of lookahead slots exceeds
|
||||
# the total number of available GPU blocks then the swap
|
||||
# should fail.
|
||||
num_unseen_tokens = 1
|
||||
if (num_lookahead_slots + num_unseen_tokens +
|
||||
prompt_length) <= (block_size * num_gpu_blocks):
|
||||
assert block_manager.can_swap_in(seq_group,
|
||||
num_lookahead_slots) == AllocStatus.OK
|
||||
else:
|
||||
assert block_manager.can_swap_in(
|
||||
seq_group, num_lookahead_slots) == AllocStatus.NEVER
|
||||
|
||||
|
||||
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8, 16])
|
||||
@pytest.mark.parametrize("prompt_len", [10, 300, 1000])
|
||||
@pytest.mark.parametrize("num_slots_to_append", [50])
|
||||
@pytest.mark.parametrize("sliding_window", [20, 32, 200, 512])
|
||||
def test_sliding_window(block_size, prompt_len, num_slots_to_append,
|
||||
sliding_window):
|
||||
"""Verify append_slots consumes the correct number of blocks from the block
|
||||
table.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
watermark = 0.1
|
||||
block_manager = SelfAttnBlockSpaceManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
watermark=watermark,
|
||||
sliding_window=sliding_window,
|
||||
)
|
||||
|
||||
def check_used(min_n, max_n=None):
|
||||
if max_n is None:
|
||||
max_n = min_n
|
||||
used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
|
||||
assert min_n <= used
|
||||
assert used <= max_n
|
||||
|
||||
def num_blocks(num_tokens):
|
||||
return (num_tokens + block_size - 1) // block_size
|
||||
|
||||
check_used(0)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=prompt_len,
|
||||
seq_output_lens=[0],
|
||||
)
|
||||
|
||||
check_used(0)
|
||||
|
||||
# Allocate seq
|
||||
assert block_manager.can_allocate(seq_group)
|
||||
block_manager.allocate(seq_group)
|
||||
|
||||
check_used(num_blocks(prompt_len))
|
||||
|
||||
# Seq seq to RUNNING
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
seq.data.update_num_computed_tokens(prompt_len)
|
||||
check_used(num_blocks(prompt_len))
|
||||
|
||||
# this is how we compute it in SelfAttnBlockSpaceManager.__init__
|
||||
sliding_blocks = (sliding_window // block_size) + 2
|
||||
# plus one block for null block
|
||||
sliding_blocks += 1
|
||||
|
||||
# Append tokens to the sequeqnce
|
||||
for token_id in range(num_slots_to_append):
|
||||
seq.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||
seq.data.update_num_computed_tokens(1)
|
||||
block_manager.append_slots(seq, num_lookahead_slots=0)
|
||||
if prompt_len < sliding_window + 10:
|
||||
check_used(0, sliding_blocks + 1)
|
||||
else:
|
||||
check_used(sliding_blocks, sliding_blocks + 1)
|
||||
576
vllm-v0.6.2/tests/core/block/test_block_table.py
Normal file
576
vllm-v0.6.2/tests/core/block/test_block_table.py
Normal file
@@ -0,0 +1,576 @@
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.block_table import BlockTable
|
||||
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
|
||||
from vllm.utils import Device, cdiv, chunk_list
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
def test_allocate_naive(block_size: int, sequence_len: int):
|
||||
"""Test the allocation of blocks using the naive allocator.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size and
|
||||
number of blocks. It then allocates multiple BlockTables with varying
|
||||
sequence lengths and verifies that the number of free blocks decreases as
|
||||
expected after each allocation.
|
||||
"""
|
||||
assert block_size > 1
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type="naive",
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
|
||||
|
||||
block_tables: List[BlockTable] = []
|
||||
for i in range(5):
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
|
||||
|
||||
block_tables.append(
|
||||
BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
))
|
||||
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
def test_allocate_prefix_caching(block_size: int, sequence_len: int):
|
||||
"""Test the allocation of blocks using the prefix caching allocator.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size and
|
||||
number of blocks, using the prefix caching allocator. It then allocates
|
||||
multiple BlockTables with varying sequence lengths and verifies that the
|
||||
number of free blocks decreases as expected after each allocation.
|
||||
|
||||
The test expects all sequences to share allocations, except for their last
|
||||
block, which may be mutable. It calculates the expected number of immutable
|
||||
and mutable blocks per allocation based on the sequence length and block
|
||||
size.
|
||||
"""
|
||||
assert block_size > 1
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type="prefix_caching",
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
chunked_tokens = list(chunk_list(token_ids, block_size))
|
||||
num_mutable_blocks_per_alloc = 0 if len(
|
||||
chunked_tokens[-1]) == block_size else 1
|
||||
num_immutable_blocks_per_alloc = len(
|
||||
chunked_tokens) - num_mutable_blocks_per_alloc
|
||||
|
||||
block_tables: List[BlockTable] = []
|
||||
for alloc_i in range(1, 6):
|
||||
|
||||
block_tables.append(
|
||||
BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
))
|
||||
block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Expect all sequences to share allocations, except for their last block
|
||||
# (which may be mutable).
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_gpu_blocks - (
|
||||
num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
|
||||
(alloc_i))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
@pytest.mark.parametrize("device", ["cpu", "gpu"])
|
||||
def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
|
||||
device: str):
|
||||
"""Test the allocation and freeing of blocks using different allocators and
|
||||
devices.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size,
|
||||
number of blocks, allocator type, and device. It then allocates a BlockTable
|
||||
multiple times with the same sequence and verifies that the number of free
|
||||
blocks remains consistent after each allocation and freeing.
|
||||
"""
|
||||
device = Device[device.upper()]
|
||||
|
||||
num_device_blocks = 1024
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_device_blocks,
|
||||
num_cpu_blocks=num_device_blocks,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
for i in range(5):
|
||||
block_table.allocate(token_ids=token_ids, device=device)
|
||||
assert allocator.get_num_free_blocks(
|
||||
device) == num_device_blocks - num_blocks_per_alloc
|
||||
assert all(block_id is not None
|
||||
for block_id in block_table.physical_block_ids)
|
||||
|
||||
block_table.free()
|
||||
assert allocator.get_num_free_blocks(device) == num_device_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_append_token_ids_allocation(block_size: int, sequence_len: int,
|
||||
append_len: int, allocator_type: str):
|
||||
"""Test the allocation behavior when appending token IDs to a BlockTable.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size,
|
||||
number of blocks, and allocator type. It then allocates a BlockTable with an
|
||||
initial sequence and appends additional token IDs to it. The test verifies
|
||||
that the number of allocated blocks before and after appending matches the
|
||||
expected values.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(append_len))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
num_expected_blocks_before_append = len(
|
||||
list(chunk_list(token_ids, block_size)))
|
||||
num_expected_appended_blocks = len(
|
||||
list(chunk_list(token_ids + token_ids_to_append,
|
||||
block_size))) - num_expected_blocks_before_append
|
||||
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
assert len(
|
||||
block_table.physical_block_ids) == num_expected_blocks_before_append
|
||||
block_table.append_token_ids(token_ids_to_append)
|
||||
assert len(
|
||||
block_table.physical_block_ids
|
||||
) == num_expected_blocks_before_append + num_expected_appended_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
|
||||
num_empty_slots: int,
|
||||
allocator_type: str):
|
||||
"""Test the allocation behavior when ensuring a certain number of empty
|
||||
slots in a BlockTable.
|
||||
|
||||
This test creates a CpuGpuBlockAllocator with the specified block size,
|
||||
number of blocks, and allocator type. It then allocates a BlockTable with an
|
||||
initial sequence and ensures a certain number of empty slots. The test
|
||||
verifies that the number of allocated blocks before and after ensuring empty
|
||||
slots matches the expected values. It also checks that filling up the empty
|
||||
slots does not consume additional blocks.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
num_expected_blocks_before_append = len(
|
||||
list(chunk_list(token_ids, block_size)))
|
||||
num_expected_appended_blocks = len(
|
||||
list(chunk_list(token_ids + [-1] * num_empty_slots,
|
||||
block_size))) - num_expected_blocks_before_append
|
||||
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Assert that the empty slots consume the expected number of additional
|
||||
# blocks.
|
||||
assert len(
|
||||
block_table.physical_block_ids) == num_expected_blocks_before_append
|
||||
block_table.ensure_num_empty_slots(num_empty_slots)
|
||||
assert len(
|
||||
block_table.physical_block_ids
|
||||
) == num_expected_blocks_before_append + num_expected_appended_blocks
|
||||
|
||||
# Now, ensure no additional blocks consumed as we fill up the empty slots.
|
||||
num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
|
||||
block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
|
||||
assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 9])
|
||||
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("append_size", [1, 4, 129])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
|
||||
append_len: int, allocator_type: str,
|
||||
append_size: int):
|
||||
"""Verify token ids are correctly appended. Appends various amounts of
|
||||
token ids in various append sizes, and verifies the final sequence is
|
||||
correct.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=1024,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(append_len))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
appended_so_far: List[int] = []
|
||||
for append in chunk_list(token_ids_to_append, append_size):
|
||||
block_table.append_token_ids(append)
|
||||
appended_so_far.extend(append)
|
||||
|
||||
assert block_table._get_all_token_ids() == token_ids + appended_so_far
|
||||
|
||||
assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_len", [1, 9, 129])
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_fork(seq_len: int, block_size: int, allocator_type: str):
|
||||
"""Create a sequence using the specified allocator.
|
||||
1. Assert that after forking the sequence, the free block count is the
|
||||
same.
|
||||
2. Assert that the forked sequence has the same physical mappings.
|
||||
3. Then free the original sequence; verify that the free block count is
|
||||
the same.
|
||||
4. Finally, free the forked sequence and verify that the free block
|
||||
count drops to zero.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(seq_len))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
block_table.allocate(token_ids)
|
||||
|
||||
num_free_blocks_before_fork = allocator.get_num_free_blocks(
|
||||
device=Device.GPU)
|
||||
|
||||
forked_block_table = block_table.fork()
|
||||
|
||||
# Expect physical_block_ids and token_ids to match.
|
||||
assert (block_table.physical_block_ids ==
|
||||
forked_block_table.physical_block_ids)
|
||||
assert block_table._get_all_token_ids(
|
||||
) == forked_block_table._get_all_token_ids()
|
||||
|
||||
# Do not expect any additional allocations.
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_free_blocks_before_fork
|
||||
|
||||
# Free the original blocks. Assert num free blocks does not change, since
|
||||
# refcount is nonzero.
|
||||
block_table.free()
|
||||
assert allocator.get_num_free_blocks(
|
||||
device=Device.GPU) == num_free_blocks_before_fork
|
||||
|
||||
# Expect the forked block table to be unaffected by the free.
|
||||
assert all(block_id is not None
|
||||
for block_id in forked_block_table.physical_block_ids)
|
||||
|
||||
# Free the forked blocks. Assert num free blocks does change, since
|
||||
# refcount is now zero.
|
||||
forked_block_table.free()
|
||||
assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("appender", ["forked", "original"])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_cow(block_size: int, sequence_len: int, append_len: int,
|
||||
allocator_type: str, appender: str):
|
||||
"""Fork a sequence; append to the forked sequence; verify there's a CoW.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(append_len))
|
||||
|
||||
original_block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
|
||||
num_expected_cow_blocks = cdiv(sequence_len + append_len,
|
||||
block_size) - (sequence_len // block_size)
|
||||
|
||||
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
original_block_ids = original_block_table.physical_block_ids[:]
|
||||
|
||||
print("original_block_ids = {}".format(original_block_ids))
|
||||
forked_block_table = original_block_table.fork()
|
||||
|
||||
# Expect no additional allocation (copy on _write_).
|
||||
assert allocator.get_num_free_blocks(
|
||||
Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
|
||||
|
||||
if appender == "forked":
|
||||
appender_block_table = forked_block_table
|
||||
static_block_table = original_block_table
|
||||
elif appender == "original":
|
||||
appender_block_table = original_block_table
|
||||
static_block_table = forked_block_table
|
||||
else:
|
||||
raise ValueError(f"unknown test config {appender=}")
|
||||
|
||||
# Write tokens.
|
||||
appender_block_table.append_token_ids(token_ids_to_append)
|
||||
|
||||
# Expect the non-appending block table to have no change.
|
||||
assert static_block_table.physical_block_ids == original_block_ids
|
||||
assert appender_block_table.physical_block_ids != original_block_ids
|
||||
|
||||
# Expect the blocks changed during append to have a CoW.
|
||||
assert allocator.get_num_free_blocks(
|
||||
Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
|
||||
num_expected_cow_blocks)
|
||||
|
||||
cows = allocator.clear_copy_on_writes()
|
||||
if sequence_len % block_size > 0:
|
||||
# If the last block in the sequence is not full, then when appending we
|
||||
# expect a CoW.
|
||||
assert cows
|
||||
|
||||
cow_block_id = sequence_len // block_size
|
||||
expected_src = static_block_table.physical_block_ids[cow_block_id]
|
||||
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
|
||||
|
||||
assert (expected_src, expected_dst) in cows
|
||||
else:
|
||||
# Otherwise, there should be no copy-on-write.
|
||||
assert not cows
|
||||
|
||||
static_block_table.free()
|
||||
appender_block_table.free()
|
||||
|
||||
# After free, expect all blocks to be freed.
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("append_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
|
||||
@pytest.mark.parametrize("appender", ["forked", "original"])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_cow_lookahead_simple(block_size: int, sequence_len: int,
|
||||
append_len: int, lookahead_slots: int,
|
||||
allocator_type: str, appender: str):
|
||||
"""Similar to test_cow, except with lookahead allocation. The assertions are
|
||||
less rigorous due to the complexity of the property under test.
|
||||
"""
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(append_len))
|
||||
|
||||
original_block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Allocate lookahead slots.
|
||||
original_block_table.ensure_num_empty_slots(lookahead_slots)
|
||||
original_block_ids = original_block_table.physical_block_ids[:]
|
||||
|
||||
forked_block_table = original_block_table.fork()
|
||||
|
||||
if appender == "forked":
|
||||
appender_block_table = forked_block_table
|
||||
static_block_table = original_block_table
|
||||
elif appender == "original":
|
||||
appender_block_table = original_block_table
|
||||
static_block_table = forked_block_table
|
||||
else:
|
||||
raise ValueError(f"unknown test config {appender=}")
|
||||
|
||||
# Write tokens.
|
||||
appender_block_table.append_token_ids(token_ids_to_append)
|
||||
|
||||
# Expect the non-appending block table to have no change.
|
||||
assert static_block_table.physical_block_ids == original_block_ids
|
||||
assert appender_block_table.physical_block_ids != original_block_ids
|
||||
|
||||
cows = allocator.clear_copy_on_writes()
|
||||
|
||||
# Always expect copy-on-write
|
||||
assert cows
|
||||
|
||||
if sequence_len % block_size > 0:
|
||||
# If the last block in the sequence is not full, then when appending we
|
||||
# expect a CoW.
|
||||
assert cows
|
||||
|
||||
cow_block_id = sequence_len // block_size
|
||||
expected_src = static_block_table.physical_block_ids[cow_block_id]
|
||||
expected_dst = appender_block_table.physical_block_ids[cow_block_id]
|
||||
|
||||
assert (expected_src, expected_dst) in cows
|
||||
|
||||
static_block_table.free()
|
||||
appender_block_table.free()
|
||||
|
||||
# After free, expect all blocks to be freed.
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
|
||||
num_new_tokens: int,
|
||||
num_lookahead_slots: int,
|
||||
allocator_type: str):
|
||||
"""Verify correct calculation of get_num_blocks_touched_by_append_slots.
|
||||
|
||||
This is done by using copy-on-write, which requires any modified block to
|
||||
be copied before write if the refcount > 1. We set the refcount>1 by forking
|
||||
a sequence, then measure the free blocks before and after an append. If the
|
||||
number of consumed blocks equals what `get_num_blocks_touched_by_append_
|
||||
slots` returns, then the calculation is correct.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(num_new_tokens))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Add lookahead before fork so both sequences have the same lookahead
|
||||
# blocks.
|
||||
block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
|
||||
|
||||
# Fork sequence so that every block has refcount > 1.
|
||||
_ = block_table.fork()
|
||||
|
||||
# Determine how many blocks should be touched.
|
||||
expected_num_touched_blocks = (
|
||||
block_table.get_num_blocks_touched_by_append_slots(
|
||||
token_ids=token_ids_to_append,
|
||||
num_lookahead_slots=num_lookahead_slots))
|
||||
|
||||
# Measure how many blocks are touched by measuring num_free_blocks before
|
||||
# and after the append.
|
||||
#
|
||||
# We expect append_token_ids to CoW all mutated blocks that have refcount>1.
|
||||
num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
|
||||
block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
|
||||
num_consumed_blocks = (num_free_blocks_before_append -
|
||||
allocator.get_num_free_blocks(Device.GPU))
|
||||
|
||||
# TODO(cade) ensure equality when num_lookahead_slots > 0.
|
||||
# The reason we have < is because lookahead blocks are not copied eagerly;
|
||||
# they are copied on first write. This will cause issues for beam search +
|
||||
# speculative decoding. This is acceptable for now as it is a large effort
|
||||
# to combine the two. To fix this, we can ensure single sequence ownership
|
||||
# of lookahead blocks by appending empty slots to each block, which will
|
||||
# trigger the CoW.
|
||||
#
|
||||
# Until then, we can accept that the consumed tokens are <= the expected
|
||||
# tokens when appending with lookahead.
|
||||
if num_lookahead_slots > 0:
|
||||
assert num_consumed_blocks <= expected_num_touched_blocks
|
||||
else:
|
||||
assert num_consumed_blocks == expected_num_touched_blocks
|
||||
42
vllm-v0.6.2/tests/core/block/test_common.py
Normal file
42
vllm-v0.6.2/tests/core/block/test_common.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import random
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.common import RefCounter
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
@pytest.mark.parametrize("num_incrs", [1, 100])
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
def test_incr(seed: int, num_incrs: int, num_blocks: int):
|
||||
random.seed(seed)
|
||||
|
||||
all_block_indices = list(range(num_blocks))
|
||||
counter = RefCounter(all_block_indices=all_block_indices)
|
||||
|
||||
block_id = random.randint(0, num_blocks - 1)
|
||||
for i in range(num_incrs):
|
||||
value = counter.incr(block_id)
|
||||
assert value == i + 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
@pytest.mark.parametrize("num_incrs", [1, 100])
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
|
||||
random.seed(seed)
|
||||
|
||||
all_block_indices = list(range(num_blocks))
|
||||
counter = RefCounter(all_block_indices=all_block_indices)
|
||||
|
||||
block_id = random.randint(0, num_blocks - 1)
|
||||
for i in range(num_incrs):
|
||||
value = counter.incr(block_id)
|
||||
assert value == i + 1
|
||||
|
||||
for i in range(num_incrs):
|
||||
value = counter.decr(block_id)
|
||||
assert value == num_incrs - (i + 1)
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
counter.decr(block_id)
|
||||
93
vllm-v0.6.2/tests/core/block/test_cpu_gpu_block_allocator.py
Normal file
93
vllm-v0.6.2/tests/core/block/test_cpu_gpu_block_allocator.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
|
||||
from vllm.utils import Device, chunk_list
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
|
||||
block_size: int, allocator_type: str):
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
cpu_blocks = [
|
||||
allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
|
||||
for _ in range(num_cpu_blocks)
|
||||
]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
gpu_blocks = [
|
||||
allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
|
||||
for _ in range(num_gpu_blocks)
|
||||
]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == 0
|
||||
|
||||
_ = [allocator.free(block) for block in cpu_blocks]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == 0
|
||||
|
||||
_ = [allocator.free(block) for block in gpu_blocks]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
|
||||
@pytest.mark.parametrize("num_gpu_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [2])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
|
||||
block_size: int, allocator_type: str):
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
unique_token_ids = list(
|
||||
range((num_cpu_blocks + num_gpu_blocks) * block_size))
|
||||
gpu_token_ids = list(
|
||||
chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
|
||||
cpu_token_ids = list(
|
||||
chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
|
||||
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
cpu_blocks = [
|
||||
allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids,
|
||||
device=Device.CPU)
|
||||
for token_ids in cpu_token_ids
|
||||
]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
gpu_blocks = [
|
||||
allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids,
|
||||
device=Device.GPU)
|
||||
for token_ids in gpu_token_ids
|
||||
]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == 0
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == 0
|
||||
|
||||
_ = [allocator.free(block) for block in cpu_blocks]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == 0
|
||||
|
||||
_ = [allocator.free(block) for block in gpu_blocks]
|
||||
assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
145
vllm-v0.6.2/tests/core/block/test_naive_block.py
Normal file
145
vllm-v0.6.2/tests/core/block/test_naive_block.py
Normal file
@@ -0,0 +1,145 @@
|
||||
from typing import List, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.interfaces import Block, BlockAllocator
|
||||
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
|
||||
|
||||
|
||||
class TestNaiveBlockAllocator:
|
||||
|
||||
@staticmethod
|
||||
def create_allocate_lambda(allocate_type: str,
|
||||
allocator: NaiveBlockAllocator,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int]):
|
||||
if allocate_type == "immutable":
|
||||
allocate_block = lambda: allocator.allocate_immutable_block(
|
||||
prev_block=prev_block, token_ids=token_ids)
|
||||
elif allocate_type == "mutable":
|
||||
allocate_block = lambda: allocator.allocate_mutable_block(
|
||||
prev_block=prev_block)
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
return allocate_block
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_allocate_ooms(allocate_type: str, num_blocks: int,
|
||||
block_size: int):
|
||||
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
allocate_type,
|
||||
allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)))
|
||||
|
||||
[allocate_block() for _ in range(num_blocks)]
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocate_block()
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_free_prevents_oom(allocate_type: str, num_blocks: int,
|
||||
block_size: int):
|
||||
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
allocate_type,
|
||||
allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)))
|
||||
|
||||
blocks = [allocate_block() for _ in range(num_blocks)]
|
||||
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocate_block()
|
||||
|
||||
block_to_free = blocks.pop()
|
||||
|
||||
for _ in range(100):
|
||||
block_id = block_to_free.block_id
|
||||
allocator.free(block_to_free)
|
||||
assert block_to_free.block_id is None
|
||||
|
||||
new_block = allocate_block()
|
||||
assert new_block.block_id == block_id
|
||||
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocate_block()
|
||||
|
||||
block_to_free = new_block
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
|
||||
block_size: int):
|
||||
allocator = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
allocate_type,
|
||||
allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)))
|
||||
|
||||
assert allocator.get_num_free_blocks() == num_blocks
|
||||
|
||||
blocks = [allocate_block() for _ in range(num_blocks)]
|
||||
|
||||
for i, block in enumerate(blocks):
|
||||
assert allocator.get_num_free_blocks() == i
|
||||
allocator.free(block)
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [4])
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
|
||||
""" Verify the allocator can correctly return the number of
|
||||
full blocks touched.
|
||||
"""
|
||||
allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock,
|
||||
num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
|
||||
# Create a chain of cacheable blocks in the dst
|
||||
allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
"immutable",
|
||||
allocator_src,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)))
|
||||
src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
|
||||
|
||||
# All blocks are cached
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
src_blocks) == num_blocks - 1
|
||||
|
||||
# Insert one non-full block in the src
|
||||
allocate_non_full_block = \
|
||||
TestNaiveBlockAllocator.create_allocate_lambda(
|
||||
"mutable", allocator_src,
|
||||
prev_block=src_blocks[-1],token_ids=[]
|
||||
)
|
||||
src_blocks.append(allocate_non_full_block())
|
||||
src_blocks[-1].append_token_ids([0])
|
||||
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
src_blocks) == num_blocks - 1
|
||||
# Fill up the last source block and then invoke
|
||||
# get_num_blocks_touched
|
||||
src_blocks[-1].append_token_ids([0] * (block_size - 1))
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
src_blocks) == num_blocks
|
||||
764
vllm-v0.6.2/tests/core/block/test_prefix_caching_block.py
Normal file
764
vllm-v0.6.2/tests/core/block/test_prefix_caching_block.py
Normal file
@@ -0,0 +1,764 @@
|
||||
import math
|
||||
import random
|
||||
from typing import List, Optional
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.block.interfaces import Block, BlockAllocator
|
||||
from vllm.core.block.prefix_caching_block import (PrefixCachingBlock,
|
||||
PrefixCachingBlockAllocator)
|
||||
|
||||
|
||||
class TestPrefixCachingBlock:
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("seed", list(range(10)))
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
@pytest.mark.parametrize("is_curr_block_full", [True, False])
|
||||
def test_first_block_has_correct_content_hash(seed: int, block_size: int,
|
||||
is_curr_block_full: bool):
|
||||
"""Verify a block which is first in the sequence has the correct hash.
|
||||
"""
|
||||
random.seed(seed)
|
||||
num_to_fill = block_size if is_curr_block_full else random.randint(
|
||||
0, block_size - 1)
|
||||
token_ids = list(range(num_to_fill))
|
||||
mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
|
||||
|
||||
block_with_prev = PrefixCachingBlock(prev_block=None,
|
||||
token_ids=token_ids,
|
||||
block_size=block_size,
|
||||
allocator=mock_allocator)
|
||||
|
||||
if is_curr_block_full:
|
||||
# Expect hash since block is full.
|
||||
assert block_with_prev.content_hash == (
|
||||
PrefixCachingBlock.hash_block_tokens(
|
||||
is_first_block=True,
|
||||
prev_block_hash=None,
|
||||
cur_block_token_ids=token_ids))
|
||||
else:
|
||||
# Do not expect hash since block is not full.
|
||||
assert block_with_prev.content_hash is None
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("seed", list(range(10)))
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
@pytest.mark.parametrize("is_curr_block_full", [True, False])
|
||||
@pytest.mark.parametrize("prev_block_has_hash", [True, False])
|
||||
def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
|
||||
is_curr_block_full: bool,
|
||||
prev_block_has_hash: bool):
|
||||
"""Verify a block which is not first in the sequence has the correct
|
||||
hash.
|
||||
"""
|
||||
|
||||
random.seed(seed)
|
||||
|
||||
previous_block = MagicMock(spec=PrefixCachingBlock)
|
||||
prev_block_hash = random.randint(0, 1000)
|
||||
previous_block.content_hash = (prev_block_hash
|
||||
if prev_block_has_hash else None)
|
||||
|
||||
num_to_fill = block_size if is_curr_block_full else random.randint(
|
||||
0, block_size - 1)
|
||||
token_ids = list(range(num_to_fill))
|
||||
mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
|
||||
|
||||
block_with_prev = PrefixCachingBlock(
|
||||
prev_block=previous_block,
|
||||
token_ids=token_ids,
|
||||
block_size=block_size,
|
||||
allocator=mock_allocator,
|
||||
)
|
||||
|
||||
if is_curr_block_full and prev_block_has_hash:
|
||||
# Expect hash since block is full and previous block has hash.
|
||||
assert (block_with_prev.content_hash ==
|
||||
PrefixCachingBlock.hash_block_tokens(
|
||||
is_first_block=False,
|
||||
prev_block_hash=prev_block_hash,
|
||||
cur_block_token_ids=token_ids))
|
||||
else:
|
||||
# Do not expect hash since block is not full or the previous block
|
||||
# does not have a hash.
|
||||
assert block_with_prev.content_hash is None
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("block_size", [1, 2, 16])
|
||||
@pytest.mark.parametrize("num_tokens", list(range(3)))
|
||||
@pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10])
|
||||
def test_blocks_have_correct_hash_in_chain(block_size: int,
|
||||
num_tokens: int,
|
||||
num_empty_trailing_blocks: int):
|
||||
"""Create two chains of logical blocks with the same contents.
|
||||
Assert the hashes are equal.
|
||||
"""
|
||||
random.seed(0)
|
||||
|
||||
token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
|
||||
|
||||
first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
num_empty_trailing_blocks=num_empty_trailing_blocks)
|
||||
for _ in range(2))
|
||||
|
||||
for first_chain_block, second_chain_block in zip(
|
||||
first_chain, second_chain):
|
||||
assert (first_chain_block.content_hash ==
|
||||
second_chain_block.content_hash)
|
||||
|
||||
if not first_chain or not second_chain:
|
||||
assert first_chain == second_chain
|
||||
assert num_tokens == 0
|
||||
|
||||
@staticmethod
|
||||
def create_chain(block_size: int,
|
||||
token_ids: List[int],
|
||||
num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
|
||||
"""Helper method which creates a chain of blocks.
|
||||
"""
|
||||
blocks: List[PrefixCachingBlock] = []
|
||||
num_blocks = math.ceil(
|
||||
len(token_ids) / block_size) + num_empty_trailing_blocks
|
||||
|
||||
if num_blocks == 0:
|
||||
return []
|
||||
|
||||
allocator = MagicMock(spec=PrefixCachingBlockAllocator)
|
||||
|
||||
prev_block = None
|
||||
for block_number in range(0, num_blocks):
|
||||
prev_block = PrefixCachingBlock(
|
||||
prev_block=prev_block,
|
||||
token_ids=[],
|
||||
block_size=block_size,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
tokens_to_append = token_ids[block_number *
|
||||
block_size:(block_number + 1) *
|
||||
block_size]
|
||||
if tokens_to_append:
|
||||
prev_block.append_token_ids(tokens_to_append)
|
||||
|
||||
blocks.append(prev_block)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
class TestPrefixCachingBlockAllocator:
|
||||
|
||||
@staticmethod
|
||||
def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int]):
|
||||
if allocate_type == "immutable":
|
||||
allocate_block = lambda: allocator.allocate_immutable_block(
|
||||
prev_block=prev_block, token_ids=token_ids)
|
||||
elif allocate_type == "mutable":
|
||||
allocate_block = lambda: allocator.allocate_mutable_block(
|
||||
prev_block=prev_block)
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
return allocate_block
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
|
||||
allocate_type="mutable",
|
||||
allocator=allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)),
|
||||
)
|
||||
|
||||
[allocate_block() for _ in range(num_blocks)]
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocate_block()
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_allocate_immutable_does_not_oom_single_hash(
|
||||
num_blocks: int, block_size: int):
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
|
||||
allocate_type="immutable",
|
||||
allocator=allocator,
|
||||
prev_block=None,
|
||||
token_ids=list(range(block_size)),
|
||||
)
|
||||
|
||||
blocks = [allocate_block() for _ in range(num_blocks)]
|
||||
|
||||
# Expect no OOM. If these were mutable blocks, this would OOM.
|
||||
non_oom_block = allocate_block()
|
||||
|
||||
# Expect all blocks to have same physical block index.
|
||||
for block in blocks:
|
||||
assert (block.block_id == non_oom_block.block_id)
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_allocate_immutable_ooms_many_hash(num_blocks: int,
|
||||
block_size: int):
|
||||
"""Consume all blocks using many different hashes/block content.
|
||||
|
||||
Do this by creating a sequence that is very long.
|
||||
Expect next block to OOM.
|
||||
"""
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
|
||||
# Create token ids that will exhaust all blocks.
|
||||
token_ids = list(range(num_blocks * block_size))
|
||||
|
||||
chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# Expect allocation with unseen hash to fail.
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocator.allocate_immutable_block(prev_block=chain[-1],
|
||||
token_ids=list(
|
||||
range(block_size)))
|
||||
|
||||
# Expect mutable allocation to fail.
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocator.allocate_mutable_block(prev_block=chain[-1])
|
||||
|
||||
# Expect allocation of exact same chain to pass.
|
||||
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# Expect physical block indices to be the same in both chains.
|
||||
assert chain and second_chain
|
||||
for first_chain_block, second_chain_block in zip(chain, second_chain):
|
||||
assert (first_chain_block.block_id == second_chain_block.block_id)
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1, 1024])
|
||||
@pytest.mark.parametrize("block_size", [1, 16])
|
||||
def test_free_prevents_oom(num_blocks: int, block_size: int):
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
|
||||
# Create token ids that will exhaust all blocks.
|
||||
token_ids = list(range(num_blocks * block_size))
|
||||
|
||||
chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# Expect mutable allocation to fail.
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocator.allocate_mutable_block(prev_block=None)
|
||||
|
||||
block_to_free = chain[-1]
|
||||
|
||||
# Expect free/allocate loop to succeed many times.
|
||||
for i in range(100):
|
||||
block_id = block_to_free.block_id
|
||||
allocator.free(block_to_free)
|
||||
assert block_to_free.block_id is None, i
|
||||
|
||||
new_block = allocator.allocate_mutable_block(prev_block=None)
|
||||
assert new_block.block_id == block_id, i
|
||||
|
||||
with pytest.raises(BlockAllocator.NoFreeBlocksError):
|
||||
allocator.allocate_mutable_block(prev_block=None)
|
||||
|
||||
block_to_free = new_block
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
|
||||
random.seed(seed)
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
num_blocks_to_consume = random.randint(1, num_blocks - 1)
|
||||
|
||||
# Create token ids that will exhaust all blocks.
|
||||
token_ids = list(range(num_blocks_to_consume * block_size))
|
||||
|
||||
chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# Free each block in chain, assert num free blocks includes new free
|
||||
# block.
|
||||
for i, block in enumerate(chain):
|
||||
assert allocator.get_num_free_blocks() == (num_blocks -
|
||||
num_blocks_to_consume +
|
||||
i)
|
||||
allocator.free(block)
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [4])
|
||||
@pytest.mark.parametrize("block_size", [8])
|
||||
def test_prefix_caching_block_get_num_full_blocks_touched(
|
||||
num_blocks, block_size):
|
||||
""" Verify the allocator can correctly return the number of
|
||||
blocks touched, when there are cached prefixes.
|
||||
"""
|
||||
allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
allocator_dst = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
|
||||
# Create token ids that will exhaust all blocks except the last
|
||||
token_ids = list(range((num_blocks - 1) * block_size))
|
||||
|
||||
# Create a chain of cacheable blocks in the dst
|
||||
cached_blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator_dst,
|
||||
)
|
||||
|
||||
# Create a chain of the same blocks in the src
|
||||
blocks_to_swap_in = \
|
||||
TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator_src,
|
||||
)
|
||||
# All blocks are cached
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
blocks_to_swap_in) == 0
|
||||
|
||||
# Free the first block in the dst
|
||||
allocator_dst.free(cached_blocks[0])
|
||||
|
||||
# Now the first block becomes dangling, the swapped blocks need
|
||||
# to reclaim the first block in the dst
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
blocks_to_swap_in) == 1
|
||||
|
||||
# Insert one non-full block in the src
|
||||
non_full_block = allocator_src.allocate_mutable_block(
|
||||
blocks_to_swap_in[-1])
|
||||
non_full_block.append_token_ids([0])
|
||||
blocks_to_swap_in.append(non_full_block)
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
blocks_to_swap_in) == 1
|
||||
# Fill up the last mutable block and invoke get_num_blocks_touched.
|
||||
# Note: The last block is not cached so it will be touched.
|
||||
non_full_block.append_token_ids([0] * (block_size - 1))
|
||||
assert allocator_dst.get_num_full_blocks_touched(
|
||||
blocks_to_swap_in) == 2
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
|
||||
seed: int):
|
||||
"""Verify sharing occurs by allocating two sequences that share prefixes
|
||||
and incrementally freeing blocks.
|
||||
"""
|
||||
random.seed(seed)
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
num_blocks_to_consume = random.randint(1, num_blocks - 1)
|
||||
|
||||
# Create token ids that will exhaust all blocks.
|
||||
token_ids = list(range(num_blocks_to_consume * block_size))
|
||||
|
||||
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# Free each block in the first chain. Since all blocks are shared, the
|
||||
# free count should stay constant.
|
||||
for i, block in enumerate(first_chain):
|
||||
assert allocator.get_num_free_blocks() == (num_blocks -
|
||||
num_blocks_to_consume)
|
||||
allocator.free(block)
|
||||
|
||||
# Free each block in the second chain. Since the refcount is now zero,
|
||||
# the free count should increment with each free.
|
||||
for i, block in enumerate(second_chain):
|
||||
assert allocator.get_num_free_blocks() == (num_blocks -
|
||||
num_blocks_to_consume +
|
||||
i)
|
||||
allocator.free(block)
|
||||
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
|
||||
seed: int):
|
||||
"""Verify get_common_computed_block_ids could get correct result
|
||||
by create two immutable chain sharing prefix at specified pos,
|
||||
and compare whether we also could get right result
|
||||
from get_common_computed_block_ids.
|
||||
"""
|
||||
random.seed(seed)
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2,
|
||||
block_size=block_size)
|
||||
num_blocks_to_consume = random.randint(1, num_blocks - 1)
|
||||
|
||||
# Create token ids that will exhaust all blocks.
|
||||
token_ids = list(range(num_blocks_to_consume * block_size))
|
||||
|
||||
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# After zero_point, second_chain's token_ids would be set -1, which
|
||||
# make it different from here comparing with first_chain
|
||||
zero_point = random.randint(1, len(token_ids) - 1)
|
||||
zero_point_blocks = zero_point // block_size
|
||||
token_ids[zero_point:] = [-1] * (len(token_ids) - zero_point)
|
||||
|
||||
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
first_computed_ids = [
|
||||
first_chain[i].block_id for i in range(num_blocks_to_consume)
|
||||
]
|
||||
second_computed_ids = [
|
||||
second_chain[i].block_id for i in range(num_blocks_to_consume)
|
||||
]
|
||||
res = allocator.get_common_computed_block_ids(
|
||||
[first_computed_ids, second_computed_ids])
|
||||
|
||||
assert (len(res) == zero_point_blocks)
|
||||
|
||||
# Test case that assume those prompted block after first immutable would
|
||||
# be freed into hashless allocator, while first immutable block get ref
|
||||
# increased.
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [3])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(10)))
|
||||
def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
|
||||
random.seed(seed)
|
||||
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
token_ids = list(range(block_size))
|
||||
|
||||
block = allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids)
|
||||
|
||||
assert allocator._refcounter.get(block.block_id) == 1
|
||||
m = allocator.allocate_mutable_block(prev_block=None)
|
||||
|
||||
block_id = m.block_id
|
||||
for i in range(block_size):
|
||||
m.append_token_ids([i])
|
||||
|
||||
# After block get promoted to immutable from mutable, if there is
|
||||
# already same content hash block, then it shall be released into
|
||||
# hashless_allocator
|
||||
# And first immutable block's ref get increased by 1
|
||||
assert m.block_id == block.block_id
|
||||
assert block_id in allocator._hashless_allocator._free_block_indices
|
||||
assert allocator._refcounter.get(block.block_id) == 2
|
||||
|
||||
# Test case when eviction and allocation are mixed,
|
||||
# make sure they work as expected
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [3])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(10)))
|
||||
def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
|
||||
random.seed(seed)
|
||||
|
||||
all_blocks_list = [i for i in range(num_blocks)]
|
||||
zero_ref = {i: 0 for i in range(num_blocks)}
|
||||
one_ref = {i: 1 for i in range(num_blocks)}
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
token_ids = list(range(num_blocks * block_size))
|
||||
|
||||
# Verify initial/pre-alloc state
|
||||
|
||||
# Ensure all blocks are free inside hashless allocator
|
||||
assert list(allocator._hashless_allocator._free_block_indices
|
||||
) == all_blocks_list
|
||||
# Ensure no tracked blocks
|
||||
assert len(allocator._block_tracker.keys()) == num_blocks
|
||||
for block_id in range(num_blocks):
|
||||
assert not allocator._block_tracker[block_id].active
|
||||
# Ensure no cached blocks
|
||||
assert len(allocator._cached_blocks.values()) == 0
|
||||
# Ensure no evicted blocks
|
||||
assert len(allocator.evictor.free_table.keys()) == 0
|
||||
# Ensure 0s ref counts for all blocks
|
||||
assert allocator._refcounter._refcounts == zero_ref
|
||||
|
||||
# Allocate immutable chains with only one block residuled in
|
||||
new_block = []
|
||||
for i in range(num_blocks):
|
||||
block = allocator.allocate_immutable_block(
|
||||
prev_block=None,
|
||||
token_ids=token_ids[block_size * i:block_size * (i + 1)])
|
||||
new_block.append(block)
|
||||
|
||||
# Verify post-alloc state
|
||||
|
||||
# Ensure no blocks are free inside hashless allocator
|
||||
assert (len(allocator._hashless_allocator._free_block_indices) == 0)
|
||||
# Ensure all blocks are tracked
|
||||
assert len(allocator._block_tracker.keys()) == num_blocks
|
||||
for block_id in range(num_blocks):
|
||||
assert allocator._block_tracker[block_id].active
|
||||
# Ensure all blocks are cached (all promoted)
|
||||
assert len(allocator._cached_blocks.values()) == num_blocks
|
||||
# Ensure no evicted blocks
|
||||
assert len(allocator.evictor.free_table.keys()) == 0
|
||||
# Ensure 1s ref counts for all blocks
|
||||
assert allocator._refcounter._refcounts == one_ref
|
||||
|
||||
# Free all blocks, and now all blocks shall be in the evictor
|
||||
# there shall be no tracking data left in _block_tracker
|
||||
# all blocks shall be tracked in _cached_blocks
|
||||
# all blocks' ref shall be zero
|
||||
for block in new_block:
|
||||
allocator.free(block)
|
||||
|
||||
# Verify post-free state
|
||||
|
||||
# Ensure no tracked blocks
|
||||
assert len(allocator._block_tracker.keys()) == num_blocks
|
||||
for block_id in range(num_blocks):
|
||||
assert not allocator._block_tracker[block_id].active
|
||||
# Ensure no blocks in hashless allocator (all promoted)
|
||||
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
||||
# Ensure all blocks are cached
|
||||
assert list(allocator._cached_blocks.values()) == all_blocks_list
|
||||
# Ensure all blocks are inside the evictor
|
||||
assert list(allocator.evictor.free_table.keys()) == all_blocks_list
|
||||
# Ensure 0s refcounts
|
||||
assert allocator._refcounter._refcounts == zero_ref
|
||||
|
||||
# Allocate a mutable block, and the first block shall be evicted
|
||||
# and set its content hash into None, ref to 1
|
||||
mutable = allocator.allocate_mutable_block(prev_block=None)
|
||||
|
||||
assert mutable.block_id == 0
|
||||
assert mutable.content_hash is None
|
||||
assert allocator._block_tracker[0].active
|
||||
assert allocator._refcounter.get(0) == 1
|
||||
assert 0 not in allocator._cached_blocks
|
||||
assert 0 not in allocator.evictor
|
||||
|
||||
# Since this mutable block has no hash yet, it shall be released into
|
||||
# hashless allocator
|
||||
allocator.free(mutable)
|
||||
|
||||
assert not allocator._block_tracker[0].active
|
||||
assert allocator._refcounter._refcounts == zero_ref
|
||||
assert 0 not in allocator._cached_blocks
|
||||
assert 0 not in allocator.evictor
|
||||
assert 0 in allocator._hashless_allocator._free_block_indices
|
||||
|
||||
# When allocate immutable with first block_size tokens, we
|
||||
# shall get free block from hashless allocator, thus no block left
|
||||
# in hashless
|
||||
block = allocator.allocate_immutable_block(
|
||||
prev_block=None, token_ids=token_ids[:block_size])
|
||||
|
||||
assert block.block_id == 0
|
||||
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
||||
assert allocator._block_tracker[0].active
|
||||
assert 0 in allocator._cached_blocks.values()
|
||||
assert allocator._refcounter.get(0) == 1
|
||||
assert 0 not in allocator.evictor
|
||||
|
||||
# allocate mutable block again, it shall be popped from evictor
|
||||
mutable = allocator.allocate_mutable_block(prev_block=None)
|
||||
assert len(allocator._hashless_allocator._free_block_indices) == 0
|
||||
assert mutable.block_id not in allocator.evictor.free_table
|
||||
assert allocator._refcounter.get(mutable.block_id) == 1
|
||||
|
||||
# Test case where two last accessed times are equal
|
||||
@staticmethod
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("seed", list(range(20)))
|
||||
def test_eviction_order(num_blocks: int, block_size: int, seed: int):
|
||||
"""This test case simulate the two chain created and free in order,
|
||||
and together they would exhaust the initial freed blocks.
|
||||
|
||||
So the next block created after those two chain shall use the block
|
||||
from the first chain as that block has long access time.
|
||||
While first chain has two blocks, it shall pick up the last one, as
|
||||
it has larger token number.
|
||||
"""
|
||||
|
||||
random.seed(seed)
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
|
||||
block_size=block_size)
|
||||
num_blocks_to_consume = num_blocks + 1
|
||||
|
||||
token_ids = list(range(num_blocks_to_consume * block_size))
|
||||
|
||||
num_blocks_in_first_chain = 2
|
||||
num_tokens_in_first_chain = block_size * num_blocks_in_first_chain
|
||||
# First chain takes the first block
|
||||
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids[:num_tokens_in_first_chain],
|
||||
allocator=allocator,
|
||||
)
|
||||
# There should only be one block allocated at this point
|
||||
assert allocator.get_num_free_blocks() == (num_blocks -
|
||||
num_blocks_in_first_chain)
|
||||
|
||||
# Set the last accessed time of the first block to 1
|
||||
blocks_ids = [block.block_id for block in first_chain]
|
||||
allocator.mark_blocks_as_accessed(blocks_ids, 1)
|
||||
|
||||
# Second chain takes the rest of the blocks
|
||||
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids[num_tokens_in_first_chain:-block_size],
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
# There shouldn't be any blocks left at this point
|
||||
assert allocator.get_num_free_blocks() == (0)
|
||||
|
||||
assert len(first_chain) == num_blocks_in_first_chain
|
||||
last_block_id = first_chain[-1].block_id
|
||||
# Free each block in the first chain.
|
||||
for i, block in enumerate(first_chain):
|
||||
allocator.free(block)
|
||||
|
||||
# Set the last accessed time on all of the blocks in the second chain
|
||||
# to 2
|
||||
blocks_ids = [block.block_id for block in second_chain]
|
||||
allocator.mark_blocks_as_accessed(blocks_ids, 2)
|
||||
|
||||
# Free each block in the second chain.
|
||||
for i, block in enumerate(second_chain):
|
||||
allocator.free(block)
|
||||
|
||||
# Allocate a new block and check that it's the least recently used block
|
||||
# from the first chain.
|
||||
new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=token_ids[-block_size:],
|
||||
allocator=allocator,
|
||||
)
|
||||
|
||||
assert new_block[0].block_id == last_block_id
|
||||
|
||||
# Test case for cache mertics
|
||||
@staticmethod
|
||||
def test_metric():
|
||||
block_size = 16
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=4,
|
||||
block_size=block_size)
|
||||
# Test when no query (0/0)
|
||||
assert allocator.get_prefix_cache_hit_rate() == 0.0
|
||||
|
||||
token_ids = list(range(block_size))
|
||||
allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids)
|
||||
# Test 0/1 hit rate
|
||||
assert allocator.get_prefix_cache_hit_rate() == 0.0
|
||||
|
||||
allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids)
|
||||
# Test 1/2 hit rate
|
||||
assert allocator.get_prefix_cache_hit_rate() == 0.5
|
||||
|
||||
# Test more than one block
|
||||
for _ in range(2, 1005):
|
||||
allocator.allocate_immutable_block(prev_block=None,
|
||||
token_ids=token_ids)
|
||||
assert allocator.get_prefix_cache_hit_rate() > 0.99
|
||||
|
||||
# Test case for marking cache hit blocks as computed right after
|
||||
# a batch of prefill sequences are scheduled.
|
||||
@staticmethod
|
||||
def test_touch_block():
|
||||
block_size = 16
|
||||
common_blocks = 4
|
||||
allocator = PrefixCachingBlockAllocator(num_blocks=8,
|
||||
block_size=block_size)
|
||||
|
||||
common_token_ids = list(range(block_size * common_blocks))
|
||||
|
||||
# Mimic the behavior of allocating the same block chain
|
||||
# (i.e., common prefix) for a batch of 3 different prefill sequences.
|
||||
for _ in range(3):
|
||||
blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
|
||||
block_size=block_size,
|
||||
token_ids=common_token_ids,
|
||||
allocator=allocator,
|
||||
)
|
||||
block_ids = [block.block_id for block in blocks]
|
||||
# The allocated blocks should be marked as touched
|
||||
# but not computed.
|
||||
computed_block_ids = allocator.get_computed_block_ids(
|
||||
[], block_ids, skip_last_block_id=False)
|
||||
assert len(computed_block_ids) == 0
|
||||
|
||||
allocator.mark_blocks_as_computed([])
|
||||
computed_block_ids = allocator.get_computed_block_ids(
|
||||
[], block_ids, skip_last_block_id=False)
|
||||
assert len(computed_block_ids) == common_blocks
|
||||
|
||||
@staticmethod
|
||||
def create_immutable_chain(
|
||||
block_size: int,
|
||||
token_ids: List[int],
|
||||
allocator: PrefixCachingBlockAllocator,
|
||||
) -> List[PrefixCachingBlock]:
|
||||
"""Helper method which creates a chain of blocks.
|
||||
"""
|
||||
blocks: List[Block] = []
|
||||
num_blocks = math.ceil(len(token_ids) / block_size)
|
||||
|
||||
if num_blocks == 0:
|
||||
return []
|
||||
|
||||
prev_block = None
|
||||
for block_number in range(0, num_blocks):
|
||||
block_token_ids = token_ids[block_number *
|
||||
block_size:(block_number + 1) *
|
||||
block_size]
|
||||
prev_block = allocator.allocate_immutable_block(
|
||||
prev_block=prev_block, token_ids=block_token_ids)
|
||||
blocks.append(prev_block)
|
||||
|
||||
return blocks
|
||||
509
vllm-v0.6.2/tests/core/test_chunked_prefill_scheduler.py
Normal file
509
vllm-v0.6.2/tests/core/test_chunked_prefill_scheduler.py
Normal file
@@ -0,0 +1,509 @@
|
||||
from typing import List
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest # noqa
|
||||
|
||||
from vllm.config import CacheConfig, SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.sequence import Logprob, SequenceGroup
|
||||
|
||||
from .utils import create_dummy_prompt
|
||||
|
||||
|
||||
def get_sequence_groups(scheduler_output):
|
||||
return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
|
||||
|
||||
|
||||
def append_new_token(seq_group, token_id: int):
|
||||
for seq in seq_group.get_seqs():
|
||||
seq.append_token_id(token_id, {token_id: Logprob(token_id)})
|
||||
|
||||
|
||||
def schedule_and_update_computed_tokens(scheduler):
|
||||
metas, out, _ = scheduler.schedule()
|
||||
for s, meta in zip(out.scheduled_seq_groups, metas):
|
||||
s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
|
||||
return metas, out
|
||||
|
||||
|
||||
def test_simple():
|
||||
"""Verify basic scheduling works."""
|
||||
block_size = 4
|
||||
num_seq_group = 4
|
||||
max_model_len = 16
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig("generate",
|
||||
max_num_batched_tokens,
|
||||
num_seq_group,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(num_seq_group):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=block_size,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Schedule seq groups prompts.
|
||||
num_tokens = block_size * num_seq_group
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert out.num_batched_tokens == num_tokens
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == num_seq_group
|
||||
for s in running:
|
||||
append_new_token(s, 1)
|
||||
|
||||
# Schedule seq groups generation.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert out.num_batched_tokens == num_seq_group
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == num_seq_group
|
||||
|
||||
|
||||
def test_chunk():
|
||||
"""Verify prefills are chunked properly."""
|
||||
block_size = 4
|
||||
max_seqs = 60
|
||||
max_model_len = 80
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 32
|
||||
cache_config.num_gpu_blocks = 32
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Verify the second request is chunked.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
print()
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert seq_group_meta[0].token_chunk_size == 60
|
||||
# Verify it is chunked.
|
||||
assert seq_group_meta[1].token_chunk_size == 4
|
||||
assert out.num_prefill_groups == 2
|
||||
assert out.num_batched_tokens == 64
|
||||
# Only the first seq group has a new token appended.
|
||||
append_new_token(running[0], 1)
|
||||
|
||||
# One chunked prefill, and one decoding.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
# The first one is prefill. Scheduler guarantees ordering.
|
||||
assert seq_group_meta[0].token_chunk_size == 56
|
||||
# The second one is a chunked prefill.
|
||||
assert seq_group_meta[1].token_chunk_size == 1
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == 57
|
||||
|
||||
|
||||
def test_complex():
|
||||
block_size = 4
|
||||
max_seqs = 60
|
||||
max_model_len = 80
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 64
|
||||
cache_config.num_gpu_blocks = 64
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
assert seq_group.is_prefill()
|
||||
|
||||
# Verify the second request is chunked.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert seq_group_meta[0].token_chunk_size == 60
|
||||
# Verify it is chunked.
|
||||
assert seq_group_meta[1].token_chunk_size == 4
|
||||
assert not running[0].is_prefill()
|
||||
assert running[1].is_prefill()
|
||||
assert out.num_prefill_groups == 2
|
||||
assert out.num_batched_tokens == 64
|
||||
# Only the first seq group has a new token appended.
|
||||
append_new_token(running[0], 1)
|
||||
|
||||
# Add 2 more requests.
|
||||
for i in range(2, 4):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Decoding & chunked prefill & first chunk of 3rd request is scheduled.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 3
|
||||
# The first one is the first chunked prefill.
|
||||
assert seq_group_meta[0].token_chunk_size == 7
|
||||
# The second one is the second new chunked prefill.
|
||||
assert seq_group_meta[1].token_chunk_size == 56
|
||||
# The last one is decode.
|
||||
assert seq_group_meta[2].token_chunk_size == 1
|
||||
# Two of them are in chunked prefill.
|
||||
assert out.num_prefill_groups == 2
|
||||
assert out.num_batched_tokens == 64
|
||||
# The first 2 requests are now in decodine phase.
|
||||
append_new_token(running[0], 1)
|
||||
assert not running[0].is_prefill()
|
||||
append_new_token(running[1], 1)
|
||||
assert not running[1].is_prefill()
|
||||
# The third request is still in prefill stage.
|
||||
assert running[2].is_prefill()
|
||||
|
||||
|
||||
def test_maximal_decoding():
|
||||
"""Verify decoding requests are prioritized."""
|
||||
block_size = 4
|
||||
max_seqs = 2
|
||||
max_model_len = 8
|
||||
max_num_batched_tokens = 2
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=2,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
assert seq_group.is_prefill()
|
||||
|
||||
# The first prefill is scheduled.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 1
|
||||
assert seq_group_meta[0].token_chunk_size == 2
|
||||
assert not running[0].is_prefill()
|
||||
assert running[1].is_prefill()
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == 2
|
||||
# Only the first seq group has a new token appended.
|
||||
append_new_token(running[0], 1)
|
||||
|
||||
# Create one more seq_group.
|
||||
_, seq_group = create_dummy_prompt("3",
|
||||
prompt_length=2,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
assert seq_group.is_prefill()
|
||||
# The first decoding + second chunk is scheduled.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 2
|
||||
assert seq_group_meta[0].token_chunk_size == 1
|
||||
assert seq_group_meta[1].token_chunk_size == 1
|
||||
assert not running[0].is_prefill()
|
||||
assert running[1].is_prefill()
|
||||
assert running[2].is_prefill()
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == 2
|
||||
append_new_token(running[0], 1)
|
||||
|
||||
# Decoding + running prefill is prioritized.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 2
|
||||
assert seq_group_meta[0].token_chunk_size == 1
|
||||
assert seq_group_meta[1].token_chunk_size == 1
|
||||
assert not running[0].is_prefill()
|
||||
assert not running[1].is_prefill()
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == 2
|
||||
append_new_token(running[0], 1)
|
||||
append_new_token(running[1], 1)
|
||||
|
||||
# Only decoding is prioritized.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 2
|
||||
assert seq_group_meta[0].token_chunk_size == 1
|
||||
assert seq_group_meta[1].token_chunk_size == 1
|
||||
assert not running[0].is_prefill()
|
||||
assert not running[1].is_prefill()
|
||||
assert out.num_prefill_groups == 0
|
||||
assert out.num_batched_tokens == 2
|
||||
append_new_token(running[0], 1)
|
||||
append_new_token(running[1], 1)
|
||||
|
||||
# After aborting the decoding request, the fcfs new prefill is prioritized.
|
||||
scheduler.abort_seq_group(running[0].request_id)
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 2
|
||||
assert seq_group_meta[0].token_chunk_size == 1
|
||||
assert seq_group_meta[1].token_chunk_size == 1
|
||||
assert not running[1].is_prefill()
|
||||
assert running[2].is_prefill()
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == 2
|
||||
|
||||
|
||||
def test_prompt_limit():
|
||||
"""Verify max_num_batched_tokens < max_model_len is possible."""
|
||||
block_size = 4
|
||||
max_seqs = 32
|
||||
max_model_len = 64
|
||||
max_num_batched_tokens = 32
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
cache_config.num_gpu_blocks = 16
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
_, seq_group = create_dummy_prompt("1",
|
||||
prompt_length=48,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
assert seq_group.is_prefill()
|
||||
|
||||
# The prompt length > max_num_batched_tokens should be still scheduled.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(get_sequence_groups(out)) == 1
|
||||
assert seq_group_meta[0].token_chunk_size == 32
|
||||
assert running[0].is_prefill()
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == 32
|
||||
|
||||
|
||||
def test_prompt_limit_exceed():
|
||||
block_size = 4
|
||||
max_seqs = 64
|
||||
max_model_len = 32
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig("generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
cache_config.num_gpu_blocks = 16
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
_, seq_group = create_dummy_prompt("2",
|
||||
prompt_length=48,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
assert seq_group.is_prefill()
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(out.ignored_seq_groups) == 1
|
||||
assert out.ignored_seq_groups[0] == seq_group
|
||||
|
||||
|
||||
def test_chunked_prefill_preempt():
|
||||
"""Verify preempt works with chunked prefill requests"""
|
||||
block_size = 4
|
||||
max_seqs = 30
|
||||
max_model_len = 200
|
||||
max_num_batched_tokens = 30
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
cache_config.num_gpu_blocks = 16
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
_, seq_group = create_dummy_prompt("1",
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
# The request is chunked.
|
||||
# prefill scheduled now.
|
||||
assert len(out.scheduled_seq_groups) == 1
|
||||
assert out.num_prefill_groups == 1
|
||||
assert seq_group.is_prefill()
|
||||
assert out.num_batched_tokens == max_num_batched_tokens
|
||||
|
||||
# The request should be preempted.
|
||||
scheduler.block_manager.can_append_slots = MagicMock()
|
||||
|
||||
def cannot_append_second_group1(seq_group, num_lookahead_slots):
|
||||
return seq_group.request_id != "1"
|
||||
|
||||
scheduler.block_manager.can_append_slots.side_effect = (
|
||||
cannot_append_second_group1)
|
||||
|
||||
# The running prefill is now preempted.
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(out.scheduled_seq_groups) == 0
|
||||
assert out.num_batched_tokens == 0
|
||||
assert out.blocks_to_swap_out == []
|
||||
assert out.blocks_to_swap_in == []
|
||||
|
||||
# Make sure we can reschedule preempted request.
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(out.scheduled_seq_groups) == 1
|
||||
assert out.num_prefill_groups == 1
|
||||
assert seq_group.is_prefill()
|
||||
assert out.num_batched_tokens == max_num_batched_tokens
|
||||
assert seq_group.get_num_uncomputed_tokens() == 30
|
||||
|
||||
# We should be able to run prefill twice as it is chunked.
|
||||
def cannot_append_second_group2(seq_group, num_lookahead_slots):
|
||||
return True
|
||||
|
||||
scheduler.block_manager.can_append_slots.side_effect = (
|
||||
cannot_append_second_group2)
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert len(out.scheduled_seq_groups) == 1
|
||||
assert out.num_prefill_groups == 1
|
||||
assert not seq_group.is_prefill()
|
||||
assert out.num_batched_tokens == max_num_batched_tokens
|
||||
|
||||
|
||||
def test_chunked_prefill_max_seqs():
|
||||
block_size = 4
|
||||
max_seqs = 2
|
||||
max_model_len = 80
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 128
|
||||
cache_config.num_gpu_blocks = 128
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
_, seq_group = create_dummy_prompt("1",
|
||||
prompt_length=65,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
# The first prefill is chunked.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert seq_group_meta[0].token_chunk_size == max_num_batched_tokens
|
||||
assert len(get_sequence_groups(out)) == 1
|
||||
|
||||
# Add new requests.
|
||||
for i in range(4):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=65,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Make sure only 2 requests are scheduled.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert out.num_batched_tokens == max_num_batched_tokens
|
||||
assert len(get_sequence_groups(out)) == 2
|
||||
assert not running[0].is_prefill()
|
||||
assert running[1].is_prefill()
|
||||
append_new_token(running[0], 1)
|
||||
|
||||
# Although we have enough token budget, we can only schedule max_seqs.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert seq_group_meta[0].token_chunk_size == 2
|
||||
assert seq_group_meta[1].token_chunk_size == 1
|
||||
assert out.num_batched_tokens == 3
|
||||
assert len(get_sequence_groups(out)) == max_seqs
|
||||
assert not running[0].is_prefill()
|
||||
assert not running[1].is_prefill()
|
||||
|
||||
|
||||
def test_perfix_caching():
|
||||
"""Verify allocating full blocks when prefix caching is enabled."""
|
||||
block_size = 4
|
||||
max_seqs = 10
|
||||
max_model_len = 80
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
cache_config = CacheConfig(block_size,
|
||||
1.0,
|
||||
1,
|
||||
"auto",
|
||||
enable_prefix_caching=True)
|
||||
cache_config.num_cpu_blocks = 0
|
||||
cache_config.num_gpu_blocks = 32
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
block_size=block_size,
|
||||
prompt_length=50)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert seq_group_meta[0].token_chunk_size == 50
|
||||
# Verify it is chunked. Note that although the budget is 64-50=14,
|
||||
# we only allocate full blocks for prefix caching, so only 4*(14//4)=12
|
||||
# tokens are allocated.
|
||||
assert seq_group_meta[1].token_chunk_size == 12
|
||||
assert out.num_prefill_groups == 2
|
||||
assert out.num_batched_tokens == 62
|
||||
80
vllm-v0.6.2/tests/core/test_num_computed_tokens_update.py
Normal file
80
vllm-v0.6.2/tests/core/test_num_computed_tokens_update.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import pytest
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.core.utils import create_dummy_prompt
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SequenceGroup
|
||||
|
||||
MODEL = "JackFram/llama-160m"
|
||||
|
||||
|
||||
def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
|
||||
scheduler = engine.scheduler[0]
|
||||
scheduler.add_seq_group(seq_group)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
|
||||
@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
|
||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
||||
def test_num_computed_tokens_update(num_scheduler_steps: int,
|
||||
enable_chunked_prefill: bool,
|
||||
enforce_eager: bool):
|
||||
|
||||
is_multi_step = num_scheduler_steps > 1
|
||||
is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
|
||||
|
||||
if is_multi_step_chunked_prefill and current_platform.is_rocm():
|
||||
pytest.skip("Multi-step with Chunked-Prefill does not support "
|
||||
"rocm_flash_attn backend")
|
||||
|
||||
# Make a vllm engine
|
||||
runner = VllmRunner(model_name=MODEL,
|
||||
gpu_memory_utilization=0.3,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
enforce_eager=enforce_eager)
|
||||
engine: LLMEngine = runner.model.llm_engine
|
||||
|
||||
# In multi-step + chunked-prefill there is no separate single prompt step.
|
||||
# What is scheduled will run for num_scheduler_steps always.
|
||||
num_prompt_steps = num_scheduler_steps \
|
||||
if is_multi_step_chunked_prefill else 1
|
||||
|
||||
num_output_tokens_list = [4, 8, 12, 15, 16, 17]
|
||||
|
||||
# Create sequence and add to engine
|
||||
prompt_len = 10
|
||||
|
||||
for req_idx, num_output_tokens in enumerate(num_output_tokens_list):
|
||||
seq, seq_group = create_dummy_prompt(request_id=str(req_idx),
|
||||
prompt_length=prompt_len,
|
||||
min_tokens=num_output_tokens,
|
||||
max_tokens=num_output_tokens)
|
||||
add_seq_group_to_engine(engine, seq_group)
|
||||
|
||||
assert seq.data.get_num_computed_tokens() == 0
|
||||
|
||||
for _ in range(num_prompt_steps):
|
||||
# prompt steps
|
||||
engine.step()
|
||||
|
||||
if not seq.is_finished():
|
||||
prompt_num_computed_tokens = seq.data.get_num_computed_tokens()
|
||||
# Test correctness of num_computed_tokens after the prompt steps
|
||||
assert prompt_num_computed_tokens == \
|
||||
prompt_len + num_prompt_steps - 1
|
||||
|
||||
decode_step_counter = 0
|
||||
while not seq.is_finished():
|
||||
# Test correctness of num_computed_tokens after the decode steps
|
||||
assert seq.data.get_num_computed_tokens(
|
||||
) == prompt_num_computed_tokens + decode_step_counter
|
||||
for _ in range(num_scheduler_steps):
|
||||
# decode step
|
||||
engine.step()
|
||||
decode_step_counter += 1
|
||||
|
||||
# Test correctness of num_computed_tokens after the sequence finish.
|
||||
assert seq.data.get_num_computed_tokens(
|
||||
) == prompt_len + num_output_tokens - 1
|
||||
802
vllm-v0.6.2/tests/core/test_scheduler.py
Normal file
802
vllm-v0.6.2/tests/core/test_scheduler.py
Normal file
@@ -0,0 +1,802 @@
|
||||
import time
|
||||
from collections import deque
|
||||
from typing import List, Set, Tuple
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest # noqa
|
||||
from torch import Use # noqa
|
||||
|
||||
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
|
||||
from vllm.core.interfaces import AllocStatus
|
||||
from vllm.core.scheduler import Scheduler, SchedulingBudget
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import SequenceGroup
|
||||
|
||||
from .utils import (append_new_token, append_new_token_seq_group,
|
||||
create_dummy_prompt, get_sequence_groups,
|
||||
schedule_and_update_computed_tokens)
|
||||
|
||||
|
||||
def test_scheduler_add_seq_group():
|
||||
block_size = 4
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=100,
|
||||
max_num_seqs=64,
|
||||
max_model_len=1,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
|
||||
cache_config.num_cpu_blocks = 4
|
||||
cache_config.num_gpu_blocks = 4
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
# Add seq group to scheduler.
|
||||
num_seq_group = 4
|
||||
for i in range(num_seq_group):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
block_size,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
assert scheduler.get_num_unfinished_seq_groups() == i + 1
|
||||
|
||||
|
||||
def test_scheduler_abort_seq_group():
|
||||
block_size = 4
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=100,
|
||||
max_num_seqs=64,
|
||||
max_model_len=1,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 4
|
||||
cache_config.num_gpu_blocks = 4
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
# Add multiple seq groups to scheduler.
|
||||
num_seq_group = 4
|
||||
request_ids: Set[str] = set()
|
||||
for i in range(num_seq_group):
|
||||
_, seq_group = create_dummy_prompt(str(i), block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
request_ids.add(str(i))
|
||||
|
||||
# Abort all added seq groups.
|
||||
assert scheduler.get_num_unfinished_seq_groups() == num_seq_group
|
||||
scheduler.abort_seq_group(request_ids)
|
||||
assert scheduler.get_num_unfinished_seq_groups() == 0
|
||||
|
||||
|
||||
def test_scheduler_schedule_simple():
|
||||
block_size = 4
|
||||
num_seq_group = 4
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=num_seq_group,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(num_seq_group):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=block_size,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Schedule seq groups prompts.
|
||||
num_tokens = block_size * num_seq_group
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert out.num_batched_tokens == num_tokens
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == num_seq_group
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Schedule seq groups generation.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
assert out.num_batched_tokens == num_seq_group
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == num_seq_group
|
||||
append_new_token(out, 1)
|
||||
|
||||
|
||||
def test_scheduler_prefill_prioritized():
|
||||
"""Verify running batched tokens are not applied to prefill requests."""
|
||||
block_size = 4
|
||||
max_model_len = 30
|
||||
max_batched_num_tokens = 30
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=max_batched_num_tokens,
|
||||
max_num_seqs=2,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
cache_config.num_gpu_blocks = 16
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
_, seq_group_a = create_dummy_prompt("1", 1, block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group_a)
|
||||
|
||||
# Schedule seq groups prompts.
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert get_sequence_groups(out) == [seq_group_a]
|
||||
|
||||
# Add a new prefill request B.
|
||||
_, seq_group_b = create_dummy_prompt("2", 30, block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group_b)
|
||||
|
||||
# Verify prefill requests are prioritized. Since max_batched_num_tokens
|
||||
# is 1, new prefill request has to be scheduled first.
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert get_sequence_groups(out) == [seq_group_b]
|
||||
|
||||
|
||||
def test_scheduler_schedule_preempt_abort():
|
||||
block_size = 4
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=2,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 2
|
||||
cache_config.num_gpu_blocks = 2
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
seq_a, seq_group_a = create_dummy_prompt("1",
|
||||
block_size,
|
||||
block_size=block_size)
|
||||
seq_b, seq_group_b = create_dummy_prompt("2",
|
||||
block_size,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group_a)
|
||||
scheduler.add_seq_group(seq_group_b)
|
||||
|
||||
# Schedule seq groups prompts.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert get_sequence_groups(out) == [seq_group_a, seq_group_b]
|
||||
assert out.num_batched_tokens == block_size * 2 # seq_a and seq_b
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == 2
|
||||
assert scheduler.get_num_unfinished_seq_groups() == 2
|
||||
|
||||
# Append "generated" tokens, allowing the sequence to mark prompt tokens as
|
||||
# processed.
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Schedule seq groups generation and preempt seq group b.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert get_sequence_groups(out) == [seq_group_a]
|
||||
assert out.num_batched_tokens == 1
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == 1
|
||||
assert scheduler.get_num_unfinished_seq_groups() == 2
|
||||
assert out.preempted == 1
|
||||
|
||||
# Abort seq group a. Re-schedule seq group b prompt with recomputation.
|
||||
scheduler.abort_seq_group("1")
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert get_sequence_groups(out) == [seq_group_b]
|
||||
assert out.num_batched_tokens == 5 # 4 prompt + 1 generation.
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
assert len(seq_group_meta) == 1
|
||||
assert scheduler.get_num_unfinished_seq_groups() == 1
|
||||
|
||||
|
||||
def test_scheduler_max_seqs():
|
||||
block_size = 4
|
||||
num_seq_group = 4
|
||||
max_seq_group = 2
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=max_seq_group,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
all_seq_groups: List[SequenceGroup] = []
|
||||
# Add seq groups to scheduler.
|
||||
for i in range(num_seq_group):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=block_size,
|
||||
block_size=block_size)
|
||||
all_seq_groups.append(seq_group)
|
||||
|
||||
# Append 1 seq group
|
||||
scheduler.add_seq_group(all_seq_groups[0])
|
||||
|
||||
# Schedule seq groups prompts.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Schedule seq groups generation.
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Append 2 more seq group
|
||||
scheduler.add_seq_group(all_seq_groups[1])
|
||||
scheduler.add_seq_group(all_seq_groups[2])
|
||||
|
||||
# Schedule seq groups prompts.
|
||||
# Only 1 seq group should be scheduled since max_seq_group is 2
|
||||
# and one is prompting.
|
||||
_, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
|
||||
|
||||
|
||||
def test_scheduler_delay_factor():
|
||||
block_size = 4
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=100,
|
||||
max_num_seqs=64,
|
||||
max_model_len=16,
|
||||
delay_factor=0.5,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
cache_config.num_gpu_blocks = 8
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
|
||||
# schedule first prompt
|
||||
seq_group_meta, seq_group = create_dummy_prompt("0",
|
||||
prompt_length=block_size,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert out.num_prefill_groups > 0
|
||||
assert seq_group_meta[0].request_id == '0'
|
||||
append_new_token(out, 1)
|
||||
|
||||
# wait for a second before scheduling next prompt
|
||||
time.sleep(1)
|
||||
seq_group_meta, seq_group = create_dummy_prompt("1",
|
||||
prompt_length=block_size,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
|
||||
# second prompt should *not* be scheduled
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert out.num_prefill_groups == 0
|
||||
assert seq_group_meta[0].request_id == '0'
|
||||
append_new_token(out, 1)
|
||||
|
||||
# wait for more than 0.5 second and try again
|
||||
time.sleep(0.6)
|
||||
seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
|
||||
assert out.num_prefill_groups > 0
|
||||
assert seq_group_meta[0].request_id == '1'
|
||||
append_new_token(out, 1)
|
||||
|
||||
|
||||
def initialize_scheduler(
|
||||
*,
|
||||
max_num_seqs=1000,
|
||||
max_token_budget=1000,
|
||||
max_model_len=1000,
|
||||
lora_config=None,
|
||||
block_size=4,
|
||||
num_cpu_blocks=8,
|
||||
num_gpu_blocks=8,
|
||||
):
|
||||
block_size = block_size
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens=max_token_budget,
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
scheduler = Scheduler(scheduler_config, cache_config, lora_config)
|
||||
return scheduler
|
||||
|
||||
|
||||
def create_token_budget(token_budget: int = 10000,
|
||||
max_num_seqs: int = 10000) -> SchedulingBudget:
|
||||
return SchedulingBudget(
|
||||
token_budget=token_budget,
|
||||
max_num_seqs=max_num_seqs,
|
||||
)
|
||||
|
||||
|
||||
def add_token_budget(budget: SchedulingBudget,
|
||||
num_batched_tokens: int = 0,
|
||||
num_curr_seqs: int = 0):
|
||||
mock_seq_group = create_dummy_prompt('10', prompt_length=60)[1]
|
||||
budget.add_num_batched_tokens(mock_seq_group.request_id,
|
||||
num_batched_tokens)
|
||||
budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
|
||||
|
||||
|
||||
def test_prefill_schedule_max_prompt_len():
|
||||
"""
|
||||
Test prompt longer than max_prompt_len is aborted.
|
||||
"""
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
|
||||
_, seq_group = create_dummy_prompt("0",
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 1
|
||||
assert len(output.seq_groups) == 0
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(remaining_waiting) == 0
|
||||
|
||||
|
||||
def test_prefill_schedule_token_budget():
|
||||
"""
|
||||
Test token budget respected.
|
||||
"""
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=64,
|
||||
num_gpu_blocks=64)
|
||||
budget = create_token_budget(token_budget=0)
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
|
||||
# 0 token budget == nothing is scheduled.
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 0
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(remaining_waiting) == 2
|
||||
|
||||
# 60 token budget == 1 request scheduled.
|
||||
budget = create_token_budget(token_budget=60)
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 1
|
||||
assert budget.num_batched_tokens == 60
|
||||
assert budget.num_curr_seqs == 1
|
||||
assert len(remaining_waiting) == 1
|
||||
|
||||
# Test when current_batched_tokens respected.
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=16,
|
||||
num_gpu_blocks=16)
|
||||
budget = create_token_budget(token_budget=60)
|
||||
add_token_budget(budget, 30, 0)
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
# Cannot schedule a prompt that doesn't fit the budget.
|
||||
scheduler.add_seq_group(seq_group)
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 0
|
||||
assert budget.num_batched_tokens == 30
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(remaining_waiting) == 1
|
||||
budget = create_token_budget(token_budget=90)
|
||||
add_token_budget(budget, 30, 0)
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.seq_groups) == 1
|
||||
assert budget.num_batched_tokens == 90
|
||||
assert budget.num_curr_seqs == 1
|
||||
assert len(remaining_waiting) == 0
|
||||
|
||||
|
||||
def test_prefill_schedule_max_seqs():
|
||||
"""
|
||||
Test max seq respected.
|
||||
"""
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=64,
|
||||
num_gpu_blocks=64)
|
||||
budget = create_token_budget(max_num_seqs=2)
|
||||
for i in range(3):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 2
|
||||
assert budget.num_batched_tokens == 120
|
||||
assert budget.num_curr_seqs == 2
|
||||
assert len(remaining_waiting) == 1
|
||||
|
||||
# Verify curr_num_seqs respected.
|
||||
scheduler.waiting = deque()
|
||||
budget = create_token_budget(max_num_seqs=2)
|
||||
add_token_budget(budget, 0, 2)
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 0
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 2
|
||||
assert len(remaining_waiting) == 1
|
||||
|
||||
|
||||
def test_prefill_schedule_max_lora():
|
||||
"""
|
||||
Test max lora is respected and prioritized.
|
||||
"""
|
||||
block_size = 4
|
||||
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
|
||||
scheduler = initialize_scheduler(lora_config=lora_config,
|
||||
block_size=block_size,
|
||||
num_cpu_blocks=64,
|
||||
num_gpu_blocks=64)
|
||||
budget = create_token_budget(token_budget=120)
|
||||
curr_loras: Set[int] = set()
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size,
|
||||
lora_request=LoRARequest(
|
||||
lora_name=str(i),
|
||||
lora_int_id=i + 1,
|
||||
lora_path="abc"))
|
||||
scheduler.add_seq_group(seq_group)
|
||||
# Add two more requests to verify lora is prioritized.
|
||||
# 0: Lora, 1: Lora, 2: regular, 3: regular
|
||||
# In the first iteration, index 0, 2 is scheduled.
|
||||
# If a request is not scheduled because it hits max lora, it is
|
||||
# prioritized. Verify that.
|
||||
for i in range(2, 4):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
# Schedule 2 requests (0 and 2)
|
||||
output = scheduler._schedule_prefills(budget, curr_loras)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 2
|
||||
assert budget.num_batched_tokens == 120
|
||||
assert budget.num_curr_seqs == 2
|
||||
assert len(remaining_waiting) == 2
|
||||
assert len(curr_loras) == 1
|
||||
# The second lora request is scheduled next as FCFS policy.
|
||||
# Reset curr_loras so that it can be scheduled.
|
||||
curr_loras = set()
|
||||
budget = create_token_budget(token_budget=60)
|
||||
output = scheduler._schedule_prefills(budget, curr_loras)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.seq_groups) == 1
|
||||
assert output.seq_groups[0].seq_group.request_id == "1"
|
||||
assert len(remaining_waiting) == 1
|
||||
assert len(curr_loras) == 1
|
||||
assert budget.num_batched_tokens == 60
|
||||
|
||||
|
||||
def test_prefill_schedule_no_block_manager_capacity():
|
||||
"""
|
||||
Test sequence cannot be scheduled due to block manager has no capacity.
|
||||
"""
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_gpu_blocks=128,
|
||||
num_cpu_blocks=128)
|
||||
budget = create_token_budget()
|
||||
for i in range(3):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
scheduler.block_manager.can_allocate = MagicMock()
|
||||
scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 0
|
||||
assert len(output.seq_groups) == 0
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(remaining_waiting) == 3
|
||||
|
||||
scheduler = initialize_scheduler()
|
||||
budget = create_token_budget()
|
||||
for i in range(3):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
scheduler.block_manager.can_allocate = MagicMock()
|
||||
scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
|
||||
output = scheduler._schedule_prefills(budget, None)
|
||||
remaining_waiting = scheduler.waiting
|
||||
assert len(output.ignored_seq_groups) == 3
|
||||
assert len(output.seq_groups) == 0
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(remaining_waiting) == 0
|
||||
|
||||
|
||||
def test_decode_schedule_preempted():
|
||||
"""
|
||||
Test decodes cannot be scheduled and preempted.
|
||||
"""
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=64,
|
||||
num_gpu_blocks=64)
|
||||
curr_loras = None
|
||||
for i in range(3):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
scheduler._add_seq_group_to_running(seq_group)
|
||||
scheduler.block_manager.can_append_slots = MagicMock()
|
||||
|
||||
def cannot_append_second_group(seq_group, num_lookahead_slots):
|
||||
return seq_group.request_id != "1"
|
||||
|
||||
scheduler.block_manager.can_append_slots.side_effect = (
|
||||
cannot_append_second_group)
|
||||
|
||||
# 1 cannot be scheduled, and the lowest priority (request 2)
|
||||
# should be preempted. 1 will also be preempted.
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_running(budget, curr_loras)
|
||||
remainig_running = scheduler.running
|
||||
assert len(remainig_running) == 0
|
||||
assert len(output.decode_seq_groups) == 1
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
assert output.decode_seq_groups[0].seq_group.request_id == "0"
|
||||
assert len(output.preempted) == 2
|
||||
# Verify budgets are updated.
|
||||
assert budget.num_batched_tokens == 1
|
||||
# NOTE: When enable_chunk is False, num_seqs budget is not updated.
|
||||
# assert budget.num_curr_seqs == 1
|
||||
# Both should be preempted, not swapped.
|
||||
assert output.blocks_to_swap_out == []
|
||||
# Nothing is copied.
|
||||
assert output.blocks_to_copy == []
|
||||
|
||||
|
||||
def test_schedule_decode_blocks_to_copy_update():
|
||||
"""
|
||||
Verify blocks_to_copy is updated.
|
||||
"""
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=4,
|
||||
num_cpu_blocks=16,
|
||||
num_gpu_blocks=16)
|
||||
_, seq_group = create_dummy_prompt("1",
|
||||
prompt_length=60,
|
||||
best_of=2,
|
||||
block_size=block_size)
|
||||
curr_loras = None
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
scheduler._add_seq_group_to_running(seq_group)
|
||||
|
||||
# The last request should be swapped out.
|
||||
scheduler.block_manager.append_slots = MagicMock()
|
||||
scheduler.block_manager.append_slots.return_value = [(2, 3)]
|
||||
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_running(budget, curr_loras)
|
||||
remaining_running = scheduler.running
|
||||
assert len(remaining_running) == 0
|
||||
assert len(output.decode_seq_groups) == 1
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
assert len(output.preempted) == 0
|
||||
assert len(output.swapped_out) == 0
|
||||
# Nothing is preempted.
|
||||
assert output.blocks_to_swap_out == []
|
||||
# Since append_slot returns the source -> dist mapping, it should
|
||||
# applied.
|
||||
assert output.blocks_to_copy == [(2, 3)]
|
||||
|
||||
|
||||
def test_schedule_swapped_max_loras():
|
||||
block_size = 4
|
||||
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
|
||||
scheduler = initialize_scheduler(lora_config=lora_config,
|
||||
block_size=block_size,
|
||||
num_cpu_blocks=32,
|
||||
num_gpu_blocks=32)
|
||||
curr_loras: Set[int] = set()
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
block_size=block_size,
|
||||
lora_request=LoRARequest(
|
||||
lora_name=str(i),
|
||||
lora_int_id=i + 1,
|
||||
lora_path="abc"))
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
scheduler._swap_out(seq_group, blocks_to_swap_out)
|
||||
scheduler._add_seq_group_to_swapped(seq_group)
|
||||
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_swapped(budget, curr_loras)
|
||||
remaining_swapped = scheduler.swapped
|
||||
assert len(remaining_swapped) == 1
|
||||
assert budget.num_batched_tokens == 1
|
||||
assert budget.num_curr_seqs == 1
|
||||
assert len(output.decode_seq_groups) == 1
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
assert len(curr_loras) == 1
|
||||
|
||||
|
||||
def test_schedule_swapped_cannot_swap_in():
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=32,
|
||||
num_gpu_blocks=32)
|
||||
curr_loras = None
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
best_of=2,
|
||||
block_size=block_size)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
scheduler._swap_out(seq_group, blocks_to_swap_out)
|
||||
scheduler._add_seq_group_to_swapped(seq_group)
|
||||
|
||||
# The last request should be swapped out.
|
||||
scheduler.block_manager.can_swap_in = MagicMock()
|
||||
scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
|
||||
# Since we cannot swap in, none of the requests are swapped in.
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_swapped(budget, curr_loras)
|
||||
remaining_swapped = scheduler.swapped
|
||||
assert len(remaining_swapped) == 2
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(output.decode_seq_groups) == 0
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
|
||||
|
||||
def test_infeasible_swap():
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=32,
|
||||
num_gpu_blocks=32)
|
||||
curr_loras = None
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
for i in range(2):
|
||||
_, seq_group = create_dummy_prompt(str(i),
|
||||
prompt_length=60,
|
||||
best_of=2,
|
||||
block_size=block_size)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
scheduler._swap_out(seq_group, blocks_to_swap_out)
|
||||
scheduler._add_seq_group_to_swapped(seq_group)
|
||||
|
||||
# The last request should be swapped out.
|
||||
scheduler.block_manager.can_swap_in = MagicMock()
|
||||
scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER
|
||||
# Since we cannot swap in, none of the requests are swapped in.
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_swapped(budget, curr_loras)
|
||||
remaining_swapped = scheduler.swapped
|
||||
assert len(remaining_swapped) == 0
|
||||
assert len(output.infeasible_seq_groups) == 2
|
||||
assert budget.num_batched_tokens == 0
|
||||
assert budget.num_curr_seqs == 0
|
||||
assert len(output.decode_seq_groups) == 0
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
|
||||
|
||||
def test_schedule_swapped_blocks_to_copy():
|
||||
block_size = 4
|
||||
scheduler = initialize_scheduler(block_size=block_size,
|
||||
num_cpu_blocks=32,
|
||||
num_gpu_blocks=32)
|
||||
curr_loras = None
|
||||
_, seq_group = create_dummy_prompt("1",
|
||||
prompt_length=60,
|
||||
best_of=2,
|
||||
block_size=block_size)
|
||||
scheduler._allocate_and_set_running(seq_group)
|
||||
append_new_token_seq_group(60, seq_group, 1)
|
||||
blocks_to_swap_out: List[Tuple[int, int]] = []
|
||||
scheduler._swap_out(seq_group, blocks_to_swap_out)
|
||||
scheduler._add_seq_group_to_swapped(seq_group)
|
||||
|
||||
# The last request should be swapped out.
|
||||
scheduler.block_manager.append_slots = MagicMock()
|
||||
scheduler.block_manager.append_slots.return_value = [(2, 3)]
|
||||
|
||||
budget = create_token_budget()
|
||||
output = scheduler._schedule_swapped(budget, curr_loras)
|
||||
remaining_swapped = scheduler.swapped
|
||||
assert len(remaining_swapped) == 0
|
||||
assert len(output.decode_seq_groups) == 1
|
||||
assert len(output.prefill_seq_groups) == 0
|
||||
assert output.blocks_to_copy == [(2, 3)]
|
||||
|
||||
|
||||
def test_scheduling_budget():
|
||||
TOKEN_BUDGET = 4
|
||||
MAX_SEQS = 4
|
||||
budget = SchedulingBudget(token_budget=TOKEN_BUDGET, max_num_seqs=MAX_SEQS)
|
||||
assert budget.can_schedule(num_new_tokens=1, num_new_seqs=1)
|
||||
assert budget.can_schedule(num_new_tokens=4, num_new_seqs=4)
|
||||
assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=5)
|
||||
assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=1)
|
||||
assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=5)
|
||||
assert budget.remaining_token_budget() == TOKEN_BUDGET
|
||||
|
||||
# Verify add/subtract num batched tokens.
|
||||
_, seq_group = create_dummy_prompt("1", 3)
|
||||
budget.add_num_batched_tokens(seq_group.request_id, 2)
|
||||
assert budget.remaining_token_budget() == 2
|
||||
assert budget.num_batched_tokens == 2
|
||||
assert budget.can_schedule(num_new_tokens=2, num_new_seqs=1)
|
||||
assert not budget.can_schedule(num_new_tokens=3, num_new_seqs=1)
|
||||
# Verify adding another seq group is no-op.
|
||||
budget.add_num_batched_tokens(seq_group.request_id, 2)
|
||||
assert budget.remaining_token_budget() == 2
|
||||
assert budget.num_batched_tokens == 2
|
||||
budget.subtract_num_batched_tokens(seq_group.request_id, 2)
|
||||
assert budget.remaining_token_budget() == 4
|
||||
assert budget.num_batched_tokens == 0
|
||||
budget.subtract_num_batched_tokens(seq_group.request_id, 2)
|
||||
assert budget.remaining_token_budget() == 4
|
||||
assert budget.num_batched_tokens == 0
|
||||
|
||||
# Verify add/subtract max seqs.
|
||||
_, seq_group = create_dummy_prompt("1", 3)
|
||||
budget.add_num_seqs(seq_group.request_id, 2)
|
||||
assert budget.can_schedule(num_new_tokens=1, num_new_seqs=2)
|
||||
assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=3)
|
||||
assert budget.num_curr_seqs == 2
|
||||
# Verify adding another seq group is no-op.
|
||||
budget.add_num_seqs(seq_group.request_id, 2)
|
||||
assert budget.num_curr_seqs == 2
|
||||
budget.subtract_num_seqs(seq_group.request_id, 2)
|
||||
assert budget.num_curr_seqs == 0
|
||||
budget.subtract_num_seqs(seq_group.request_id, 2)
|
||||
assert budget.num_curr_seqs == 0
|
||||
104
vllm-v0.6.2/tests/core/test_scheduler_encoder_decoder.py
Normal file
104
vllm-v0.6.2/tests/core/test_scheduler_encoder_decoder.py
Normal file
@@ -0,0 +1,104 @@
|
||||
from typing import List
|
||||
|
||||
import pytest # noqa
|
||||
|
||||
from vllm.config import CacheConfig, SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.sequence import SequenceGroup
|
||||
|
||||
from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
|
||||
get_sequence_groups, schedule_and_update_computed_tokens)
|
||||
|
||||
|
||||
def test_scheduler_schedule_simple_encoder_decoder():
|
||||
'''
|
||||
Test basic scheduler functionality in the context
|
||||
of an encoder/decoder model. Focus on testing
|
||||
enc/dec-specific functionality sense tests already
|
||||
exist for decoder-only functionality
|
||||
|
||||
Test behavior:
|
||||
* Construct Scheduler
|
||||
* Construct dummy encoder/decoder sequence groups
|
||||
* Add dummy seq groups to scheduler backlog
|
||||
* Schedule the next seq group & validate:
|
||||
* Cross-attn block tables
|
||||
* Updated states of seq groups
|
||||
* Number of batched tokens
|
||||
* Number of blocks to copy/swap-in/swap-out
|
||||
* Number of scheduled seq groups
|
||||
* Repeat for both prefill- and decode-phase
|
||||
* Abort scheduled seq groups
|
||||
* Assert that aborted seq groups no longer appear in
|
||||
cross-attention block table
|
||||
'''
|
||||
|
||||
block_size = 4
|
||||
num_seq_group = 4
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
task="generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=num_seq_group,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group
|
||||
cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group
|
||||
scheduler = Scheduler(scheduler_config, cache_config, None)
|
||||
running: List[SequenceGroup] = []
|
||||
|
||||
# Add seq groups to scheduler.
|
||||
req_id_list = []
|
||||
for i in range(num_seq_group):
|
||||
req_id = str(i)
|
||||
req_id_list.append(req_id)
|
||||
_, _, seq_group = create_dummy_prompt_encoder_decoder(
|
||||
req_id, block_size, block_size, block_size)
|
||||
scheduler.add_seq_group(seq_group)
|
||||
running.append(seq_group)
|
||||
|
||||
# Schedule seq groups prefill.
|
||||
num_tokens = block_size * num_seq_group
|
||||
seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
|
||||
# - Verify that sequence group cross-attention block tables are
|
||||
# registered with the block manager
|
||||
assert all([(req_id in scheduler.block_manager.cross_block_tables)
|
||||
for req_id in req_id_list])
|
||||
# - Validate sequence-group status
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
# - Validate number of batched tokens
|
||||
assert out.num_batched_tokens == num_tokens
|
||||
# - Validate there are no remaining blocks to swap
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
# - Validate all seq groups were scheduled
|
||||
assert len(seq_group_meta_list) == num_seq_group
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Schedule seq groups decode.
|
||||
seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
|
||||
# - Verify that sequence group metadata includes encoder attention
|
||||
# and cross-attention metadata
|
||||
assert all([
|
||||
not ((seq_group_meta.encoder_seq_data is None) or
|
||||
(seq_group_meta.cross_block_table is None))
|
||||
for seq_group_meta in seq_group_meta_list
|
||||
])
|
||||
# - Validate sequence-group status
|
||||
assert set(get_sequence_groups(out)) == set(running)
|
||||
# - Validate there is one batched token per seq group
|
||||
assert out.num_batched_tokens == num_seq_group
|
||||
# - Validate there are no remaining blocks to swap
|
||||
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
|
||||
and not out.blocks_to_swap_out)
|
||||
# - Validate that all seq groups were scheduled
|
||||
assert len(seq_group_meta_list) == num_seq_group
|
||||
append_new_token(out, 1)
|
||||
|
||||
# Abort sequences
|
||||
for req_id in req_id_list:
|
||||
scheduler.abort_seq_group(req_id)
|
||||
# - Verify that sequence group cross-attention block tables are
|
||||
# NO LONGER registered with the block manager
|
||||
assert req_id not in scheduler.block_manager.cross_block_tables
|
||||
33
vllm-v0.6.2/tests/core/test_serialization.py
Normal file
33
vllm-v0.6.2/tests/core/test_serialization.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import msgspec
|
||||
|
||||
from vllm.executor.msgspec_utils import decode_hook, encode_hook
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
|
||||
from ..spec_decode.utils import create_batch
|
||||
|
||||
|
||||
def test_msgspec_serialization():
|
||||
num_lookahead_slots = 4
|
||||
seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots)
|
||||
execute_model_req = ExecuteModelRequest(
|
||||
seq_group_metadata_list=seq_group_metadata_list,
|
||||
num_lookahead_slots=num_lookahead_slots,
|
||||
running_queue_size=4)
|
||||
|
||||
encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
|
||||
decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
|
||||
dec_hook=decode_hook)
|
||||
req = decoder.decode(encoder.encode(execute_model_req))
|
||||
expected = execute_model_req.seq_group_metadata_list
|
||||
actual = req.seq_group_metadata_list
|
||||
assert (len(expected) == len(actual))
|
||||
expected = expected[0]
|
||||
actual = actual[0]
|
||||
|
||||
assert expected.block_tables == actual.block_tables
|
||||
assert expected.is_prompt == actual.is_prompt
|
||||
assert expected.request_id == actual.request_id
|
||||
assert (expected.seq_data[0].prompt_token_ids ==
|
||||
actual.seq_data[0].prompt_token_ids)
|
||||
assert (expected.seq_data[0].output_token_ids ==
|
||||
actual.seq_data[0].output_token_ids)
|
||||
205
vllm-v0.6.2/tests/core/utils.py
Normal file
205
vllm-v0.6.2/tests/core/utils.py
Normal file
@@ -0,0 +1,205 @@
|
||||
import time
|
||||
from typing import List, Optional
|
||||
from typing import Sequence as GenericSequence
|
||||
from typing import Tuple
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.inputs import EncoderDecoderInputs, token_inputs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import Logprob, Sequence, SequenceGroup
|
||||
|
||||
|
||||
def create_dummy_prompt(
|
||||
request_id: str,
|
||||
prompt_length: int,
|
||||
block_size: Optional[int] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
best_of: int = 1,
|
||||
prompt_tokens: Optional[List[int]] = None,
|
||||
min_tokens: int = 0,
|
||||
max_tokens: int = 16,
|
||||
) -> Tuple[Sequence, SequenceGroup]:
|
||||
if not block_size:
|
||||
block_size = prompt_length
|
||||
|
||||
if prompt_tokens is None:
|
||||
# Create dummy prompt sequence with tokens 0...block_size-1
|
||||
# and prompt "0 ... block_size".
|
||||
prompt_tokens = list(range(prompt_length))
|
||||
prompt_str = " ".join([str(t) for t in prompt_tokens])
|
||||
prompt = Sequence(int(request_id),
|
||||
inputs=token_inputs(prompt_tokens, prompt=prompt_str),
|
||||
block_size=block_size)
|
||||
seq_group = SequenceGroup(request_id=request_id,
|
||||
seqs=[prompt],
|
||||
arrival_time=time.time(),
|
||||
sampling_params=SamplingParams(
|
||||
best_of=best_of,
|
||||
max_tokens=max_tokens,
|
||||
min_tokens=min_tokens),
|
||||
lora_request=lora_request)
|
||||
|
||||
return prompt, seq_group
|
||||
|
||||
|
||||
def create_dummy_prompt_encoder_decoder(
|
||||
request_id: str,
|
||||
decoder_prompt_length: int,
|
||||
encoder_prompt_length: int,
|
||||
block_size: Optional[int] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
best_of: int = 1,
|
||||
) -> Tuple[Sequence, Sequence, SequenceGroup]:
|
||||
if not block_size:
|
||||
block_size = decoder_prompt_length
|
||||
|
||||
# Create dummy prompt sequence with tokens 0...block_size-1
|
||||
# and prompt "0 ... block_size". Note that the prompt string
|
||||
# doesn't actually match the tokens
|
||||
decoder_prompt_tokens = list(range(decoder_prompt_length))
|
||||
decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
|
||||
encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
|
||||
encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
|
||||
|
||||
inputs: EncoderDecoderInputs = {
|
||||
"decoder": token_inputs(decoder_prompt_tokens,
|
||||
prompt=decoder_prompt_str),
|
||||
"encoder": token_inputs(encoder_prompt_tokens,
|
||||
prompt=encoder_prompt_str),
|
||||
}
|
||||
|
||||
decoder_prompt = Sequence(int(request_id),
|
||||
inputs=inputs["decoder"],
|
||||
block_size=block_size)
|
||||
|
||||
encoder_prompt = Sequence(int(request_id),
|
||||
inputs=inputs["encoder"],
|
||||
block_size=block_size)
|
||||
|
||||
seq_group = SequenceGroup(request_id=request_id,
|
||||
seqs=[decoder_prompt],
|
||||
sampling_params=SamplingParams(best_of=best_of),
|
||||
arrival_time=time.time(),
|
||||
lora_request=lora_request,
|
||||
encoder_seq=encoder_prompt)
|
||||
|
||||
return decoder_prompt, encoder_prompt, seq_group
|
||||
|
||||
|
||||
def create_seq_group(
|
||||
seq_prompt_len: int = 1024,
|
||||
seq_output_lens: GenericSequence[int] = (128, ),
|
||||
request_id: str = '0',
|
||||
seq_id_start: int = 0,
|
||||
sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
|
||||
|
||||
assert len(seq_output_lens) > 0
|
||||
|
||||
if sampling_params is None:
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
prompt_token_ids = [0] * seq_prompt_len
|
||||
|
||||
seqs: List[Sequence] = []
|
||||
for seq_id_offset, output_len in enumerate(seq_output_lens):
|
||||
seq = Sequence(
|
||||
seq_id=seq_id_start + seq_id_offset,
|
||||
inputs=token_inputs(prompt_token_ids),
|
||||
block_size=16,
|
||||
)
|
||||
|
||||
for i in range(output_len):
|
||||
seq.append_token_id(
|
||||
token_id=i,
|
||||
logprobs={i: Logprob(0.0)},
|
||||
)
|
||||
seqs.append(seq)
|
||||
|
||||
seq_group = SequenceGroup(
|
||||
request_id=request_id,
|
||||
seqs=seqs,
|
||||
sampling_params=sampling_params,
|
||||
arrival_time=time.time(),
|
||||
)
|
||||
|
||||
return seq_group
|
||||
|
||||
|
||||
def create_seq_group_encoder_decoder(
|
||||
seq_prompt_len: int = 1024,
|
||||
seq_output_lens: GenericSequence[int] = (128, ),
|
||||
request_id: str = '0',
|
||||
seq_id_start: int = 0,
|
||||
sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
|
||||
|
||||
assert len(seq_output_lens) > 0
|
||||
|
||||
if sampling_params is None:
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
prompt_token_ids = [0] * seq_prompt_len
|
||||
|
||||
inputs: EncoderDecoderInputs = {
|
||||
"decoder": token_inputs(prompt_token_ids),
|
||||
"encoder": token_inputs(prompt_token_ids),
|
||||
}
|
||||
|
||||
seqs = []
|
||||
for seq_id_offset, output_len in enumerate(seq_output_lens):
|
||||
# Construct decoder input sequences
|
||||
seq = Sequence(
|
||||
seq_id=seq_id_start + seq_id_offset,
|
||||
inputs=inputs["decoder"],
|
||||
block_size=16,
|
||||
)
|
||||
|
||||
for i in range(output_len):
|
||||
seq.append_token_id(
|
||||
token_id=i,
|
||||
logprobs={i: Logprob(0.0)},
|
||||
)
|
||||
seqs.append(seq)
|
||||
|
||||
# Encoder input sequence
|
||||
encoder_seq = Sequence(
|
||||
seq_id=seq_id_start + len(seq_output_lens),
|
||||
inputs=inputs["encoder"],
|
||||
block_size=16,
|
||||
)
|
||||
|
||||
return SequenceGroup(request_id=request_id,
|
||||
seqs=seqs,
|
||||
sampling_params=sampling_params,
|
||||
arrival_time=time.time(),
|
||||
encoder_seq=encoder_seq)
|
||||
|
||||
|
||||
def round_up_to_next_block(seq_len: int, block_size: int) -> int:
|
||||
return (seq_len + block_size - 1) // block_size
|
||||
|
||||
|
||||
# Helper functions for scheduler tests
|
||||
|
||||
|
||||
def get_sequence_groups(scheduler_output):
|
||||
return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
|
||||
|
||||
|
||||
def append_new_token(out, token_id: int):
|
||||
seq_groups = get_sequence_groups(out)
|
||||
for seq_group in seq_groups:
|
||||
for seq in seq_group.get_seqs():
|
||||
seq.append_token_id(token_id, {token_id: Logprob(token_id)})
|
||||
|
||||
|
||||
def schedule_and_update_computed_tokens(scheduler):
|
||||
metas, out, _ = scheduler.schedule()
|
||||
for s, meta in zip(out.scheduled_seq_groups, metas):
|
||||
s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
|
||||
return metas, out
|
||||
|
||||
|
||||
def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
|
||||
seq_group.update_num_computed_tokens(token_chunk_size)
|
||||
for seq in seq_group.get_seqs():
|
||||
seq.append_token_id(token_id, {token_id: Logprob(token_id)})
|
||||
5
vllm-v0.6.2/tests/data/test_config.yaml
Normal file
5
vllm-v0.6.2/tests/data/test_config.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
port: 12312
|
||||
served_model_name: mymodel
|
||||
tensor_parallel_size: 2
|
||||
trust_remote_code: true
|
||||
multi_step_stream_outputs: false
|
||||
0
vllm-v0.6.2/tests/distributed/__init__.py
Normal file
0
vllm-v0.6.2/tests/distributed/__init__.py
Normal file
59
vllm-v0.6.2/tests/distributed/test_ca_buffer_sharing.py
Normal file
59
vllm-v0.6.2/tests/distributed/test_ca_buffer_sharing.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# can only run on machines with p2p access across GPUs
|
||||
# can only run with torchrun:
|
||||
# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py
|
||||
|
||||
import ctypes
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
||||
from vllm.distributed.device_communicators.custom_all_reduce import ( # noqa
|
||||
CustomAllreduce)
|
||||
|
||||
# create a cpu process group for communicating metadata (ipc handle)
|
||||
dist.init_process_group(backend="gloo")
|
||||
rank = local_rank = dist.get_rank()
|
||||
world_size = dist.get_world_size()
|
||||
|
||||
# every process sets its own device (differently)
|
||||
lib = CudaRTLibrary()
|
||||
lib.cudaSetDevice(rank)
|
||||
|
||||
buffer_size_in_bytes = 1024
|
||||
byte_value = 2 # the value we write to the buffer for verification
|
||||
|
||||
pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes)
|
||||
|
||||
print(f"Rank {rank} has pointers {pointers}")
|
||||
|
||||
dist.barrier()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
if rank == 0:
|
||||
# the first rank tries to write to all buffers
|
||||
for p in pointers:
|
||||
pointer = ctypes.c_void_p(p)
|
||||
lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes)
|
||||
|
||||
dist.barrier()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
host_data = (ctypes.c_char * buffer_size_in_bytes)()
|
||||
|
||||
# all ranks read from all buffers, and check if the data is correct
|
||||
for p in pointers:
|
||||
pointer = ctypes.c_void_p(p)
|
||||
lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes)
|
||||
for i in range(buffer_size_in_bytes):
|
||||
assert ord(host_data[i]) == byte_value, (
|
||||
f"Rank {rank} failed"
|
||||
f" to verify buffer {p}. Expected {byte_value}, "
|
||||
f"got {ord(host_data[i])}")
|
||||
|
||||
print(f"Rank {rank} verified all buffers")
|
||||
|
||||
dist.barrier()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
CustomAllreduce.free_shared_buffer(pointers)
|
||||
200
vllm-v0.6.2/tests/distributed/test_comm_ops.py
Normal file
200
vllm-v0.6.2/tests/distributed/test_comm_ops.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""Test the communication operators.
|
||||
|
||||
Run `pytest tests/distributed/test_comm_ops.py`.
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
|
||||
from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
|
||||
tensor_model_parallel_all_gather,
|
||||
tensor_model_parallel_all_reduce)
|
||||
|
||||
from ..utils import init_test_distributed_environment, multi_process_parallel
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
del os.environ["MLU_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
num_elements = 8
|
||||
all_tensors = [
|
||||
torch.arange(num_elements, dtype=torch.float32, device="cuda") *
|
||||
(r + 1) for r in range(tp_size)
|
||||
]
|
||||
expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
|
||||
t = all_tensors[rank % tp_size]
|
||||
t = tensor_model_parallel_all_reduce(t)
|
||||
torch.testing.assert_close(t, expected)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
del os.environ["MLU_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
num_dimensions = 3
|
||||
tensor_size = list(range(2, num_dimensions + 2))
|
||||
total_size = 1
|
||||
for s in tensor_size:
|
||||
total_size *= s
|
||||
for all_gather_dimension in range(num_dimensions):
|
||||
all_tensors = [
|
||||
torch.arange(total_size, dtype=torch.float32,
|
||||
device="cuda").reshape(tensor_size) * (r + 1)
|
||||
for r in range(tp_size)
|
||||
]
|
||||
expected = torch.cat(all_tensors, dim=all_gather_dimension)
|
||||
t = all_tensors[rank % tp_size]
|
||||
t = tensor_model_parallel_all_gather(t, all_gather_dimension)
|
||||
torch.testing.assert_close(t, expected)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
del os.environ["MLU_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
test_dict = {
|
||||
# device tensor
|
||||
"a": torch.arange(8, dtype=torch.float32, device="cuda"),
|
||||
# CPU tensor
|
||||
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
|
||||
"c": "test",
|
||||
"d": [1, 2, 3],
|
||||
"e": {
|
||||
"a": 1,
|
||||
"b": 2
|
||||
},
|
||||
# empty tensor
|
||||
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
|
||||
}
|
||||
|
||||
if (rank % tp_size) == 0:
|
||||
broadcast_tensor_dict(test_dict, src=0)
|
||||
else:
|
||||
recv_dict = broadcast_tensor_dict(src=0)
|
||||
assert len(recv_dict) == len(test_dict)
|
||||
torch.testing.assert_close(recv_dict["a"], test_dict["a"])
|
||||
torch.testing.assert_close(recv_dict["b"], test_dict["b"])
|
||||
assert recv_dict["c"] == test_dict["c"]
|
||||
assert recv_dict["d"] == test_dict["d"]
|
||||
assert recv_dict["e"] == test_dict["e"]
|
||||
torch.testing.assert_close(recv_dict["f"], test_dict["f"])
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
del os.environ["MLU_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
|
||||
test_dict = {
|
||||
# device tensor
|
||||
"a": torch.arange(8, dtype=torch.float32, device="cuda"),
|
||||
# CPU tensor
|
||||
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
|
||||
"c": "test",
|
||||
"d": [1, 2, 3],
|
||||
"e": {
|
||||
"a": 1,
|
||||
"b": 2
|
||||
},
|
||||
# empty tensor
|
||||
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
|
||||
}
|
||||
|
||||
if not get_pp_group().is_first_rank:
|
||||
recv_dict = get_pp_group().recv_tensor_dict()
|
||||
|
||||
if not get_pp_group().is_last_rank:
|
||||
get_pp_group().send_tensor_dict(test_dict)
|
||||
|
||||
if not get_pp_group().is_first_rank:
|
||||
assert len(recv_dict) == len(test_dict)
|
||||
torch.testing.assert_close(recv_dict["a"], test_dict["a"])
|
||||
torch.testing.assert_close(recv_dict["b"], test_dict["b"])
|
||||
assert recv_dict["c"] == test_dict["c"]
|
||||
assert recv_dict["d"] == test_dict["d"]
|
||||
assert recv_dict["e"] == test_dict["e"]
|
||||
torch.testing.assert_close(recv_dict["f"], test_dict["f"])
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
del os.environ["MLU_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
|
||||
size = 64
|
||||
test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
|
||||
|
||||
if not get_pp_group().is_first_rank:
|
||||
recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
|
||||
|
||||
if not get_pp_group().is_last_rank:
|
||||
get_pp_group().send(test_tensor)
|
||||
|
||||
if not get_pp_group().is_first_rank:
|
||||
torch.testing.assert_close(test_tensor, recv_tensor)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("test_target", [
|
||||
all_reduce_test_worker, all_gather_test_worker,
|
||||
broadcast_tensor_dict_test_worker
|
||||
])
|
||||
def test_multi_process_tensor_parallel(tp_size, test_target):
|
||||
multi_process_parallel(tp_size, 1, test_target)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("pp_size", [2])
|
||||
@pytest.mark.parametrize(
|
||||
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
|
||||
def test_multi_process_pipeline_parallel(pp_size, test_target):
|
||||
multi_process_parallel(1, pp_size, test_target)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("pp_size", [2])
|
||||
@pytest.mark.parametrize("test_target", [
|
||||
send_recv_test_worker, send_recv_tensor_dict_test_worker,
|
||||
all_reduce_test_worker, all_gather_test_worker,
|
||||
broadcast_tensor_dict_test_worker
|
||||
])
|
||||
def test_multi_process_tensor_parallel_pipeline_parallel(
|
||||
tp_size, pp_size, test_target):
|
||||
multi_process_parallel(tp_size, pp_size, test_target)
|
||||
115
vllm-v0.6.2/tests/distributed/test_custom_all_reduce.py
Normal file
115
vllm-v0.6.2/tests/distributed/test_custom_all_reduce.py
Normal file
@@ -0,0 +1,115 @@
|
||||
import os
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.communication_op import ( # noqa
|
||||
tensor_model_parallel_all_reduce)
|
||||
from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
|
||||
get_tp_group, graph_capture)
|
||||
|
||||
from ..utils import (ensure_model_parallel_initialized,
|
||||
init_test_distributed_environment, multi_process_parallel)
|
||||
|
||||
random.seed(42)
|
||||
test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
|
||||
for i, v in enumerate(test_sizes):
|
||||
test_sizes[i] -= v % 8
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
ensure_model_parallel_initialized(tp_size, pp_size)
|
||||
group = get_tensor_model_parallel_group().device_group
|
||||
|
||||
# A small all_reduce for warmup.
|
||||
# this is needed because device communicators might be created lazily
|
||||
# (e.g. NCCL). This will ensure that the communicator is initialized
|
||||
# before any communication happens, so that this group can be used for
|
||||
# graph capture immediately.
|
||||
data = torch.zeros(1)
|
||||
data = data.to(device=device)
|
||||
torch.distributed.all_reduce(data, group=group)
|
||||
torch.cuda.synchronize()
|
||||
del data
|
||||
|
||||
# we use the first group to communicate once
|
||||
# and the second group to communicate twice
|
||||
# and so on
|
||||
# this is used to demonstrate that each group can
|
||||
# communicate independently
|
||||
num_communication = rank // tp_size + 1
|
||||
|
||||
for sz in test_sizes:
|
||||
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
||||
with graph_capture() as graph_capture_context:
|
||||
# use integers so result matches NCCL exactly
|
||||
inp1 = torch.randint(1,
|
||||
16, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
inp2 = torch.randint(1,
|
||||
16, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
torch.cuda.synchronize()
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph,
|
||||
stream=graph_capture_context.stream):
|
||||
for i in range(num_communication):
|
||||
out1 = tensor_model_parallel_all_reduce(inp1)
|
||||
# the input buffer is immediately modified to test
|
||||
# synchronization
|
||||
dist.all_reduce(inp1, group=group)
|
||||
out2 = tensor_model_parallel_all_reduce(inp2)
|
||||
dist.all_reduce(inp2, group=group)
|
||||
graph.replay()
|
||||
torch.testing.assert_close(out1, inp1)
|
||||
torch.testing.assert_close(out2, inp2)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
|
||||
# we use the first group to communicate once
|
||||
# and the second group to communicate twice
|
||||
# and so on
|
||||
# this is used to demonstrate that each group can
|
||||
# communicate independently
|
||||
num_communication = rank // tp_size + 1
|
||||
sz = 1024
|
||||
fa = get_tp_group().ca_comm
|
||||
inp = torch.ones(sz, dtype=torch.float32, device=device)
|
||||
out = inp
|
||||
for _ in range(num_communication):
|
||||
out = fa.all_reduce(out, registered=False)
|
||||
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
||||
|
||||
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
|
||||
out = inp
|
||||
for _ in range(num_communication):
|
||||
out = fa.all_reduce(out, registered=False)
|
||||
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
|
||||
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
|
||||
def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
|
||||
world_size = tp_size * pipeline_parallel_size
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
|
||||
6
vllm-v0.6.2/tests/distributed/test_distributed_oot.py
Normal file
6
vllm-v0.6.2/tests/distributed/test_distributed_oot.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from ..entrypoints.openai.test_oot_registration import (
|
||||
run_and_test_dummy_opt_api_server)
|
||||
|
||||
|
||||
def test_distributed_oot(dummy_opt_path: str):
|
||||
run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
|
||||
64
vllm-v0.6.2/tests/distributed/test_multi_node_assignment.py
Normal file
64
vllm-v0.6.2/tests/distributed/test_multi_node_assignment.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""Make sure ray assigns GPU workers to the correct node.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
cd $VLLM_PATH/tests
|
||||
|
||||
pytest distributed/test_multi_node_assignment.py
|
||||
```
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
|
||||
from vllm import initialize_ray_cluster
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.executor.ray_utils import _wait_until_pg_removed
|
||||
from vllm.utils import get_ip
|
||||
|
||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not VLLM_MULTI_NODE,
|
||||
reason="Need at least 2 nodes to run the test.")
|
||||
def test_multi_node_assignment() -> None:
|
||||
|
||||
# NOTE: important to keep this class definition here
|
||||
# to let ray use cloudpickle to serialize it.
|
||||
class Actor:
|
||||
|
||||
def get_ip(self):
|
||||
return get_ip()
|
||||
|
||||
for _ in range(10):
|
||||
config = ParallelConfig(1, 2)
|
||||
initialize_ray_cluster(config)
|
||||
|
||||
current_ip = get_ip()
|
||||
workers = []
|
||||
for bundle_id, bundle in enumerate(
|
||||
config.placement_group.bundle_specs):
|
||||
if not bundle.get("GPU", 0):
|
||||
continue
|
||||
scheduling_strategy = PlacementGroupSchedulingStrategy(
|
||||
placement_group=config.placement_group,
|
||||
placement_group_capture_child_tasks=True,
|
||||
placement_group_bundle_index=bundle_id,
|
||||
)
|
||||
|
||||
worker = ray.remote(
|
||||
num_cpus=0,
|
||||
num_gpus=1,
|
||||
scheduling_strategy=scheduling_strategy,
|
||||
)(Actor).remote()
|
||||
worker_ip = ray.get(worker.get_ip.remote())
|
||||
assert worker_ip == current_ip
|
||||
workers.append(worker)
|
||||
|
||||
for worker in workers:
|
||||
ray.kill(worker)
|
||||
|
||||
_wait_until_pg_removed(config.placement_group)
|
||||
414
vllm-v0.6.2/tests/distributed/test_pipeline_parallel.py
Normal file
414
vllm-v0.6.2/tests/distributed/test_pipeline_parallel.py
Normal file
@@ -0,0 +1,414 @@
|
||||
"""
|
||||
WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
|
||||
important to set the distributed backend to "mp" to avoid Ray scheduling
|
||||
all workers in a node other than the head node, which can cause the test
|
||||
to fail.
|
||||
"""
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import TaskOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
|
||||
logger = init_logger("test_pipeline_parallel")
|
||||
|
||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
|
||||
class ParallelSetup(NamedTuple):
|
||||
tp_size: int
|
||||
pp_size: int
|
||||
eager_mode: bool
|
||||
chunked_prefill: bool
|
||||
|
||||
|
||||
class PPTestOptions(NamedTuple):
|
||||
multi_node_only: bool
|
||||
trust_remote_code: bool
|
||||
tokenizer_mode: Optional[str]
|
||||
load_format: Optional[str] = None
|
||||
hf_overrides: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PPTestSettings:
|
||||
parallel_setups: List[ParallelSetup]
|
||||
distributed_backends: List[str]
|
||||
task: TaskOption
|
||||
test_options: PPTestOptions
|
||||
|
||||
@staticmethod
|
||||
def detailed(
|
||||
*,
|
||||
tp_base: int = 1,
|
||||
pp_base: int = 2,
|
||||
multi_node_only: bool = False,
|
||||
task: TaskOption = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: Optional[str] = None,
|
||||
load_format: Optional[str] = None,
|
||||
hf_overrides: Optional[str] = None,
|
||||
):
|
||||
return PPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
eager_mode=False,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
eager_mode=False,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=2 * tp_base,
|
||||
pp_size=pp_base,
|
||||
eager_mode=False,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=2 * tp_base,
|
||||
pp_size=pp_base,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp", "ray"],
|
||||
task=task,
|
||||
test_options=PPTestOptions(multi_node_only=multi_node_only,
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
hf_overrides=hf_overrides),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def fast(
|
||||
*,
|
||||
tp_base: int = 1,
|
||||
pp_base: int = 2,
|
||||
task: TaskOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: Optional[str] = None,
|
||||
load_format: Optional[str] = None,
|
||||
hf_overrides: Optional[str] = None,
|
||||
):
|
||||
return PPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp"],
|
||||
task=task,
|
||||
test_options=PPTestOptions(multi_node_only=multi_node_only,
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
hf_overrides=hf_overrides),
|
||||
)
|
||||
|
||||
def iter_params(self, model_name: str):
|
||||
opts = self.test_options
|
||||
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for distributed_backend in self.distributed_backends:
|
||||
yield (model_name, parallel_setup, distributed_backend,
|
||||
self.task, opts)
|
||||
|
||||
|
||||
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
|
||||
# The values displayed here are only a rough indicator of the size of the model
|
||||
|
||||
# yapf: disable
|
||||
TEXT_GENERATION_MODELS = {
|
||||
# [Decoder-only]
|
||||
# Uses Llama
|
||||
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
|
||||
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True), # noqa: E501
|
||||
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
|
||||
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
|
||||
"bigscience/bloomz-1b1": PPTestSettings.fast(),
|
||||
"THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
|
||||
"CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True), # noqa: E501
|
||||
"databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
|
||||
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
|
||||
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
|
||||
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
|
||||
"tiiuae/falcon-7b": PPTestSettings.fast(),
|
||||
"google/gemma-2b": PPTestSettings.fast(),
|
||||
"google/gemma-2-9b": PPTestSettings.fast(),
|
||||
"gpt2": PPTestSettings.fast(),
|
||||
"bigcode/starcoder": PPTestSettings.fast(),
|
||||
"EleutherAI/gpt-j-6b": PPTestSettings.fast(),
|
||||
"EleutherAI/pythia-12b": PPTestSettings.fast(),
|
||||
"ibm/PowerLM-3b": PPTestSettings.fast(),
|
||||
"ibm/PowerMoE-3b": PPTestSettings.fast(),
|
||||
# Uses Llama
|
||||
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
|
||||
"internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
|
||||
"inceptionai/jais-13b-chat": PPTestSettings.fast(),
|
||||
# TODO: Implement PP
|
||||
# "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
|
||||
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
|
||||
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
|
||||
"openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
|
||||
# Uses Llama
|
||||
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
|
||||
"mosaicml/mpt-7b": PPTestSettings.fast(),
|
||||
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
|
||||
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
|
||||
"allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
|
||||
"facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
|
||||
"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
|
||||
"adept/persimmon-8b-chat": PPTestSettings.fast(),
|
||||
"microsoft/phi-2": PPTestSettings.fast(),
|
||||
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
|
||||
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'), # noqa: E501
|
||||
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
|
||||
"Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
|
||||
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
|
||||
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
|
||||
"bigcode/starcoder2-3b": PPTestSettings.fast(),
|
||||
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
|
||||
# FIXME: Cannot load tokenizer in latest transformers version.
|
||||
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
|
||||
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
|
||||
# [Encoder-only]
|
||||
# TODO: Implement PP
|
||||
# "facebook/bart-base": PPTestSettings.fast(),
|
||||
}
|
||||
|
||||
EMBEDDING_MODELS = { # type: ignore[var-annotated]
|
||||
# [Text-only]
|
||||
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
|
||||
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True), # noqa: E501
|
||||
}
|
||||
|
||||
MULTIMODAL_MODELS = {
|
||||
# [Decoder-only]
|
||||
"Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
|
||||
"facebook/chameleon-7b": PPTestSettings.fast(),
|
||||
"adept/fuyu-8b": PPTestSettings.fast(),
|
||||
"THUDM/glm-4v-9b": PPTestSettings.fast(trust_remote_code=True),
|
||||
"OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
|
||||
"llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
|
||||
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
|
||||
"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
|
||||
"allenai/Molmo-7B-D-0924": PPTestSettings.fast(trust_remote_code=True),
|
||||
"microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
|
||||
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501
|
||||
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
|
||||
"Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
|
||||
"fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
|
||||
# [Encoder-decoder]
|
||||
# TODO: Implement PP
|
||||
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
# NOTE: You can update this on your local machine to run specific tests
|
||||
TEST_MODELS = [
|
||||
# [LANGUAGE GENERATION]
|
||||
# "microsoft/Phi-3.5-MoE-instruct",
|
||||
"meta-llama/Meta-Llama-3-8B",
|
||||
# "ibm/PowerLM-3b",
|
||||
# [LANGUAGE EMBEDDING]
|
||||
# "intfloat/e5-mistral-7b-instruct",
|
||||
# "BAAI/bge-multilingual-gemma2",
|
||||
# [MULTIMODAL GENERATION]
|
||||
# "OpenGVLab/InternVL2-1B",
|
||||
# "microsoft/Phi-3-vision-128k-instruct",
|
||||
# "fixie-ai/ultravox-v0_3",
|
||||
]
|
||||
|
||||
|
||||
def _compare_tp(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available: int,
|
||||
*,
|
||||
method: Literal["generate", "encode"],
|
||||
):
|
||||
tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
|
||||
multi_node_only, trust_remote_code, tokenizer_mode, \
|
||||
load_format, hf_overrides = test_options
|
||||
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
if VLLM_MULTI_NODE and distributed_backend == "mp":
|
||||
pytest.skip("Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend")
|
||||
if multi_node_only and not VLLM_MULTI_NODE:
|
||||
pytest.skip("Not in multi-node setting")
|
||||
|
||||
common_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"8",
|
||||
]
|
||||
if chunked_prefill:
|
||||
common_args.append("--enable-chunked-prefill")
|
||||
if eager_mode:
|
||||
common_args.append("--enforce-eager")
|
||||
if task != "auto":
|
||||
common_args.extend(["--task", task])
|
||||
if trust_remote_code:
|
||||
common_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
common_args.extend(["--tokenizer-mode", tokenizer_mode])
|
||||
if load_format:
|
||||
common_args.extend(["--load-format", load_format])
|
||||
if hf_overrides:
|
||||
common_args.extend(["--hf-overrides", hf_overrides])
|
||||
|
||||
if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
|
||||
and chunked_prefill):
|
||||
# Test Ray ADAG for a subset of the tests
|
||||
pp_env = {
|
||||
"VLLM_USE_RAY_COMPILED_DAG": "1",
|
||||
"VLLM_USE_RAY_SPMD_WORKER": "1",
|
||||
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
|
||||
}
|
||||
# Temporary. Currently when zeromq + SPMD is used, it does not properly
|
||||
# terminate because of aDAG issue.
|
||||
common_args.append("--disable-frontend-multiprocessing")
|
||||
else:
|
||||
pp_env = None
|
||||
|
||||
pp_args = [
|
||||
*common_args,
|
||||
"--pipeline-parallel-size",
|
||||
str(pp_size),
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
]
|
||||
|
||||
# compare without pipeline parallelism
|
||||
# NOTE: use mp backend for TP
|
||||
# PP tests might involve multiple nodes, and ray might
|
||||
# schedule all workers in a node other than the head node,
|
||||
# which can cause the test to fail.
|
||||
tp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--distributed-executor-backend",
|
||||
"mp",
|
||||
]
|
||||
|
||||
try:
|
||||
compare_two_settings(model_name,
|
||||
pp_args,
|
||||
tp_args,
|
||||
pp_env,
|
||||
method=method)
|
||||
except Exception:
|
||||
if pp_env is None:
|
||||
raise
|
||||
else:
|
||||
# Ray ADAG tests are flaky, so we don't want to fail the test
|
||||
logger.exception("Ray ADAG tests failed")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"test_options"),
|
||||
[
|
||||
params for model_name, settings in TEXT_GENERATION_MODELS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
if model_name in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
def test_tp_language_generation(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
task,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"test_options"),
|
||||
[
|
||||
params for model_name, settings in EMBEDDING_MODELS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
if model_name in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
def test_tp_language_embedding(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
task,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="encode")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"test_options"),
|
||||
[
|
||||
params for model_name, settings in MULTIMODAL_MODELS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
if model_name in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
def test_tp_multimodal_generation(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
task,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate")
|
||||
34
vllm-v0.6.2/tests/distributed/test_pipeline_partition.py
Normal file
34
vllm-v0.6.2/tests/distributed/test_pipeline_partition.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.distributed.utils import get_pp_indices
|
||||
|
||||
|
||||
def test_custom_layer_partition():
|
||||
|
||||
def _verify(partition_str, num_layers, pp_size, goldens):
|
||||
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
|
||||
os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
|
||||
for pp_rank, golden in enumerate(goldens):
|
||||
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
|
||||
if bak is not None:
|
||||
os.environ["VLLM_PP_LAYER_PARTITION"] = bak
|
||||
|
||||
# Even partition
|
||||
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Balanced partition
|
||||
_verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
|
||||
# Put reminder somewhere
|
||||
_verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
|
||||
# Invalid partition strings
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Wrong number of partitions
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Wrong number of layers
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
30
vllm-v0.6.2/tests/distributed/test_pp_cudagraph.py
Normal file
30
vllm-v0.6.2/tests/distributed/test_pp_cudagraph.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
|
||||
(2, "JackFram/llama-160m"),
|
||||
])
|
||||
@pytest.mark.parametrize("ATTN_BACKEND", [
|
||||
"FLASH_ATTN",
|
||||
"FLASHINFER",
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
|
||||
cudagraph_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--pipeline-parallel-size",
|
||||
str(PP_SIZE),
|
||||
"--distributed-executor-backend",
|
||||
"mp",
|
||||
]
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
|
||||
|
||||
eager_args = cudagraph_args + ["--enforce-eager"]
|
||||
|
||||
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
|
||||
241
vllm-v0.6.2/tests/distributed/test_pynccl.py
Normal file
241
vllm-v0.6.2/tests/distributed/test_pynccl.py
Normal file
@@ -0,0 +1,241 @@
|
||||
import multiprocessing
|
||||
import os
|
||||
from typing import Dict, List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
from vllm.distributed.communication_op import ( # noqa
|
||||
tensor_model_parallel_all_reduce)
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
|
||||
from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
|
||||
get_world_group, graph_capture,
|
||||
init_distributed_environment)
|
||||
from vllm.utils import update_environment_variables
|
||||
|
||||
|
||||
def distributed_run(fn, world_size):
|
||||
number_of_processes = world_size
|
||||
processes: List[multiprocessing.Process] = []
|
||||
for i in range(number_of_processes):
|
||||
env: Dict[str, str] = {}
|
||||
env['RANK'] = str(i)
|
||||
env['LOCAL_RANK'] = str(i)
|
||||
env['WORLD_SIZE'] = str(number_of_processes)
|
||||
env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
|
||||
env['MASTER_ADDR'] = 'localhost'
|
||||
env['MASTER_PORT'] = '12345'
|
||||
p = multiprocessing.Process(target=fn, args=(env, ))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
for p in processes:
|
||||
assert p.exitcode == 0
|
||||
|
||||
|
||||
def worker_fn_wrapper(fn):
|
||||
# `multiprocessing.Process` cannot accept environment variables directly
|
||||
# so we need to pass the environment variables as arguments
|
||||
# and update the environment variables in the function
|
||||
def wrapped_fn(env):
|
||||
update_environment_variables(env)
|
||||
local_rank = os.environ['LOCAL_RANK']
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_distributed_environment()
|
||||
fn()
|
||||
|
||||
return wrapped_fn
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
tensor = torch.ones(16, 1024, 1024,
|
||||
dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
with pynccl_comm.change_state(enable=True):
|
||||
pynccl_comm.all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == pynccl_comm.world_size
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
def test_pynccl():
|
||||
distributed_run(worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def multiple_allreduce_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
groups = [
|
||||
torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
|
||||
torch.distributed.new_group(ranks=[2, 3], backend="gloo")
|
||||
]
|
||||
group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
|
||||
pynccl_comm = PyNcclCommunicator(group=group, device=device)
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
with pynccl_comm.change_state(enable=True):
|
||||
# two groups can communicate independently
|
||||
if torch.distributed.get_rank() in [0, 1]:
|
||||
pynccl_comm.all_reduce(tensor)
|
||||
pynccl_comm.all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 4
|
||||
else:
|
||||
pynccl_comm.all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 2
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
def test_pynccl_multiple_allreduce():
|
||||
# this tests pynccl for multiple tp groups, in a standalone way
|
||||
# i.e. call `pynccl_comm.all_reduce` directly
|
||||
distributed_run(multiple_allreduce_worker_fn, 4)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def multiple_allreduce_with_vllm_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
ensure_model_parallel_initialized(2, 2)
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
with graph_capture():
|
||||
# two tp groups can communicate independently
|
||||
if torch.distributed.get_rank() in [0, 1]:
|
||||
tensor = tensor_model_parallel_all_reduce(tensor)
|
||||
tensor = tensor_model_parallel_all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 4
|
||||
else:
|
||||
tensor = tensor_model_parallel_all_reduce(tensor)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 2
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
def test_pynccl_multiple_allreduce_with_vllm():
|
||||
# this tests pynccl for multiple tp groups, together with vllm
|
||||
# i.e. call `tensor_model_parallel_all_reduce`
|
||||
distributed_run(multiple_allreduce_with_vllm_worker_fn, 4)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def worker_fn_with_cudagraph():
|
||||
with torch.no_grad():
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
# run something in the default stream to initialize torch engine
|
||||
a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
|
||||
torch.cuda.synchronize()
|
||||
with torch.cuda.graph(
|
||||
graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
|
||||
enable=True):
|
||||
# operation during the graph capture is recorded but not executed
|
||||
# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa
|
||||
pynccl_comm.all_reduce(a)
|
||||
pynccl_comm.stream.synchronize()
|
||||
assert a.mean().cpu().item() == pynccl_comm.world_size**0
|
||||
graph.replay()
|
||||
pynccl_comm.stream.synchronize()
|
||||
assert a.mean().cpu().item() == pynccl_comm.world_size**1
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
def test_pynccl_with_cudagraph():
|
||||
distributed_run(worker_fn_with_cudagraph, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def send_recv_worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
if pynccl_comm.rank == 0:
|
||||
tensor = torch.ones(16, 1024, 1024,
|
||||
dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
else:
|
||||
tensor = torch.empty(16, 1024, 1024,
|
||||
dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
with pynccl_comm.change_state(enable=True):
|
||||
if pynccl_comm.rank == 0:
|
||||
pynccl_comm.send(tensor,
|
||||
dst=(pynccl_comm.rank + 1) %
|
||||
pynccl_comm.world_size)
|
||||
else:
|
||||
pynccl_comm.recv(tensor,
|
||||
src=(pynccl_comm.rank - 1) %
|
||||
pynccl_comm.world_size)
|
||||
result = tensor.mean().cpu().item()
|
||||
assert result == 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
def test_pynccl_send_recv():
|
||||
distributed_run(send_recv_worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def multiple_send_recv_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
groups = [
|
||||
torch.distributed.new_group(ranks=[0, 2], backend="gloo"),
|
||||
torch.distributed.new_group(ranks=[1, 3], backend="gloo")
|
||||
]
|
||||
group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1]
|
||||
pynccl_comm = PyNcclCommunicator(group=group, device=device)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
elif torch.distributed.get_rank() == 1:
|
||||
tensor = 2 * torch.ones(
|
||||
16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
else:
|
||||
tensor = torch.empty(16,
|
||||
1024,
|
||||
1024,
|
||||
dtype=torch.float32,
|
||||
device=device)
|
||||
with pynccl_comm.change_state(enable=True):
|
||||
if torch.distributed.get_rank() in [0, 1]:
|
||||
pynccl_comm.send(tensor,
|
||||
dst=(pynccl_comm.rank + 1) %
|
||||
pynccl_comm.world_size)
|
||||
else:
|
||||
pynccl_comm.recv(tensor,
|
||||
src=(pynccl_comm.rank - 1) %
|
||||
pynccl_comm.world_size)
|
||||
result = tensor.mean().cpu().item()
|
||||
if torch.distributed.get_rank() in [0, 2]:
|
||||
assert result == 1
|
||||
else:
|
||||
assert result == 2
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
def test_pynccl_multiple_send_recv():
|
||||
distributed_run(multiple_send_recv_worker_fn, 4)
|
||||
|
||||
|
||||
def test_ncclGetUniqueId():
|
||||
lib = NCCLLibrary()
|
||||
unique_id = lib.ncclGetUniqueId()
|
||||
# `list(unique_id.internal)` is something like this:
|
||||
# [34, -16, 23, 83, 109, -19, 59, 95, 2, 0, -86, 55, 10, -128, 0, 29, 0,
|
||||
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
# 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
# as long as the function doesn't raise an exception, we're good
|
||||
assert unique_id is not None
|
||||
13
vllm-v0.6.2/tests/distributed/test_same_node.py
Normal file
13
vllm-v0.6.2/tests/distributed/test_same_node.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import os
|
||||
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.parallel_state import in_the_same_node_as
|
||||
|
||||
if __name__ == "__main__":
|
||||
dist.init_process_group(backend="gloo")
|
||||
test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
|
||||
|
||||
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
|
||||
assert test_result == expected, f"Expected {expected}, got {test_result}"
|
||||
print("Same node test passed!")
|
||||
88
vllm-v0.6.2/tests/distributed/test_shm_broadcast.py
Normal file
88
vllm-v0.6.2/tests/distributed/test_shm_broadcast.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import multiprocessing
|
||||
import random
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
|
||||
from vllm.utils import update_environment_variables
|
||||
|
||||
|
||||
def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
|
||||
np.random.seed(seed)
|
||||
sizes = np.random.randint(1, 10_000, n)
|
||||
# on average, each array will have 5k elements
|
||||
# with int64, each array will have 40kb
|
||||
return [np.random.randint(1, 100, i) for i in sizes]
|
||||
|
||||
|
||||
def distributed_run(fn, world_size):
|
||||
number_of_processes = world_size
|
||||
processes = []
|
||||
for i in range(number_of_processes):
|
||||
env = {}
|
||||
env['RANK'] = str(i)
|
||||
env['LOCAL_RANK'] = str(i)
|
||||
env['WORLD_SIZE'] = str(number_of_processes)
|
||||
env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
|
||||
env['MASTER_ADDR'] = 'localhost'
|
||||
env['MASTER_PORT'] = '12345'
|
||||
p = multiprocessing.Process(target=fn, args=(env, ))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
for p in processes:
|
||||
assert p.exitcode == 0
|
||||
|
||||
|
||||
def worker_fn_wrapper(fn):
|
||||
# `multiprocessing.Process` cannot accept environment variables directly
|
||||
# so we need to pass the environment variables as arguments
|
||||
# and update the environment variables in the function
|
||||
def wrapped_fn(env):
|
||||
update_environment_variables(env)
|
||||
dist.init_process_group(backend="gloo")
|
||||
fn()
|
||||
|
||||
return wrapped_fn
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def worker_fn():
|
||||
writer_rank = 2
|
||||
broadcaster = MessageQueue.create_from_process_group(
|
||||
dist.group.WORLD, 40 * 1024, 2, writer_rank)
|
||||
if dist.get_rank() == writer_rank:
|
||||
seed = random.randint(0, 1000)
|
||||
dist.broadcast_object_list([seed], writer_rank)
|
||||
else:
|
||||
recv = [None]
|
||||
dist.broadcast_object_list(recv, writer_rank)
|
||||
seed = recv[0] # type: ignore
|
||||
dist.barrier()
|
||||
# in case we find a race condition
|
||||
# print the seed so that we can reproduce the error
|
||||
print(f"Rank {dist.get_rank()} got seed {seed}")
|
||||
# test broadcasting with about 400MB of data
|
||||
N = 10_000
|
||||
if dist.get_rank() == writer_rank:
|
||||
arrs = get_arrays(N, seed)
|
||||
for x in arrs:
|
||||
broadcaster.broadcast_object(x)
|
||||
time.sleep(random.random() / 1000)
|
||||
else:
|
||||
arrs = get_arrays(N, seed)
|
||||
for x in arrs:
|
||||
y = broadcaster.broadcast_object(None)
|
||||
assert np.array_equal(x, y)
|
||||
time.sleep(random.random() / 1000)
|
||||
dist.barrier()
|
||||
|
||||
|
||||
def test_shm_broadcast():
|
||||
distributed_run(worker_fn, 4)
|
||||
143
vllm-v0.6.2/tests/distributed/test_utils.py
Normal file
143
vllm-v0.6.2/tests/distributed/test_utils.py
Normal file
@@ -0,0 +1,143 @@
|
||||
import socket
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.utils import (mlu_device_count_stateless, get_open_port,
|
||||
update_environment_variables)
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
|
||||
@ray.remote
|
||||
class _CUDADeviceCountStatelessTestActor:
|
||||
|
||||
def get_count(self):
|
||||
return mlu_device_count_stateless()
|
||||
|
||||
def set_cuda_visible_devices(self, cuda_visible_devices: str):
|
||||
update_environment_variables(
|
||||
{"MLU_VISIBLE_DEVICES": cuda_visible_devices})
|
||||
|
||||
def get_cuda_visible_devices(self):
|
||||
return envs.MLU_VISIBLE_DEVICES
|
||||
|
||||
|
||||
def test_cuda_device_count_stateless():
|
||||
"""Test that cuda_device_count_stateless changes return value if
|
||||
CUDA_VISIBLE_DEVICES is changed."""
|
||||
actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore
|
||||
num_gpus=2).remote()
|
||||
assert len(
|
||||
sorted(ray.get(
|
||||
actor.get_cuda_visible_devices.remote()).split(","))) == 2
|
||||
assert ray.get(actor.get_count.remote()) == 2
|
||||
ray.get(actor.set_cuda_visible_devices.remote("0"))
|
||||
assert ray.get(actor.get_count.remote()) == 1
|
||||
ray.get(actor.set_cuda_visible_devices.remote(""))
|
||||
assert ray.get(actor.get_count.remote()) == 0
|
||||
|
||||
|
||||
def cpu_worker(rank, WORLD_SIZE, port1, port2):
|
||||
pg1 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port1,
|
||||
rank=rank,
|
||||
world_size=WORLD_SIZE)
|
||||
if rank <= 2:
|
||||
pg2 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port2,
|
||||
rank=rank,
|
||||
world_size=3)
|
||||
data = torch.tensor([rank])
|
||||
data = pg1.broadcast_obj(data, src=2)
|
||||
assert data.item() == 2
|
||||
if rank <= 2:
|
||||
data = torch.tensor([rank + 1])
|
||||
data = pg2.broadcast_obj(data, src=2)
|
||||
assert data.item() == 3
|
||||
pg2.barrier()
|
||||
pg1.barrier()
|
||||
|
||||
|
||||
def gpu_worker(rank, WORLD_SIZE, port1, port2):
|
||||
torch.cuda.set_device(rank)
|
||||
pg1 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port1,
|
||||
rank=rank,
|
||||
world_size=WORLD_SIZE)
|
||||
pynccl1 = PyNcclCommunicator(pg1, device=rank)
|
||||
pynccl1.disabled = False
|
||||
if rank <= 2:
|
||||
pg2 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port2,
|
||||
rank=rank,
|
||||
world_size=3)
|
||||
pynccl2 = PyNcclCommunicator(pg2, device=rank)
|
||||
pynccl2.disabled = False
|
||||
data = torch.tensor([rank]).cuda()
|
||||
pynccl1.all_reduce(data)
|
||||
pg1.barrier()
|
||||
torch.cuda.synchronize()
|
||||
if rank <= 2:
|
||||
pynccl2.all_reduce(data)
|
||||
pg2.barrier()
|
||||
torch.cuda.synchronize()
|
||||
item = data[0].item()
|
||||
print(f"rank: {rank}, item: {item}")
|
||||
if rank == 3:
|
||||
assert item == 6
|
||||
else:
|
||||
assert item == 18
|
||||
|
||||
|
||||
def broadcast_worker(rank, WORLD_SIZE, port1, port2):
|
||||
pg1 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port1,
|
||||
rank=rank,
|
||||
world_size=WORLD_SIZE)
|
||||
if rank == 2:
|
||||
pg1.broadcast_obj("secret", src=2)
|
||||
else:
|
||||
obj = pg1.broadcast_obj(None, src=2)
|
||||
assert obj == "secret"
|
||||
pg1.barrier()
|
||||
|
||||
|
||||
def allgather_worker(rank, WORLD_SIZE, port1, port2):
|
||||
pg1 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port1,
|
||||
rank=rank,
|
||||
world_size=WORLD_SIZE)
|
||||
data = pg1.all_gather_obj(rank)
|
||||
assert data == list(range(WORLD_SIZE))
|
||||
pg1.barrier()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test is flaky and prone to hang.")
|
||||
# @multi_gpu_test(num_gpus=4)
|
||||
@pytest.mark.parametrize(
|
||||
"worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
|
||||
def test_stateless_process_group(worker):
|
||||
port1 = get_open_port()
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("", port1))
|
||||
port2 = get_open_port()
|
||||
WORLD_SIZE = 4
|
||||
from multiprocessing import get_context
|
||||
ctx = get_context("fork")
|
||||
processes = []
|
||||
for i in range(WORLD_SIZE):
|
||||
rank = i
|
||||
processes.append(
|
||||
ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2)))
|
||||
for p in processes:
|
||||
p.start()
|
||||
for p in processes:
|
||||
p.join()
|
||||
for p in processes:
|
||||
assert not p.exitcode
|
||||
print("All processes finished.")
|
||||
0
vllm-v0.6.2/tests/encoder_decoder/__init__.py
Normal file
0
vllm-v0.6.2/tests/encoder_decoder/__init__.py
Normal file
119
vllm-v0.6.2/tests/encoder_decoder/test_e2e_correctness.py
Normal file
119
vllm-v0.6.2/tests/encoder_decoder/test_e2e_correctness.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""E2E tests to verify the correctness of the encoder-decoder framework
|
||||
|
||||
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
|
||||
"""
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
|
||||
global_force_attn_backend_context_manager)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import DecoderPromptType
|
||||
from ..models.utils import check_logprobs_close
|
||||
|
||||
LIST_ENC_DEC_SUPPORTED_BACKENDS = [
|
||||
_Backend.XFORMERS, _Backend.FLASH_ATTN, None
|
||||
]
|
||||
|
||||
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "</s>"
|
||||
if decoder_prompt_type == DecoderPromptType.NONE:
|
||||
hf_output_str = "<s>" + hf_output_str
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clear_cache():
|
||||
"""Fixture to clear backend cache before each test."""
|
||||
_cached_get_attn_backend.cache_clear() # Clear the cache
|
||||
yield # This allows the test to run
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_cpu(),
|
||||
reason="CPU backend is not currently supported with encoder/decoder models"
|
||||
)
|
||||
def test_encoder_decoder_e2e(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
enforce_eager: bool,
|
||||
attn_backend: _Backend,
|
||||
) -> None:
|
||||
'''
|
||||
End-to-End (E2E) test for the encoder-decoder framework.
|
||||
This test evaluates the encoder-decoder functionality using the BART
|
||||
model. We compare the outputs of the Hugging Face and vLLM
|
||||
implementations to ensure that both implementations produce consistent
|
||||
and correct results.
|
||||
'''
|
||||
with global_force_attn_backend_context_manager(attn_backend):
|
||||
if attn_backend == _Backend.FLASH_ATTN:
|
||||
# Flash Attention works only with bfloat16 data-type
|
||||
dtype = 'bfloat16'
|
||||
test_case_prompts = example_encoder_decoder_prompts[
|
||||
decoder_prompt_type]
|
||||
|
||||
# Configuration settings for HF baseline
|
||||
hf_kwargs = {
|
||||
"top_k": None,
|
||||
"num_beams": 1,
|
||||
"repetition_penalty": 1.0,
|
||||
"top_p": 1.0,
|
||||
"length_penalty": 1.0,
|
||||
"early_stopping": False,
|
||||
"no_repeat_ngram_size": None,
|
||||
"min_length": 0
|
||||
}
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
||||
hf_outputs = (
|
||||
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
test_case_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
**hf_kwargs,
|
||||
))
|
||||
with vllm_runner(model, dtype=dtype,
|
||||
enforce_eager=enforce_eager) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
test_case_prompts, max_tokens, num_logprobs)
|
||||
|
||||
hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
|
||||
else 0)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, decoder_prompt_type)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
num_outputs_0_skip_tokens=hf_skip_tokens,
|
||||
)
|
||||
0
vllm-v0.6.2/tests/engine/__init__.py
Normal file
0
vllm-v0.6.2/tests/engine/__init__.py
Normal file
271
vllm-v0.6.2/tests/engine/output_processor/test_multi_step.py
Normal file
271
vllm-v0.6.2/tests/engine/output_processor/test_multi_step.py
Normal file
@@ -0,0 +1,271 @@
|
||||
import random
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
|
||||
SequenceOutput, SequenceStatus)
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.utils import Counter
|
||||
|
||||
from ...core.utils import create_seq_group
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_output_len", [128])
|
||||
@pytest.mark.parametrize("num_new_tokens", [1, 12])
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
|
||||
"""Verify multi-step decoding appends token ids correctly.
|
||||
|
||||
We append token ids and verify all the token ids were appended correctly.
|
||||
Note that ignore_eos=True.
|
||||
"""
|
||||
detokenizer = MagicMock(spec=Detokenizer)
|
||||
scheduler = MagicMock(spec=Scheduler)
|
||||
stop_checker = MagicMock(spec=StopChecker)
|
||||
seq_counter = Counter()
|
||||
|
||||
output_processor = MultiStepOutputProcessor(
|
||||
detokenizer=detokenizer,
|
||||
scheduler=[scheduler],
|
||||
seq_counter=seq_counter,
|
||||
get_tokenizer_for_seq=lambda _: mock_tokenizer(),
|
||||
stop_checker=stop_checker,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=1024,
|
||||
seq_output_lens=[seq_output_len],
|
||||
sampling_params=SamplingParams(max_tokens=seq_output_len +
|
||||
num_new_tokens,
|
||||
ignore_eos=True),
|
||||
)
|
||||
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
new_token_ids = list(range(num_new_tokens))
|
||||
|
||||
outputs = [
|
||||
CompletionSequenceGroupOutput(
|
||||
samples=[
|
||||
SequenceOutput(
|
||||
parent_seq_id=seq.seq_id,
|
||||
output_token=output_token,
|
||||
logprobs={output_token: Logprob(0.0)},
|
||||
)
|
||||
],
|
||||
prompt_logprobs=None,
|
||||
) for output_token in new_token_ids
|
||||
]
|
||||
|
||||
assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
|
||||
output_processor.process_outputs(seq_group, outputs)
|
||||
assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_prompt_len", [1024])
|
||||
@pytest.mark.parametrize("seq_output_len", [128])
|
||||
@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
|
||||
@pytest.mark.parametrize("max_tokens", [128 + 3])
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
|
||||
seq_output_len: int, max_tokens: int):
|
||||
"""Verify tokens after max_tokens are dropped and not appended to the
|
||||
sequence.
|
||||
"""
|
||||
detokenizer = MagicMock(spec=Detokenizer)
|
||||
scheduler = MagicMock(spec=Scheduler)
|
||||
stop_checker = MagicMock(spec=StopChecker)
|
||||
seq_counter = Counter()
|
||||
|
||||
output_processor = MultiStepOutputProcessor(
|
||||
detokenizer=detokenizer,
|
||||
scheduler=[scheduler],
|
||||
seq_counter=seq_counter,
|
||||
get_tokenizer_for_seq=lambda _: mock_tokenizer(),
|
||||
stop_checker=stop_checker,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=seq_prompt_len,
|
||||
seq_output_lens=[seq_output_len],
|
||||
sampling_params=SamplingParams(max_tokens=max_tokens, ),
|
||||
)
|
||||
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
new_token_ids = list(range(num_new_tokens))
|
||||
|
||||
outputs = [
|
||||
CompletionSequenceGroupOutput(
|
||||
samples=[
|
||||
SequenceOutput(
|
||||
parent_seq_id=seq.seq_id,
|
||||
output_token=output_token,
|
||||
logprobs={output_token: Logprob(0.0)},
|
||||
)
|
||||
],
|
||||
prompt_logprobs=None,
|
||||
) for output_token in new_token_ids
|
||||
]
|
||||
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len
|
||||
output_processor.process_outputs(seq_group, outputs)
|
||||
|
||||
# Expect the processed sequence to not go over max tokens in len.
|
||||
assert seq.get_len() == seq_prompt_len + max_tokens
|
||||
|
||||
# Expect the correct tokens were appended.
|
||||
expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
|
||||
assert seq.get_token_ids(
|
||||
)[-len(expected_appended_tokens):] == expected_appended_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_prompt_len", [1024])
|
||||
@pytest.mark.parametrize("seq_output_len", [128])
|
||||
@pytest.mark.parametrize("num_new_tokens", [12])
|
||||
@pytest.mark.parametrize("seed", list(range(6)))
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
|
||||
seq_output_len: int, seed: int):
|
||||
"""Verify the eos token id is included in the sequence, but subsequent
|
||||
tokens are dropped (not appended to sequence).
|
||||
"""
|
||||
random.seed(seed)
|
||||
detokenizer = MagicMock(spec=Detokenizer)
|
||||
scheduler = MagicMock(spec=Scheduler)
|
||||
stop_checker = MagicMock(spec=StopChecker)
|
||||
seq_counter = Counter()
|
||||
|
||||
eos_token_id = 100
|
||||
|
||||
output_processor = MultiStepOutputProcessor(
|
||||
detokenizer=detokenizer,
|
||||
scheduler=[scheduler],
|
||||
seq_counter=seq_counter,
|
||||
get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
|
||||
stop_checker=stop_checker,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=seq_prompt_len,
|
||||
seq_output_lens=[seq_output_len],
|
||||
sampling_params=SamplingParams(
|
||||
# Ensure enough space.
|
||||
max_tokens=seq_output_len + num_new_tokens, ),
|
||||
)
|
||||
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
new_token_ids = list(range(num_new_tokens))
|
||||
assert eos_token_id not in new_token_ids
|
||||
eos_index = random.randint(0, len(new_token_ids) - 1)
|
||||
new_token_ids[eos_index] = eos_token_id
|
||||
|
||||
outputs = [
|
||||
CompletionSequenceGroupOutput(
|
||||
samples=[
|
||||
SequenceOutput(
|
||||
parent_seq_id=seq.seq_id,
|
||||
output_token=output_token,
|
||||
logprobs={output_token: Logprob(0.0)},
|
||||
)
|
||||
],
|
||||
prompt_logprobs=None,
|
||||
) for output_token in new_token_ids
|
||||
]
|
||||
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len
|
||||
output_processor.process_outputs(seq_group, outputs)
|
||||
|
||||
# Expect the processed sequence to not go beyond provided eos.
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
|
||||
|
||||
# Expect the correct tokens were appended.
|
||||
expected_appended_tokens = new_token_ids[:eos_index + 1]
|
||||
assert seq.get_token_ids(
|
||||
)[-len(expected_appended_tokens):] == expected_appended_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_prompt_len", [1024])
|
||||
@pytest.mark.parametrize("seq_output_len", [128])
|
||||
@pytest.mark.parametrize("num_new_tokens", [12])
|
||||
@pytest.mark.parametrize("seed", list(range(6)))
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
|
||||
seq_output_len: int, seed: int):
|
||||
"""When sampling parameters dictate that we should ignore the eos token id,
|
||||
ensure all token ids are appended even if the eos token id is emitted.
|
||||
"""
|
||||
random.seed(seed)
|
||||
detokenizer = MagicMock(spec=Detokenizer)
|
||||
scheduler = MagicMock(spec=Scheduler)
|
||||
stop_checker = MagicMock(spec=StopChecker)
|
||||
seq_counter = Counter()
|
||||
|
||||
eos_token_id = 100
|
||||
|
||||
output_processor = MultiStepOutputProcessor(
|
||||
detokenizer=detokenizer,
|
||||
scheduler=[scheduler],
|
||||
seq_counter=seq_counter,
|
||||
get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
|
||||
stop_checker=stop_checker,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=seq_prompt_len,
|
||||
seq_output_lens=[seq_output_len],
|
||||
sampling_params=SamplingParams(
|
||||
# Ensure enough space.
|
||||
max_tokens=seq_output_len + num_new_tokens,
|
||||
ignore_eos=True,
|
||||
),
|
||||
)
|
||||
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
new_token_ids = list(range(num_new_tokens))
|
||||
assert eos_token_id not in new_token_ids
|
||||
eos_index = random.randint(0, len(new_token_ids) - 1)
|
||||
new_token_ids[eos_index] = eos_token_id
|
||||
|
||||
outputs = [
|
||||
CompletionSequenceGroupOutput(
|
||||
samples=[
|
||||
SequenceOutput(
|
||||
parent_seq_id=seq.seq_id,
|
||||
output_token=output_token,
|
||||
logprobs={output_token: Logprob(0.0)},
|
||||
)
|
||||
],
|
||||
prompt_logprobs=None,
|
||||
) for output_token in new_token_ids
|
||||
]
|
||||
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len
|
||||
output_processor.process_outputs(seq_group, outputs)
|
||||
|
||||
# Expect the processed sequence to go beyond eos.
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
|
||||
|
||||
# Expect the correct tokens were appended.
|
||||
expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
|
||||
seq_output_len]
|
||||
assert seq.get_token_ids(
|
||||
)[-len(expected_appended_tokens):] == expected_appended_tokens
|
||||
|
||||
|
||||
def mock_tokenizer(eos_token_id=1000):
|
||||
tokenizer = MagicMock(spec=PreTrainedTokenizer)
|
||||
tokenizer.eos_token_id = eos_token_id
|
||||
return tokenizer
|
||||
@@ -0,0 +1,86 @@
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.inputs import token_inputs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import Logprob, Sequence, SequenceStatus
|
||||
|
||||
|
||||
def sequence_with_eos(text: str, eos_token: str,
|
||||
eos_token_id: int) -> Sequence:
|
||||
"""
|
||||
Create a Sequence that ends with an EOS token.
|
||||
"""
|
||||
seq = Sequence(
|
||||
seq_id=0,
|
||||
inputs=token_inputs([]),
|
||||
block_size=16,
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
seq.output_text = text + eos_token
|
||||
|
||||
offset = eos_token_id + 1
|
||||
for i in range(offset, len(text) + offset):
|
||||
seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)})
|
||||
seq.append_token_id(token_id=eos_token_id,
|
||||
logprobs={eos_token_id: Logprob(0.0)})
|
||||
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
return seq
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
|
||||
("This text ends with EOS token", "</s>", 2),
|
||||
])
|
||||
@pytest.mark.parametrize("ignore_eos", [True, False])
|
||||
@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
|
||||
ignore_eos: bool, include_stop_str_in_output: bool):
|
||||
"""
|
||||
Test the behavior of the StopChecker's maybe_stop_sequence method
|
||||
when an EOS token is encountered.
|
||||
|
||||
This test covers:
|
||||
- When the EOS token should stop the sequence and be removed from the output
|
||||
- When the EOS token should stop the sequence and be included in the output
|
||||
- When the EOS token should be ignored, and the sequence continues
|
||||
"""
|
||||
|
||||
tokenizer = MagicMock(spec=PreTrainedTokenizer)
|
||||
get_tokenizer_for_seq = MagicMock(return_value=tokenizer)
|
||||
stop_checker = StopChecker(max_model_len=1024,
|
||||
get_tokenizer_for_seq=get_tokenizer_for_seq)
|
||||
|
||||
seq = sequence_with_eos(
|
||||
text=text_wo_eos,
|
||||
eos_token=eos_token,
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
new_char_count = len(eos_token)
|
||||
|
||||
# Note that `stop` and `stop_token_ids` are not specified
|
||||
sampling_params = SamplingParams(
|
||||
min_tokens=1,
|
||||
ignore_eos=ignore_eos,
|
||||
include_stop_str_in_output=include_stop_str_in_output)
|
||||
|
||||
stop_checker.maybe_stop_sequence(
|
||||
seq=seq,
|
||||
new_char_count=new_char_count,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
if ignore_eos:
|
||||
assert seq.status == SequenceStatus.RUNNING
|
||||
assert seq.output_text == text_wo_eos + eos_token
|
||||
elif include_stop_str_in_output:
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
assert seq.output_text == text_wo_eos + eos_token
|
||||
else:
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
assert seq.output_text == text_wo_eos
|
||||
95
vllm-v0.6.2/tests/engine/test_arg_utils.py
Normal file
95
vllm-v0.6.2/tests/engine/test_arg_utils.py
Normal file
@@ -0,0 +1,95 @@
|
||||
from argparse import ArgumentTypeError
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import PoolerConfig
|
||||
from vllm.engine.arg_utils import EngineArgs, nullable_kvs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("arg", "expected"), [
|
||||
(None, None),
|
||||
("image=16", {
|
||||
"image": 16
|
||||
}),
|
||||
("image=16,video=2", {
|
||||
"image": 16,
|
||||
"video": 2
|
||||
}),
|
||||
("Image=16, Video=2", {
|
||||
"image": 16,
|
||||
"video": 2
|
||||
}),
|
||||
])
|
||||
def test_limit_mm_per_prompt_parser(arg, expected):
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
if arg is None:
|
||||
args = parser.parse_args([])
|
||||
else:
|
||||
args = parser.parse_args(["--limit-mm-per-prompt", arg])
|
||||
|
||||
assert args.limit_mm_per_prompt == expected
|
||||
|
||||
|
||||
def test_valid_pooling_config():
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
args = parser.parse_args([
|
||||
'--override-pooler-config',
|
||||
'{"pooling_type": "MEAN"}',
|
||||
])
|
||||
engine_args = EngineArgs.from_cli_args(args=args)
|
||||
assert engine_args.override_pooler_config == PoolerConfig(
|
||||
pooling_type="MEAN", )
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("arg"),
|
||||
[
|
||||
"image", # Missing =
|
||||
"image=4,image=5", # Conflicting values
|
||||
"image=video=4" # Too many = in tokenized arg
|
||||
])
|
||||
def test_bad_nullable_kvs(arg):
|
||||
with pytest.raises(ArgumentTypeError):
|
||||
nullable_kvs(arg)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(("arg", "expected", "option"), [
|
||||
(None, None, "mm-processor-kwargs"),
|
||||
("{}", {}, "mm-processor-kwargs"),
|
||||
(
|
||||
'{"num_crops": 4}',
|
||||
{
|
||||
"num_crops": 4
|
||||
},
|
||||
"mm-processor-kwargs"
|
||||
),
|
||||
(
|
||||
'{"foo": {"bar": "baz"}}',
|
||||
{
|
||||
"foo":
|
||||
{
|
||||
"bar": "baz"
|
||||
}
|
||||
},
|
||||
"mm-processor-kwargs"
|
||||
),
|
||||
(
|
||||
'{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
|
||||
{
|
||||
"cast_logits_dtype": "bfloat16",
|
||||
"sequence_parallel_norm": True,
|
||||
"sequence_parallel_norm_threshold": 2048,
|
||||
},
|
||||
"override-neuron-config"
|
||||
),
|
||||
])
|
||||
# yapf: enable
|
||||
def test_composite_arg_parser(arg, expected, option):
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
if arg is None:
|
||||
args = parser.parse_args([])
|
||||
else:
|
||||
args = parser.parse_args([f"--{option}", arg])
|
||||
assert getattr(args, option.replace("-", "_")) == expected
|
||||
40
vllm-v0.6.2/tests/engine/test_computed_prefix_blocks.py
Normal file
40
vllm-v0.6.2/tests/engine/test_computed_prefix_blocks.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import pytest
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(enable_prefix_caching): Not support prefix caching yet, will fix in VLLM-342.
|
||||
'''
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
def test_computed_prefix_blocks(model: str, block_size: int):
|
||||
# This test checks if we are able to run the engine to completion
|
||||
# without triggering asserts.
|
||||
# We are in a scenario where all blocks from the second request's prompt
|
||||
# are full and already computed when the second request arrives.
|
||||
prompt = (
|
||||
"You are a helpful assistant. How do I build a car from cardboard and "
|
||||
"paper clips? Is there an easy to follow video tutorial available "
|
||||
"online for free?")
|
||||
prompt2 = (
|
||||
" Please recommend to me some resources where I can learn not only to "
|
||||
"handle technical difficulties of building a car, but also "
|
||||
"decoration.")
|
||||
|
||||
engine_args = EngineArgs(model=model,
|
||||
block_size=block_size,
|
||||
enable_prefix_caching=False)
|
||||
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
engine.add_request("0", prompt + prompt2, sampling_params)
|
||||
engine.step()
|
||||
engine.add_request("1", prompt, sampling_params)
|
||||
engine.step()
|
||||
116
vllm-v0.6.2/tests/engine/test_custom_executor.py
Normal file
116
vllm-v0.6.2/tests/engine/test_custom_executor.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.executor.mlu_executor import MLUExecutor, MLUExecutorAsync
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
class Mock:
|
||||
...
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(GPUExecutor): Use mlu executor on MLU devices.
|
||||
'''
|
||||
class CustomGPUExecutor(MLUExecutor):
|
||||
|
||||
def execute_model(self, *args, **kwargs):
|
||||
# Drop marker to show that this was ran
|
||||
with open(".marker", "w"):
|
||||
...
|
||||
return super().execute_model(*args, **kwargs)
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(GPUExecutor): Use mlu executor on MLU devices.
|
||||
'''
|
||||
class CustomGPUExecutorAsync(MLUExecutorAsync):
|
||||
|
||||
async def execute_model_async(self, *args, **kwargs):
|
||||
with open(".marker", "w"):
|
||||
...
|
||||
return await super().execute_model_async(*args, **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
def test_custom_executor_type_checking(model):
|
||||
with pytest.raises(ValueError):
|
||||
engine_args = EngineArgs(model=model,
|
||||
distributed_executor_backend=Mock)
|
||||
LLMEngine.from_engine_args(engine_args)
|
||||
with pytest.raises(ValueError):
|
||||
engine_args = AsyncEngineArgs(model=model,
|
||||
distributed_executor_backend=Mock)
|
||||
AsyncLLMEngine.from_engine_args(engine_args)
|
||||
with pytest.raises(TypeError):
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model, distributed_executor_backend=CustomGPUExecutor)
|
||||
AsyncLLMEngine.from_engine_args(engine_args)
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(tmpdir): All test models are soft link into tests dir, do not change dir.
|
||||
'''
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
def test_custom_executor(model, tmp_path):
|
||||
cwd = os.path.abspath(".")
|
||||
# os.chdir(tmp_path)
|
||||
try:
|
||||
assert not os.path.exists(".marker")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model, distributed_executor_backend=CustomGPUExecutor)
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
|
||||
engine.add_request("0", "foo", sampling_params)
|
||||
engine.step()
|
||||
|
||||
assert os.path.exists(".marker")
|
||||
os.remove(".marker")
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
@brief(tmpdir): All test models are soft link into tests dir, do not change dir.
|
||||
'''
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
def test_custom_executor_async(model, tmp_path):
|
||||
cwd = os.path.abspath(".")
|
||||
# os.chdir(tmp_path)
|
||||
try:
|
||||
assert not os.path.exists(".marker")
|
||||
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model, distributed_executor_backend=CustomGPUExecutorAsync)
|
||||
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
|
||||
async def t():
|
||||
stream = await engine.add_request("0", "foo", sampling_params)
|
||||
async for x in stream:
|
||||
...
|
||||
|
||||
asyncio.run(t())
|
||||
|
||||
assert os.path.exists(".marker")
|
||||
os.remove(".marker")
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
32
vllm-v0.6.2/tests/engine/test_detokenization.py
Normal file
32
vllm-v0.6.2/tests/engine/test_detokenization.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
def test_computed_prefix_blocks(model: str):
|
||||
# This test checks if the engine generates completions both with and
|
||||
# without optional detokenization, that detokenization includes text
|
||||
# and no-detokenization doesn't, and that both completions have the same
|
||||
# token_ids.
|
||||
prompt = (
|
||||
"You are a helpful assistant. How do I build a car from cardboard and "
|
||||
"paper clips? Is there an easy to follow video tutorial available "
|
||||
"online for free?")
|
||||
|
||||
llm = LLM(model=model)
|
||||
sampling_params = SamplingParams(max_tokens=10,
|
||||
temperature=0.0,
|
||||
detokenize=False)
|
||||
|
||||
outputs_no_detokenization = llm.generate(prompt,
|
||||
sampling_params)[0].outputs[0]
|
||||
sampling_params.detokenize = True
|
||||
outputs_with_detokenization = llm.generate(prompt,
|
||||
sampling_params)[0].outputs[0]
|
||||
|
||||
assert outputs_no_detokenization.text == ''
|
||||
assert outputs_with_detokenization.text != ''
|
||||
assert outputs_no_detokenization.token_ids == \
|
||||
outputs_with_detokenization.token_ids
|
||||
176
vllm-v0.6.2/tests/engine/test_multiproc_workers.py
Normal file
176
vllm-v0.6.2/tests/engine/test_multiproc_workers.py
Normal file
@@ -0,0 +1,176 @@
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from time import sleep
|
||||
from typing import Any, List, Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
|
||||
ResultHandler, WorkerMonitor)
|
||||
|
||||
|
||||
class DummyWorker:
|
||||
"""Dummy version of vllm.worker.worker.Worker"""
|
||||
|
||||
def __init__(self, rank: int):
|
||||
self.rank = rank
|
||||
|
||||
def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
|
||||
sleep(0.05)
|
||||
|
||||
if isinstance(worker_input, Exception):
|
||||
# simulate error case
|
||||
raise worker_input
|
||||
|
||||
return self.rank, input
|
||||
|
||||
|
||||
def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
|
||||
result_handler = ResultHandler()
|
||||
workers = [
|
||||
ProcessWorkerWrapper(result_handler, partial(DummyWorker, rank=rank))
|
||||
for rank in range(8)
|
||||
]
|
||||
|
||||
worker_monitor = WorkerMonitor(workers, result_handler)
|
||||
assert not worker_monitor.is_alive()
|
||||
|
||||
result_handler.start()
|
||||
worker_monitor.start()
|
||||
assert worker_monitor.is_alive()
|
||||
|
||||
return workers, worker_monitor
|
||||
|
||||
|
||||
def test_local_workers() -> None:
|
||||
"""Test workers with sync task submission"""
|
||||
|
||||
workers, worker_monitor = _start_workers()
|
||||
|
||||
def execute_workers(worker_input: str) -> None:
|
||||
worker_outputs = [
|
||||
worker.execute_method("worker_method", worker_input)
|
||||
for worker in workers
|
||||
]
|
||||
|
||||
for rank, output in enumerate(worker_outputs):
|
||||
assert output.get() == (rank, input)
|
||||
|
||||
executor = ThreadPoolExecutor(max_workers=4)
|
||||
|
||||
# Test concurrent submission from different threads
|
||||
futures = [
|
||||
executor.submit(partial(execute_workers, f"thread {thread_num}"))
|
||||
for thread_num in range(4)
|
||||
]
|
||||
|
||||
for future in futures:
|
||||
future.result()
|
||||
|
||||
# Test error case
|
||||
exception = ValueError("fake error")
|
||||
result = workers[0].execute_method("worker_method", exception)
|
||||
try:
|
||||
result.get()
|
||||
pytest.fail("task should have failed")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ValueError)
|
||||
assert str(e) == "fake error"
|
||||
|
||||
# Test cleanup when a worker fails
|
||||
assert worker_monitor.is_alive()
|
||||
workers[3].process.kill()
|
||||
|
||||
# Other workers should get shut down here
|
||||
worker_monitor.join(20)
|
||||
|
||||
# Ensure everything is stopped
|
||||
assert not worker_monitor.is_alive()
|
||||
assert all(not worker.process.is_alive() for worker in workers)
|
||||
|
||||
# Further attempts to submit tasks should fail
|
||||
try:
|
||||
_result = workers[0].execute_method("worker_method", "test")
|
||||
pytest.fail("task should fail once workers have been shut down")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ChildProcessError)
|
||||
|
||||
|
||||
def test_local_workers_clean_shutdown() -> None:
|
||||
"""Test clean shutdown"""
|
||||
|
||||
workers, worker_monitor = _start_workers()
|
||||
|
||||
assert worker_monitor.is_alive()
|
||||
assert all(worker.process.is_alive() for worker in workers)
|
||||
|
||||
# Clean shutdown
|
||||
worker_monitor.close()
|
||||
|
||||
worker_monitor.join(20)
|
||||
|
||||
# Ensure everything is stopped
|
||||
assert not worker_monitor.is_alive()
|
||||
assert all(not worker.process.is_alive() for worker in workers)
|
||||
|
||||
# Further attempts to submit tasks should fail
|
||||
try:
|
||||
_result = workers[0].execute_method("worker_method", "test")
|
||||
pytest.fail("task should fail once workers have been shut down")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ChildProcessError)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_local_workers_async() -> None:
|
||||
"""Test local workers with async task submission"""
|
||||
|
||||
workers, worker_monitor = _start_workers()
|
||||
|
||||
async def execute_workers(worker_input: str) -> None:
|
||||
worker_coros = [
|
||||
worker.execute_method_async("worker_method", worker_input)
|
||||
for worker in workers
|
||||
]
|
||||
|
||||
results = await asyncio.gather(*worker_coros)
|
||||
for rank, result in enumerate(results):
|
||||
assert result == (rank, input)
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(execute_workers(f"task {task_num}"))
|
||||
for task_num in range(4)
|
||||
]
|
||||
|
||||
for task in tasks:
|
||||
await task
|
||||
|
||||
# Test error case
|
||||
exception = ValueError("fake error")
|
||||
try:
|
||||
_result = await workers[0].execute_method_async(
|
||||
"worker_method", exception)
|
||||
pytest.fail("task should have failed")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ValueError)
|
||||
assert str(e) == "fake error"
|
||||
|
||||
# Test cleanup when a worker fails
|
||||
assert worker_monitor.is_alive()
|
||||
workers[3].process.kill()
|
||||
|
||||
# Other workers should get shut down here
|
||||
worker_monitor.join(20)
|
||||
|
||||
# Ensure everything is stopped
|
||||
assert not worker_monitor.is_alive()
|
||||
assert all(not worker.process.is_alive() for worker in workers)
|
||||
|
||||
# Further attempts to submit tasks should fail
|
||||
try:
|
||||
_result = await workers[0].execute_method_async(
|
||||
"worker_method", "test")
|
||||
pytest.fail("task should fail once workers have been shut down")
|
||||
except Exception as e:
|
||||
assert isinstance(e, ChildProcessError)
|
||||
29
vllm-v0.6.2/tests/engine/test_short_mm_context.py
Normal file
29
vllm-v0.6.2/tests/engine/test_short_mm_context.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import pytest
|
||||
|
||||
from ..conftest import IMAGE_ASSETS
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"USER: <image>\nWhat's the content of the image?\nASSISTANT:",
|
||||
"cherry_blossom":
|
||||
"USER: <image>\nWhat is the season?\nASSISTANT:",
|
||||
})
|
||||
|
||||
models = ["llava-hf/llava-1.5-7b-hf"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
def test_context_length_too_short(vllm_runner, image_assets, model):
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
with pytest.raises(ValueError, match="too long to fit into the model"):
|
||||
vllm_model = vllm_runner(
|
||||
model,
|
||||
max_model_len=128, # LLaVA has a feature size of 576
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
with vllm_model:
|
||||
vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
|
||||
max_tokens=1,
|
||||
images=[images[0]])
|
||||
24
vllm-v0.6.2/tests/engine/test_skip_tokenizer_init.py
Normal file
24
vllm-v0.6.2/tests/engine/test_skip_tokenizer_init.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
def test_skip_tokenizer_initialization(model: str):
|
||||
# This test checks if the flag skip_tokenizer_init skips the initialization
|
||||
# of tokenizer and detokenizer. The generated output is expected to contain
|
||||
# token ids.
|
||||
llm = LLM(model=model, skip_tokenizer_init=True)
|
||||
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
|
||||
|
||||
with pytest.raises(ValueError, match="cannot pass text prompts when"):
|
||||
llm.generate("abc", sampling_params)
|
||||
|
||||
outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
|
||||
sampling_params=sampling_params)
|
||||
assert len(outputs) > 0
|
||||
completions = outputs[0].outputs
|
||||
assert len(completions) > 0
|
||||
assert completions[0].text == ""
|
||||
assert completions[0].token_ids
|
||||
62
vllm-v0.6.2/tests/engine/test_stop_reason.py
Normal file
62
vllm-v0.6.2/tests/engine/test_stop_reason.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""Test the different finish_reason="stop" situations during generation:
|
||||
1. One of the provided stop strings
|
||||
2. One of the provided stop tokens
|
||||
3. The EOS token
|
||||
|
||||
Run `pytest tests/engine/test_stop_reason.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import transformers
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
MODEL = "facebook/opt-350m"
|
||||
STOP_STR = "."
|
||||
SEED = 42
|
||||
MAX_TOKENS = 1024
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vllm_model(vllm_runner):
|
||||
with vllm_runner(MODEL) as vllm_model:
|
||||
yield vllm_model
|
||||
|
||||
|
||||
def test_stop_reason(vllm_model, example_prompts):
|
||||
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
|
||||
stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
|
||||
llm = vllm_model.model
|
||||
|
||||
# test stop token
|
||||
outputs = llm.generate(example_prompts,
|
||||
sampling_params=SamplingParams(
|
||||
ignore_eos=True,
|
||||
seed=SEED,
|
||||
max_tokens=MAX_TOKENS,
|
||||
stop_token_ids=[stop_token_id]))
|
||||
for output in outputs:
|
||||
output = output.outputs[0]
|
||||
assert output.finish_reason == "stop"
|
||||
assert output.stop_reason == stop_token_id
|
||||
|
||||
# test stop string
|
||||
outputs = llm.generate(example_prompts,
|
||||
sampling_params=SamplingParams(
|
||||
ignore_eos=True,
|
||||
seed=SEED,
|
||||
max_tokens=MAX_TOKENS,
|
||||
stop="."))
|
||||
for output in outputs:
|
||||
output = output.outputs[0]
|
||||
assert output.finish_reason == "stop"
|
||||
assert output.stop_reason == STOP_STR
|
||||
|
||||
# test EOS token
|
||||
outputs = llm.generate(example_prompts,
|
||||
sampling_params=SamplingParams(
|
||||
seed=SEED, max_tokens=MAX_TOKENS))
|
||||
for output in outputs:
|
||||
output = output.outputs[0]
|
||||
assert output.finish_reason == "length" or (
|
||||
output.finish_reason == "stop" and output.stop_reason is None)
|
||||
163
vllm-v0.6.2/tests/engine/test_stop_strings.py
Normal file
163
vllm-v0.6.2/tests/engine/test_stop_strings.py
Normal file
@@ -0,0 +1,163 @@
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import CompletionOutput, LLMEngine, SamplingParams
|
||||
|
||||
MODEL = "meta-llama/llama-2-7b-hf"
|
||||
MAX_TOKENS = 200
|
||||
|
||||
IS_ASYNC = False
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def vllm_model(vllm_runner):
|
||||
with vllm_runner(MODEL) as vllm_model:
|
||||
yield vllm_model
|
||||
|
||||
|
||||
def _test_stopping(llm_engine: LLMEngine,
|
||||
expected_output: str,
|
||||
expected_reason: Any,
|
||||
stop: Optional[List[str]] = None,
|
||||
stop_token_ids: Optional[List[int]] = None,
|
||||
include_in_output: bool = False,
|
||||
use_async_output_proc: bool = False) -> None:
|
||||
llm_engine.add_request(
|
||||
"id", "A story about vLLM:\n",
|
||||
SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=MAX_TOKENS,
|
||||
stop=stop,
|
||||
stop_token_ids=stop_token_ids,
|
||||
include_stop_str_in_output=include_in_output,
|
||||
), None)
|
||||
|
||||
output: Optional[CompletionOutput] = None
|
||||
output_text = ""
|
||||
stop_reason = None
|
||||
|
||||
if use_async_output_proc:
|
||||
llm_engine.step()
|
||||
|
||||
while llm_engine.has_unfinished_requests():
|
||||
(request_output, ) = llm_engine.step()
|
||||
(output, ) = request_output.outputs
|
||||
|
||||
# Ensure we don't backtrack
|
||||
assert output.text.startswith(output_text)
|
||||
output_text = output.text
|
||||
stop_reason = output.stop_reason
|
||||
|
||||
assert output is not None
|
||||
assert output_text == expected_output
|
||||
assert stop_reason == expected_reason
|
||||
|
||||
|
||||
def _set_async_mode(llm_engine, is_async):
|
||||
llm_engine.scheduler[0].use_async_output_proc = is_async
|
||||
|
||||
|
||||
def _stop_basic(llm_engine, is_async):
|
||||
_test_stopping(llm_engine,
|
||||
stop=["."],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer organization",
|
||||
expected_reason=".",
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
_test_stopping(llm_engine,
|
||||
stop=["."],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organization.",
|
||||
expected_reason=".",
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
|
||||
def _stop_multi_tokens(llm_engine, is_async):
|
||||
_test_stopping(
|
||||
llm_engine,
|
||||
stop=["group of peo", "short"],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer organization. We are a ",
|
||||
expected_reason="group of peo",
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
_test_stopping(
|
||||
llm_engine,
|
||||
stop=["group of peo", "short"],
|
||||
include_in_output=True,
|
||||
expected_output=
|
||||
"VLLM is a 100% volunteer organization. We are a group of peo",
|
||||
expected_reason="group of peo",
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
|
||||
def _stop_partial_token(llm_engine, is_async):
|
||||
_test_stopping(llm_engine,
|
||||
stop=["gani"],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer or",
|
||||
expected_reason="gani",
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
_test_stopping(llm_engine,
|
||||
stop=["gani"],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organi",
|
||||
expected_reason="gani",
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
|
||||
def _stop_token_id(llm_engine, is_async):
|
||||
# token id 13013 => " organization"
|
||||
|
||||
_test_stopping(llm_engine,
|
||||
stop_token_ids=[13013],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer",
|
||||
expected_reason=13013,
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
_test_stopping(llm_engine,
|
||||
stop_token_ids=[13013],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organization",
|
||||
expected_reason=13013,
|
||||
use_async_output_proc=is_async)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_basic(vllm_model):
|
||||
_set_async_mode(vllm_model.model.llm_engine, True)
|
||||
_stop_basic(vllm_model.model.llm_engine, is_async=True)
|
||||
|
||||
_set_async_mode(vllm_model.model.llm_engine, False)
|
||||
_stop_basic(vllm_model.model.llm_engine, is_async=False)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_multi_tokens(vllm_model):
|
||||
_set_async_mode(vllm_model.model.llm_engine, True)
|
||||
_stop_multi_tokens(vllm_model.model.llm_engine, is_async=True)
|
||||
|
||||
_set_async_mode(vllm_model.model.llm_engine, False)
|
||||
_stop_multi_tokens(vllm_model.model.llm_engine, is_async=False)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_partial_token(vllm_model):
|
||||
_set_async_mode(vllm_model.model.llm_engine, True)
|
||||
_stop_partial_token(vllm_model.model.llm_engine, is_async=True)
|
||||
|
||||
_set_async_mode(vllm_model.model.llm_engine, False)
|
||||
_stop_partial_token(vllm_model.model.llm_engine, is_async=False)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_token_id(vllm_model):
|
||||
_set_async_mode(vllm_model.model.llm_engine, True)
|
||||
_stop_token_id(vllm_model.model.llm_engine, is_async=True)
|
||||
|
||||
_set_async_mode(vllm_model.model.llm_engine, False)
|
||||
_stop_token_id(vllm_model.model.llm_engine, is_async=False)
|
||||
0
vllm-v0.6.2/tests/entrypoints/__init__.py
Normal file
0
vllm-v0.6.2/tests/entrypoints/__init__.py
Normal file
89
vllm-v0.6.2/tests/entrypoints/conftest.py
Normal file
89
vllm-v0.6.2/tests/entrypoints/conftest.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_prompts():
|
||||
return [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_token_ids():
|
||||
return [
|
||||
[0],
|
||||
[0, 1],
|
||||
[0, 2, 1],
|
||||
[0, 3, 1, 2],
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_regex():
|
||||
return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
|
||||
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_json_schema():
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"age": {
|
||||
"type": "integer"
|
||||
},
|
||||
"skills": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"maxLength": 10
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"work_history": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company": {
|
||||
"type": "string"
|
||||
},
|
||||
"duration": {
|
||||
"type": "number"
|
||||
},
|
||||
"position": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["company", "position"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["name", "age", "skills", "work_history"]
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_guided_choice():
|
||||
return [
|
||||
"Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
|
||||
"Ruby", "Swift", "Kotlin"
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_sql_statements():
|
||||
return ("""
|
||||
start: select_statement
|
||||
select_statement: "SELECT" column "from" table "where" condition
|
||||
column: "col_1" | "col_2"
|
||||
table: "table_1" | "table_2"
|
||||
condition: column "=" number
|
||||
number: "1" | "2"
|
||||
""")
|
||||
0
vllm-v0.6.2/tests/entrypoints/llm/__init__.py
Normal file
0
vllm-v0.6.2/tests/entrypoints/llm/__init__.py
Normal file
56
vllm-v0.6.2/tests/entrypoints/llm/test_accuracy.py
Normal file
56
vllm-v0.6.2/tests/entrypoints/llm/test_accuracy.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""
|
||||
This file test accuracy of the vLLM server via LMEval.
|
||||
It uses local-completions, which interacts with vLLM
|
||||
through the OAI API with N concurrent connections.
|
||||
This simulates real work usage of the API and makes
|
||||
sure that the zmq frontend mp RPC message passing and
|
||||
AsyncLLMEngine are working correctly.
|
||||
"""
|
||||
|
||||
import lm_eval
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
|
||||
NUM_CONCURRENT = 500
|
||||
TASK = "gsm8k"
|
||||
FILTER = "exact_match,strict-match"
|
||||
RTOL = 0.03
|
||||
EXPECTED_VALUE = 0.58
|
||||
|
||||
|
||||
def run_test():
|
||||
"""Run the end to end accuracy test."""
|
||||
|
||||
model_args = f"pretrained={MODEL_NAME},max_model_len=2048"
|
||||
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=model_args,
|
||||
tasks="gsm8k",
|
||||
batch_size="auto",
|
||||
)
|
||||
|
||||
measured_value = results["results"][TASK][FILTER]
|
||||
assert (measured_value - RTOL < EXPECTED_VALUE
|
||||
and measured_value + RTOL > EXPECTED_VALUE
|
||||
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda(),
|
||||
reason="V1 is currently only supported on CUDA.")
|
||||
def test_lm_eval_accuracy_v1_engine(monkeypatch):
|
||||
"""Run with the V1 Engine."""
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
run_test()
|
||||
|
||||
|
||||
def test_lm_eval_accuracy_v0_engine(monkeypatch):
|
||||
"""Run with the V0 Engine."""
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
run_test()
|
||||
93
vllm-v0.6.2/tests/entrypoints/llm/test_chat.py
Normal file
93
vllm-v0.6.2/tests/entrypoints/llm/test_chat.py
Normal file
@@ -0,0 +1,93 @@
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from ..openai.test_vision import TEST_IMAGE_URLS
|
||||
|
||||
|
||||
def test_chat():
|
||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt1
|
||||
},
|
||||
]
|
||||
outputs = llm.chat(messages)
|
||||
assert len(outputs) == 1
|
||||
|
||||
|
||||
def test_multi_chat():
|
||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
prompt2 = "Explain what among us is."
|
||||
|
||||
conversation1 = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt1
|
||||
},
|
||||
]
|
||||
|
||||
conversation2 = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt2
|
||||
},
|
||||
]
|
||||
|
||||
messages = [conversation1, conversation2]
|
||||
|
||||
outputs = llm.chat(messages)
|
||||
assert len(outputs) == 2
|
||||
|
||||
|
||||
@pytest.mark.skip("Not support Phi vision model yet.")
|
||||
@pytest.mark.parametrize("image_urls",
|
||||
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
|
||||
def test_chat_multi_image(image_urls: List[str]):
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
dtype="bfloat16",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
)
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*({
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
} for image_url in image_urls),
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this image?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
outputs = llm.chat(messages)
|
||||
assert len(outputs) >= 0
|
||||
107
vllm-v0.6.2/tests/entrypoints/llm/test_encode.py
Normal file
107
vllm-v0.6.2/tests/entrypoints/llm/test_encode.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import weakref
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, EmbeddingRequestOutput, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
TOKEN_IDS = [
|
||||
# Using ID={0, 1, 2, 3} results in NaN values,
|
||||
# so we add this offset of 1000
|
||||
[1000],
|
||||
[1000, 1001],
|
||||
[1000, 1002, 1001],
|
||||
[1000, 1003, 1001, 1002],
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
enforce_eager=True)
|
||||
|
||||
with llm.deprecate_legacy_api():
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
|
||||
o2: List[EmbeddingRequestOutput]):
|
||||
assert [o.outputs for o in o1] == [o.outputs for o in o2]
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
|
||||
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
|
||||
prompt_token_ids):
|
||||
pooling_params = PoolingParams()
|
||||
|
||||
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
|
||||
v1_output = llm.encode(prompt_token_ids=prompt_token_ids,
|
||||
pooling_params=pooling_params)
|
||||
|
||||
v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
|
||||
pooling_params=pooling_params)
|
||||
assert_outputs_equal(v1_output, v2_output)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
|
||||
pooling_params = PoolingParams()
|
||||
|
||||
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
|
||||
v1_output = llm.encode(prompt_token_ids=TOKEN_IDS,
|
||||
pooling_params=pooling_params)
|
||||
|
||||
v2_output = llm.encode(
|
||||
[{
|
||||
"prompt_token_ids": p
|
||||
} for p in TOKEN_IDS],
|
||||
pooling_params=pooling_params,
|
||||
)
|
||||
assert_outputs_equal(v1_output, v2_output)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_multiple_pooling_params(llm: LLM):
|
||||
pooling_params = [
|
||||
PoolingParams(),
|
||||
PoolingParams(),
|
||||
PoolingParams(),
|
||||
PoolingParams(),
|
||||
]
|
||||
|
||||
# Multiple PoolingParams should be matched with each prompt
|
||||
outputs = llm.encode(PROMPTS, pooling_params=pooling_params)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the size of params does not match the size of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3])
|
||||
|
||||
# Single PoolingParams should be applied to every prompt
|
||||
single_pooling_params = PoolingParams()
|
||||
outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# pooling_params is None, default params should be applied
|
||||
outputs = llm.encode(PROMPTS, pooling_params=None)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
104
vllm-v0.6.2/tests/entrypoints/llm/test_generate.py
Normal file
104
vllm-v0.6.2/tests/entrypoints/llm/test_generate.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import weakref
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, RequestOutput, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
TOKEN_IDS = [
|
||||
[0],
|
||||
[0, 1],
|
||||
[0, 2, 1],
|
||||
[0, 3, 1, 2],
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
max_num_batched_tokens=4096,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.10,
|
||||
enforce_eager=True)
|
||||
|
||||
with llm.deprecate_legacy_api():
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
|
||||
assert [o.outputs for o in o1] == [o.outputs for o in o2]
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
|
||||
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
|
||||
prompt_token_ids):
|
||||
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
|
||||
|
||||
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
|
||||
v1_output = llm.generate(prompt_token_ids=prompt_token_ids,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
v2_output = llm.generate({"prompt_token_ids": prompt_token_ids},
|
||||
sampling_params=sampling_params)
|
||||
assert_outputs_equal(v1_output, v2_output)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
|
||||
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
|
||||
|
||||
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
|
||||
v1_output = llm.generate(prompt_token_ids=TOKEN_IDS,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
v2_output = llm.generate(
|
||||
[{
|
||||
"prompt_token_ids": p
|
||||
} for p in TOKEN_IDS],
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
assert_outputs_equal(v1_output, v2_output)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_multiple_sampling_params(llm: LLM):
|
||||
sampling_params = [
|
||||
SamplingParams(temperature=0.01, top_p=0.95),
|
||||
SamplingParams(temperature=0.3, top_p=0.95),
|
||||
SamplingParams(temperature=0.7, top_p=0.95),
|
||||
SamplingParams(temperature=0.99, top_p=0.95),
|
||||
]
|
||||
|
||||
# Multiple SamplingParams should be matched with each prompt
|
||||
outputs = llm.generate(PROMPTS, sampling_params=sampling_params)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the size of params does not match the size of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3])
|
||||
|
||||
# Single SamplingParams should be applied to every prompt
|
||||
single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
|
||||
outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# sampling_params is None, default params should be applied
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
@@ -0,0 +1,66 @@
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
tensor_parallel_size=1,
|
||||
max_model_len=8192,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
max_num_seqs=128,
|
||||
enforce_eager=True)
|
||||
|
||||
with llm.deprecate_legacy_api():
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
|
||||
lora_request = [
|
||||
LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
|
||||
for idx in range(len(PROMPTS))
|
||||
]
|
||||
# Multiple SamplingParams should be matched with each prompt
|
||||
outputs = llm.generate(PROMPTS, lora_request=lora_request)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the size of params does not match the size of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
|
||||
|
||||
# Single LoRARequest should be applied to every prompt
|
||||
single_lora_request = lora_request[0]
|
||||
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
161
vllm-v0.6.2/tests/entrypoints/llm/test_guided_generate.py
Normal file
161
vllm-v0.6.2/tests/entrypoints/llm/test_guided_generate.py
Normal file
@@ -0,0 +1,161 @@
|
||||
import json
|
||||
import re
|
||||
import weakref
|
||||
|
||||
import jsonschema
|
||||
import pytest
|
||||
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME, max_model_len=1024)
|
||||
|
||||
with llm.deprecate_legacy_api():
|
||||
yield weakref.proxy(llm)
|
||||
del llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_guided_regex(sample_regex, llm):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(regex=sample_regex))
|
||||
outputs = llm.generate(prompts=[
|
||||
f"Give an example IPv4 address with this regex: {sample_regex}"
|
||||
] * 2,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
|
||||
assert outputs is not None
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
assert isinstance(output, RequestOutput)
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(generated_text)
|
||||
assert generated_text is not None
|
||||
assert re.fullmatch(sample_regex, generated_text) is not None
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_guided_json_completion(sample_json_schema, llm):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
|
||||
outputs = llm.generate(prompts=[
|
||||
f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {sample_json_schema}"
|
||||
] * 2,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
|
||||
assert outputs is not None
|
||||
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
assert isinstance(output, RequestOutput)
|
||||
prompt = output.prompt
|
||||
|
||||
generated_text = output.outputs[0].text
|
||||
assert generated_text is not None
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
output_json = json.loads(generated_text)
|
||||
jsonschema.validate(instance=output_json, schema=sample_json_schema)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_guided_choice_completion(sample_guided_choice, llm):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
|
||||
outputs = llm.generate(
|
||||
prompts="The best language for type-safe systems programming is ",
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
|
||||
assert outputs is not None
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
assert isinstance(output, RequestOutput)
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(generated_text)
|
||||
assert generated_text is not None
|
||||
assert generated_text in sample_guided_choice
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_guided_grammar(sample_sql_statements, llm):
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar=sample_sql_statements))
|
||||
outputs = llm.generate(
|
||||
prompts=("Generate a sql state that select col_1 from "
|
||||
"table_1 where it is equals to 1"),
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True,
|
||||
)
|
||||
|
||||
assert outputs is not None
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
assert isinstance(output, RequestOutput)
|
||||
prompt = output.prompt
|
||||
|
||||
generated_text = output.outputs[0].text
|
||||
assert generated_text is not None
|
||||
# use Lark to parse the output, and make sure it's a valid parse tree
|
||||
from lark import Lark
|
||||
parser = Lark(sample_sql_statements)
|
||||
parser.parse(generated_text)
|
||||
|
||||
# remove spaces for comparison b/c we removed them in the grammar
|
||||
ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
|
||||
" ", "")
|
||||
|
||||
assert generated_text.strip() == ground_truth
|
||||
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_guided_options_request_deprecation_warning(sample_regex, llm):
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
with pytest.warns(DeprecationWarning, match="guided_options_request"):
|
||||
llm.generate(prompts="This should fail",
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True,
|
||||
guided_options_request=dict(guided_regex=sample_regex))
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_validation_against_both_guided_decoding_options(sample_regex, llm):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(regex=sample_regex))
|
||||
|
||||
with pytest.raises(ValueError, match="Cannot set both"):
|
||||
llm.generate(prompts="This should fail",
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True,
|
||||
guided_options_request=dict(guided_regex=sample_regex))
|
||||
22
vllm-v0.6.2/tests/entrypoints/llm/test_init.py
Normal file
22
vllm-v0.6.2/tests/entrypoints/llm/test_init.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from ...utils import error_on_warning
|
||||
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
|
||||
|
||||
def test_pos_args_deprecated():
|
||||
with error_on_warning(DeprecationWarning):
|
||||
LLM(model=MODEL_NAME, tokenizer=MODEL_NAME)
|
||||
|
||||
with error_on_warning(DeprecationWarning):
|
||||
LLM(MODEL_NAME, tokenizer=MODEL_NAME)
|
||||
|
||||
with pytest.warns(DeprecationWarning, match="'tokenizer'"):
|
||||
LLM(MODEL_NAME, MODEL_NAME)
|
||||
|
||||
with pytest.warns(DeprecationWarning,
|
||||
match="'tokenizer', 'tokenizer_mode'"):
|
||||
LLM(MODEL_NAME, MODEL_NAME, "auto")
|
||||
55
vllm-v0.6.2/tests/entrypoints/llm/test_lazy_outlines.py
Normal file
55
vllm-v0.6.2/tests/entrypoints/llm/test_lazy_outlines.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import sys
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
|
||||
def test_lazy_outlines(sample_regex):
|
||||
"""If users don't use guided decoding, outlines should not be imported.
|
||||
"""
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM without guided decoding as a baseline.
|
||||
llm = LLM(model="facebook/opt-125m",
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.3)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
# make sure outlines is not imported
|
||||
assert 'outlines' not in sys.modules
|
||||
|
||||
# Destroy the LLM object and free up the GPU memory.
|
||||
del llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
# Create an LLM with guided decoding enabled.
|
||||
llm = LLM(model="facebook/opt-125m",
|
||||
enforce_eager=True,
|
||||
guided_decoding_backend="lm-format-enforcer",
|
||||
gpu_memory_utilization=0.6)
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
outputs = llm.generate(
|
||||
prompts=[
|
||||
f"Give an example IPv4 address with this regex: {sample_regex}"
|
||||
] * 2,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True,
|
||||
guided_options_request=dict(guided_regex=sample_regex))
|
||||
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
# make sure outlines is not imported
|
||||
assert 'outlines' not in sys.modules
|
||||
25
vllm-v0.6.2/tests/entrypoints/llm/test_prompt_validation.py
Normal file
25
vllm-v0.6.2/tests/entrypoints/llm/test_prompt_validation.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
def test_empty_prompt():
|
||||
llm = LLM(model="gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match='Prompt cannot be empty'):
|
||||
llm.generate([""])
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
def test_out_of_vocab_token():
|
||||
llm = LLM(model="gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match='out of vocabulary'):
|
||||
llm.generate({"prompt_token_ids": [999999]})
|
||||
@@ -0,0 +1,82 @@
|
||||
"""Tests for HF_HUB_OFFLINE mode"""
|
||||
import importlib
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_CONFIGS = [
|
||||
{
|
||||
"model": "facebook/opt-125m",
|
||||
"enforce_eager": True,
|
||||
"gpu_memory_utilization": 0.20,
|
||||
"max_model_len": 64,
|
||||
"max_num_batched_tokens": 64,
|
||||
"max_num_seqs": 64,
|
||||
"tensor_parallel_size": 1,
|
||||
},
|
||||
# {
|
||||
# "model": "mistralai/Mistral-7B-Instruct-v0.1",
|
||||
# "enforce_eager": True,
|
||||
# "gpu_memory_utilization": 0.95,
|
||||
# "max_model_len": 64,
|
||||
# "max_num_batched_tokens": 64,
|
||||
# "max_num_seqs": 64,
|
||||
# "tensor_parallel_size": 1,
|
||||
# "tokenizer_mode": "mistral",
|
||||
# },
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def cache_models():
|
||||
# Cache model files first
|
||||
for model_config in MODEL_CONFIGS:
|
||||
LLM(**model_config)
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.usefixtures("cache_models")
|
||||
def test_offline_mode(monkeypatch):
|
||||
# Set HF to offline mode and ensure we can still construct an LLM
|
||||
try:
|
||||
monkeypatch.setenv("HF_HUB_OFFLINE", "1")
|
||||
# Need to re-import huggingface_hub and friends to setup offline mode
|
||||
_re_import_modules()
|
||||
# Cached model files should be used in offline mode
|
||||
for model_config in MODEL_CONFIGS:
|
||||
LLM(**model_config)
|
||||
finally:
|
||||
# Reset the environment after the test
|
||||
# NB: Assuming tests are run in online mode
|
||||
monkeypatch.delenv("HF_HUB_OFFLINE")
|
||||
_re_import_modules()
|
||||
pass
|
||||
|
||||
|
||||
def _re_import_modules():
|
||||
hf_hub_module_names = [
|
||||
k for k in sys.modules if k.startswith("huggingface_hub")
|
||||
]
|
||||
transformers_module_names = [
|
||||
k for k in sys.modules if k.startswith("transformers")
|
||||
and not k.startswith("transformers_modules")
|
||||
]
|
||||
|
||||
reload_exception = None
|
||||
for module_name in hf_hub_module_names + transformers_module_names:
|
||||
try:
|
||||
importlib.reload(sys.modules[module_name])
|
||||
except Exception as e:
|
||||
reload_exception = e
|
||||
# Try to continue clean up so that other tests are less likely to
|
||||
# be affected
|
||||
|
||||
# Error this test if reloading a module failed
|
||||
if reload_exception is not None:
|
||||
raise reload_exception
|
||||
0
vllm-v0.6.2/tests/entrypoints/openai/__init__.py
Normal file
0
vllm-v0.6.2/tests/entrypoints/openai/__init__.py
Normal file
92
vllm-v0.6.2/tests/entrypoints/openai/test_accuracy.py
Normal file
92
vllm-v0.6.2/tests/entrypoints/openai/test_accuracy.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""
|
||||
This file test accuracy of the vLLM server via LMEval.
|
||||
It uses local-completions, which interacts with vLLM
|
||||
through the OAI API with N concurrent connections.
|
||||
This simulates real work usage of the API and makes
|
||||
sure that the zmq frontend mp RPC message passing and
|
||||
AsyncLLMEngine are working correctly.
|
||||
"""
|
||||
|
||||
import lm_eval
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
'''
|
||||
=============================
|
||||
Modify by vllm_mlu
|
||||
=============================
|
||||
We do not have Qwen2-1.5B-Instruct locally, so we use Qwen2-7B-Instruct instead.
|
||||
'''
|
||||
# The original model is: MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
|
||||
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
|
||||
NUM_CONCURRENT = 500
|
||||
TASK = "gsm8k"
|
||||
FILTER = "exact_match,strict-match"
|
||||
RTOL = 0.03
|
||||
EXPECTED_VALUE = 0.67
|
||||
DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"]
|
||||
MORE_ARGS_LIST = [
|
||||
[], # Default
|
||||
["--enable-chunked-prefill"], # Chunked
|
||||
["--num-scheduler-steps", "8"], # MS
|
||||
["--num-scheduler-steps", "8", "--multi-step-stream-outputs"] # MS+Stream
|
||||
]
|
||||
MAX_WAIT_SECONDS = None
|
||||
|
||||
if current_platform.is_tpu():
|
||||
MORE_ARGS_LIST = [
|
||||
[], # Default
|
||||
# ["--num-scheduler-steps", "8"], # Multi-step << currently fails
|
||||
]
|
||||
MAX_WAIT_SECONDS = 600
|
||||
|
||||
|
||||
def run_test(more_args):
|
||||
"""Run the end to end accuracy test."""
|
||||
|
||||
args = list(DEFAULT_ARGS)
|
||||
args.extend(more_args)
|
||||
print(f"Running with: {args}")
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, args,
|
||||
max_wait_seconds=MAX_WAIT_SECONDS) as remote_server:
|
||||
url = f"{remote_server.url_for('v1')}/completions"
|
||||
|
||||
model_args = (
|
||||
f"model={MODEL_NAME},"
|
||||
f"base_url={url},"
|
||||
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
|
||||
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="local-completions",
|
||||
model_args=model_args,
|
||||
tasks=TASK,
|
||||
)
|
||||
|
||||
measured_value = results["results"][TASK][FILTER]
|
||||
assert (measured_value - RTOL < EXPECTED_VALUE
|
||||
and measured_value + RTOL > EXPECTED_VALUE
|
||||
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda(),
|
||||
reason="V1 currently only supported on CUDA")
|
||||
def test_lm_eval_accuracy_v1_engine(monkeypatch):
|
||||
"""Run with the V1 Engine."""
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
run_test([])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
|
||||
def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
|
||||
"""Run with the V0 Engine."""
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
run_test(more_args)
|
||||
259
vllm-v0.6.2/tests/entrypoints/openai/test_audio.py
Normal file
259
vllm-v0.6.2/tests/entrypoints/openai/test_audio.py
Normal file
@@ -0,0 +1,259 @@
|
||||
from typing import Dict, List
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.multimodal.utils import encode_audio_base64, fetch_audio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_3"
|
||||
TEST_AUDIO_URLS = [
|
||||
AudioAsset("winning_call").url,
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"5",
|
||||
"--enforce-eager",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_audio() -> Dict[str, str]:
|
||||
return {
|
||||
audio_url: encode_audio_base64(*fetch_audio(audio_url))
|
||||
for audio_url in TEST_AUDIO_URLS
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
||||
async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
|
||||
model_name: str, audio_url: str):
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's happening in this audio?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=202, total_tokens=212)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
||||
async def test_single_chat_session_audio_base64encoded(
|
||||
client: openai.AsyncOpenAI, model_name: str, audio_url: str,
|
||||
base64_encoded_audio: Dict[str, str]):
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url":
|
||||
f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's happening in this audio?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=202, total_tokens=212)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
||||
async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
|
||||
model_name: str, audio_url: str):
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's happening in this audio?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
output = chat_completion.choices[0].message.content
|
||||
stop_reason = chat_completion.choices[0].finish_reason
|
||||
|
||||
# test streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
chunks: List[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
if delta.content:
|
||||
chunks.append(delta.content)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == stop_reason
|
||||
assert delta.content
|
||||
assert "".join(chunks) == output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
||||
async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
|
||||
audio_url: str):
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's happening in this audio?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
with pytest.raises(openai.BadRequestError): # test multi-audio input
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# the server should still work afterwards
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
completion = completion.choices[0].text
|
||||
assert completion is not None and len(completion) >= 0
|
||||
105
vllm-v0.6.2/tests/entrypoints/openai/test_basic.py
Normal file
105
vllm-v0.6.2/tests/entrypoints/openai/test_basic.py
Normal file
@@ -0,0 +1,105 @@
|
||||
from http import HTTPStatus
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def server_args(request: pytest.FixtureRequest) -> List[str]:
|
||||
""" Provide extra arguments to the server via indirect parametrization
|
||||
|
||||
Usage:
|
||||
|
||||
>>> @pytest.mark.parametrize(
|
||||
>>> "server_args",
|
||||
>>> [
|
||||
>>> ["--disable-frontend-multiprocessing"],
|
||||
>>> [
|
||||
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
|
||||
>>> "--enable-auto-tool-choice",
|
||||
>>> ],
|
||||
>>> ],
|
||||
>>> indirect=True,
|
||||
>>> )
|
||||
>>> def test_foo(server, client):
|
||||
>>> ...
|
||||
|
||||
This will run `test_foo` twice with servers with:
|
||||
- `--disable-frontend-multiprocessing`
|
||||
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
|
||||
|
||||
"""
|
||||
if not hasattr(request, "param"):
|
||||
return []
|
||||
|
||||
val = request.param
|
||||
|
||||
if isinstance(val, str):
|
||||
return [val]
|
||||
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(server_args):
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
*server_args,
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server_args",
|
||||
[
|
||||
pytest.param([], id="default-frontend-multiprocessing"),
|
||||
pytest.param(["--disable-frontend-multiprocessing"],
|
||||
id="disable-frontend-multiprocessing")
|
||||
],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_show_version(server: RemoteOpenAIServer):
|
||||
response = requests.get(server.url_for("version"))
|
||||
response.raise_for_status()
|
||||
|
||||
assert response.json() == {"version": VLLM_VERSION}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"server_args",
|
||||
[
|
||||
pytest.param([], id="default-frontend-multiprocessing"),
|
||||
pytest.param(["--disable-frontend-multiprocessing"],
|
||||
id="disable-frontend-multiprocessing")
|
||||
],
|
||||
indirect=True,
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_health(server: RemoteOpenAIServer):
|
||||
response = requests.get(server.url_for("health"))
|
||||
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
985
vllm-v0.6.2/tests/entrypoints/openai/test_chat.py
Normal file
985
vllm-v0.6.2/tests/entrypoints/openai/test_chat.py
Normal file
@@ -0,0 +1,985 @@
|
||||
# imports for guided decoding tests
|
||||
import json
|
||||
import re
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import jsonschema
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import torch
|
||||
from openai import BadRequestError
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
|
||||
from .test_completion import zephyr_lora_files # noqa: F401
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
# lora config below
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
|
||||
)
|
||||
async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}]
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=5,
|
||||
temperature=0.0,
|
||||
logprobs=False)
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.logprobs is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# just test 1 lora hereafter
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}]
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=5,
|
||||
temperature=0.0,
|
||||
logprobs=True,
|
||||
top_logprobs=0)
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.logprobs is not None
|
||||
assert choice.logprobs.content is not None
|
||||
assert len(choice.logprobs.content[0].top_logprobs) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}]
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=5,
|
||||
temperature=0.0,
|
||||
logprobs=True,
|
||||
top_logprobs=5)
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.logprobs is not None
|
||||
assert choice.logprobs.content is not None
|
||||
assert len(choice.logprobs.content[0].top_logprobs) == 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}]
|
||||
|
||||
# Default max_logprobs is 20, so this should raise an error
|
||||
with pytest.raises((openai.BadRequestError, openai.APIError)):
|
||||
stream = await client.chat.completions.create(model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=21,
|
||||
stream=True)
|
||||
async for chunk in stream:
|
||||
...
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=30,
|
||||
stream=False)
|
||||
|
||||
# the server should still work afterwards
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
stream=False)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, prompt_logprobs",
|
||||
[(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
|
||||
)
|
||||
async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
prompt_logprobs: Optional[int]):
|
||||
params: Dict = {
|
||||
"messages": [{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Who won the world series in 2020?"
|
||||
}, {
|
||||
"role":
|
||||
"assistant",
|
||||
"content":
|
||||
"The Los Angeles Dodgers won the World Series in 2020."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Where was it played?"
|
||||
}],
|
||||
"model":
|
||||
model_name
|
||||
}
|
||||
|
||||
if prompt_logprobs is not None:
|
||||
params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
|
||||
|
||||
if prompt_logprobs is not None and prompt_logprobs < 0:
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.chat.completions.create(**params)
|
||||
else:
|
||||
completion = await client.chat.completions.create(**params)
|
||||
if prompt_logprobs is not None:
|
||||
assert completion.prompt_logprobs is not None
|
||||
assert len(completion.prompt_logprobs) > 0
|
||||
else:
|
||||
assert completion.prompt_logprobs is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME],
|
||||
)
|
||||
async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
params: Dict = {
|
||||
"messages": [{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Who won the world series in 2020?"
|
||||
}, {
|
||||
"role":
|
||||
"assistant",
|
||||
"content":
|
||||
"The Los Angeles Dodgers won the World Series in 2020."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Where was it played?"
|
||||
}],
|
||||
"model":
|
||||
model_name,
|
||||
"extra_body": {
|
||||
"prompt_logprobs": 1
|
||||
}
|
||||
}
|
||||
|
||||
completion_1 = await client.chat.completions.create(**params)
|
||||
|
||||
params["extra_body"] = {"prompt_logprobs": 2}
|
||||
completion_2 = await client.chat.completions.create(**params)
|
||||
|
||||
assert len(completion_1.prompt_logprobs[3]) == 1
|
||||
assert len(completion_2.prompt_logprobs[3]) == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_single_chat_session(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5)
|
||||
assert chat_completion.id is not None
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=37, total_tokens=47)
|
||||
|
||||
message = choice.message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# just test 1 lora hereafter
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
output = chat_completion.choices[0].message.content
|
||||
stop_reason = chat_completion.choices[0].finish_reason
|
||||
|
||||
# test streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
chunks: List[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
if delta.content:
|
||||
chunks.append(delta.content)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == stop_reason
|
||||
assert delta.content
|
||||
assert "".join(chunks) == output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
|
||||
)
|
||||
async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "What is the capital of France?"
|
||||
}]
|
||||
|
||||
# Test stream=True, stream_options={"include_usage": False}
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={"include_usage": False})
|
||||
async for chunk in stream:
|
||||
assert chunk.usage is None
|
||||
|
||||
# Test stream=True, stream_options={"include_usage": True,
|
||||
# "continuous_usage_stats": False}}
|
||||
stream = await client.chat.completions.create(model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage":
|
||||
True,
|
||||
"continuous_usage_stats":
|
||||
False
|
||||
})
|
||||
|
||||
async for chunk in stream:
|
||||
if chunk.choices[0].finish_reason is None:
|
||||
assert chunk.usage is None
|
||||
else:
|
||||
assert chunk.usage is None
|
||||
final_chunk = await stream.__anext__()
|
||||
assert final_chunk.usage is not None
|
||||
assert final_chunk.usage.prompt_tokens > 0
|
||||
assert final_chunk.usage.completion_tokens > 0
|
||||
assert final_chunk.usage.total_tokens == (
|
||||
final_chunk.usage.prompt_tokens +
|
||||
final_chunk.usage.completion_tokens)
|
||||
assert final_chunk.choices == []
|
||||
|
||||
# Test stream=False, stream_options={"include_usage": None}
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"include_usage": None})
|
||||
|
||||
# Test stream=False, stream_options={"include_usage": True}
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"include_usage": True})
|
||||
|
||||
# Test stream=True, stream_options={"include_usage": True,
|
||||
# "continuous_usage_stats": True}
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
extra_body=dict(min_tokens=10),
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats": True,
|
||||
},
|
||||
)
|
||||
last_completion_tokens = 0
|
||||
async for chunk in stream:
|
||||
assert chunk.usage.prompt_tokens >= 0
|
||||
assert last_completion_tokens == 0 or \
|
||||
chunk.usage.completion_tokens > last_completion_tokens or \
|
||||
(
|
||||
not chunk.choices and
|
||||
chunk.usage.completion_tokens == last_completion_tokens
|
||||
)
|
||||
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
|
||||
chunk.usage.completion_tokens)
|
||||
last_completion_tokens = chunk.usage.completion_tokens
|
||||
|
||||
assert last_completion_tokens == 10
|
||||
|
||||
|
||||
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
|
||||
# (i.e. using the same ordering as in the Completions API tests), the test
|
||||
# will fail on the second `guided_decoding_backend` even when I swap their order
|
||||
# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_guided_choice):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"The best language for type-safe systems programming is "
|
||||
}]
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
extra_body=dict(guided_choice=sample_guided_choice,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
choice1 = chat_completion.choices[0].message.content
|
||||
assert choice1 in sample_guided_choice
|
||||
|
||||
messages.append({"role": "assistant", "content": choice1})
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": "I disagree, pick another one"
|
||||
})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
extra_body=dict(guided_choice=sample_guided_choice,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
choice2 = chat_completion.choices[0].message.content
|
||||
assert choice2 in sample_guided_choice
|
||||
assert choice1 != choice2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
async def test_guided_json_chat(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
f"Give an example JSON for an employee profile that "
|
||||
f"fits this schema: {sample_json_schema}"
|
||||
}]
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
extra_body=dict(guided_json=sample_json_schema,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None
|
||||
json1 = json.loads(message.content)
|
||||
jsonschema.validate(instance=json1, schema=sample_json_schema)
|
||||
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
messages.append({
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Give me another one with a different name and age"
|
||||
})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
extra_body=dict(guided_json=sample_json_schema,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None
|
||||
json2 = json.loads(message.content)
|
||||
jsonschema.validate(instance=json2, schema=sample_json_schema)
|
||||
assert json1["name"] != json2["name"]
|
||||
assert json1["age"] != json2["age"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
async def test_guided_regex_chat(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str, sample_regex):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
f"Give an example IP address with this regex: {sample_regex}"
|
||||
}]
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=20,
|
||||
extra_body=dict(guided_regex=sample_regex,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
ip1 = chat_completion.choices[0].message.content
|
||||
assert ip1 is not None
|
||||
assert re.fullmatch(sample_regex, ip1) is not None
|
||||
|
||||
messages.append({"role": "assistant", "content": ip1})
|
||||
messages.append({"role": "user", "content": "Give me a different one"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=20,
|
||||
extra_body=dict(guided_regex=sample_regex,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
ip2 = chat_completion.choices[0].message.content
|
||||
assert ip2 is not None
|
||||
assert re.fullmatch(sample_regex, ip2) is not None
|
||||
assert ip1 != ip2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"The best language for type-safe systems programming is "
|
||||
}]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
extra_body=dict(guided_regex={
|
||||
1: "Python",
|
||||
2: "C++"
|
||||
}))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_guided_choice):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"The best language for type-safe systems programming is "
|
||||
}]
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
extra_body=dict(guided_choice=sample_guided_choice,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
|
||||
assert chat_completion.choices[0].logprobs is not None
|
||||
assert chat_completion.choices[0].logprobs.content is not None
|
||||
top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
|
||||
|
||||
# -9999.0 is the minimum logprob returned by OpenAI
|
||||
for item in top_logprobs:
|
||||
assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
async def test_named_tool_use(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
f"Give an example JSON for an employee profile that "
|
||||
f"fits this schema: {sample_json_schema}"
|
||||
}]
|
||||
|
||||
# non-streaming
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}],
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name"
|
||||
}
|
||||
})
|
||||
message = chat_completion.choices[0].message
|
||||
assert len(message.content) == 0
|
||||
json_string = message.tool_calls[0].function.arguments
|
||||
json1 = json.loads(json_string)
|
||||
jsonschema.validate(instance=json1, schema=sample_json_schema)
|
||||
|
||||
messages.append({"role": "assistant", "content": json_string})
|
||||
messages.append({
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Give me another one with a different name and age"
|
||||
})
|
||||
|
||||
# streaming
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}],
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name"
|
||||
}
|
||||
},
|
||||
stream=True)
|
||||
|
||||
output = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
assert delta.content is None or len(delta.content) == 0
|
||||
if delta.tool_calls:
|
||||
output.append(delta.tool_calls[0].function.arguments)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
json2 = json.loads("".join(output))
|
||||
jsonschema.validate(instance=json2, schema=sample_json_schema)
|
||||
assert json1["name"] != json2["name"]
|
||||
assert json1["age"] != json2["age"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
|
||||
async def test_required_tool_use_not_yet_supported(
|
||||
client: openai.AsyncOpenAI, guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
f"Give an example JSON for an employee profile that "
|
||||
f"fits this schema: {sample_json_schema}"
|
||||
}]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}],
|
||||
tool_choice="required")
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}],
|
||||
tool_choice="auto")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
|
||||
async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
f"Give an example JSON for an employee profile that "
|
||||
f"fits this schema: {sample_json_schema}"
|
||||
}]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name":
|
||||
"dummy_function_name"
|
||||
}
|
||||
})
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}],
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "nondefined_function_name"
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_response_format_json_object(client: openai.AsyncOpenAI):
|
||||
for _ in range(2):
|
||||
resp = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{
|
||||
"role":
|
||||
"user",
|
||||
"content": ('what is 1+1? please respond with a JSON object, '
|
||||
'the format is {"result": 2}')
|
||||
}],
|
||||
response_format={"type": "json_object"})
|
||||
|
||||
content = resp.choices[0].message.content
|
||||
assert content is not None
|
||||
|
||||
loaded = json.loads(content)
|
||||
assert loaded == {"result": 2}, loaded
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_response_format_json_schema(client: openai.AsyncOpenAI):
|
||||
prompt = 'what is 1+1? The format is "result": 2'
|
||||
# Check that this prompt cannot lead to a valid JSON without json_schema
|
||||
for _ in range(2):
|
||||
resp = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
)
|
||||
content = resp.choices[0].message.content
|
||||
assert content is not None
|
||||
with pytest.raises((json.JSONDecodeError, AssertionError)):
|
||||
loaded = json.loads(content)
|
||||
assert loaded == {"result": 2}, loaded
|
||||
|
||||
for _ in range(2):
|
||||
resp = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "foo_test",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"result": {
|
||||
"type": "integer"
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
})
|
||||
|
||||
content = resp.choices[0].message.content
|
||||
assert content is not None
|
||||
|
||||
loaded = json.loads(content)
|
||||
assert loaded == {"result": 2}, loaded
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extra_fields(client: openai.AsyncOpenAI):
|
||||
with pytest.raises(BadRequestError) as exc_info:
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.",
|
||||
"extra_field": "0",
|
||||
}], # type: ignore
|
||||
temperature=0,
|
||||
seed=0)
|
||||
|
||||
assert "extra_forbidden" in exc_info.value.message
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_complex_message_content(client: openai.AsyncOpenAI):
|
||||
resp = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type":
|
||||
"text",
|
||||
"text":
|
||||
"what is 1+1? please provide the result without any other text."
|
||||
}]
|
||||
}],
|
||||
temperature=0,
|
||||
seed=0)
|
||||
content = resp.choices[0].message.content
|
||||
assert content == "2"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_custom_role(client: openai.AsyncOpenAI):
|
||||
# Not sure how the model handles custom roles so we just check that
|
||||
# both string and complex message content are handled in the same way
|
||||
|
||||
resp1 = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{
|
||||
"role": "my-custom-role",
|
||||
"content": "what is 1+1?",
|
||||
}], # type: ignore
|
||||
temperature=0,
|
||||
seed=0)
|
||||
|
||||
resp2 = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{
|
||||
"role": "my-custom-role",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": "what is 1+1?"
|
||||
}]
|
||||
}], # type: ignore
|
||||
temperature=0,
|
||||
seed=0)
|
||||
|
||||
content1 = resp1.choices[0].message.content
|
||||
content2 = resp2.choices[0].message.content
|
||||
assert content1 == content2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_long_seed(client: openai.AsyncOpenAI):
|
||||
for seed in [
|
||||
torch.iinfo(torch.long).min - 1,
|
||||
torch.iinfo(torch.long).max + 1
|
||||
]:
|
||||
with pytest.raises(BadRequestError) as exc_info:
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.",
|
||||
}],
|
||||
temperature=0,
|
||||
seed=seed)
|
||||
|
||||
assert ("greater_than_equal" in exc_info.value.message
|
||||
or "less_than_equal" in exc_info.value.message)
|
||||
117
vllm-v0.6.2/tests/entrypoints/openai/test_chat_template.py
Normal file
117
vllm-v0.6.2/tests/entrypoints/openai/test_chat_template.py
Normal file
@@ -0,0 +1,117 @@
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.chat_utils import (apply_hf_chat_template,
|
||||
load_chat_template)
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...utils import VLLM_PATH
|
||||
|
||||
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
|
||||
assert chatml_jinja_path.exists()
|
||||
|
||||
# Define models, templates, and their corresponding expected outputs
|
||||
MODEL_TEMPLATE_GENERATON_OUTPUT = [
|
||||
("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
|
||||
Hello<|im_end|>
|
||||
<|im_start|>assistant
|
||||
Hi there!<|im_end|>
|
||||
<|im_start|>user
|
||||
What is the capital of<|im_end|>
|
||||
<|im_start|>assistant
|
||||
"""),
|
||||
("facebook/opt-125m", chatml_jinja_path, False, False, """<|im_start|>user
|
||||
Hello<|im_end|>
|
||||
<|im_start|>assistant
|
||||
Hi there!<|im_end|>
|
||||
<|im_start|>user
|
||||
What is the capital of"""),
|
||||
("facebook/opt-125m", chatml_jinja_path, False, True, """<|im_start|>user
|
||||
Hello<|im_end|>
|
||||
<|im_start|>assistant
|
||||
Hi there!<|im_end|>
|
||||
<|im_start|>user
|
||||
What is the capital of<|im_end|>
|
||||
<|im_start|>assistant
|
||||
The capital of"""),
|
||||
]
|
||||
|
||||
TEST_MESSAGES = [
|
||||
{
|
||||
'role': 'user',
|
||||
'content': 'Hello'
|
||||
},
|
||||
{
|
||||
'role': 'assistant',
|
||||
'content': 'Hi there!'
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': 'What is the capital of'
|
||||
},
|
||||
]
|
||||
ASSISTANT_MESSAGE_TO_CONTINUE = {
|
||||
'role': 'assistant',
|
||||
'content': 'The capital of'
|
||||
}
|
||||
|
||||
|
||||
def test_load_chat_template():
|
||||
# Testing chatml template
|
||||
template_content = load_chat_template(chat_template=chatml_jinja_path)
|
||||
|
||||
# Test assertions
|
||||
assert template_content is not None
|
||||
# Hard coded value for template_chatml.jinja
|
||||
assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
|
||||
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501
|
||||
|
||||
|
||||
def test_no_load_chat_template_filelike():
|
||||
# Testing chatml template
|
||||
template = "../../examples/does_not_exist"
|
||||
|
||||
with pytest.raises(ValueError, match="looks like a file path"):
|
||||
load_chat_template(chat_template=template)
|
||||
|
||||
|
||||
def test_no_load_chat_template_literallike():
|
||||
# Testing chatml template
|
||||
template = "{{ messages }}"
|
||||
|
||||
template_content = load_chat_template(chat_template=template)
|
||||
|
||||
assert template_content == template
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model,template,add_generation_prompt,continue_final_message,expected_output",
|
||||
MODEL_TEMPLATE_GENERATON_OUTPUT)
|
||||
def test_get_gen_prompt(model, template, add_generation_prompt,
|
||||
continue_final_message, expected_output):
|
||||
# Initialize the tokenizer
|
||||
tokenizer = get_tokenizer(tokenizer_name=model)
|
||||
template_content = load_chat_template(chat_template=template)
|
||||
|
||||
# Create a mock request object using keyword arguments
|
||||
mock_request = ChatCompletionRequest(
|
||||
model=model,
|
||||
messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
|
||||
if continue_final_message else TEST_MESSAGES,
|
||||
add_generation_prompt=add_generation_prompt,
|
||||
continue_final_message=continue_final_message,
|
||||
)
|
||||
|
||||
# Call the function and get the result
|
||||
result = apply_hf_chat_template(
|
||||
tokenizer,
|
||||
conversation=mock_request.messages,
|
||||
chat_template=mock_request.chat_template or template_content,
|
||||
add_generation_prompt=mock_request.add_generation_prompt,
|
||||
continue_final_message=mock_request.continue_final_message,
|
||||
)
|
||||
|
||||
# Test assertion
|
||||
assert result == expected_output, (
|
||||
f"The generated prompt does not match the expected output for "
|
||||
f"model {model} and template {template}")
|
||||
126
vllm-v0.6.2/tests/entrypoints/openai/test_chunked_prompt.py
Normal file
126
vllm-v0.6.2/tests/entrypoints/openai/test_chunked_prompt.py
Normal file
@@ -0,0 +1,126 @@
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
# lora config below
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enable-chunked-prefill",
|
||||
"--max-num-batched-tokens",
|
||||
"1000",
|
||||
# large prompts create a lot of output
|
||||
"--disable-log-requests",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_completion_stream_options_and_logprobs_with_long_prompts(
|
||||
client: openai.AsyncOpenAI):
|
||||
# Test stream with long prompt
|
||||
prompt = "What is the capital of France?" * 400
|
||||
|
||||
stream = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats": True,
|
||||
},
|
||||
logprobs=5,
|
||||
)
|
||||
|
||||
tokens_received = 0
|
||||
finished = False
|
||||
async for chunk in stream:
|
||||
assert chunk.usage.prompt_tokens >= 0
|
||||
assert chunk.usage.completion_tokens >= 0
|
||||
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
|
||||
chunk.usage.completion_tokens)
|
||||
if not finished:
|
||||
tokens_received += 1
|
||||
assert chunk.choices[0].text
|
||||
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finished = True
|
||||
|
||||
if finished:
|
||||
assert chunk.usage.completion_tokens == tokens_received
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
|
||||
client: openai.AsyncOpenAI):
|
||||
# Test stream with long prompt
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "What is the capital of France?" * 400
|
||||
}]
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats": True,
|
||||
},
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
)
|
||||
|
||||
tokens_received = 0
|
||||
empty_chunks_received = 0
|
||||
finished = False
|
||||
async for chunk in stream:
|
||||
assert chunk.usage.prompt_tokens >= 0
|
||||
assert chunk.usage.completion_tokens >= 0
|
||||
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
|
||||
chunk.usage.completion_tokens)
|
||||
|
||||
if not finished:
|
||||
if chunk.choices[0].delta.content == "":
|
||||
# when there is no tokens generated
|
||||
assert chunk.usage.completion_tokens == 0
|
||||
assert chunk.choices[0].logprobs is None
|
||||
empty_chunks_received += 1
|
||||
else:
|
||||
tokens_received += 1
|
||||
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finished = True
|
||||
|
||||
if finished:
|
||||
assert chunk.usage.completion_tokens == tokens_received
|
||||
|
||||
assert empty_chunks_received <= 1
|
||||
131
vllm-v0.6.2/tests/entrypoints/openai/test_cli_args.py
Normal file
131
vllm-v0.6.2/tests/entrypoints/openai/test_cli_args.py
Normal file
@@ -0,0 +1,131 @@
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.cli_args import (make_arg_parser,
|
||||
validate_parsed_serve_args)
|
||||
from vllm.entrypoints.openai.serving_engine import LoRAModulePath
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
from ...utils import VLLM_PATH
|
||||
|
||||
LORA_MODULE = {
|
||||
"name": "module2",
|
||||
"path": "/path/to/module2",
|
||||
"base_model_name": "llama"
|
||||
}
|
||||
CHATML_JINJA_PATH = VLLM_PATH / "examples/template_chatml.jinja"
|
||||
assert CHATML_JINJA_PATH.exists()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def serve_parser():
|
||||
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
|
||||
return make_arg_parser(parser)
|
||||
|
||||
|
||||
### Tests for Lora module parsing
|
||||
def test_valid_key_value_format(serve_parser):
|
||||
# Test old format: name=path
|
||||
args = serve_parser.parse_args([
|
||||
'--lora-modules',
|
||||
'module1=/path/to/module1',
|
||||
])
|
||||
expected = [LoRAModulePath(name='module1', path='/path/to/module1')]
|
||||
assert args.lora_modules == expected
|
||||
|
||||
|
||||
def test_valid_json_format(serve_parser):
|
||||
# Test valid JSON format input
|
||||
args = serve_parser.parse_args([
|
||||
'--lora-modules',
|
||||
json.dumps(LORA_MODULE),
|
||||
])
|
||||
expected = [
|
||||
LoRAModulePath(name='module2',
|
||||
path='/path/to/module2',
|
||||
base_model_name='llama')
|
||||
]
|
||||
assert args.lora_modules == expected
|
||||
|
||||
|
||||
def test_invalid_json_format(serve_parser):
|
||||
# Test invalid JSON format input, missing closing brace
|
||||
with pytest.raises(SystemExit):
|
||||
serve_parser.parse_args([
|
||||
'--lora-modules', '{"name": "module3", "path": "/path/to/module3"'
|
||||
])
|
||||
|
||||
|
||||
def test_invalid_type_error(serve_parser):
|
||||
# Test type error when values are not JSON or key=value
|
||||
with pytest.raises(SystemExit):
|
||||
serve_parser.parse_args([
|
||||
'--lora-modules',
|
||||
'invalid_format' # This is not JSON or key=value format
|
||||
])
|
||||
|
||||
|
||||
def test_invalid_json_field(serve_parser):
|
||||
# Test valid JSON format but missing required fields
|
||||
with pytest.raises(SystemExit):
|
||||
serve_parser.parse_args([
|
||||
'--lora-modules',
|
||||
'{"name": "module4"}' # Missing required 'path' field
|
||||
])
|
||||
|
||||
|
||||
def test_empty_values(serve_parser):
|
||||
# Test when no LoRA modules are provided
|
||||
args = serve_parser.parse_args(['--lora-modules', ''])
|
||||
assert args.lora_modules == []
|
||||
|
||||
|
||||
def test_multiple_valid_inputs(serve_parser):
|
||||
# Test multiple valid inputs (both old and JSON format)
|
||||
args = serve_parser.parse_args([
|
||||
'--lora-modules',
|
||||
'module1=/path/to/module1',
|
||||
json.dumps(LORA_MODULE),
|
||||
])
|
||||
expected = [
|
||||
LoRAModulePath(name='module1', path='/path/to/module1'),
|
||||
LoRAModulePath(name='module2',
|
||||
path='/path/to/module2',
|
||||
base_model_name='llama')
|
||||
]
|
||||
assert args.lora_modules == expected
|
||||
|
||||
|
||||
### Tests for serve argument validation that run prior to loading
|
||||
def test_enable_auto_choice_passes_without_tool_call_parser(serve_parser):
|
||||
"""Ensure validation fails if tool choice is enabled with no call parser"""
|
||||
# If we enable-auto-tool-choice, explode with no tool-call-parser
|
||||
args = serve_parser.parse_args(args=["--enable-auto-tool-choice"])
|
||||
with pytest.raises(TypeError):
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
|
||||
def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
|
||||
"""Ensure validation passes with tool choice enabled with a call parser"""
|
||||
args = serve_parser.parse_args(args=[
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"mistral",
|
||||
])
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
|
||||
def test_chat_template_validation_for_happy_paths(serve_parser):
|
||||
"""Ensure validation passes if the chat template exists"""
|
||||
args = serve_parser.parse_args(
|
||||
args=["--chat-template",
|
||||
CHATML_JINJA_PATH.absolute().as_posix()])
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
|
||||
def test_chat_template_validation_for_sad_paths(serve_parser):
|
||||
"""Ensure validation fails if the chat template doesn't exist"""
|
||||
args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"])
|
||||
with pytest.raises(ValueError):
|
||||
validate_parsed_serve_args(args)
|
||||
781
vllm-v0.6.2/tests/entrypoints/openai/test_completion.py
Normal file
781
vllm-v0.6.2/tests/entrypoints/openai/test_completion.py
Normal file
@@ -0,0 +1,781 @@
|
||||
# imports for guided decoding tests
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import jsonschema
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
from openai import BadRequestError
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
# technically these adapters use a different base model,
|
||||
# but we're not testing generation quality here
|
||||
LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
||||
PA_NAME = "swapnilbp/llama_tweet_ptune"
|
||||
# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
|
||||
# need to change to match the prompt adapter
|
||||
PA_NUM_VIRTUAL_TOKENS = 8
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_added_tokens_files(zephyr_lora_files):
|
||||
tmp_dir = TemporaryDirectory()
|
||||
tmp_model_dir = f"{tmp_dir.name}/zephyr"
|
||||
shutil.copytree(zephyr_lora_files, tmp_model_dir)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
# Copy tokenizer to adapter and add some unique tokens
|
||||
# 32000, 32001, 32002
|
||||
added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
|
||||
special_tokens=True)
|
||||
assert added == 3
|
||||
tokenizer.save_pretrained(tmp_model_dir)
|
||||
yield tmp_model_dir
|
||||
tmp_dir.cleanup()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_pa_files():
|
||||
return snapshot_download(repo_id=PA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
|
||||
zephyr_pa_files):
|
||||
return [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
# lora config
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
# pa config
|
||||
"--enable-prompt-adapter",
|
||||
"--prompt-adapters",
|
||||
f"zephyr-pa={zephyr_pa_files}",
|
||||
f"zephyr-pa2={zephyr_pa_files}",
|
||||
"--max-prompt-adapters",
|
||||
"2",
|
||||
"--max-prompt-adapter-token",
|
||||
"128",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module",
|
||||
params=["", "--disable-frontend-multiprocessing"])
|
||||
def server(default_server_args, request):
|
||||
if request.param:
|
||||
default_server_args.append(request.param)
|
||||
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras, then test prompt adapters
|
||||
"model_name,num_virtual_tokens",
|
||||
[(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
|
||||
("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
|
||||
("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
|
||||
)
|
||||
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
|
||||
num_virtual_tokens: int):
|
||||
completion = await client.completions.create(model=model_name,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0)
|
||||
|
||||
assert completion.id is not None
|
||||
assert completion.choices is not None and len(completion.choices) == 1
|
||||
|
||||
choice = completion.choices[0]
|
||||
assert len(choice.text) >= 5
|
||||
assert choice.finish_reason == "length"
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5,
|
||||
prompt_tokens=6 + num_virtual_tokens,
|
||||
total_tokens=11 + num_virtual_tokens)
|
||||
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 1
|
||||
assert completion.choices[0].prompt_logprobs is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_added_lora_tokens(client: openai.AsyncOpenAI):
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model="zephyr-lora2",
|
||||
prompt=[0, 0, 32000, 32001, 32002],
|
||||
echo=True,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
# Added tokens should appear in tokenized prompt
|
||||
assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
|
||||
# test using token IDs
|
||||
with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
|
||||
# Added tokens should be rejected by the base model
|
||||
await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=[0, 0, 32000, 32001, 32002],
|
||||
echo=True,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras, then test prompt adapters
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
|
||||
)
|
||||
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
logprobs=None,
|
||||
)
|
||||
choice = completion.choices[0]
|
||||
assert choice.logprobs is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# just test 1 lora and 1 pa hereafter
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
|
||||
)
|
||||
async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
logprobs=0,
|
||||
)
|
||||
choice = completion.choices[0]
|
||||
assert choice.logprobs is not None
|
||||
assert choice.logprobs.token_logprobs is not None
|
||||
assert choice.logprobs.top_logprobs is not None
|
||||
assert len(choice.logprobs.top_logprobs[0]) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
|
||||
)
|
||||
async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
logprobs=5,
|
||||
)
|
||||
choice = completion.choices[0]
|
||||
assert choice.logprobs is not None
|
||||
assert choice.logprobs.token_logprobs is not None
|
||||
assert choice.logprobs.top_logprobs is not None
|
||||
assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
|
||||
)
|
||||
async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
|
||||
with pytest.raises(
|
||||
(openai.BadRequestError, openai.APIError)): # test using token IDs
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
# vLLM has higher default max_logprobs (20 instead of 5) to support
|
||||
# both Completion API and Chat Completion API
|
||||
logprobs=21,
|
||||
)
|
||||
...
|
||||
with pytest.raises(
|
||||
(openai.BadRequestError, openai.APIError)): # test using token IDs
|
||||
stream = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
# vLLM has higher default max_logprobs (20 instead of 5) to support
|
||||
# both Completion API and Chat Completion API
|
||||
logprobs=30,
|
||||
stream=True,
|
||||
)
|
||||
async for chunk in stream:
|
||||
...
|
||||
|
||||
# the server should still work afterwards
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
|
||||
(MODEL_NAME, 0),
|
||||
(MODEL_NAME, 1),
|
||||
(MODEL_NAME, None)])
|
||||
async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
prompt_logprobs: Optional[int]):
|
||||
params: Dict = {
|
||||
"prompt": ["A robot may not injure another robot", "My name is"],
|
||||
"model": model_name,
|
||||
}
|
||||
if prompt_logprobs is not None:
|
||||
params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
|
||||
|
||||
if prompt_logprobs is not None and prompt_logprobs < 0:
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.completions.create(**params)
|
||||
else:
|
||||
completion = await client.completions.create(**params)
|
||||
if prompt_logprobs is not None:
|
||||
assert completion.choices[0].prompt_logprobs is not None
|
||||
assert len(completion.choices[0].prompt_logprobs) > 0
|
||||
|
||||
assert completion.choices[1].prompt_logprobs is not None
|
||||
assert len(completion.choices[1].prompt_logprobs) > 0
|
||||
|
||||
else:
|
||||
assert completion.choices[0].prompt_logprobs is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
|
||||
)
|
||||
async def test_completion_streaming(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
prompt = "What is an LLM?"
|
||||
|
||||
single_completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
single_output = single_completion.choices[0].text
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True)
|
||||
chunks: List[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
chunks.append(chunk.choices[0].text)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == "length"
|
||||
assert chunk.choices[0].text
|
||||
assert "".join(chunks) == single_output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
|
||||
)
|
||||
async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
|
||||
"""Streaming for parallel sampling.
|
||||
The tokens from multiple samples, are flattened into a single stream,
|
||||
with an index to indicate which sample the token belongs to.
|
||||
"""
|
||||
|
||||
prompt = "What is an LLM?"
|
||||
n = 3
|
||||
max_tokens = 5
|
||||
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
n=n,
|
||||
stream=True)
|
||||
chunks: List[List[str]] = [[] for i in range(n)]
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
index = chunk.choices[0].index
|
||||
text = chunk.choices[0].text
|
||||
chunks[index].append(text)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
assert finish_reason_count == n
|
||||
for chunk in chunks:
|
||||
assert len(chunk) == max_tokens
|
||||
print("".join(chunk))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
|
||||
)
|
||||
async def test_completion_stream_options(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
prompt = "What is the capital of France?"
|
||||
|
||||
# Test stream=True, stream_options=
|
||||
# {"include_usage": False, "continuous_usage_stats": False}
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": False,
|
||||
"continuous_usage_stats":
|
||||
False,
|
||||
})
|
||||
|
||||
async for chunk in stream:
|
||||
assert chunk.usage is None
|
||||
|
||||
# Test stream=True, stream_options=
|
||||
# {"include_usage": False, "continuous_usage_stats": True}
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": False,
|
||||
"continuous_usage_stats":
|
||||
True,
|
||||
})
|
||||
async for chunk in stream:
|
||||
assert chunk.usage is None
|
||||
|
||||
# Test stream=True, stream_options=
|
||||
# {"include_usage": True, "continuous_usage_stats": False}
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats":
|
||||
False,
|
||||
})
|
||||
async for chunk in stream:
|
||||
if chunk.choices[0].finish_reason is None:
|
||||
assert chunk.usage is None
|
||||
else:
|
||||
assert chunk.usage is None
|
||||
final_chunk = await stream.__anext__()
|
||||
assert final_chunk.usage is not None
|
||||
assert final_chunk.usage.prompt_tokens > 0
|
||||
assert final_chunk.usage.completion_tokens > 0
|
||||
assert final_chunk.usage.total_tokens == (
|
||||
final_chunk.usage.prompt_tokens +
|
||||
final_chunk.usage.completion_tokens)
|
||||
assert final_chunk.choices == []
|
||||
|
||||
# Test stream=True, stream_options=
|
||||
# {"include_usage": True, "continuous_usage_stats": True}
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats":
|
||||
True,
|
||||
})
|
||||
async for chunk in stream:
|
||||
assert chunk.usage is not None
|
||||
assert chunk.usage.prompt_tokens > 0
|
||||
assert chunk.usage.completion_tokens > 0
|
||||
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
|
||||
chunk.usage.completion_tokens)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
final_chunk = await stream.__anext__()
|
||||
assert final_chunk.usage is not None
|
||||
assert final_chunk.usage.prompt_tokens > 0
|
||||
assert final_chunk.usage.completion_tokens > 0
|
||||
assert final_chunk.usage.total_tokens == (
|
||||
final_chunk.usage.prompt_tokens +
|
||||
final_chunk.usage.completion_tokens)
|
||||
assert final_chunk.choices == []
|
||||
|
||||
# Test stream=False, stream_options=
|
||||
# {"include_usage": None}
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"include_usage": None})
|
||||
|
||||
# Test stream=False, stream_options=
|
||||
# {"include_usage": True}
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"include_usage": True})
|
||||
|
||||
# Test stream=False, stream_options=
|
||||
# {"continuous_usage_stats": None}
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"continuous_usage_stats": None})
|
||||
|
||||
# Test stream=False, stream_options=
|
||||
# {"continuous_usage_stats": True}
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"continuous_usage_stats": True})
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-pa"],
|
||||
)
|
||||
async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
|
||||
# test both text and token IDs
|
||||
for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
|
||||
# test simple list
|
||||
batch = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompts,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert len(batch.choices) == 2
|
||||
assert batch.choices[0].text == batch.choices[1].text
|
||||
|
||||
# test n = 2
|
||||
batch = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompts,
|
||||
n=2,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body=dict(
|
||||
# NOTE: this has to be true for n > 1 in vLLM, but
|
||||
# not necessary for official client.
|
||||
use_beam_search=True),
|
||||
)
|
||||
assert len(batch.choices) == 4
|
||||
assert batch.choices[0].text != batch.choices[
|
||||
1].text, "beam search should be different"
|
||||
assert batch.choices[0].text == batch.choices[
|
||||
2].text, "two copies of the same prompt should be the same"
|
||||
assert batch.choices[1].text == batch.choices[
|
||||
3].text, "two copies of the same prompt should be the same"
|
||||
|
||||
# test streaming
|
||||
batch = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompts,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
texts = [""] * 2
|
||||
async for chunk in batch:
|
||||
assert len(chunk.choices) == 1
|
||||
choice = chunk.choices[0]
|
||||
texts[choice.index] += choice.text
|
||||
assert texts[0] == texts[1]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_logits_bias(client: openai.AsyncOpenAI):
|
||||
prompt = "Hello, my name is"
|
||||
max_tokens = 5
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
|
||||
# Test exclusive selection
|
||||
token_id = 1000
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
logit_bias={str(token_id): 100},
|
||||
seed=42,
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 5
|
||||
response_tokens = tokenizer(completion.choices[0].text,
|
||||
add_special_tokens=False)["input_ids"]
|
||||
expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
|
||||
add_special_tokens=False)["input_ids"]
|
||||
assert all([
|
||||
response == expected
|
||||
for response, expected in zip(response_tokens, expected_tokens)
|
||||
])
|
||||
|
||||
# Test ban
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
)
|
||||
response_tokens = tokenizer(completion.choices[0].text,
|
||||
add_special_tokens=False)["input_ids"]
|
||||
first_response = completion.choices[0].text
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
logit_bias={str(token): -100
|
||||
for token in response_tokens},
|
||||
)
|
||||
assert first_response != completion.choices[0].text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_allowed_token_ids(client: openai.AsyncOpenAI):
|
||||
prompt = "Hello, my name is"
|
||||
max_tokens = 1
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
|
||||
# Test exclusive selection
|
||||
allowed_ids = [21555, 21557, 21558]
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
seed=42,
|
||||
extra_body=dict(allowed_token_ids=allowed_ids),
|
||||
logprobs=1,
|
||||
)
|
||||
response_tokens = completion.choices[0].logprobs.tokens
|
||||
assert len(response_tokens) == 1
|
||||
assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
async def test_guided_json_completion(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {sample_json_schema}",
|
||||
n=3,
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
extra_body=dict(guided_json=sample_json_schema,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
|
||||
assert completion.id is not None
|
||||
assert len(completion.choices) == 3
|
||||
for i in range(3):
|
||||
output_json = json.loads(completion.choices[i].text)
|
||||
jsonschema.validate(instance=output_json, schema=sample_json_schema)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
async def test_guided_regex_completion(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_regex):
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
|
||||
n=3,
|
||||
temperature=1.0,
|
||||
max_tokens=20,
|
||||
extra_body=dict(guided_regex=sample_regex,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
|
||||
assert completion.id is not None
|
||||
assert len(completion.choices) == 3
|
||||
for i in range(3):
|
||||
assert re.fullmatch(sample_regex,
|
||||
completion.choices[i].text) is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
async def test_guided_choice_completion(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_guided_choice):
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="The best language for type-safe systems programming is ",
|
||||
n=2,
|
||||
temperature=1.0,
|
||||
max_tokens=10,
|
||||
extra_body=dict(guided_choice=sample_guided_choice,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
|
||||
assert completion.id is not None
|
||||
assert len(completion.choices) == 2
|
||||
for i in range(2):
|
||||
assert completion.choices[i].text in sample_guided_choice
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_grammar(client: openai.AsyncOpenAI,
|
||||
sample_sql_statements):
|
||||
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=("Generate a sql state that select col_1 from "
|
||||
"table_1 where it is equals to 1"),
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
extra_body=dict(guided_grammar=sample_sql_statements))
|
||||
|
||||
content = completion.choices[0].text
|
||||
|
||||
# use Lark to parse the output, and make sure it's a valid parse tree
|
||||
from lark import Lark
|
||||
parser = Lark(sample_sql_statements)
|
||||
parser.parse(content)
|
||||
|
||||
# remove spaces for comparison b/c we removed them in the grammar
|
||||
ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
|
||||
|
||||
assert content.strip() == ground_truth
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
|
||||
)
|
||||
@pytest.mark.parametrize("logprobs_arg", [1, 0])
|
||||
async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
|
||||
model_name: str, logprobs_arg: int):
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
# test using text and token IDs
|
||||
for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
|
||||
completion = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
echo=True,
|
||||
logprobs=logprobs_arg)
|
||||
|
||||
prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
|
||||
list) else prompt
|
||||
assert re.search(r"^" + prompt_text, completion.choices[0].text)
|
||||
logprobs = completion.choices[0].logprobs
|
||||
assert logprobs is not None
|
||||
assert len(logprobs.text_offset) > 5
|
||||
assert (len(logprobs.token_logprobs) > 5
|
||||
and logprobs.token_logprobs[0] is None)
|
||||
assert (len(logprobs.top_logprobs) > 5
|
||||
and logprobs.top_logprobs[0] is None)
|
||||
for top_logprobs in logprobs.top_logprobs[1:]:
|
||||
assert max(logprobs_arg,
|
||||
1) <= len(top_logprobs) <= logprobs_arg + 1
|
||||
assert len(logprobs.tokens) > 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["outlines", "lm-format-enforcer"])
|
||||
async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema, sample_regex):
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Give an example JSON that fits this schema: 42",
|
||||
extra_body=dict(guided_json=42,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Give an example string that fits this regex",
|
||||
extra_body=dict(guided_regex=sample_regex,
|
||||
guided_json=sample_json_schema))
|
||||
250
vllm-v0.6.2/tests/entrypoints/openai/test_embedding.py
Normal file
250
vllm-v0.6.2/tests/entrypoints/openai/test_embedding.py
Normal file
@@ -0,0 +1,250 @@
|
||||
import base64
|
||||
|
||||
import numpy as np
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
|
||||
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--chat-template",
|
||||
DUMMY_CHAT_TEMPLATE,
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
|
||||
input_texts = [
|
||||
"The chef prepared a delicious meal.",
|
||||
]
|
||||
|
||||
# test single embedding
|
||||
embeddings = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_texts,
|
||||
encoding_format="float",
|
||||
)
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding) == 4096
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 9
|
||||
assert embeddings.usage.total_tokens == 9
|
||||
|
||||
# test using token IDs
|
||||
input_tokens = [1, 1, 1, 1, 1]
|
||||
embeddings = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_tokens,
|
||||
encoding_format="float",
|
||||
)
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding) == 4096
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 5
|
||||
assert embeddings.usage.total_tokens == 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
|
||||
# test List[str]
|
||||
input_texts = [
|
||||
"The cat sat on the mat.", "A feline was resting on a rug.",
|
||||
"Stars twinkle brightly in the night sky."
|
||||
]
|
||||
embeddings = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_texts,
|
||||
encoding_format="float",
|
||||
)
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 3
|
||||
assert len(embeddings.data[0].embedding) == 4096
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 32
|
||||
assert embeddings.usage.total_tokens == 32
|
||||
|
||||
# test List[List[int]]
|
||||
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
|
||||
[25, 32, 64, 77]]
|
||||
embeddings = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_tokens,
|
||||
encoding_format="float",
|
||||
)
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 4
|
||||
assert len(embeddings.data[0].embedding) == 4096
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 17
|
||||
assert embeddings.usage.total_tokens == 17
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_conversation_embedding(server: RemoteOpenAIServer,
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
messages = [{
|
||||
"role": "user",
|
||||
"content": "The cat sat on the mat.",
|
||||
}, {
|
||||
"role": "assistant",
|
||||
"content": "A feline was resting on a rug.",
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Stars twinkle brightly in the night sky.",
|
||||
}]
|
||||
|
||||
chat_response = requests.post(server.url_for("v1/embeddings"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"messages": messages,
|
||||
"encoding_format": "float",
|
||||
})
|
||||
chat_response.raise_for_status()
|
||||
chat_embeddings = chat_response.json()
|
||||
|
||||
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
chat_template=DUMMY_CHAT_TEMPLATE,
|
||||
add_generation_prompt=True,
|
||||
continue_final_message=False,
|
||||
tokenize=False,
|
||||
)
|
||||
completion_response = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=prompt,
|
||||
encoding_format="float",
|
||||
# To be consistent with chat
|
||||
extra_body={"add_special_tokens": False},
|
||||
)
|
||||
completion_embeddings = completion_response.model_dump(mode="json")
|
||||
|
||||
assert chat_embeddings.pop("id") is not None
|
||||
assert completion_embeddings.pop("id") is not None
|
||||
assert chat_embeddings.pop("created") <= completion_embeddings.pop(
|
||||
"created")
|
||||
assert chat_embeddings == completion_embeddings
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
input_texts = [
|
||||
"Hello my name is",
|
||||
"The best thing about vLLM is that it supports many different models"
|
||||
]
|
||||
|
||||
responses_float = await client.embeddings.create(input=input_texts,
|
||||
model=model_name,
|
||||
encoding_format="float")
|
||||
|
||||
responses_base64 = await client.embeddings.create(input=input_texts,
|
||||
model=model_name,
|
||||
encoding_format="base64")
|
||||
|
||||
decoded_responses_base64_data = []
|
||||
for data in responses_base64.data:
|
||||
decoded_responses_base64_data.append(
|
||||
np.frombuffer(base64.b64decode(data.embedding),
|
||||
dtype="float32").tolist())
|
||||
|
||||
assert responses_float.data[0].embedding == decoded_responses_base64_data[
|
||||
0]
|
||||
assert responses_float.data[1].embedding == decoded_responses_base64_data[
|
||||
1]
|
||||
|
||||
# Default response is float32 decoded from base64 by OpenAI Client
|
||||
responses_default = await client.embeddings.create(input=input_texts,
|
||||
model=model_name)
|
||||
|
||||
assert responses_float.data[0].embedding == responses_default.data[
|
||||
0].embedding
|
||||
assert responses_float.data[1].embedding == responses_default.data[
|
||||
1].embedding
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
input_texts = [
|
||||
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
|
||||
]
|
||||
|
||||
# test single embedding
|
||||
embeddings = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_texts,
|
||||
extra_body={"truncate_prompt_tokens": 10})
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding) == 4096
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 10
|
||||
assert embeddings.usage.total_tokens == 10
|
||||
|
||||
input_tokens = [
|
||||
1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
|
||||
9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
|
||||
]
|
||||
embeddings = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_tokens,
|
||||
extra_body={"truncate_prompt_tokens": 10})
|
||||
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding) == 4096
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
assert embeddings.usage.prompt_tokens == 10
|
||||
assert embeddings.usage.total_tokens == 10
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
input_texts = [
|
||||
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
|
||||
]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
embeddings = await client.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_texts,
|
||||
extra_body={"truncate_prompt_tokens": 8193})
|
||||
assert "error" in embeddings.object
|
||||
assert "truncate_prompt_tokens value is greater than max_model_len. "\
|
||||
"Please, select a smaller truncation size." in embeddings.message
|
||||
52
vllm-v0.6.2/tests/entrypoints/openai/test_encoder_decoder.py
Normal file
52
vllm-v0.6.2/tests/entrypoints/openai/test_encoder_decoder.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "facebook/bart-base"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--enforce-eager",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
|
||||
completion = await client.completions.create(model=model_name,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0)
|
||||
|
||||
assert completion.id is not None
|
||||
assert completion.choices is not None and len(completion.choices) == 1
|
||||
|
||||
choice = completion.choices[0]
|
||||
assert len(choice.text) >= 5
|
||||
assert choice.finish_reason == "length"
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5, prompt_tokens=2, total_tokens=7)
|
||||
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 1
|
||||
83
vllm-v0.6.2/tests/entrypoints/openai/test_lora_lineage.py
Normal file
83
vllm-v0.6.2/tests/entrypoints/openai/test_lora_lineage.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import json
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
|
||||
# generation quality here
|
||||
LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server_with_lora_modules_json(zephyr_lora_files):
|
||||
# Define the json format LoRA module configurations
|
||||
lora_module_1 = {
|
||||
"name": "zephyr-lora",
|
||||
"path": zephyr_lora_files,
|
||||
"base_model_name": MODEL_NAME
|
||||
}
|
||||
|
||||
lora_module_2 = {
|
||||
"name": "zephyr-lora2",
|
||||
"path": zephyr_lora_files,
|
||||
"base_model_name": MODEL_NAME
|
||||
}
|
||||
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
# lora config below
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
json.dumps(lora_module_1),
|
||||
json.dumps(lora_module_2),
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
"--max-num-seqs",
|
||||
"64",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client_for_lora_lineage(server_with_lora_modules_json):
|
||||
async with server_with_lora_modules_json.get_async_client(
|
||||
) as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
|
||||
zephyr_lora_files):
|
||||
models = await client_for_lora_lineage.models.list()
|
||||
models = models.data
|
||||
served_model = models[0]
|
||||
lora_models = models[1:]
|
||||
assert served_model.id == MODEL_NAME
|
||||
assert served_model.root == MODEL_NAME
|
||||
assert served_model.parent is None
|
||||
assert all(lora_model.root == zephyr_lora_files
|
||||
for lora_model in lora_models)
|
||||
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
|
||||
assert lora_models[0].id == "zephyr-lora"
|
||||
assert lora_models[1].id == "zephyr-lora2"
|
||||
236
vllm-v0.6.2/tests/entrypoints/openai/test_metrics.py
Normal file
236
vllm-v0.6.2/tests/entrypoints/openai/test_metrics.py
Normal file
@@ -0,0 +1,236 @@
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from http import HTTPStatus
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
from prometheus_client.parser import text_string_to_metric_families
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args():
|
||||
return [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"1024",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module",
|
||||
params=[
|
||||
"",
|
||||
"--enable-chunked-prefill",
|
||||
"--disable-frontend-multiprocessing",
|
||||
])
|
||||
def server(default_server_args, request):
|
||||
if request.param:
|
||||
default_server_args.append(request.param)
|
||||
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as cl:
|
||||
yield cl
|
||||
|
||||
|
||||
_PROMPT = "Hello my name is Robert and I love magic"
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
|
||||
|
||||
_NUM_REQUESTS = 10
|
||||
_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
|
||||
_NUM_GENERATION_TOKENS_PER_REQUEST = 10
|
||||
|
||||
# {metric_family: [(suffix, expected_value)]}
|
||||
EXPECTED_VALUES = {
|
||||
"vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
|
||||
"vllm:time_per_output_token_seconds":
|
||||
[("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
|
||||
"vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
|
||||
"vllm:request_prompt_tokens":
|
||||
[("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
|
||||
("_count", _NUM_REQUESTS)],
|
||||
"vllm:request_generation_tokens":
|
||||
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
|
||||
("_count", _NUM_REQUESTS)],
|
||||
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
|
||||
"vllm:request_params_max_tokens":
|
||||
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
|
||||
("_count", _NUM_REQUESTS)],
|
||||
"vllm:prompt_tokens": [("_total",
|
||||
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
|
||||
"vllm:generation_tokens": [
|
||||
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
|
||||
],
|
||||
"vllm:request_success": [("_total", _NUM_REQUESTS)],
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_metrics_counts(server: RemoteOpenAIServer,
|
||||
client: openai.AsyncClient):
|
||||
for _ in range(_NUM_REQUESTS):
|
||||
# sending a request triggers the metrics to be logged.
|
||||
await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=_TOKENIZED_PROMPT,
|
||||
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
|
||||
|
||||
response = requests.get(server.url_for("metrics"))
|
||||
print(response.text)
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
# Loop over all expected metric_families
|
||||
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
|
||||
found_metric = False
|
||||
|
||||
# Check to see if the metric_family is found in the prom endpoint.
|
||||
for family in text_string_to_metric_families(response.text):
|
||||
if family.name == metric_family:
|
||||
found_metric = True
|
||||
|
||||
# Check that each suffix is found in the prom endpoint.
|
||||
for suffix, expected_value in suffix_values_list:
|
||||
metric_name_w_suffix = f"{metric_family}{suffix}"
|
||||
found_suffix = False
|
||||
|
||||
for sample in family.samples:
|
||||
if sample.name == metric_name_w_suffix:
|
||||
found_suffix = True
|
||||
|
||||
# For each suffix, value sure the value matches
|
||||
# what we expect.
|
||||
assert sample.value == expected_value, (
|
||||
f"{metric_name_w_suffix} expected value of "
|
||||
f"{expected_value} did not match found value "
|
||||
f"{sample.value}")
|
||||
break
|
||||
assert found_suffix, (
|
||||
f"Did not find {metric_name_w_suffix} in prom endpoint"
|
||||
)
|
||||
break
|
||||
|
||||
assert found_metric, (f"Did not find {metric_family} in prom endpoint")
|
||||
|
||||
|
||||
EXPECTED_METRICS = [
|
||||
"vllm:num_requests_running",
|
||||
"vllm:num_requests_swapped",
|
||||
"vllm:num_requests_waiting",
|
||||
"vllm:gpu_cache_usage_perc",
|
||||
"vllm:cpu_cache_usage_perc",
|
||||
"vllm:time_to_first_token_seconds_sum",
|
||||
"vllm:time_to_first_token_seconds_bucket",
|
||||
"vllm:time_to_first_token_seconds_count",
|
||||
"vllm:time_per_output_token_seconds_sum",
|
||||
"vllm:time_per_output_token_seconds_bucket",
|
||||
"vllm:time_per_output_token_seconds_count",
|
||||
"vllm:e2e_request_latency_seconds_sum",
|
||||
"vllm:e2e_request_latency_seconds_bucket",
|
||||
"vllm:e2e_request_latency_seconds_count",
|
||||
"vllm:request_prompt_tokens_sum",
|
||||
"vllm:request_prompt_tokens_bucket",
|
||||
"vllm:request_prompt_tokens_count",
|
||||
"vllm:request_generation_tokens_sum",
|
||||
"vllm:request_generation_tokens_bucket",
|
||||
"vllm:request_generation_tokens_count",
|
||||
"vllm:request_params_n_sum",
|
||||
"vllm:request_params_n_bucket",
|
||||
"vllm:request_params_n_count",
|
||||
"vllm:request_params_max_tokens_sum",
|
||||
"vllm:request_params_max_tokens_bucket",
|
||||
"vllm:request_params_max_tokens_count",
|
||||
"vllm:num_preemptions_total",
|
||||
"vllm:prompt_tokens_total",
|
||||
"vllm:generation_tokens_total",
|
||||
"vllm:request_success_total",
|
||||
"vllm:cache_config_info",
|
||||
# labels in cache_config_info
|
||||
"block_size",
|
||||
"cache_dtype",
|
||||
"cpu_offload_gb",
|
||||
"enable_prefix_caching",
|
||||
"gpu_memory_utilization",
|
||||
"num_cpu_blocks",
|
||||
"num_gpu_blocks",
|
||||
"num_gpu_blocks_override",
|
||||
"sliding_window",
|
||||
"swap_space_bytes",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_metrics_exist(server: RemoteOpenAIServer,
|
||||
client: openai.AsyncClient):
|
||||
# sending a request triggers the metrics to be logged.
|
||||
await client.completions.create(model=MODEL_NAME,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0)
|
||||
|
||||
response = requests.get(server.url_for("metrics"))
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
for metric in EXPECTED_METRICS:
|
||||
assert metric in response.text
|
||||
|
||||
|
||||
def test_metrics_exist_run_batch():
|
||||
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501
|
||||
|
||||
base_url = "0.0.0.0"
|
||||
port = "8001"
|
||||
server_url = f"http://{base_url}:{port}"
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
"w") as input_file, tempfile.NamedTemporaryFile(
|
||||
"r") as output_file:
|
||||
input_file.write(input_batch)
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen([
|
||||
sys.executable,
|
||||
"-m",
|
||||
"vllm.entrypoints.openai.run_batch",
|
||||
"-i",
|
||||
input_file.name,
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"intfloat/e5-mistral-7b-instruct",
|
||||
"--enable-metrics",
|
||||
"--url",
|
||||
base_url,
|
||||
"--port",
|
||||
port,
|
||||
], )
|
||||
|
||||
def is_server_up(url):
|
||||
try:
|
||||
response = requests.get(url)
|
||||
return response.status_code == 200
|
||||
except requests.ConnectionError:
|
||||
return False
|
||||
|
||||
while not is_server_up(server_url):
|
||||
time.sleep(1)
|
||||
|
||||
response = requests.get(server_url + "/metrics")
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
proc.wait()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user