Sync from v0.13
This commit is contained in:
@@ -1,136 +1,164 @@
|
||||
import contextlib
|
||||
import gc
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import tempfile
|
||||
from collections import OrderedDict
|
||||
from unittest.mock import MagicMock, patch
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
import vllm
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.distributed import destroy_model_parallel, initialize_model_parallel
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.distributed import (
|
||||
cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
RowParallelLinear,
|
||||
)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.models.interfaces import SupportsLoRA
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def cleanup():
|
||||
destroy_model_parallel()
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
ray.shutdown()
|
||||
@pytest.fixture()
|
||||
def should_do_global_cleanup_after_test(request) -> bool:
|
||||
"""Allow subdirectories to skip global cleanup by overriding this fixture.
|
||||
This can provide a ~10x speedup for non-GPU unit tests since they don't need
|
||||
to initialize torch.
|
||||
"""
|
||||
|
||||
return not request.node.get_closest_marker("skip_global_cleanup")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def cleanup_fixture():
|
||||
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
|
||||
yield
|
||||
cleanup()
|
||||
if should_do_global_cleanup_after_test:
|
||||
cleanup_dist_env_and_memory(shutdown_ray=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dist_init():
|
||||
if not torch.distributed.is_initialized():
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl",
|
||||
world_size=1,
|
||||
rank=0,
|
||||
init_method=f"file://{temp_file}",
|
||||
)
|
||||
torch.distributed.all_reduce(torch.zeros(1).cuda())
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
|
||||
backend = "nccl"
|
||||
if current_platform.is_cpu() or current_platform.is_tpu():
|
||||
backend = "gloo"
|
||||
|
||||
init_distributed_environment(
|
||||
world_size=1,
|
||||
rank=0,
|
||||
distributed_init_method=f"file://{temp_file}",
|
||||
local_rank=0,
|
||||
backend=backend,
|
||||
)
|
||||
initialize_model_parallel(1, 1)
|
||||
yield
|
||||
cleanup()
|
||||
cleanup_dist_env_and_memory(shutdown_ray=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dist_init_torch_only():
|
||||
if torch.distributed.is_initialized():
|
||||
return
|
||||
backend = "nccl"
|
||||
if current_platform.is_cpu():
|
||||
backend = "gloo"
|
||||
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl",
|
||||
world_size=1,
|
||||
rank=0,
|
||||
init_method=f"file://{temp_file}",
|
||||
world_size=1, rank=0, init_method=f"file://{temp_file}", backend=backend
|
||||
)
|
||||
|
||||
|
||||
class DummyLoRAModel(nn.Sequential, SupportsLoRA):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_model() -> nn.Module:
|
||||
model = nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
])),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("output", ColumnParallelLinear(50, 10)),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
("sampler", Sampler())
|
||||
]))
|
||||
model = DummyLoRAModel(
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
]
|
||||
)
|
||||
),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("output", ColumnParallelLinear(50, 10)),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
]
|
||||
)
|
||||
)
|
||||
model.config = MagicMock()
|
||||
model.embedding_modules = {"lm_head": "lm_head"}
|
||||
model.unpadded_vocab_size = 32000
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_model_gate_up() -> nn.Module:
|
||||
model = nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
])),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
("sampler", Sampler())
|
||||
]))
|
||||
model = DummyLoRAModel(
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
]
|
||||
)
|
||||
),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
]
|
||||
)
|
||||
)
|
||||
model.config = MagicMock()
|
||||
model.packed_modules_mapping = {
|
||||
"gate_up_proj": [
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
],
|
||||
}
|
||||
model.embedding_modules = {"lm_head": "lm_head"}
|
||||
model.unpadded_vocab_size = 32000
|
||||
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sql_lora_files():
|
||||
return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def mixtral_lora_files():
|
||||
return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def gemma_lora_files():
|
||||
return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
|
||||
# Note: this module has incorrect adapter_config.json to test
|
||||
# https://github.com/vllm-project/vllm/pull/5909/files.
|
||||
return snapshot_download(repo_id="SangBinCho/mixtral-lora")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
@@ -149,31 +177,85 @@ def baichuan_zero_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def baichuan_regex_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ilama_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/ilama-text2sql-spider")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def minicpmv_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen2vl_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen25vl_base_huggingface_id():
|
||||
# used as a base model for testing with qwen25vl lora adapter
|
||||
return "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen25vl_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tinyllama_lora_files():
|
||||
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def llama_2_7b_engine_extra_embeddings() -> nn.Module:
|
||||
cleanup()
|
||||
get_model_old = get_model
|
||||
@pytest.fixture(scope="session")
|
||||
def deepseekv2_lora_files():
|
||||
return snapshot_download(repo_id="wuchen01/DeepSeek-V2-Lite-Chat-All-LoRA")
|
||||
|
||||
def get_model_patched(*, model_config, device_config, **kwargs):
|
||||
kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8)
|
||||
return get_model_old(model_config=model_config,
|
||||
device_config=device_config,
|
||||
**kwargs)
|
||||
|
||||
with patch("vllm.worker.model_runner.get_model", get_model_patched):
|
||||
engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
|
||||
yield engine.llm_engine
|
||||
del engine
|
||||
cleanup()
|
||||
@pytest.fixture(scope="session")
|
||||
def gptoss20b_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/gpt-oss-20b-lora-adapter-text2sql")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen3moe_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/qwen3-moe-text2sql-spider")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def olmoe_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/olmoe-instruct-text2sql-spider")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen3_lora_files():
|
||||
return snapshot_download(repo_id="charent/self_cognition_Alice")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def llama32_lora_huggingface_id():
|
||||
# huggingface repo id is used to test lora runtime downloading.
|
||||
return "jeeejeee/llama32-3b-text2sql-spider"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def llama32_lora_files(llama32_lora_huggingface_id):
|
||||
return snapshot_download(repo_id=llama32_lora_huggingface_id)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def llama_2_7b_model_extra_embeddings(
|
||||
llama_2_7b_engine_extra_embeddings) -> nn.Module:
|
||||
yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
def reset_default_device():
|
||||
"""
|
||||
Some tests, such as `test_punica_ops.py`, explicitly set the
|
||||
default device, which can affect subsequent tests. Adding this fixture
|
||||
helps avoid this problem.
|
||||
"""
|
||||
original_device = torch.get_default_device()
|
||||
yield
|
||||
torch.set_default_device(original_device)
|
||||
|
||||
113
tests/lora/test_add_lora.py
Normal file
113
tests/lora/test_add_lora.py
Normal file
@@ -0,0 +1,113 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
build_async_engine_client_from_engine_args,
|
||||
)
|
||||
from vllm.inputs import TextPrompt
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils.async_utils import merge_async_iterators
|
||||
|
||||
MODEL_PATH = "zai-org/chatglm3-6b"
|
||||
LORA_RANK = 64
|
||||
DEFAULT_MAX_LORAS = 4 * 3
|
||||
|
||||
|
||||
def get_lora_requests(lora_path) -> list[LoRARequest]:
|
||||
lora_requests: list[LoRARequest] = [
|
||||
LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
|
||||
for i in range(1, DEFAULT_MAX_LORAS + 1)
|
||||
]
|
||||
return lora_requests
|
||||
|
||||
|
||||
async def requests_processing_time(llm, lora_requests: list[LoRARequest]) -> float:
|
||||
sampling_params = SamplingParams(
|
||||
n=1, temperature=0.0, top_p=1.0, ignore_eos=True, max_tokens=1
|
||||
)
|
||||
|
||||
generators = []
|
||||
start = time.perf_counter()
|
||||
|
||||
for lora_request in lora_requests:
|
||||
lora_int_id = lora_request.lora_int_id
|
||||
generator = llm.generate(
|
||||
prompt=TextPrompt(prompt=f"hello {lora_int_id}", multi_modal_data=None), # type: ignore
|
||||
sampling_params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
request_id=f"test{lora_int_id}",
|
||||
)
|
||||
generators.append(generator)
|
||||
|
||||
all_gens = merge_async_iterators(*generators)
|
||||
async for i, res in all_gens:
|
||||
pass
|
||||
|
||||
end = time.perf_counter()
|
||||
return end - start
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_lora(chatglm3_lora_files):
|
||||
"""
|
||||
The add_lora function is used to preload some LoRA adapters into the
|
||||
engine in anticipation of future requests using these adapters. To test
|
||||
this functionality, we use the async engine to process some requests - We
|
||||
do it twice, once with add_lora() preloading and once without.
|
||||
|
||||
We measure the request processing time in both cases and expect the time
|
||||
to be lesser in the case with add_lora() calls.
|
||||
"""
|
||||
lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
|
||||
|
||||
max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
|
||||
# Create engine in eager-mode. Due to high max_loras, the CI can
|
||||
# OOM during cuda-graph capture.
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=max_loras,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=128,
|
||||
gpu_memory_utilization=0.8, # avoid OOM
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
# split lora_requests into 3 parts
|
||||
part_size = len(lora_requests) // 3
|
||||
dummy_run_requests = lora_requests[:part_size]
|
||||
warmup_run_requests = lora_requests[part_size : part_size * 2]
|
||||
cold_run_requests = lora_requests[part_size * 2 :]
|
||||
|
||||
async with build_async_engine_client_from_engine_args(engine_args) as llm:
|
||||
# Dummy run - So any 1-time functionality like triton kernel compilation
|
||||
# is complete here.
|
||||
await requests_processing_time(llm, dummy_run_requests)
|
||||
|
||||
# Run with warmup
|
||||
add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
|
||||
add_lora_results = await asyncio.gather(*add_lora_tasks)
|
||||
|
||||
# Test that all all_lora calls are successful.
|
||||
assert all(add_lora_results)
|
||||
|
||||
time_with_add_lora = await requests_processing_time(llm, warmup_run_requests)
|
||||
|
||||
# Run without any warmup
|
||||
time_cold_start = await requests_processing_time(llm, cold_run_requests)
|
||||
|
||||
print(f"time hot-start {time_with_add_lora} vs time cold-start {time_cold_start} ")
|
||||
|
||||
assert time_with_add_lora < time_cold_start, (
|
||||
f"time_with_add_lora={time_with_add_lora}, "
|
||||
f"time_cold_start={time_cold_start}"
|
||||
"The engine request processing time with LoRA pre-loading "
|
||||
"must be less than the version that does on-demand LoRA loading."
|
||||
)
|
||||
@@ -1,108 +0,0 @@
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from .conftest import cleanup
|
||||
|
||||
MODEL_PATH = "baichuan-inc/Baichuan-7B"
|
||||
|
||||
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
|
||||
),
|
||||
]
|
||||
print(prompts)
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
def test_baichuan_lora(baichuan_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
trust_remote_code=True)
|
||||
|
||||
expected_lora_output = [
|
||||
"SELECT count(*) FROM singer",
|
||||
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE Country = 'France'", # noqa: E501
|
||||
"SELECT name , country , age FROM singer ORDER BY age ASC",
|
||||
]
|
||||
|
||||
output1 = do_sample(llm, baichuan_lora_files, lora_id=1)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output1[i] == expected_lora_output[i]
|
||||
output2 = do_sample(llm, baichuan_lora_files, lora_id=2)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output2[i] == expected_lora_output[i]
|
||||
|
||||
|
||||
@pytest.mark.skip("Requires multiple GPUs")
|
||||
def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
|
||||
# Cannot use as it will initialize torch.cuda too early...
|
||||
# if torch.cuda.device_count() < 4:
|
||||
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
|
||||
|
||||
llm_tp1 = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=1,
|
||||
trust_remote_code=True)
|
||||
output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp1
|
||||
cleanup()
|
||||
|
||||
llm_tp2 = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=2,
|
||||
trust_remote_code=True)
|
||||
output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
|
||||
|
||||
del llm_tp2
|
||||
cleanup()
|
||||
|
||||
assert output_tp1 == output_tp2
|
||||
|
||||
llm_tp4 = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True)
|
||||
output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
|
||||
|
||||
del llm_tp4
|
||||
cleanup()
|
||||
|
||||
assert output_tp1 == output_tp4
|
||||
@@ -1,57 +0,0 @@
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_PATH = "THUDM/chatglm3-6b"
|
||||
|
||||
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
|
||||
),
|
||||
]
|
||||
print(prompts)
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
def test_chatglm3_lora(chatglm3_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
trust_remote_code=True)
|
||||
|
||||
expected_lora_output = [
|
||||
"SELECT count(*) FROM singer",
|
||||
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501
|
||||
"SELECT name , country , age FROM singer ORDER BY age",
|
||||
]
|
||||
|
||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output1[i] == expected_lora_output[i]
|
||||
output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output2[i] == expected_lora_output[i]
|
||||
122
tests/lora/test_chatglm3_tp.py
Normal file
122
tests/lora/test_chatglm3_tp.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import vllm
|
||||
import vllm.config
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
MODEL_PATH = "zai-org/chatglm3-6b"
|
||||
|
||||
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
|
||||
|
||||
EXPECTED_LORA_OUTPUT = [
|
||||
"SELECT count(*) FROM singer",
|
||||
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'",
|
||||
"SELECT name , country , age FROM singer ORDER BY age",
|
||||
]
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=(
|
||||
"What is the average, minimum, and maximum "
|
||||
"age of all singers from France?"
|
||||
)
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=(
|
||||
"Show name, country, age for all singers ordered "
|
||||
"by age from the oldest to the youngest."
|
||||
)
|
||||
),
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora(chatglm3_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=512,
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_num_seqs=16,
|
||||
max_lora_rank=64,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=512,
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=64,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=False,
|
||||
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
|
||||
cudagraph_specialize_lora=False,
|
||||
),
|
||||
)
|
||||
|
||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
||||
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
|
||||
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
|
||||
# more GPU memory causing vLLM to OOM
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=512,
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=True,
|
||||
gpu_memory_utilization=0.8,
|
||||
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
|
||||
cudagraph_specialize_lora=False,
|
||||
),
|
||||
)
|
||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
101
tests/lora/test_deepseekv2_tp.py
Normal file
101
tests/lora/test_deepseekv2_tp.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# NOTE To avoid overloading the CI pipeline, this test script will
|
||||
# not be triggered on CI and is primarily intended for local testing
|
||||
# and verification.
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
MODEL_PATH = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
|
||||
PROMPT_TEMPLATE = "<|begin▁of▁sentence|>You are a helpful assistant.\n\nUser: {context}\n\nAssistant:" # noqa: E501
|
||||
|
||||
|
||||
def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int):
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(context="Who are you?"),
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
# return generated_texts
|
||||
expected_lora_output = [
|
||||
"I am \u5f20\u5b50\u8c6a, an AI assistant developed by \u9648\u58eb\u680b.", # noqa: E501
|
||||
]
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert generated_texts[i].startswith(expected_lora_output[i])
|
||||
|
||||
|
||||
def test_deepseekv2_lora(deepseekv2_lora_files):
|
||||
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
|
||||
# Otherwise, the lora-test will fail due to CUDA OOM.
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
generate_and_test(llm, deepseekv2_lora_files, 1)
|
||||
|
||||
|
||||
def test_deepseekv2(deepseekv2_lora_files):
|
||||
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
|
||||
# Otherwise, the lora-test will fail due to CUDA OOM.
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
generate_and_test(llm, deepseekv2_lora_files, 1)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_deepseekv2_tp2(deepseekv2_lora_files):
|
||||
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
|
||||
# Otherwise, the lora-test will fail due to CUDA OOM.
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
tensor_parallel_size=2,
|
||||
)
|
||||
generate_and_test(llm, deepseekv2_lora_files, 2)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_deepseekv2_tp4(deepseekv2_lora_files):
|
||||
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
|
||||
# Otherwise, the lora-test will fail due to CUDA OOM.
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
tensor_parallel_size=4,
|
||||
)
|
||||
generate_and_test(llm, deepseekv2_lora_files, 2)
|
||||
157
tests/lora/test_default_mm_loras.py
Normal file
157
tests/lora/test_default_mm_loras.py
Normal file
@@ -0,0 +1,157 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Tests for applying default registered multimodal loras.
|
||||
"""
|
||||
|
||||
import os
|
||||
import unittest.mock as mock
|
||||
|
||||
import pytest
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..conftest import AudioTestAssets, VllmRunner
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
MODEL_PATH = snapshot_download("microsoft/Phi-4-multimodal-instruct")
|
||||
AUDIO_LORA_PATH = os.path.join(MODEL_PATH, "speech-lora")
|
||||
IMAGE_LORA_PATH = os.path.join(MODEL_PATH, "vision-lora")
|
||||
|
||||
AUDIO_PROMPT = "<|user|><|audio_1|>Can you transcribe this audio?<|end|><|assistant|>" # noqa: E501
|
||||
|
||||
# Responses are greedy decoded; we just check the end of
|
||||
# the generated text. If the lora is inactive, this model
|
||||
# generates commentary on the transcription.
|
||||
RESPONSE_SUFFIX_WITH_LORA = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go." # noqa: E501
|
||||
RESPONSE_SUFFIX_WITHOUT_LORA = "Certainly! Here is the transcription of the audio you provided:\n\nThe first words I spoke in the original phonograph record: A little piece of practical poetry. Mary had a little lamb; its fleece was white as snow, and everywhere that Mary went, the lamb was sure to go." # noqa: E501
|
||||
|
||||
VLLM_RUNNER_BASE_KWARGS = {
|
||||
"model_name": MODEL_PATH,
|
||||
"dtype": "half",
|
||||
"enable_lora": "True",
|
||||
"max_num_seqs": 2,
|
||||
"max_lora_rank": 320,
|
||||
# Keep these LoRA tests on short-RoPE for determinism post-LongRoPE change.
|
||||
"max_model_len": 4096,
|
||||
"gpu_memory_utilization": 0.8,
|
||||
"limit_mm_per_prompt": {"audio": 1},
|
||||
"enforce_eager": True,
|
||||
}
|
||||
|
||||
|
||||
def run_test(vllm_runner, audio_assets, lora_request, expected_suffix, **kwargs):
|
||||
inputs = [([AUDIO_PROMPT], [audio_assets[0].audio_and_sample_rate[0]])]
|
||||
|
||||
# Apply any additional kwargs as overrides to the base kwargs
|
||||
vllm_runner_kwargs = {**VLLM_RUNNER_BASE_KWARGS, **kwargs}
|
||||
|
||||
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
|
||||
vllm_outputs_with_default_lora = [
|
||||
vllm_model.generate_greedy(
|
||||
prompts,
|
||||
max_tokens=128,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompts, audios in inputs
|
||||
]
|
||||
|
||||
assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(expected_suffix)
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_active_default_mm_lora(
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
"""Ensure that we can use the default audio lora."""
|
||||
run_test(
|
||||
vllm_runner,
|
||||
audio_assets,
|
||||
lora_request=None,
|
||||
default_mm_loras={"audio": AUDIO_LORA_PATH},
|
||||
expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
|
||||
)
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_inactive_default_mm_lora(
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
"""Ensure that modalities are filtered properly."""
|
||||
# Default image lora won't be active since we only pass audio
|
||||
run_test(
|
||||
vllm_runner,
|
||||
audio_assets,
|
||||
lora_request=None,
|
||||
default_mm_loras={"image": IMAGE_LORA_PATH},
|
||||
expected_suffix=RESPONSE_SUFFIX_WITHOUT_LORA,
|
||||
)
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_default_mm_lora_succeeds_with_redundant_lora_request(
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
"""Ensure that redundantly providing the lora works."""
|
||||
run_test(
|
||||
vllm_runner,
|
||||
audio_assets,
|
||||
lora_request=LoRARequest("audio", 1, AUDIO_LORA_PATH),
|
||||
default_mm_loras={"audio": AUDIO_LORA_PATH},
|
||||
expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
|
||||
)
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_default_mm_lora_fails_with_overridden_lora_request(
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
"""Ensure that if the lora_request conflicts with default_mm_loras,
|
||||
we use the lora_request."""
|
||||
run_test(
|
||||
vllm_runner,
|
||||
audio_assets,
|
||||
lora_request=LoRARequest("speech", 2, AUDIO_LORA_PATH),
|
||||
default_mm_loras={"audio": IMAGE_LORA_PATH},
|
||||
expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
|
||||
)
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_default_mm_lora_does_not_expand_string_reqs(vllm_runner):
|
||||
class MockEngineException(Exception):
|
||||
pass
|
||||
|
||||
# Regression test for ensuring default multimodal lora resolution
|
||||
# does not expand the lora req if the prompt type is a string.
|
||||
vllm_runner_kwargs = {
|
||||
**VLLM_RUNNER_BASE_KWARGS,
|
||||
**{"default_mm_loras": {"audio": AUDIO_LORA_PATH}},
|
||||
}
|
||||
|
||||
# Avoid the full generation call since these tests are expensive;
|
||||
# just check what lora request is actually submitted to the engine
|
||||
mock_err = "Engine is mocked for this test"
|
||||
|
||||
with (
|
||||
mock.patch(
|
||||
"vllm.v1.engine.llm_engine.LLMEngine.add_request",
|
||||
side_effect=MockEngineException(mock_err),
|
||||
) as mock_add_request,
|
||||
vllm_runner(**vllm_runner_kwargs) as vllm_model,
|
||||
):
|
||||
# Die once we actually submit the request to the engine
|
||||
with pytest.raises(MockEngineException):
|
||||
vllm_model.llm.generate(prompts=AUDIO_PROMPT)
|
||||
|
||||
# Then check to make sure the submitted lora request
|
||||
# and text prompt were zipped together correctly
|
||||
engine_args, engine_kwargs = mock_add_request.call_args
|
||||
assert engine_kwargs["lora_request"] is None
|
||||
assert engine_kwargs["prompt_text"] == AUDIO_PROMPT
|
||||
523
tests/lora/test_fused_moe_lora_kernel.py
Normal file
523
tests/lora/test_fused_moe_lora_kernel.py
Normal file
@@ -0,0 +1,523 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.distributed import (
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
tensor_model_parallel_all_gather,
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
from vllm.distributed.parallel_state import (
|
||||
get_tensor_model_parallel_world_size,
|
||||
)
|
||||
from vllm.lora.ops.triton_ops import fused_moe_lora
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_device(reset_default_device):
|
||||
pass
|
||||
|
||||
|
||||
def round_up(x, base):
|
||||
return ((x + base - 1) // base) * base
|
||||
|
||||
|
||||
def CEILDIV(x, y):
|
||||
return (x + y - 1) // y
|
||||
|
||||
|
||||
def assign_loras_to_tokens(num_tokens: int, num_sequences: int, max_loras: int):
|
||||
"""
|
||||
Split `num_tokens` into `num_sequences` sequences.
|
||||
Each sequence randomly selects 1 LoRA index from [0, max_loras),
|
||||
and all tokens in that sequence are assigned this LoRA index.
|
||||
|
||||
Args:
|
||||
num_tokens (int): Total number of tokens.
|
||||
num_sequences (int): Number of sequences to split the tokens into.
|
||||
max_loras (int): Total number of available LoRA modules.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: 1D tensor of shape [num_tokens], where each value
|
||||
is the LoRA index assigned to that token.
|
||||
"""
|
||||
assert num_sequences > 0 and max_loras > 0
|
||||
assert num_tokens >= num_sequences, "num_tokens must be >= num_sequences"
|
||||
|
||||
# Compute token distribution per sequence (distribute remainder evenly)
|
||||
tokens_per_seq = num_tokens // num_sequences
|
||||
remainder = num_tokens % num_sequences
|
||||
|
||||
token_lora_mapping = torch.empty(num_tokens, dtype=torch.int32)
|
||||
|
||||
start = 0
|
||||
for seq_idx in range(num_sequences):
|
||||
# Determine the token range for this sequence
|
||||
end = start + tokens_per_seq + (1 if seq_idx < remainder else 0)
|
||||
|
||||
# Randomly select one LoRA ID for this sequence
|
||||
lora_id = random.randint(0, max_loras - 1)
|
||||
|
||||
# Assign the same LoRA ID to all tokens in this sequence
|
||||
token_lora_mapping[start:end] = lora_id
|
||||
|
||||
start = end
|
||||
|
||||
return token_lora_mapping
|
||||
|
||||
|
||||
def assign_experts_to_tokens(num_tokens: int, num_experts: int, top_k_num: int):
|
||||
"""
|
||||
For each token, randomly select `top_k_num` distinct experts out of `num_experts`,
|
||||
and assign normalized random weights that sum to 1.
|
||||
|
||||
Args:
|
||||
num_tokens (int): Total number of tokens.
|
||||
num_experts (int): Total number of available experts.
|
||||
top_k_num (int): Number of experts to select per token.
|
||||
|
||||
Returns:
|
||||
expert_indices (torch.Tensor): shape [num_tokens, top_k_num],
|
||||
expert index for each token.
|
||||
expert_weights (torch.Tensor): shape [num_tokens, top_k_num],
|
||||
normalized weights (sum = 1 per row).
|
||||
"""
|
||||
assert top_k_num <= num_experts, "top_k_num must be <= num_experts"
|
||||
|
||||
# Randomly select top_k_num distinct experts for each token
|
||||
expert_indices = torch.empty((num_tokens, top_k_num), dtype=torch.int32)
|
||||
for i in range(num_tokens):
|
||||
# Randomly choose unique expert indices
|
||||
selected = torch.randperm(num_experts)[:top_k_num]
|
||||
expert_indices[i] = selected
|
||||
|
||||
# Generate random weights and normalize along dim=1
|
||||
expert_weights = torch.rand((num_tokens, top_k_num), dtype=torch.float32)
|
||||
expert_weights = expert_weights / expert_weights.sum(dim=1, keepdim=True)
|
||||
|
||||
return expert_indices, expert_weights
|
||||
|
||||
|
||||
def sample_data(
|
||||
num_tokens: int,
|
||||
num_sequences: int,
|
||||
max_loras: int,
|
||||
num_experts: int,
|
||||
top_k_num: int,
|
||||
):
|
||||
topk_ids, topk_weights = assign_experts_to_tokens(
|
||||
num_tokens, num_experts, top_k_num
|
||||
)
|
||||
token_lora_mapping = assign_loras_to_tokens(num_tokens, num_sequences, max_loras)
|
||||
return topk_ids, topk_weights, token_lora_mapping
|
||||
|
||||
|
||||
def use_fused_moe_lora_kernel(
|
||||
topk_ids,
|
||||
topk_weights,
|
||||
token_lora_mapping,
|
||||
max_lora_rank,
|
||||
top_k_num,
|
||||
lora_a_stacked,
|
||||
lora_b_stacked,
|
||||
hidden_states,
|
||||
output,
|
||||
max_loras,
|
||||
num_experts,
|
||||
block_size,
|
||||
fully_sharded=False,
|
||||
offset=0,
|
||||
):
|
||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||
max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
|
||||
max_num_m_blocks = CEILDIV(max_num_tokens_padded, block_size)
|
||||
|
||||
# init output tensors
|
||||
sorted_token_ids = torch.empty(
|
||||
(max_loras * max_num_tokens_padded,),
|
||||
dtype=torch.int32,
|
||||
)
|
||||
expert_ids = torch.empty((max_loras * max_num_m_blocks,), dtype=torch.int32)
|
||||
num_tokens_post_padded = torch.empty((max_loras,), dtype=torch.int32)
|
||||
adapter_enabled = torch.ones(max_loras + 1, dtype=torch.int32)
|
||||
lora_ids = torch.arange(max_loras + 2, dtype=torch.int32)
|
||||
|
||||
# call kernel
|
||||
ops.moe_lora_align_block_size(
|
||||
topk_ids,
|
||||
token_lora_mapping,
|
||||
num_experts,
|
||||
block_size,
|
||||
max_loras,
|
||||
max_num_tokens_padded,
|
||||
max_num_m_blocks,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
num_tokens_post_padded,
|
||||
adapter_enabled,
|
||||
lora_ids,
|
||||
)
|
||||
|
||||
config = {
|
||||
"BLOCK_SIZE_M": 16,
|
||||
"BLOCK_SIZE_N": 32,
|
||||
"BLOCK_SIZE_K": 64,
|
||||
"GROUP_SIZE_M": 1,
|
||||
"NUM_WARPS": 4,
|
||||
"NUM_STAGES": 3,
|
||||
"SPLIT_K": 1,
|
||||
}
|
||||
|
||||
mul_routed_weight = False
|
||||
expert_ids = expert_ids.view(max_loras, -1)
|
||||
sorted_token_ids = sorted_token_ids.view(max_loras, -1)
|
||||
|
||||
fused_moe_lora(
|
||||
output,
|
||||
hidden_states,
|
||||
lora_a_stacked,
|
||||
lora_b_stacked,
|
||||
topk_weights,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
num_tokens_post_padded,
|
||||
max_lora_rank,
|
||||
top_k_num,
|
||||
lora_ids,
|
||||
adapter_enabled,
|
||||
config["BLOCK_SIZE_M"],
|
||||
config["BLOCK_SIZE_N"],
|
||||
config["BLOCK_SIZE_K"],
|
||||
config["GROUP_SIZE_M"],
|
||||
config["NUM_WARPS"],
|
||||
config["NUM_STAGES"],
|
||||
config["SPLIT_K"],
|
||||
config["BLOCK_SIZE_M"],
|
||||
config["BLOCK_SIZE_N"],
|
||||
config["BLOCK_SIZE_K"],
|
||||
config["GROUP_SIZE_M"],
|
||||
config["NUM_WARPS"],
|
||||
config["NUM_STAGES"],
|
||||
config["SPLIT_K"],
|
||||
mul_routed_weight,
|
||||
fully_sharded=fully_sharded,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
|
||||
def use_torch(
|
||||
hidden_states,
|
||||
token_lora_mapping,
|
||||
topk_ids,
|
||||
lora_a_stacked,
|
||||
lora_b_stacked,
|
||||
top_k_num,
|
||||
):
|
||||
outputs = []
|
||||
for i in range(hidden_states.shape[0]):
|
||||
lora_idx = token_lora_mapping[i]
|
||||
expert_ids = topk_ids[i]
|
||||
lora_a = lora_a_stacked[0][lora_idx][expert_ids]
|
||||
lora_b = lora_b_stacked[0][lora_idx][expert_ids]
|
||||
tensors = [
|
||||
hidden_states[i] @ lora_a[x].T @ lora_b[x].T for x in range(top_k_num)
|
||||
]
|
||||
outputs.append(torch.stack(tensors, dim=0))
|
||||
return torch.stack(outputs, dim=0)
|
||||
|
||||
|
||||
DTYPES = [torch.float16, torch.bfloat16]
|
||||
DEVICES = [f"cuda:{0}"]
|
||||
SEED = [42]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", [100])
|
||||
@pytest.mark.parametrize("top_k_num", [6, 12])
|
||||
@pytest.mark.parametrize("num_experts", [64])
|
||||
@pytest.mark.parametrize("max_loras", [4, 6, 16])
|
||||
@pytest.mark.parametrize("N", [1408])
|
||||
@pytest.mark.parametrize("K", [2048])
|
||||
@pytest.mark.parametrize("max_lora_rank", [16, 32, 64])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
def test_fused_moe_lora_kernel(
|
||||
num_tokens,
|
||||
top_k_num,
|
||||
num_experts,
|
||||
max_loras,
|
||||
N,
|
||||
K,
|
||||
max_lora_rank,
|
||||
block_size,
|
||||
dtype,
|
||||
device,
|
||||
seed,
|
||||
):
|
||||
torch.set_default_device(device)
|
||||
current_platform.seed_everything(seed)
|
||||
# the number of randomly generated sentences.
|
||||
num_sequences = 10
|
||||
# generate data
|
||||
topk_ids, topk_weights, token_lora_mapping = sample_data(
|
||||
num_tokens, num_sequences, max_loras, num_experts, top_k_num
|
||||
)
|
||||
|
||||
# init lora weights
|
||||
lora_a_stacked = [
|
||||
torch.rand(
|
||||
(
|
||||
max_loras,
|
||||
num_experts,
|
||||
max_lora_rank,
|
||||
K,
|
||||
),
|
||||
dtype=dtype,
|
||||
)
|
||||
]
|
||||
lora_b_stacked = [
|
||||
torch.rand(
|
||||
(
|
||||
max_loras,
|
||||
num_experts,
|
||||
N,
|
||||
max_lora_rank,
|
||||
),
|
||||
dtype=dtype,
|
||||
)
|
||||
]
|
||||
hidden_states = torch.rand(
|
||||
(
|
||||
num_tokens,
|
||||
K,
|
||||
),
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
# fused_moe_lora_kernel output
|
||||
output = torch.zeros((num_tokens, top_k_num, N), dtype=dtype)
|
||||
use_fused_moe_lora_kernel(
|
||||
topk_ids,
|
||||
topk_weights,
|
||||
token_lora_mapping,
|
||||
max_lora_rank,
|
||||
top_k_num,
|
||||
lora_a_stacked,
|
||||
lora_b_stacked,
|
||||
hidden_states,
|
||||
output,
|
||||
max_loras,
|
||||
num_experts,
|
||||
block_size,
|
||||
)
|
||||
# pytorch output
|
||||
output2 = use_torch(
|
||||
hidden_states,
|
||||
token_lora_mapping,
|
||||
topk_ids,
|
||||
lora_a_stacked,
|
||||
lora_b_stacked,
|
||||
top_k_num,
|
||||
)
|
||||
|
||||
torch.testing.assert_close(output, output2, atol=1e-1, rtol=1e-1)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("num_tokens", [100])
|
||||
@pytest.mark.parametrize("top_k_num", [6])
|
||||
@pytest.mark.parametrize("num_experts", [64])
|
||||
@pytest.mark.parametrize("max_loras", [4])
|
||||
@pytest.mark.parametrize("N", [1408])
|
||||
@pytest.mark.parametrize("K", [2048])
|
||||
@pytest.mark.parametrize("max_lora_rank", [16, 32, 64])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@pytest.mark.parametrize("column_parallel", [True, False])
|
||||
def test_fused_moe_lora_kernel_fully_sharded(
|
||||
num_tokens,
|
||||
top_k_num,
|
||||
num_experts,
|
||||
max_loras,
|
||||
N,
|
||||
K,
|
||||
max_lora_rank,
|
||||
block_size,
|
||||
dtype,
|
||||
seed,
|
||||
column_parallel,
|
||||
):
|
||||
current_platform.seed_everything(seed)
|
||||
# the number of randomly generated sentences.
|
||||
num_sequences = 10
|
||||
# generate data
|
||||
topk_ids, topk_weights, token_lora_mapping = sample_data(
|
||||
num_tokens, num_sequences, max_loras, num_experts, top_k_num
|
||||
)
|
||||
|
||||
def run_torch_spawn(fn, nprocs):
|
||||
torch.multiprocessing.spawn(
|
||||
fn,
|
||||
args=(
|
||||
nprocs,
|
||||
f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}",
|
||||
dtype,
|
||||
seed,
|
||||
N,
|
||||
K,
|
||||
num_tokens,
|
||||
topk_ids,
|
||||
topk_weights,
|
||||
token_lora_mapping,
|
||||
max_lora_rank,
|
||||
top_k_num,
|
||||
max_loras,
|
||||
num_experts,
|
||||
block_size,
|
||||
column_parallel,
|
||||
),
|
||||
nprocs=nprocs,
|
||||
)
|
||||
|
||||
run_torch_spawn(use_fused_moe_lora_kernel_tensor_parallel, nprocs=2)
|
||||
|
||||
|
||||
def use_fused_moe_lora_kernel_tensor_parallel(
|
||||
local_rank,
|
||||
world_size,
|
||||
init_method,
|
||||
dtype,
|
||||
seed,
|
||||
N,
|
||||
K,
|
||||
num_tokens,
|
||||
topk_ids,
|
||||
topk_weights,
|
||||
token_lora_mapping,
|
||||
max_lora_rank,
|
||||
top_k_num,
|
||||
max_loras,
|
||||
num_experts,
|
||||
block_size,
|
||||
column_parallel,
|
||||
):
|
||||
def _get_shard_slice(shard_size):
|
||||
return slice(local_rank * shard_size, (local_rank + 1) * shard_size)
|
||||
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
torch.set_default_device(device)
|
||||
torch.set_default_dtype(dtype)
|
||||
|
||||
init_distributed_environment(
|
||||
world_size=world_size,
|
||||
rank=local_rank,
|
||||
local_rank=local_rank,
|
||||
distributed_init_method=init_method,
|
||||
)
|
||||
initialize_model_parallel(world_size, 1)
|
||||
tp_size = get_tensor_model_parallel_world_size()
|
||||
|
||||
input_dim = K if column_parallel else N
|
||||
output_dim = N if column_parallel else K
|
||||
|
||||
# init lora weights
|
||||
lora_a = torch.rand(
|
||||
(
|
||||
max_loras,
|
||||
num_experts,
|
||||
max_lora_rank,
|
||||
input_dim,
|
||||
),
|
||||
dtype=dtype,
|
||||
)
|
||||
lora_b = torch.rand(
|
||||
(
|
||||
max_loras,
|
||||
num_experts,
|
||||
output_dim,
|
||||
max_lora_rank,
|
||||
),
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
hidden_states = torch.rand(
|
||||
(
|
||||
num_tokens,
|
||||
input_dim,
|
||||
),
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
output = torch.zeros((num_tokens, top_k_num, output_dim), dtype=dtype)
|
||||
topk_ids = topk_ids.to(device)
|
||||
topk_weights = topk_weights.to(device)
|
||||
token_lora_mapping = token_lora_mapping.to(device)
|
||||
|
||||
ref_output = use_torch(
|
||||
hidden_states,
|
||||
token_lora_mapping,
|
||||
topk_ids,
|
||||
[lora_a],
|
||||
[lora_b],
|
||||
top_k_num,
|
||||
)
|
||||
|
||||
if column_parallel:
|
||||
# Column parallel (e.g. gate_up_proj): LoRA A is sliced along the rank dim,
|
||||
# and Lora B is sliced along the output dim
|
||||
lora_a_shard_size = max_lora_rank // tp_size
|
||||
lora_a = lora_a[:, :, _get_shard_slice(lora_a_shard_size), :]
|
||||
max_lora_rank = lora_a_shard_size
|
||||
offset = 0
|
||||
|
||||
lora_b_shard_size = output_dim // tp_size
|
||||
lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :]
|
||||
output = output[:, :, _get_shard_slice(lora_b_shard_size)].contiguous()
|
||||
else:
|
||||
# Row parallel (e.g. down proj): LoRA A is sliced along the input dim,
|
||||
# and LoRA B is sliced along the output dim
|
||||
lora_a_shard_size = input_dim // tp_size
|
||||
lora_a = lora_a[:, :, :, _get_shard_slice(lora_a_shard_size)]
|
||||
hidden_states = hidden_states[:, _get_shard_slice(lora_a_shard_size)]
|
||||
|
||||
lora_b_shard_size = output_dim // tp_size
|
||||
lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :]
|
||||
offset = lora_b_shard_size * local_rank
|
||||
|
||||
use_fused_moe_lora_kernel(
|
||||
topk_ids,
|
||||
topk_weights,
|
||||
token_lora_mapping,
|
||||
max_lora_rank,
|
||||
top_k_num,
|
||||
[lora_a],
|
||||
[lora_b],
|
||||
hidden_states,
|
||||
output,
|
||||
max_loras,
|
||||
num_experts,
|
||||
block_size,
|
||||
fully_sharded=True,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
if column_parallel:
|
||||
output = tensor_model_parallel_all_gather(output)
|
||||
else:
|
||||
output = tensor_model_parallel_all_reduce(output)
|
||||
|
||||
torch.testing.assert_close(output, ref_output, atol=1e-1, rtol=1e-1)
|
||||
@@ -1,46 +0,0 @@
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_PATH = "google/gemma-7b"
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
||||
prompts = [
|
||||
"Quote: Imagination is",
|
||||
"Quote: Be yourself;",
|
||||
"Quote: So many books,",
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
def test_gemma_lora(gemma_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4)
|
||||
|
||||
expected_lora_output = [
|
||||
"more important than knowledge.\nAuthor: Albert Einstein\n",
|
||||
"everyone else is already taken.\nAuthor: Oscar Wilde\n",
|
||||
"so little time\nAuthor: Frank Zappa\n",
|
||||
]
|
||||
|
||||
output1 = do_sample(llm, gemma_lora_files, lora_id=1)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output1[i].startswith(expected_lora_output[i])
|
||||
output2 = do_sample(llm, gemma_lora_files, lora_id=2)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output2[i].startswith(expected_lora_output[i])
|
||||
106
tests/lora/test_gptoss_tp.py
Normal file
106
tests/lora/test_gptoss_tp.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
MODEL_PATH = "openai/gpt-oss-20b"
|
||||
|
||||
PROMPT_TEMPLATE = """<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
|
||||
Knowledge cutoff: 2024-06
|
||||
Current date: 2025-10-29
|
||||
|
||||
Reasoning: medium
|
||||
|
||||
# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>user<|message|>I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
|
||||
"
|
||||
##Instruction:
|
||||
farm contains tables such as city, farm, farm_competition, competition_record. Table city has columns such as City_ID, Official_Name, Status, Area_km_2, Population, Census_Ranking. City_ID is the primary key.
|
||||
Table farm has columns such as Farm_ID, Year, Total_Horses, Working_Horses, Total_Cattle, Oxen, Bulls, Cows, Pigs, Sheep_and_Goats. Farm_ID is the primary key.
|
||||
Table farm_competition has columns such as Competition_ID, Year, Theme, Host_city_ID, Hosts. Competition_ID is the primary key.
|
||||
Table competition_record has columns such as Competition_ID, Farm_ID, Rank. Competition_ID is the primary key.
|
||||
The Host_city_ID of farm_competition is the foreign key of City_ID of city.
|
||||
The Farm_ID of competition_record is the foreign key of Farm_ID of farm.
|
||||
The Competition_ID of competition_record is the foreign key of Competition_ID of farm_competition.
|
||||
|
||||
|
||||
###Input:
|
||||
{context}
|
||||
|
||||
###Response:<|end|><|start|>assistant<|channel|>final<|message|>""" # noqa: E501
|
||||
|
||||
EXPECTED_LORA_OUTPUT = [
|
||||
"SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;",
|
||||
"SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
|
||||
"SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
|
||||
]
|
||||
|
||||
|
||||
def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="Give the average number of working horses on farms with more than 5000 total horses." # noqa: E501
|
||||
), # noqa: E501
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="What are the maximum and minimum number of cows across all farms."
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="Return the maximum and minimum number of cows across all farms."
|
||||
),
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
|
||||
|
||||
|
||||
def test_gpt_oss_lora(gptoss20b_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=8,
|
||||
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
|
||||
cudagraph_specialize_lora=False,
|
||||
),
|
||||
)
|
||||
|
||||
generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
|
||||
generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("fully_sharded_loras", [False, True])
|
||||
def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=8,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=2,
|
||||
fully_sharded_loras=fully_sharded_loras,
|
||||
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
|
||||
cudagraph_specialize_lora=False,
|
||||
),
|
||||
)
|
||||
|
||||
generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
|
||||
generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
|
||||
@@ -1,106 +0,0 @@
|
||||
import tempfile
|
||||
from random import sample
|
||||
from typing import List, Optional
|
||||
|
||||
import peft
|
||||
import pytest
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from .conftest import cleanup
|
||||
|
||||
MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
|
||||
PROMPTS = [
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501
|
||||
]
|
||||
|
||||
|
||||
def get_lora_model(model_id: str, target_modules: List[str], rank: int):
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id)
|
||||
lora_config = peft.tuners.lora.LoraConfig(target_modules, rank)
|
||||
lora_model = peft.PeftModel(model, lora_config)
|
||||
return lora_model
|
||||
|
||||
|
||||
def do_sample(llm,
|
||||
lora_path: Optional[str] = None,
|
||||
lora_id: Optional[int] = None,
|
||||
logprobs: int = 0,
|
||||
n_tokens: int = 256):
|
||||
prompts = PROMPTS
|
||||
sampling_params = vllm.SamplingParams(temperature=0,
|
||||
max_tokens=n_tokens,
|
||||
logprobs=logprobs,
|
||||
stop=["[/assistant]"])
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
generated_logprobs = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
generated_logprobs.append([
|
||||
list(logprob.keys()) for out in output.outputs
|
||||
for logprob in out.logprobs
|
||||
])
|
||||
return generated_logprobs if logprobs else generated_texts
|
||||
|
||||
|
||||
SUPPORTED_MODULES = [
|
||||
"qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
|
||||
"lm_head"
|
||||
]
|
||||
TARGET_MODULES_LIST = []
|
||||
for length in range(2, 6):
|
||||
TARGET_MODULES_LIST.extend(
|
||||
[sample(SUPPORTED_MODULES, length) for _ in range(3)])
|
||||
|
||||
|
||||
# Test the correctness when layer and rank are varied
|
||||
# step 1: init a base model and serve with LoRA to get the reference results
|
||||
# step 2: merge the same LoRA to the base model, serve the merged model
|
||||
# step 3: compare the results from step 1 and step 2
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
@pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST)
|
||||
@pytest.mark.parametrize("rank", [8, 16, 32, 64])
|
||||
def test_layer_variation_correctness(tp_size, target_modules, rank):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=tp_size,
|
||||
worker_use_ray=True)
|
||||
model = get_lora_model(MODEL_PATH, target_modules, rank)
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
model.save_pretrained(tmpdir)
|
||||
merged_probs = do_sample(llm, tmpdir, 1, logprobs=5, n_tokens=32)
|
||||
del llm
|
||||
cleanup()
|
||||
reference_id_sets = [set(prob[0]) for prob in merged_probs]
|
||||
|
||||
model = get_lora_model(MODEL_PATH, target_modules, rank)
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
merged_model = model.merge_and_unload()
|
||||
merged_model.save_pretrained(tmpdir)
|
||||
llm = vllm.LLM(tmpdir,
|
||||
tokenizer=MODEL_PATH,
|
||||
enable_lora=False,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=tp_size,
|
||||
worker_use_ray=True)
|
||||
probs = do_sample(llm, logprobs=5, n_tokens=32)
|
||||
del llm
|
||||
cleanup()
|
||||
# verify the top-5 tokens are identical for each token
|
||||
id_sets = [set(prob[0]) for prob in probs]
|
||||
assert id_sets == reference_id_sets
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,148 +0,0 @@
|
||||
import pytest
|
||||
import ray
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from .conftest import cleanup
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int):
|
||||
prompts = [
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0,
|
||||
max_tokens=256,
|
||||
stop=["[/assistant]"])
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
def test_llama_lora(sql_lora_files, tp_size):
|
||||
# Cannot use as it will initialize torch.cuda too early...
|
||||
# if torch.cuda.device_count() < tp_size:
|
||||
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=tp_size)
|
||||
|
||||
expected_no_lora_output = [
|
||||
"\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501
|
||||
" Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501
|
||||
"\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501
|
||||
" Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501
|
||||
" Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501
|
||||
"\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501
|
||||
]
|
||||
expected_lora_output = [
|
||||
" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501
|
||||
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501
|
||||
" SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501
|
||||
" SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501
|
||||
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501
|
||||
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501
|
||||
]
|
||||
|
||||
print("lora adapter created")
|
||||
assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
|
||||
|
||||
print("lora 1")
|
||||
assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output
|
||||
|
||||
print("no lora")
|
||||
assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
|
||||
|
||||
print("lora 2")
|
||||
assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output
|
||||
|
||||
print("removing lora")
|
||||
|
||||
|
||||
@pytest.mark.skip("Requires multiple GPUs")
|
||||
def test_llama_tensor_parallel_equality(sql_lora_files):
|
||||
# Cannot use as it will initialize torch.cuda too early...
|
||||
# if torch.cuda.device_count() < 4:
|
||||
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
|
||||
|
||||
llm_tp1 = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=1)
|
||||
output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp1
|
||||
cleanup()
|
||||
|
||||
llm_tp2 = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=2)
|
||||
output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp2
|
||||
cleanup()
|
||||
|
||||
assert output_tp1 == output_tp2
|
||||
|
||||
llm_tp4 = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=4)
|
||||
output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp4
|
||||
cleanup()
|
||||
|
||||
assert output_tp1 == output_tp4
|
||||
|
||||
|
||||
def test_llama_lora_warmup(sql_lora_files):
|
||||
"""Test that the LLM initialization works with a warmup LORA path and
|
||||
is more conservative"""
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
def get_num_gpu_blocks_lora():
|
||||
llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
|
||||
num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
|
||||
return num_gpu_blocks_lora_warmup
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
def get_num_gpu_blocks_no_lora():
|
||||
llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
|
||||
num_gpu_blocks_no_lora_warmup = (
|
||||
llm.llm_engine.cache_config.num_gpu_blocks)
|
||||
return num_gpu_blocks_no_lora_warmup
|
||||
|
||||
num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
|
||||
num_gpu_blocks_no_lora_warmup = ray.get(
|
||||
get_num_gpu_blocks_no_lora.remote())
|
||||
assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
|
||||
"The warmup with lora should be more "
|
||||
"conservative than without lora, therefore the number of "
|
||||
"memory blocks for the KV cache should be "
|
||||
"less when using lora than when not using lora")
|
||||
231
tests/lora/test_llama_tp.py
Normal file
231
tests/lora/test_llama_tp.py
Normal file
@@ -0,0 +1,231 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
import vllm.config
|
||||
from vllm import LLM
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
|
||||
from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
PROMPT_TEMPLATE = """<|eot_id|><|start_header_id|>user<|end_header_id|>
|
||||
I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
|
||||
"
|
||||
##Instruction:
|
||||
candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key.
|
||||
Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key.
|
||||
The People_ID of candidate is the foreign key of People_ID of people.
|
||||
###Input:
|
||||
{context}
|
||||
###Response:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
""" # noqa: E501
|
||||
|
||||
EXPECTED_LORA_OUTPUT = [
|
||||
"SELECT count(*) FROM candidate",
|
||||
"SELECT count(*) FROM candidate",
|
||||
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
|
||||
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
|
||||
]
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-3.2-3B-Instruct"
|
||||
|
||||
|
||||
def do_sample(
|
||||
llm: vllm.LLM,
|
||||
lora_path: str,
|
||||
lora_id: int,
|
||||
tensorizer_config_dict: dict | None = None,
|
||||
) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(context="How many candidates are there?"),
|
||||
PROMPT_TEMPLATE.format(context="Count the number of candidates."),
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="Which poll resource provided the most number of candidate information?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="Return the poll resource associated with the most candidates."
|
||||
),
|
||||
]
|
||||
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=0, max_tokens=64, stop=["<|im_end|>"]
|
||||
)
|
||||
if tensorizer_config_dict is not None:
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(
|
||||
str(lora_id),
|
||||
lora_id,
|
||||
lora_path,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
)
|
||||
if lora_id
|
||||
else None,
|
||||
)
|
||||
else:
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id
|
||||
else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
def generate_and_test(
|
||||
llm, llama32_lora_files, tensorizer_config_dict: dict | None = None
|
||||
):
|
||||
print("lora adapter created")
|
||||
print("lora 1")
|
||||
assert (
|
||||
do_sample(
|
||||
llm,
|
||||
llama32_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=1,
|
||||
)
|
||||
== EXPECTED_LORA_OUTPUT
|
||||
)
|
||||
|
||||
print("lora 2")
|
||||
assert (
|
||||
do_sample(
|
||||
llm,
|
||||
llama32_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=2,
|
||||
)
|
||||
== EXPECTED_LORA_OUTPUT
|
||||
)
|
||||
|
||||
print("removing lora")
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("cudagraph_specialize_lora", [True, False])
|
||||
def test_llama_lora(llama32_lora_files, cudagraph_specialize_lora: bool):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
# also test odd max_num_seqs
|
||||
max_num_seqs=7,
|
||||
max_model_len=1024,
|
||||
max_loras=4,
|
||||
compilation_config=vllm.config.CompilationConfig(
|
||||
cudagraph_specialize_lora=cudagraph_specialize_lora,
|
||||
),
|
||||
)
|
||||
generate_and_test(llm, llama32_lora_files)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_llama_lora_tp4(llama32_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=7,
|
||||
max_model_len=1024,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=4,
|
||||
)
|
||||
generate_and_test(llm, llama32_lora_files)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_llama_lora_tp4_fully_sharded_loras(llama32_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=8,
|
||||
max_loras=4,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=4,
|
||||
fully_sharded_loras=True,
|
||||
)
|
||||
generate_and_test(llm, llama32_lora_files)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_tp2_serialize_and_deserialize_lora(
|
||||
tmp_path,
|
||||
llama32_lora_files,
|
||||
):
|
||||
# Run the tensorizing of the LoRA adapter and the model in a subprocess
|
||||
# to guarantee cleanup
|
||||
|
||||
tp_size = 2
|
||||
model_name = "model-rank-%03d.tensors"
|
||||
|
||||
model_ref = MODEL_PATH
|
||||
lora_path = llama32_lora_files
|
||||
suffix = "test"
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
|
||||
"--model",
|
||||
MODEL_PATH,
|
||||
"--lora-path",
|
||||
lora_path,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"serialize",
|
||||
"--serialized-directory",
|
||||
str(tmp_path),
|
||||
"--suffix",
|
||||
suffix,
|
||||
"--serialization-kwargs",
|
||||
'{"limit_cpu_concurrency": 4}',
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("Tensorizing failed.")
|
||||
print("STDOUT:\n", e.stdout)
|
||||
print("STDERR:\n", e.stderr)
|
||||
raise
|
||||
|
||||
print("STDOUT:\n", result.stdout)
|
||||
|
||||
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
|
||||
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
|
||||
|
||||
loaded_llm = LLM(
|
||||
model=model_ref,
|
||||
load_format="tensorizer",
|
||||
enable_lora=True,
|
||||
enforce_eager=True,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
max_num_seqs=7,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
max_loras=2,
|
||||
)
|
||||
|
||||
tc_as_dict = tensorizer_config.to_serializable()
|
||||
|
||||
print("lora adapter created")
|
||||
print("lora 1")
|
||||
assert (
|
||||
do_sample(
|
||||
loaded_llm, llama32_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
|
||||
)
|
||||
== EXPECTED_LORA_OUTPUT
|
||||
)
|
||||
187
tests/lora/test_llm_with_multi_loras.py
Normal file
187
tests/lora/test_llm_with_multi_loras.py
Normal file
@@ -0,0 +1,187 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This script contains:
|
||||
1. test multi loras service with tp >= 2
|
||||
2. test multi loras request
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_PATH = "Qwen/Qwen3-0.6B"
|
||||
LORA_NAME_PATH_MAP = {
|
||||
"Alice": "charent/self_cognition_Alice",
|
||||
"Bob": "charent/self_cognition_Bob",
|
||||
"Cat": "charent/self_cognition_Bob", # same as Bob
|
||||
}
|
||||
|
||||
LORA_NAME_ID_MAP = {}
|
||||
INCREASE_LORA_ID = 0
|
||||
LORA_RANK = 8
|
||||
|
||||
LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
|
||||
LORA_TEST_EXPECTED = [
|
||||
"GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501
|
||||
"I am Alice, an AI assistant developed by GitHub/Charent.",
|
||||
]
|
||||
|
||||
|
||||
def format_chatml_messages(prompt: str):
|
||||
return [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
|
||||
|
||||
def make_add_lora_request(name: str, path: str):
|
||||
global INCREASE_LORA_ID, LORA_NAME_ID_MAP
|
||||
|
||||
INCREASE_LORA_ID += 1
|
||||
LORA_NAME_ID_MAP[name] = INCREASE_LORA_ID
|
||||
|
||||
return LoRARequest(
|
||||
lora_name=name,
|
||||
lora_int_id=INCREASE_LORA_ID,
|
||||
lora_path=path,
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multi_loras_with_tp_sync():
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=2, # ensure max_loras < max_cpu_loras
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.5,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=2, # ensure tp >= 2
|
||||
max_cpu_loras=4, # ensure max_cpu_loras >= 2
|
||||
)
|
||||
|
||||
def run_check_lora(fn, args, expected: list):
|
||||
fn(args)
|
||||
assert set(llm.llm_engine.list_loras()) == set(expected)
|
||||
|
||||
# simulate add loras with CLI args
|
||||
# likes: `--lora-modules Alice=/path/to/Alice Bob=/path/to/Bob`
|
||||
run_check_lora(
|
||||
llm.llm_engine.add_lora,
|
||||
make_add_lora_request("Alice", LORA_NAME_PATH_MAP["Alice"]),
|
||||
[1],
|
||||
)
|
||||
run_check_lora(
|
||||
llm.llm_engine.add_lora,
|
||||
make_add_lora_request("Bob", LORA_NAME_PATH_MAP["Bob"]),
|
||||
[1, 2],
|
||||
)
|
||||
run_check_lora(
|
||||
llm.llm_engine.add_lora,
|
||||
make_add_lora_request("Cat", LORA_NAME_PATH_MAP["Cat"]),
|
||||
[1, 2, 3],
|
||||
)
|
||||
|
||||
# set temperature = 0 for greedy search
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=64)
|
||||
|
||||
def call_llm_get_outputs(prompt: str, lora_name: str):
|
||||
lora_request = LoRARequest(
|
||||
lora_name=lora_name,
|
||||
lora_int_id=LORA_NAME_ID_MAP[lora_name],
|
||||
lora_path=LORA_NAME_PATH_MAP[lora_name],
|
||||
)
|
||||
messages = format_chatml_messages(prompt)
|
||||
outputs = llm.chat(
|
||||
[messages],
|
||||
sampling_params,
|
||||
chat_template_kwargs={
|
||||
"enable_thinking": False
|
||||
}, # for those loras, ensure enable_thinking=False
|
||||
lora_request=lora_request,
|
||||
use_tqdm=False,
|
||||
)
|
||||
output_text = outputs[0].outputs[0].text
|
||||
return output_text
|
||||
|
||||
def reload_lora(name: str):
|
||||
"""
|
||||
reload a lora to simulate the case:
|
||||
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
|
||||
for dynamic lora loading and unloading
|
||||
"""
|
||||
remove_lora_response = llm.llm_engine.remove_lora(
|
||||
lora_id=LORA_NAME_ID_MAP[name]
|
||||
)
|
||||
|
||||
add_lora_response = llm.llm_engine.add_lora(
|
||||
make_add_lora_request(name, LORA_NAME_PATH_MAP[name])
|
||||
)
|
||||
|
||||
print(f"{remove_lora_response=}, {add_lora_response=}")
|
||||
|
||||
def check_outputs(outputs: str, expected: str):
|
||||
print(f"{prompt=}.\n{expected_output=}\n{output_text=}")
|
||||
print("\n----------------------------\n")
|
||||
assert outputs == expected
|
||||
|
||||
for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
# call Bob, ignore what it is output
|
||||
call_llm_get_outputs(prompt, "Bob")
|
||||
print("After call Bob:")
|
||||
|
||||
# call Alice
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
# reload Bob Lora
|
||||
reload_lora("Bob")
|
||||
print("After reload Bob:")
|
||||
|
||||
# call Alice
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
# reload Alice Lora
|
||||
reload_lora("Alice")
|
||||
print("After reload Alice:")
|
||||
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
|
||||
def test_multiple_lora_requests():
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.5,
|
||||
enforce_eager=True,
|
||||
)
|
||||
PROMPTS = ["Hello, my name is"] * 2
|
||||
LORA_NAME = "Alice"
|
||||
lora_request = [
|
||||
LoRARequest(LORA_NAME + str(idx), idx + 1, LORA_NAME_PATH_MAP[LORA_NAME])
|
||||
for idx in range(len(PROMPTS))
|
||||
]
|
||||
# Multiple SamplingParams should be matched with each prompt
|
||||
outputs = llm.generate(PROMPTS, lora_request=lora_request)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
# Exception raised, if the size of params does not match the size of prompts
|
||||
with pytest.raises(ValueError):
|
||||
outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
|
||||
|
||||
# Single LoRARequest should be applied to every prompt
|
||||
single_lora_request = lora_request[0]
|
||||
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
@@ -1,224 +0,0 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
|
||||
|
||||
from .utils import DummyLoRAManager
|
||||
|
||||
TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4]
|
||||
QKV_TENSOR_SIZES = [
|
||||
(8192, 1024, 1024),
|
||||
(8192 // 8, 1024 // 8, 1024 // 8),
|
||||
(4096, 4096, 4096),
|
||||
(4096 // 2, 4096 // 2, 4096 // 2),
|
||||
]
|
||||
BATCH_SIZES = [8, 32, 256]
|
||||
RANKS = [8]
|
||||
DTYPES = [torch.float16]
|
||||
TOLERANCES = {
|
||||
torch.float16: (5e-3, 5e-3),
|
||||
torch.bfloat16: (3e-2, 2e-2),
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", TENSOR_SIZES)
|
||||
@pytest.mark.parametrize("n", TENSOR_SIZES)
|
||||
@pytest.mark.parametrize("k", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("rank", RANKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
def test_apply_lora(m, n, k, rank, dtype) -> None:
|
||||
manager = DummyLoRAManager()
|
||||
|
||||
module_name = "module"
|
||||
weight = torch.rand([m, n], device="cuda", dtype=dtype)
|
||||
|
||||
manager.init_random_lora(module_name, weight, rank=rank)
|
||||
lora = manager.get_module_lora(module_name)
|
||||
|
||||
input = torch.rand(k, n, device="cuda", dtype=dtype)
|
||||
expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
|
||||
|
||||
lora_a_stack = torch.zeros(8,
|
||||
1,
|
||||
lora.lora_a.shape[1],
|
||||
lora.lora_a.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype)
|
||||
lora_b_stack = torch.zeros(8,
|
||||
1,
|
||||
lora.lora_b.shape[1],
|
||||
lora.lora_b.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype)
|
||||
for i in range(lora_a_stack.shape[0]):
|
||||
lora_a_stack[i][0] = lora.lora_a.T
|
||||
lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
|
||||
|
||||
output = torch.zeros(k, m, device="cuda", dtype=dtype)
|
||||
_apply_lora(
|
||||
input, lora_a_stack, lora_b_stack,
|
||||
torch.randint(0, lora_a_stack.shape[0], (len(input), ), device="cuda"),
|
||||
output)
|
||||
|
||||
rtol, atol = TOLERANCES[dtype]
|
||||
assert torch.allclose(expected, output, rtol=rtol, atol=atol)
|
||||
|
||||
output[:] = 0
|
||||
_apply_lora(input, lora_a_stack, lora_b_stack,
|
||||
torch.full((len(input), ), -1, device="cuda"), output)
|
||||
assert torch.allclose(torch.zeros_like(output), output)
|
||||
|
||||
manager.reset_lora()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", TENSOR_SIZES)
|
||||
@pytest.mark.parametrize("n", TENSOR_SIZES)
|
||||
@pytest.mark.parametrize("k", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("rank", RANKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
|
||||
if m % 2 != 0:
|
||||
pytest.skip("m must be divisible by 2")
|
||||
if m // 2 not in TENSOR_SIZES:
|
||||
pytest.skip("m//2 must be in TENSOR_SIZES")
|
||||
|
||||
manager = DummyLoRAManager()
|
||||
|
||||
module_name = "module"
|
||||
weight = torch.rand([m // 2, n], device="cuda", dtype=dtype)
|
||||
|
||||
manager.init_random_lora(module_name + "1", weight, rank=rank)
|
||||
lora_1 = manager.get_module_lora(module_name + "1")
|
||||
manager.init_random_lora(module_name + "2", weight, rank=rank)
|
||||
lora_2 = manager.get_module_lora(module_name + "2")
|
||||
|
||||
input = torch.rand(k, n, device="cuda", dtype=dtype)
|
||||
expected = torch.cat([
|
||||
input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
|
||||
input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling
|
||||
],
|
||||
dim=1)
|
||||
|
||||
lora_a_stacks = [
|
||||
torch.zeros(8,
|
||||
1,
|
||||
lora_1.lora_a.shape[1],
|
||||
lora_1.lora_a.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype) for i in range(2)
|
||||
]
|
||||
lora_b_stacks = [
|
||||
torch.zeros(8,
|
||||
1,
|
||||
lora_1.lora_b.shape[1],
|
||||
lora_1.lora_b.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype) for i in range(2)
|
||||
]
|
||||
for i in range(lora_a_stacks[0].shape[0]):
|
||||
lora_a_stacks[0][i][0] = lora_1.lora_a.T
|
||||
lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
|
||||
lora_a_stacks[1][i][0] = lora_2.lora_a.T
|
||||
lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
|
||||
|
||||
output = torch.zeros(k, m, device="cuda", dtype=dtype)
|
||||
_apply_lora_packed_nslice(
|
||||
input, lora_a_stacks, lora_b_stacks,
|
||||
torch.randint(0,
|
||||
lora_a_stacks[0].shape[0], (len(input), ),
|
||||
device="cuda"), output, (m // 2, m // 2))
|
||||
|
||||
rtol, atol = TOLERANCES[dtype]
|
||||
assert torch.allclose(expected, output, rtol=rtol, atol=atol)
|
||||
|
||||
output[:] = 0
|
||||
_apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
|
||||
torch.full((len(input), ), -1, device="cuda"),
|
||||
output, (m // 2, m // 2))
|
||||
assert torch.allclose(torch.zeros_like(output), output)
|
||||
|
||||
manager.reset_lora()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
|
||||
@pytest.mark.parametrize("n", TENSOR_SIZES)
|
||||
@pytest.mark.parametrize("k", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("rank", RANKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
|
||||
manager = DummyLoRAManager()
|
||||
|
||||
module_name = "module"
|
||||
weight_q = torch.empty(qkv[0], n, device="cuda", dtype=dtype)
|
||||
weight_kv = torch.empty(qkv[1], n, device="cuda", dtype=dtype)
|
||||
|
||||
manager.init_random_lora(module_name + "q", weight_q, rank=rank)
|
||||
lora_q = manager.get_module_lora(module_name + "q")
|
||||
manager.init_random_lora(module_name + "k", weight_kv, rank=rank)
|
||||
lora_k = manager.get_module_lora(module_name + "k")
|
||||
manager.init_random_lora(module_name + "v", weight_kv, rank=rank)
|
||||
lora_v = manager.get_module_lora(module_name + "v")
|
||||
|
||||
input = torch.rand(k, n, device="cuda", dtype=dtype)
|
||||
expected = torch.cat([
|
||||
input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling,
|
||||
input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling,
|
||||
input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling
|
||||
],
|
||||
dim=1)
|
||||
|
||||
lora_a_stacks = [
|
||||
torch.zeros(8,
|
||||
1,
|
||||
lora_q.lora_a.shape[1],
|
||||
lora_q.lora_a.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype)
|
||||
] + [
|
||||
torch.zeros(8,
|
||||
1,
|
||||
lora_k.lora_a.shape[1],
|
||||
lora_k.lora_a.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype) for i in range(2)
|
||||
]
|
||||
lora_b_stacks = [
|
||||
torch.zeros(8,
|
||||
1,
|
||||
lora_q.lora_b.shape[1],
|
||||
lora_q.lora_b.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype)
|
||||
] + [
|
||||
torch.zeros(8,
|
||||
1,
|
||||
lora_k.lora_b.shape[1],
|
||||
lora_k.lora_b.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype) for i in range(2)
|
||||
]
|
||||
for i in range(lora_a_stacks[0].shape[0]):
|
||||
lora_a_stacks[0][i][0] = lora_q.lora_a.T
|
||||
lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
|
||||
lora_a_stacks[1][i][0] = lora_k.lora_a.T
|
||||
lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T
|
||||
lora_a_stacks[2][i][0] = lora_v.lora_a.T
|
||||
lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
|
||||
|
||||
output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype)
|
||||
_apply_lora_packed_nslice(
|
||||
input, lora_a_stacks, lora_b_stacks,
|
||||
torch.randint(0,
|
||||
lora_a_stacks[0].shape[0], (len(input), ),
|
||||
device="cuda"), output, (qkv[0], qkv[1], qkv[2]))
|
||||
|
||||
rtol, atol = TOLERANCES[dtype]
|
||||
assert torch.allclose(expected, output, rtol=rtol, atol=atol)
|
||||
|
||||
output[:] = 0
|
||||
_apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
|
||||
torch.full((len(input), ), -1, device="cuda"),
|
||||
output, (qkv[0], qkv[1], qkv[2]))
|
||||
assert torch.allclose(torch.zeros_like(output), output)
|
||||
|
||||
manager.reset_lora()
|
||||
@@ -1,9 +1,20 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.lora.models import LoRAModel
|
||||
from vllm.lora.lora_model import LoRAModel
|
||||
from vllm.lora.peft_helper import PEFTHelper
|
||||
from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
|
||||
from vllm.model_executor.models.utils import WeightsMapper
|
||||
|
||||
lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"]
|
||||
lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"]
|
||||
BAICHUAN_LORA_MODULES = [
|
||||
"W_pack",
|
||||
"o_proj",
|
||||
"gate_up_proj",
|
||||
"down_proj",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lora_name", lora_lst)
|
||||
@@ -11,48 +22,109 @@ def test_load_checkpoints(
|
||||
lora_name,
|
||||
baichuan_lora_files,
|
||||
baichuan_zero_lora_files,
|
||||
baichuan_regex_lora_files,
|
||||
chatglm3_lora_files,
|
||||
):
|
||||
supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
|
||||
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
|
||||
embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
|
||||
embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
|
||||
expected_lora_modules = []
|
||||
for module in supported_lora_modules:
|
||||
|
||||
expected_lora_lst: list[str] = []
|
||||
for module in BAICHUAN_LORA_MODULES:
|
||||
if module in packed_modules_mapping:
|
||||
expected_lora_modules.extend(packed_modules_mapping[module])
|
||||
expected_lora_lst.extend(packed_modules_mapping[module])
|
||||
else:
|
||||
expected_lora_modules.append(module)
|
||||
expected_lora_lst.append(module)
|
||||
expected_lora_modules = set(expected_lora_lst)
|
||||
if lora_name == "baichuan7B":
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
baichuan_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
# For the baichuan7B model, load it's LoRA,
|
||||
# and the test should pass.
|
||||
LoRAModel.from_local_checkpoint(
|
||||
baichuan_lora_files,
|
||||
expected_lora_modules,
|
||||
peft_helper=peft_helper,
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embed_padding_modules)
|
||||
model_vocab_size=64000,
|
||||
)
|
||||
elif lora_name == "baichuan7B-zero":
|
||||
#Test that the target_modules contain prefix
|
||||
# Test that the target_modules contain prefix
|
||||
# such as "model.layers.0.self_atten.W_pack", and
|
||||
# the test should pass.
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
baichuan_zero_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
LoRAModel.from_local_checkpoint(
|
||||
baichuan_zero_lora_files,
|
||||
expected_lora_modules,
|
||||
peft_helper=peft_helper,
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embed_padding_modules)
|
||||
model_vocab_size=64000,
|
||||
)
|
||||
elif lora_name == "baichuan7B-zero-regex":
|
||||
# Test that the `target_modules` in the form of regular expressions,
|
||||
# such as `model\\..*(W_pack|o_proj)`, and the test should pass.
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
baichuan_regex_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
LoRAModel.from_local_checkpoint(
|
||||
baichuan_regex_lora_files,
|
||||
expected_lora_modules,
|
||||
peft_helper=peft_helper,
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
model_vocab_size=64000,
|
||||
)
|
||||
else:
|
||||
# For the baichuan7B model, load chatglm3-6b's LoRA,
|
||||
# and the test should raise the following error.
|
||||
expected_error = "Please verify that the loaded LoRA module is correct" # noqa: E501
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
chatglm3_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
with pytest.raises(ValueError, match=expected_error):
|
||||
LoRAModel.from_local_checkpoint(
|
||||
chatglm3_lora_files,
|
||||
expected_lora_modules,
|
||||
peft_helper=peft_helper,
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embed_padding_modules)
|
||||
model_vocab_size=64000,
|
||||
)
|
||||
|
||||
|
||||
def test_lora_weights_mapping(baichuan_lora_files):
|
||||
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
|
||||
|
||||
expected_lora_lst: list[str] = []
|
||||
for module in BAICHUAN_LORA_MODULES:
|
||||
if module in packed_modules_mapping:
|
||||
expected_lora_lst.extend(packed_modules_mapping[module])
|
||||
else:
|
||||
expected_lora_lst.append(module)
|
||||
expected_lora_modules = set(expected_lora_lst)
|
||||
hf_to_vllm_mapper = WeightsMapper(
|
||||
orig_to_new_prefix={
|
||||
"model.": "language_model.model.",
|
||||
},
|
||||
orig_to_new_substr={
|
||||
".layers.": ".baichuan_layers.",
|
||||
},
|
||||
)
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
baichuan_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
lora_model = LoRAModel.from_local_checkpoint(
|
||||
baichuan_lora_files,
|
||||
expected_lora_modules,
|
||||
peft_helper=peft_helper,
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
model_vocab_size=64000,
|
||||
weights_mapper=hf_to_vllm_mapper,
|
||||
)
|
||||
for name in lora_model.loras:
|
||||
assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
|
||||
assert ".baichuan_layers." in name
|
||||
|
||||
116
tests/lora/test_lora_functions.py
Normal file
116
tests/lora/test_lora_functions.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Script to test add_lora, remove_lora, pin_lora, list_loras functions.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
build_async_engine_client_from_engine_args,
|
||||
)
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
|
||||
MODEL_PATH = "Qwen/Qwen3-0.6B"
|
||||
LORA_MODULE_PATH = "charent/self_cognition_Alice"
|
||||
LORA_RANK = 8
|
||||
|
||||
|
||||
def make_lora_request(lora_id: int):
|
||||
return LoRARequest(
|
||||
lora_name=f"{lora_id}", lora_int_id=lora_id, lora_path=LORA_MODULE_PATH
|
||||
)
|
||||
|
||||
|
||||
def test_lora_functions_sync():
|
||||
max_loras = 4
|
||||
# Create engine in eager-mode. Due to high max_loras, the CI can
|
||||
# OOM during cuda-graph capture.
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=max_loras,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=128,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
llm = LLMEngine.from_engine_args(engine_args)
|
||||
|
||||
def run_check(fn, args, expected: list):
|
||||
fn(args)
|
||||
assert set(llm.list_loras()) == set(expected)
|
||||
|
||||
run_check(llm.add_lora, make_lora_request(1), [1])
|
||||
run_check(llm.add_lora, make_lora_request(2), [1, 2])
|
||||
|
||||
# Pin LoRA 1 and test that it is never removed on subsequent adds.
|
||||
run_check(llm.pin_lora, 1, [1, 2])
|
||||
run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
|
||||
run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
|
||||
run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
|
||||
run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
|
||||
run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
|
||||
run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
|
||||
run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
|
||||
run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
|
||||
|
||||
# Remove LoRA 1 and continue adding.
|
||||
run_check(llm.remove_lora, 1, [8, 9, 10])
|
||||
run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
|
||||
run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
|
||||
run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
|
||||
|
||||
# Remove all LoRAs.
|
||||
run_check(llm.remove_lora, 13, [12, 10, 11])
|
||||
run_check(llm.remove_lora, 12, [10, 11])
|
||||
run_check(llm.remove_lora, 11, [10])
|
||||
run_check(llm.remove_lora, 10, [])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_lora_functions_async():
|
||||
max_loras = 4
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=max_loras,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=128,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
async def run_check(fn, args, expected: list):
|
||||
await fn(args)
|
||||
assert set(await llm.list_loras()) == set(expected)
|
||||
|
||||
async with build_async_engine_client_from_engine_args(engine_args) as llm:
|
||||
await run_check(llm.add_lora, make_lora_request(1), [1])
|
||||
await run_check(llm.add_lora, make_lora_request(2), [1, 2])
|
||||
|
||||
# Pin LoRA 1 and test that it is never removed on subsequent adds.
|
||||
await run_check(llm.pin_lora, 1, [1, 2])
|
||||
await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
|
||||
await run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
|
||||
await run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
|
||||
await run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
|
||||
await run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
|
||||
await run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
|
||||
await run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
|
||||
await run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
|
||||
|
||||
# Remove LoRA 1 and continue adding.
|
||||
await run_check(llm.remove_lora, 1, [8, 9, 10])
|
||||
await run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
|
||||
await run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
|
||||
await run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
|
||||
|
||||
# Remove all LoRAs
|
||||
await run_check(llm.remove_lora, 13, [12, 10, 11])
|
||||
await run_check(llm.remove_lora, 12, [10, 11])
|
||||
await run_check(llm.remove_lora, 11, [10])
|
||||
await run_check(llm.remove_lora, 10, [])
|
||||
48
tests/lora/test_lora_huggingface.py
Normal file
48
tests/lora/test_lora_huggingface.py
Normal file
@@ -0,0 +1,48 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.lora.lora_model import LoRAModel
|
||||
from vllm.lora.peft_helper import PEFTHelper
|
||||
from vllm.lora.utils import get_adapter_absolute_path
|
||||
from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
|
||||
|
||||
# Provide absolute path and huggingface lora ids
|
||||
lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"]
|
||||
LLAMA_LORA_MODULES = [
|
||||
"qkv_proj",
|
||||
"o_proj",
|
||||
"gate_up_proj",
|
||||
"down_proj",
|
||||
"embed_tokens",
|
||||
"lm_head",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
|
||||
def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
|
||||
lora_name = request.getfixturevalue(lora_fixture_name)
|
||||
packed_modules_mapping = Qwen3ForCausalLM.packed_modules_mapping
|
||||
|
||||
expected_lora_lst: list[str] = []
|
||||
for module in LLAMA_LORA_MODULES:
|
||||
if module in packed_modules_mapping:
|
||||
expected_lora_lst.extend(packed_modules_mapping[module])
|
||||
else:
|
||||
expected_lora_lst.append(module)
|
||||
expected_lora_modules = set(expected_lora_lst)
|
||||
lora_path = get_adapter_absolute_path(lora_name)
|
||||
|
||||
# lora loading should work for either absolute path and huggingface id.
|
||||
peft_helper = PEFTHelper.from_local_dir(lora_path, 4096)
|
||||
lora_model = LoRAModel.from_local_checkpoint(
|
||||
lora_path,
|
||||
expected_lora_modules,
|
||||
peft_helper=peft_helper,
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
)
|
||||
|
||||
# Assertions to ensure the model is loaded correctly
|
||||
assert lora_model is not None, "LoRAModel is not loaded correctly"
|
||||
File diff suppressed because it is too large
Load Diff
121
tests/lora/test_minicpmv_tp.py
Normal file
121
tests/lora/test_minicpmv_tp.py
Normal file
@@ -0,0 +1,121 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
|
||||
|
||||
PROMPT_TEMPLATE = (
|
||||
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
|
||||
"(<image>./</image>)\nWhat is in the image?<|eot_id|>"
|
||||
"<|start_header_id|>assistant<|end_header_id|>\n\n"
|
||||
)
|
||||
|
||||
IMAGE_ASSETS = [
|
||||
ImageAsset("stop_sign"),
|
||||
]
|
||||
|
||||
# After fine-tuning with LoRA, all generated content should start begin `A`.
|
||||
EXPECTED_OUTPUT = [
|
||||
"A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.", # noqa: E501
|
||||
]
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=5,
|
||||
stop_token_ids=[128001, 128009], # eos_id, eot_id
|
||||
)
|
||||
|
||||
inputs = [
|
||||
{
|
||||
"prompt": PROMPT_TEMPLATE,
|
||||
"multi_modal_data": {"image": asset.pil_image},
|
||||
}
|
||||
for asset in IMAGE_ASSETS
|
||||
]
|
||||
|
||||
outputs = llm.generate(
|
||||
inputs,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
def test_minicpmv_lora(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_num_seqs=2,
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=8,
|
||||
enforce_eager=True,
|
||||
max_model_len=2048,
|
||||
limit_mm_per_prompt={"image": 2, "video": 0},
|
||||
trust_remote_code=True,
|
||||
)
|
||||
output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_OUTPUT)):
|
||||
assert EXPECTED_OUTPUT[i].startswith(output1[i])
|
||||
output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
|
||||
for i in range(len(EXPECTED_OUTPUT)):
|
||||
assert EXPECTED_OUTPUT[i].startswith(output2[i])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
|
||||
)
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=2,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=4,
|
||||
limit_mm_per_prompt={"image": 2, "video": 0},
|
||||
trust_remote_code=True,
|
||||
)
|
||||
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_OUTPUT)):
|
||||
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
|
||||
)
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=2,
|
||||
max_loras=2,
|
||||
max_lora_rank=8,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"image": 1, "video": 0},
|
||||
fully_sharded_loras=True,
|
||||
)
|
||||
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_OUTPUT)):
|
||||
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
|
||||
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2)
|
||||
for i in range(len(EXPECTED_OUTPUT)):
|
||||
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
|
||||
@@ -1,26 +1,27 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int):
|
||||
prompts = [
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501
|
||||
]
|
||||
def do_sample(
|
||||
llm: vllm.LLM, lora_path: str, lora_id: int, prompts: list[str]
|
||||
) -> list[str]:
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
@@ -31,23 +32,46 @@ def do_sample(llm, lora_path: str, lora_id: int):
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [4])
|
||||
def test_mixtral_lora(mixtral_lora_files, tp_size):
|
||||
if torch.cuda.device_count() < tp_size:
|
||||
"""Original test, the LoRA model has the common target modules, not all"""
|
||||
if (
|
||||
torch.cuda.device_count() < tp_size
|
||||
and tp_size > 1
|
||||
and current_platform.is_cuda_alike()
|
||||
):
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=tp_size,
|
||||
worker_use_ray=True)
|
||||
|
||||
expected_lora_output = [
|
||||
"give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501
|
||||
"give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501
|
||||
"inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", # noqa: E501
|
||||
prompts = [
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501
|
||||
]
|
||||
|
||||
assert do_sample(llm, mixtral_lora_files,
|
||||
lora_id=1) == expected_lora_output
|
||||
assert do_sample(llm, mixtral_lora_files,
|
||||
lora_id=2) == expected_lora_output
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
distributed_executor_backend="ray",
|
||||
tensor_parallel_size=tp_size,
|
||||
)
|
||||
|
||||
expected_lora_output = [
|
||||
[
|
||||
"give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])" # noqa: E501
|
||||
],
|
||||
[
|
||||
"give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])", # noqa: E501
|
||||
"give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501
|
||||
],
|
||||
[
|
||||
"inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])" # noqa: E501
|
||||
],
|
||||
]
|
||||
|
||||
def check_outputs(generated: list[str]):
|
||||
assert len(generated) == len(expected_lora_output)
|
||||
for gen, gt_choices in zip(generated, expected_lora_output):
|
||||
assert gen in gt_choices
|
||||
|
||||
check_outputs(do_sample(llm, mixtral_lora_files, lora_id=1, prompts=prompts))
|
||||
check_outputs(do_sample(llm, mixtral_lora_files, lora_id=2, prompts=prompts))
|
||||
|
||||
96
tests/lora/test_moe_lora_align_sum.py
Normal file
96
tests/lora/test_moe_lora_align_sum.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
|
||||
def round_up(x, base):
|
||||
return ((x + base - 1) // base) * base
|
||||
|
||||
|
||||
def CEILDIV(x, y):
|
||||
return (x + y - 1) // y
|
||||
|
||||
|
||||
def sample_data(num_experts, max_loras, num_tokens, topk_num):
|
||||
topk_ids = torch.zeros((num_tokens, topk_num), dtype=torch.int32)
|
||||
token_lora_mapping = torch.zeros((num_tokens,), dtype=torch.int32)
|
||||
|
||||
for i in range(num_tokens):
|
||||
pool = list(range(num_experts))
|
||||
random.shuffle(pool)
|
||||
for j in range(topk_num):
|
||||
topk_ids[i, j] = pool[j]
|
||||
token_lora_mapping[i] = random.randint(0, max_loras - 1)
|
||||
|
||||
return topk_ids.to("cuda"), token_lora_mapping.to("cuda")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096]) # 81920
|
||||
@pytest.mark.parametrize("topk_num", [6])
|
||||
@pytest.mark.parametrize("num_experts", [64, 128, 256, 512])
|
||||
@pytest.mark.parametrize("max_loras", [2, 32])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
def test_moe_lora_align_block_size(
|
||||
num_tokens, topk_num, num_experts, max_loras, block_size
|
||||
):
|
||||
# sample data
|
||||
random.seed(1)
|
||||
topk_ids, token_lora_mapping = sample_data(
|
||||
num_experts, max_loras, num_tokens, topk_num
|
||||
)
|
||||
|
||||
# compute paddings
|
||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||
max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
|
||||
max_num_m_blocks = CEILDIV(max_num_tokens_padded, block_size)
|
||||
|
||||
# init output tensors
|
||||
sorted_token_ids = torch.full(
|
||||
(max_loras * max_num_tokens_padded,),
|
||||
topk_ids.numel(),
|
||||
dtype=torch.int32,
|
||||
device="cuda",
|
||||
)
|
||||
expert_ids = torch.full(
|
||||
(max_loras * max_num_m_blocks,), num_experts, dtype=torch.int32, device="cuda"
|
||||
)
|
||||
num_tokens_post_pad = torch.zeros((max_loras,), dtype=torch.int32, device="cuda")
|
||||
adapter_enabled = torch.ones((max_loras + 1,), dtype=torch.int32, device="cuda")
|
||||
lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device="cuda")
|
||||
|
||||
# call kernel
|
||||
ops.moe_lora_align_block_size(
|
||||
topk_ids,
|
||||
token_lora_mapping,
|
||||
num_experts,
|
||||
block_size,
|
||||
max_loras,
|
||||
max_num_tokens_padded,
|
||||
max_num_m_blocks,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
num_tokens_post_pad,
|
||||
adapter_enabled,
|
||||
lora_ids,
|
||||
)
|
||||
|
||||
# verify values
|
||||
expert_ids = expert_ids.view(max_loras, -1)
|
||||
sorted_token_ids = sorted_token_ids.view(max_loras, -1, block_size)
|
||||
|
||||
for lora_idx in range(max_loras):
|
||||
for token_idx in range(sorted_token_ids.size(1)):
|
||||
block = sorted_token_ids[lora_idx][token_idx]
|
||||
indices = block[block != topk_ids.numel()]
|
||||
if indices.numel() > 0:
|
||||
expert_id = expert_ids[lora_idx][token_idx]
|
||||
assert torch.all(topk_ids.view(-1)[indices] == expert_id)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
163
tests/lora/test_olmoe_tp.py
Normal file
163
tests/lora/test_olmoe_tp.py
Normal file
@@ -0,0 +1,163 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
MODEL_PATH = "allenai/OLMoE-1B-7B-0125-Instruct"
|
||||
|
||||
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
|
||||
"
|
||||
##Instruction:
|
||||
candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key.
|
||||
Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key.
|
||||
The People_ID of candidate is the foreign key of People_ID of people.
|
||||
|
||||
|
||||
###Input:
|
||||
{context}
|
||||
|
||||
###Response:""" # noqa: E501
|
||||
|
||||
EXPECTED_LORA_OUTPUT = [
|
||||
"SELECT count(*) FROM candidate",
|
||||
"SELECT count(*) FROM candidate",
|
||||
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
|
||||
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
|
||||
]
|
||||
|
||||
EXPECTED_BASE_MODEL_OUTPUT = [
|
||||
"SELECT COUNT(Candidate_ID) FROM candidate",
|
||||
"SELECT COUNT(Candidate_ID) FROM candidate",
|
||||
"SELECT Candidate_ID, COUNT(*) as Total_Candidates\nFROM candidate\nINNER JOIN people ON candidate.People_ID = people.People_ID", # noqa: E501
|
||||
"SELECT Candidate_ID, Poll_Source FROM candidate WHERE People_ID IN (SELECT People_ID FROM people) ORDER BY COUNT(*) DESC LIMIT 1", # noqa: E501
|
||||
]
|
||||
|
||||
|
||||
def generate_and_test(
|
||||
llm: vllm.LLM,
|
||||
lora_path: str,
|
||||
lora_id: list[int | None] | int | None,
|
||||
compare_lower: bool = False,
|
||||
) -> None:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(context="How many candidates are there?"),
|
||||
PROMPT_TEMPLATE.format(context="Count the number of candidates."),
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="Which poll resource provided the most number of candidate information?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="Return the poll resource associated with the most candidates."
|
||||
),
|
||||
]
|
||||
|
||||
lora_request = None
|
||||
if isinstance(lora_id, int):
|
||||
lora_request = LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
elif isinstance(lora_id, list):
|
||||
lora_request = [
|
||||
LoRARequest(str(i), i, lora_path) if i is not None else None
|
||||
for i in lora_id
|
||||
]
|
||||
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
|
||||
outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
req_lora_id = lora_id[i] if isinstance(lora_id, list) else lora_id
|
||||
generated_text = generated_texts[i]
|
||||
expected_output = (
|
||||
EXPECTED_LORA_OUTPUT[i]
|
||||
if req_lora_id is not None
|
||||
else EXPECTED_BASE_MODEL_OUTPUT[i]
|
||||
)
|
||||
|
||||
if compare_lower:
|
||||
generated_text = generated_text.lower()
|
||||
expected_output = expected_output.lower()
|
||||
|
||||
assert generated_text.startswith(expected_output)
|
||||
|
||||
|
||||
def test_olmoe_lora(olmoe_lora_files):
|
||||
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
|
||||
# Otherwise, the lora-test will fail due to CUDA OOM.
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
generate_and_test(llm, olmoe_lora_files, lora_id=1)
|
||||
generate_and_test(llm, olmoe_lora_files, lora_id=2)
|
||||
|
||||
|
||||
def test_olmoe_lora_mixed(olmoe_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fully_sharded_loras", [False, True])
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_olmoe_lora_tp2(olmoe_lora_files, fully_sharded_loras):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
tensor_parallel_size=2,
|
||||
fully_sharded_loras=fully_sharded_loras,
|
||||
)
|
||||
|
||||
generate_and_test(llm, olmoe_lora_files, lora_id=1)
|
||||
generate_and_test(llm, olmoe_lora_files, lora_id=2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fully_sharded_loras", [False, True])
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_olmoe_lora_tp4(olmoe_lora_files, fully_sharded_loras):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
tensor_parallel_size=4,
|
||||
fully_sharded_loras=fully_sharded_loras,
|
||||
)
|
||||
generate_and_test(
|
||||
llm, olmoe_lora_files, lora_id=1, compare_lower=fully_sharded_loras
|
||||
)
|
||||
generate_and_test(
|
||||
llm, olmoe_lora_files, lora_id=2, compare_lower=fully_sharded_loras
|
||||
)
|
||||
99
tests/lora/test_peft_helper.py
Normal file
99
tests/lora/test_peft_helper.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import math
|
||||
import shutil
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.lora.peft_helper import PEFTHelper
|
||||
|
||||
ERROR_CASES = [
|
||||
(
|
||||
"test_rank",
|
||||
{"r": 1024},
|
||||
"is greater than max_lora_rank",
|
||||
),
|
||||
("test_dora", {"use_dora": True}, "does not yet support DoRA"),
|
||||
(
|
||||
"test_modules_to_save",
|
||||
{"modules_to_save": ["lm_head"]},
|
||||
"only supports modules_to_save being None",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_peft_helper_pass(llama32_lora_files, tmp_path):
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
llama32_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
|
||||
peft_helper.validate_legal(lora_config)
|
||||
assert peft_helper.r == 8
|
||||
assert peft_helper.lora_alpha == 32
|
||||
target_modules = sorted(peft_helper.target_modules)
|
||||
|
||||
assert target_modules == [
|
||||
"down_proj",
|
||||
"embed_tokens",
|
||||
"gate_proj",
|
||||
"k_proj",
|
||||
"lm_head",
|
||||
"o_proj",
|
||||
"q_proj",
|
||||
"up_proj",
|
||||
"v_proj",
|
||||
]
|
||||
assert peft_helper.vllm_max_position_embeddings == 4096
|
||||
|
||||
# test RSLoRA
|
||||
rslora_config = dict(use_rslora=True)
|
||||
test_dir = tmp_path / "test_rslora"
|
||||
shutil.copytree(llama32_lora_files, test_dir)
|
||||
|
||||
# Load and modify configuration
|
||||
config_path = test_dir / "adapter_config.json"
|
||||
with open(config_path) as f:
|
||||
adapter_config = json.load(f)
|
||||
# Apply configuration changes
|
||||
adapter_config.update(rslora_config)
|
||||
|
||||
# Save modified configuration
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(adapter_config, f)
|
||||
|
||||
peft_helper = PEFTHelper.from_local_dir(test_dir, max_position_embeddings=4096)
|
||||
peft_helper.validate_legal(lora_config)
|
||||
scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
|
||||
assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES)
|
||||
def test_peft_helper_error(
|
||||
llama32_lora_files,
|
||||
tmp_path,
|
||||
test_name: str,
|
||||
config_change: dict,
|
||||
expected_error: str,
|
||||
):
|
||||
test_dir = tmp_path / test_name
|
||||
shutil.copytree(llama32_lora_files, test_dir)
|
||||
|
||||
# Load and modify configuration
|
||||
config_path = test_dir / "adapter_config.json"
|
||||
with open(config_path) as f:
|
||||
adapter_config = json.load(f)
|
||||
# Apply configuration changes
|
||||
adapter_config.update(config_change)
|
||||
|
||||
# Save modified configuration
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(adapter_config, f)
|
||||
lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
|
||||
# Test loading the adapter
|
||||
with pytest.raises(ValueError, match=expected_error):
|
||||
PEFTHelper.from_local_dir(
|
||||
test_dir, max_position_embeddings=4096
|
||||
).validate_legal(lora_config)
|
||||
@@ -1,231 +0,0 @@
|
||||
# Based on code from https://github.com/punica-ai/punica
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm.lora.punica as punica
|
||||
|
||||
|
||||
def assert_close(a, b):
|
||||
rtol, atol = {
|
||||
torch.float16: (5e-3, 5e-3),
|
||||
torch.bfloat16: (3e-2, 2e-2),
|
||||
torch.float32: (None, None),
|
||||
}[a.dtype]
|
||||
torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
def _lora_ref_impl(
|
||||
y_final: torch.Tensor,
|
||||
x: torch.Tensor,
|
||||
wa_T_all: torch.Tensor,
|
||||
wb_T_all: torch.Tensor,
|
||||
indicies: torch.LongTensor,
|
||||
layer_idx: int,
|
||||
scale: float,
|
||||
):
|
||||
y_stage_1 = torch.empty(
|
||||
(x.size(0), wa_T_all.size(-2)),
|
||||
dtype=torch.float32,
|
||||
device=x.device,
|
||||
)
|
||||
bs = x.shape[0]
|
||||
s = torch.tensor(scale, dtype=torch.float32, device=x.device)
|
||||
for i, lora_idx in zip(range(bs), indicies.cpu().tolist()):
|
||||
xi = x[i].unsqueeze(0).to(torch.float32)
|
||||
wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
|
||||
if wb_T_all is not None:
|
||||
wb = wb_T_all[lora_idx, layer_idx].transpose(-1,
|
||||
-2).to(torch.float32)
|
||||
|
||||
tmp = xi @ wa
|
||||
y_stage_1[i] = tmp.squeeze(0)
|
||||
y_final[i] += ((tmp @ wb).squeeze(0) *
|
||||
s if wb_T_all is not None else y_stage_1[i])
|
||||
return y_final, y_stage_1
|
||||
|
||||
|
||||
H1 = H2 = [
|
||||
128,
|
||||
256,
|
||||
512,
|
||||
1024,
|
||||
1152,
|
||||
1280,
|
||||
1536,
|
||||
2048,
|
||||
2304,
|
||||
2560,
|
||||
2752,
|
||||
3072,
|
||||
3456,
|
||||
3584,
|
||||
4096,
|
||||
4608,
|
||||
5120,
|
||||
5504,
|
||||
5632,
|
||||
6144,
|
||||
6848,
|
||||
6912,
|
||||
7168,
|
||||
8192,
|
||||
9216,
|
||||
10240,
|
||||
11008,
|
||||
13824,
|
||||
14336,
|
||||
15360,
|
||||
22016,
|
||||
24576,
|
||||
27392,
|
||||
32000,
|
||||
32256,
|
||||
32512,
|
||||
32768,
|
||||
33024,
|
||||
36864,
|
||||
43264,
|
||||
49152,
|
||||
64000,
|
||||
64256,
|
||||
102400,
|
||||
102656,
|
||||
128000,
|
||||
128256,
|
||||
]
|
||||
H2 = [64] + H2
|
||||
R = [1, 2, 4]
|
||||
SEED = [0xabcdabcd987]
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
|
||||
@pytest.mark.parametrize("h1", H1)
|
||||
@pytest.mark.parametrize("r", R)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@torch.inference_mode()
|
||||
def test_lora_a_extra_shapes(dtype_str, h1, r, seed):
|
||||
torch.manual_seed(seed)
|
||||
num_loras = 4
|
||||
num_layers = 1
|
||||
bs = 32
|
||||
dtype = getattr(torch, dtype_str)
|
||||
device = torch.device("cuda")
|
||||
|
||||
wa_T_all = torch.randn(num_loras,
|
||||
num_layers,
|
||||
r,
|
||||
h1,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device)
|
||||
|
||||
for layer_idx in range(num_layers):
|
||||
x = torch.randn(bs, h1, dtype=dtype, device=device)
|
||||
y = torch.randn(bs, r, dtype=dtype, device=device)
|
||||
|
||||
y_ref = y.clone()
|
||||
_lora_ref_impl(
|
||||
y_ref,
|
||||
x,
|
||||
wa_T_all,
|
||||
None,
|
||||
indices,
|
||||
layer_idx,
|
||||
1.0,
|
||||
)
|
||||
|
||||
y_our = y.clone()
|
||||
punica.bgmv(y_our, x, wa_T_all, indices, layer_idx, 1.0)
|
||||
|
||||
assert_close(y_ref, y_our)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
|
||||
@pytest.mark.parametrize("h1", H1)
|
||||
@pytest.mark.parametrize("h2", H2)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_lora_correctness(dtype_str, h1, h2, seed, device):
|
||||
torch.manual_seed(seed)
|
||||
num_loras = 4
|
||||
num_layers = 1
|
||||
r = 8
|
||||
bs = 32
|
||||
scale = 0.123
|
||||
dtype = getattr(torch, dtype_str)
|
||||
torch.set_default_device(device)
|
||||
|
||||
wa_T_all = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
|
||||
wb_T_all = torch.randn(num_loras, num_layers, h2, r, dtype=dtype)
|
||||
indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
|
||||
|
||||
for layer_idx in range(num_layers):
|
||||
x = torch.randn(bs, h1, dtype=dtype)
|
||||
y = torch.randn(bs, h2, dtype=dtype)
|
||||
|
||||
y_ref = y.clone()
|
||||
_lora_ref_impl(y_ref, x, wa_T_all, wb_T_all, indices, layer_idx, scale)
|
||||
|
||||
y_our = y.clone()
|
||||
punica.add_lora(y_our, x, wa_T_all, wb_T_all, indices, layer_idx,
|
||||
scale)
|
||||
|
||||
assert_close(y_ref, y_our)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
|
||||
@pytest.mark.parametrize("h1", H1)
|
||||
@pytest.mark.parametrize("h2", H2)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_lora_correctness_slice(dtype_str, h1, h2, seed, device):
|
||||
if h2 % 3 != 0 or h2 // 3 not in H1:
|
||||
pytest.skip("h2 must be divisible by 3 and in supported shapes")
|
||||
torch.manual_seed(seed)
|
||||
num_loras = 4
|
||||
num_layers = 1
|
||||
r = 8
|
||||
bs = 32
|
||||
scale = 0.123
|
||||
dtype = getattr(torch, dtype_str)
|
||||
torch.set_default_device(device)
|
||||
|
||||
wa_T_all_0 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
|
||||
wa_T_all_1 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
|
||||
wa_T_all_2 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
|
||||
wb_T_all_0 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
|
||||
wb_T_all_1 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
|
||||
wb_T_all_2 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
|
||||
|
||||
indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
|
||||
|
||||
for layer_idx in range(num_layers):
|
||||
x = torch.randn(bs, h1, dtype=dtype)
|
||||
y = torch.randn(bs, h2, dtype=dtype)
|
||||
s = h2 // 3
|
||||
|
||||
y_ref = y.clone()
|
||||
_lora_ref_impl(y_ref[:, :s], x, wa_T_all_0, wb_T_all_0, indices,
|
||||
layer_idx, scale)
|
||||
_lora_ref_impl(y_ref[:, s:s * 2], x, wa_T_all_1, wb_T_all_1, indices,
|
||||
layer_idx, scale)
|
||||
_lora_ref_impl(y_ref[:, s * 2:], x, wa_T_all_2, wb_T_all_2, indices,
|
||||
layer_idx, scale)
|
||||
|
||||
y_our = y.clone()
|
||||
punica.add_lora_slice(y_our, x, wa_T_all_0, wb_T_all_0, indices,
|
||||
layer_idx, scale, 0, s)
|
||||
punica.add_lora_slice(y_our, x, wa_T_all_1, wb_T_all_1, indices,
|
||||
layer_idx, scale, s, s)
|
||||
punica.add_lora_slice(y_our, x, wa_T_all_2, wb_T_all_2, indices,
|
||||
layer_idx, scale, s * 2, s)
|
||||
|
||||
assert_close(y_ref[:, :s], y_our[:, :s])
|
||||
assert_close(y_ref[:, s:s * 2], y_our[:, s:s * 2])
|
||||
assert_close(y_ref[:, s * 2:], y_our[:, s * 2:])
|
||||
475
tests/lora/test_punica_ops.py
Normal file
475
tests/lora/test_punica_ops.py
Normal file
@@ -0,0 +1,475 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from threading import Lock
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm.lora.ops.torch_ops as torch_ops
|
||||
import vllm.lora.ops.triton_ops as triton_ops
|
||||
from vllm.lora.ops.triton_ops import LoRAKernelMeta
|
||||
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .utils import PunicaTensors, assert_close, generate_data_for_nslices
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_device(reset_default_device):
|
||||
pass
|
||||
|
||||
|
||||
# Utility shrink and expand operations used as reference implementations.
|
||||
def sgmv_shrink_for_nslices(
|
||||
nslices: int,
|
||||
inputs_tensor: torch.Tensor,
|
||||
lora_weights_lst: list[torch.Tensor],
|
||||
out_tensor: torch.Tensor,
|
||||
b_seq_start_loc: torch.Tensor,
|
||||
seq_len_tensor: torch.Tensor,
|
||||
prompt_lora_mapping: torch.Tensor,
|
||||
batches: int,
|
||||
max_seq_length: int,
|
||||
num_tokens: int,
|
||||
scaling: float,
|
||||
):
|
||||
"""
|
||||
Wrapper around torch_ops.sgmv_shrink that handles any nslices.
|
||||
"""
|
||||
for index in range(nslices):
|
||||
torch_ops.sgmv_shrink(
|
||||
inputs_tensor,
|
||||
lora_weights_lst[index],
|
||||
out_tensor[index],
|
||||
b_seq_start_loc,
|
||||
seq_len_tensor,
|
||||
prompt_lora_mapping,
|
||||
batches,
|
||||
max_seq_length,
|
||||
num_tokens,
|
||||
scaling,
|
||||
)
|
||||
|
||||
|
||||
def sgmv_expand_for_nslices(
|
||||
nslices: int,
|
||||
hidden_size: int,
|
||||
inputs_tensor: torch.Tensor,
|
||||
lora_weights_lst: list[torch.Tensor],
|
||||
out_tensor: torch.Tensor,
|
||||
b_seq_start_loc: torch.Tensor,
|
||||
seq_len_tensor: torch.Tensor,
|
||||
prompt_lora_mapping: torch.Tensor,
|
||||
batches: int,
|
||||
max_seq_length: int,
|
||||
num_tokens: int,
|
||||
add_inputs: bool,
|
||||
) -> None:
|
||||
"""
|
||||
Wrapper around torch_ops.sgmv_expand that handles any nslices.
|
||||
"""
|
||||
if nslices == 1:
|
||||
# Verify the torch's sgmv_expand op
|
||||
torch_ops.sgmv_expand(
|
||||
inputs_tensor[0],
|
||||
lora_weights_lst[0],
|
||||
out_tensor,
|
||||
b_seq_start_loc,
|
||||
seq_len_tensor,
|
||||
prompt_lora_mapping,
|
||||
batches,
|
||||
max_seq_length,
|
||||
num_tokens,
|
||||
add_inputs=add_inputs,
|
||||
)
|
||||
else:
|
||||
slice_offset = 0
|
||||
for index in range(nslices):
|
||||
lora_weights = lora_weights_lst[index]
|
||||
torch_ops.sgmv_expand_slice(
|
||||
inputs_tensor[index],
|
||||
lora_weights,
|
||||
out_tensor,
|
||||
b_seq_start_loc,
|
||||
seq_len_tensor,
|
||||
prompt_lora_mapping,
|
||||
batches,
|
||||
max_seq_length,
|
||||
num_tokens,
|
||||
slice_offset,
|
||||
hidden_size,
|
||||
add_inputs=add_inputs,
|
||||
)
|
||||
slice_offset += hidden_size
|
||||
|
||||
|
||||
_dict_lock = Lock()
|
||||
|
||||
|
||||
def check_lora_shrink_kernel(
|
||||
batches: int,
|
||||
num_loras: int,
|
||||
rank: int,
|
||||
hidden_size: int,
|
||||
nslices: int,
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
seq_length: int,
|
||||
scaling: float,
|
||||
):
|
||||
"""
|
||||
Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
|
||||
kernels.
|
||||
"""
|
||||
data: PunicaTensors = generate_data_for_nslices(
|
||||
batches,
|
||||
hidden_size,
|
||||
num_loras,
|
||||
rank,
|
||||
seq_length,
|
||||
nslices,
|
||||
dtype,
|
||||
"shrink",
|
||||
device,
|
||||
)
|
||||
max_seq_length, token_nums = data.meta()
|
||||
|
||||
# Setup metadata information for SGMV and reference kernels
|
||||
sgmv_meta_args = (
|
||||
data.b_seq_start_loc,
|
||||
data.seq_len_tensor,
|
||||
data.prompt_lora_mapping,
|
||||
batches,
|
||||
max_seq_length,
|
||||
token_nums,
|
||||
)
|
||||
|
||||
# Setup metadata information for the LoRA kernel.
|
||||
lora_meta = LoRAKernelMeta.make(
|
||||
max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
|
||||
)
|
||||
lora_meta.prepare_tensors(data.token_lora_mapping)
|
||||
|
||||
ref_out_tensor = data.ref_out_tensor
|
||||
out_tensor = data.our_out_tensor.clone()
|
||||
|
||||
# Preventing cache error pointer.
|
||||
with _dict_lock:
|
||||
# lora_shrink kernel
|
||||
_LORA_A_PTR_DICT.clear()
|
||||
triton_ops.lora_shrink(
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
out_tensor,
|
||||
*lora_meta.meta_args(token_nums=token_nums),
|
||||
scaling,
|
||||
)
|
||||
|
||||
# Reference
|
||||
sgmv_shrink_for_nslices(
|
||||
nslices,
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
ref_out_tensor,
|
||||
*sgmv_meta_args,
|
||||
scaling,
|
||||
)
|
||||
|
||||
assert_close(out_tensor, ref_out_tensor)
|
||||
|
||||
|
||||
def check_lora_expand_kernel(
|
||||
batches: int,
|
||||
num_loras: int,
|
||||
rank: int,
|
||||
hidden_size: int,
|
||||
nslices: int,
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
seq_length: int,
|
||||
add_inputs: bool,
|
||||
):
|
||||
"""
|
||||
Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
|
||||
kernels.
|
||||
"""
|
||||
data: PunicaTensors = generate_data_for_nslices(
|
||||
batches,
|
||||
hidden_size,
|
||||
num_loras,
|
||||
rank,
|
||||
seq_length,
|
||||
nslices,
|
||||
dtype,
|
||||
"expand",
|
||||
device,
|
||||
)
|
||||
|
||||
max_seq_length, token_nums = data.meta()
|
||||
|
||||
# Setup metadata information for SGMV and reference kernels
|
||||
sgmv_meta_args = (
|
||||
data.b_seq_start_loc,
|
||||
data.seq_len_tensor,
|
||||
data.prompt_lora_mapping,
|
||||
batches,
|
||||
max_seq_length,
|
||||
token_nums,
|
||||
)
|
||||
|
||||
# Setup metadata information for the LoRA kernel.
|
||||
lora_meta = LoRAKernelMeta.make(
|
||||
max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
|
||||
)
|
||||
lora_meta.prepare_tensors(data.token_lora_mapping)
|
||||
|
||||
# Setup output tensors
|
||||
ref_out_tensor = data.ref_out_tensor
|
||||
out_tensor = data.our_out_tensor.clone()
|
||||
|
||||
with _dict_lock:
|
||||
# lora_expand kernel
|
||||
_LORA_B_PTR_DICT.clear()
|
||||
triton_ops.lora_expand(
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
out_tensor,
|
||||
*lora_meta.meta_args(token_nums=token_nums),
|
||||
offset_start=0,
|
||||
add_inputs=add_inputs,
|
||||
)
|
||||
|
||||
# Reference
|
||||
sgmv_expand_for_nslices(
|
||||
nslices,
|
||||
hidden_size,
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
ref_out_tensor,
|
||||
*sgmv_meta_args,
|
||||
add_inputs=add_inputs,
|
||||
)
|
||||
|
||||
assert_close(out_tensor, ref_out_tensor)
|
||||
|
||||
|
||||
# Tests
|
||||
# We test the punica kernels along 2 verticals mainly.
|
||||
# 1. Variations in hidden_dim size
|
||||
# 2. Variations in all other parameters like (batch_size, max_rank, num_loras
|
||||
# etc.)
|
||||
|
||||
# We have collected the hidden_sizes included in the LoRA models
|
||||
# currently supported by vLLM. It tests whether the corresponding Triton
|
||||
# kernel can run normally when tensor parallelism is set to
|
||||
# [1, 2, 4, 8, 16, 32, 64].
|
||||
HIDDEN_SIZES = [
|
||||
128,
|
||||
256,
|
||||
512,
|
||||
896,
|
||||
1024,
|
||||
1152,
|
||||
1216,
|
||||
1280,
|
||||
1536,
|
||||
1664,
|
||||
2048,
|
||||
2240,
|
||||
2304,
|
||||
2368,
|
||||
2432,
|
||||
2560,
|
||||
2752,
|
||||
3072,
|
||||
3328,
|
||||
3456,
|
||||
3584,
|
||||
3712,
|
||||
4096,
|
||||
4480,
|
||||
4608,
|
||||
4736,
|
||||
4864,
|
||||
5120,
|
||||
5504,
|
||||
5632,
|
||||
5888,
|
||||
6144,
|
||||
6400,
|
||||
6848,
|
||||
6912,
|
||||
7168,
|
||||
7424,
|
||||
8192,
|
||||
8960,
|
||||
9216,
|
||||
9472,
|
||||
10240,
|
||||
11008,
|
||||
11264,
|
||||
13824,
|
||||
14336,
|
||||
14784,
|
||||
14848,
|
||||
15360,
|
||||
18944,
|
||||
22016,
|
||||
22528,
|
||||
24576,
|
||||
27392,
|
||||
27648,
|
||||
29568,
|
||||
29696,
|
||||
32000,
|
||||
32256,
|
||||
32512,
|
||||
32768,
|
||||
33024,
|
||||
36864,
|
||||
43264,
|
||||
49152,
|
||||
49408,
|
||||
60544,
|
||||
60672,
|
||||
64000,
|
||||
64256,
|
||||
102400,
|
||||
102656,
|
||||
128000,
|
||||
128256,
|
||||
]
|
||||
# The size of TP
|
||||
divisibility = [1, 2, 8, 16, 64]
|
||||
|
||||
all_hidden_size = []
|
||||
for div in divisibility:
|
||||
for hidden_size in HIDDEN_SIZES:
|
||||
all_hidden_size.append(hidden_size // div)
|
||||
|
||||
HIDDEN_SIZES = list(set(all_hidden_size))
|
||||
|
||||
# Test params that focuses on hidden_size variation.
|
||||
hs_test_params = {
|
||||
"hidden_sizes": HIDDEN_SIZES,
|
||||
"batches": [4],
|
||||
"num_loras": [4],
|
||||
"max_ranks": [32],
|
||||
}
|
||||
|
||||
# General tests params that tests for variations in all dimensions
|
||||
# except hidden_size.
|
||||
test_params = {
|
||||
"hidden_sizes": [2049],
|
||||
"batches": [1, 4, 16, 32],
|
||||
"num_loras": [1, 8, 32, 128],
|
||||
"max_ranks": [1, 4, 8, 16, 32, 64, 128, 256],
|
||||
}
|
||||
|
||||
DTYPES = [torch.float16, torch.bfloat16]
|
||||
DEVICES = [f"cuda:{0}"]
|
||||
SEED = [0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batches", test_params["batches"])
|
||||
@pytest.mark.parametrize("num_loras", test_params["num_loras"])
|
||||
@pytest.mark.parametrize("rank", test_params["max_ranks"])
|
||||
@pytest.mark.parametrize("hidden_size", test_params["hidden_sizes"])
|
||||
@pytest.mark.parametrize("nslices", [1, 2, 3])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
|
||||
def test_kernels(
|
||||
batches: int,
|
||||
num_loras: int,
|
||||
rank: int,
|
||||
hidden_size: int,
|
||||
nslices: int,
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
seed: int,
|
||||
op_type: str,
|
||||
):
|
||||
"""
|
||||
Tests LoRA kernels.
|
||||
"""
|
||||
torch.set_default_device(device)
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
if op_type == "shrink":
|
||||
check_lora_shrink_kernel(
|
||||
batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
scaling=0.5,
|
||||
)
|
||||
else:
|
||||
check_lora_expand_kernel(
|
||||
batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
add_inputs=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batches", hs_test_params["batches"])
|
||||
@pytest.mark.parametrize("num_loras", hs_test_params["num_loras"])
|
||||
@pytest.mark.parametrize("rank", hs_test_params["max_ranks"])
|
||||
@pytest.mark.parametrize("hidden_size", hs_test_params["hidden_sizes"])
|
||||
@pytest.mark.parametrize("nslices", [1, 2, 3])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
|
||||
def test_kernels_hidden_size(
|
||||
batches: int,
|
||||
num_loras: int,
|
||||
rank: int,
|
||||
hidden_size: int,
|
||||
nslices: int,
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
seed: int,
|
||||
op_type: str,
|
||||
):
|
||||
"""
|
||||
Tests SGMV and LoRA kernels.
|
||||
"""
|
||||
torch.set_default_device(device)
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
if op_type == "shrink":
|
||||
check_lora_shrink_kernel(
|
||||
batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
scaling=0.5,
|
||||
)
|
||||
else:
|
||||
check_lora_expand_kernel(
|
||||
batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
add_inputs=True,
|
||||
)
|
||||
@@ -1,14 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from .conftest import cleanup
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -17,15 +19,28 @@ class ModelWithQuantization:
|
||||
quantization: str
|
||||
|
||||
|
||||
MODELS: List[ModelWithQuantization] = [
|
||||
ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
|
||||
quantization="AWQ"),
|
||||
ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
|
||||
quantization="GPTQ"),
|
||||
]
|
||||
MODELS: list[ModelWithQuantization]
|
||||
# AWQ quantization is currently not supported in ROCm.
|
||||
if current_platform.is_rocm():
|
||||
MODELS = [
|
||||
ModelWithQuantization(
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
|
||||
),
|
||||
]
|
||||
else:
|
||||
MODELS = [
|
||||
ModelWithQuantization(
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", quantization="awq"
|
||||
),
|
||||
ModelWithQuantization(
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
|
||||
def do_sample(
|
||||
llm: vllm.LLM, lora_path: str, lora_id: int, max_tokens: int = 256
|
||||
) -> list[str]:
|
||||
raw_prompts = [
|
||||
"Give me an orange-ish brown color",
|
||||
"Give me a neon pink color",
|
||||
@@ -36,16 +51,16 @@ def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
|
||||
|
||||
prompts = [format_prompt_tuples(p) for p in raw_prompts]
|
||||
|
||||
sampling_params = vllm.SamplingParams(temperature=0,
|
||||
max_tokens=max_tokens,
|
||||
stop=["<|im_end|>"])
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=0, max_tokens=max_tokens, stop=["<|im_end|>"]
|
||||
)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
@@ -55,44 +70,31 @@ def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
|
||||
# Cannot use as it will initialize torch.cuda too early...
|
||||
# if torch.cuda.device_count() < tp_size:
|
||||
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
||||
llm = vllm.LLM(model=model.model_path,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_model_len=400,
|
||||
tensor_parallel_size=tp_size,
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True)
|
||||
def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
llm = vllm.LLM(
|
||||
model=model.model_path,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_model_len=400,
|
||||
gpu_memory_utilization=0.2, # avoid OOM
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
tokenizer=tinyllama_lora_files,
|
||||
)
|
||||
|
||||
if model.quantization is None:
|
||||
expected_no_lora_output = [
|
||||
"Here are some examples of orange-brown colors",
|
||||
"I'm sorry, I don't have"
|
||||
]
|
||||
expected_lora_output = [
|
||||
"#ff8050",
|
||||
"#ff8080",
|
||||
]
|
||||
elif model.quantization == "AWQ":
|
||||
expected_no_lora_output = [
|
||||
"I'm sorry, I don't understand",
|
||||
"I'm sorry, I don't understand",
|
||||
]
|
||||
elif model.quantization == "awq":
|
||||
expected_lora_output = [
|
||||
"#f07700: A v",
|
||||
"#f00000: A v",
|
||||
]
|
||||
elif model.quantization == "GPTQ":
|
||||
expected_no_lora_output = [
|
||||
"I'm sorry, I don't have",
|
||||
"I'm sorry, I don't have",
|
||||
]
|
||||
elif model.quantization == "gptq":
|
||||
expected_lora_output = [
|
||||
"#f08800: This is",
|
||||
"#f07788 \n#",
|
||||
@@ -101,79 +103,65 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
|
||||
def expect_match(output, expected_output):
|
||||
# HACK: GPTQ lora outputs are just incredibly unstable.
|
||||
# Assert that the outputs changed.
|
||||
if (model.quantization == "GPTQ"
|
||||
and expected_output is expected_lora_output):
|
||||
assert output != expected_no_lora_output
|
||||
if model.quantization == "gptq" and expected_output is expected_lora_output:
|
||||
for i, o in enumerate(output):
|
||||
assert o.startswith(
|
||||
'#'), f"Expected example {i} to start with # but got {o}"
|
||||
assert o.startswith("#"), (
|
||||
f"Expected example {i} to start with # but got {o}"
|
||||
)
|
||||
return
|
||||
assert output == expected_output
|
||||
|
||||
max_tokens = 10
|
||||
|
||||
print("lora adapter created")
|
||||
output = do_sample(llm,
|
||||
tinyllama_lora_files,
|
||||
lora_id=0,
|
||||
max_tokens=max_tokens)
|
||||
expect_match(output, expected_no_lora_output)
|
||||
|
||||
print("lora 1")
|
||||
output = do_sample(llm,
|
||||
tinyllama_lora_files,
|
||||
lora_id=1,
|
||||
max_tokens=max_tokens)
|
||||
output = do_sample(llm, tinyllama_lora_files, lora_id=1, max_tokens=max_tokens)
|
||||
expect_match(output, expected_lora_output)
|
||||
|
||||
print("no lora")
|
||||
output = do_sample(llm,
|
||||
tinyllama_lora_files,
|
||||
lora_id=0,
|
||||
max_tokens=max_tokens)
|
||||
expect_match(output, expected_no_lora_output)
|
||||
|
||||
print("lora 2")
|
||||
output = do_sample(llm,
|
||||
tinyllama_lora_files,
|
||||
lora_id=2,
|
||||
max_tokens=max_tokens)
|
||||
output = do_sample(llm, tinyllama_lora_files, lora_id=2, max_tokens=max_tokens)
|
||||
expect_match(output, expected_lora_output)
|
||||
|
||||
print("removing lora")
|
||||
|
||||
del llm
|
||||
cleanup()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.skip("Requires multiple GPUs")
|
||||
def test_quant_model_tp_equality(tinyllama_lora_files, model):
|
||||
# Cannot use as it will initialize torch.cuda too early...
|
||||
# if torch.cuda.device_count() < 2:
|
||||
# pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
|
||||
|
||||
llm_tp1 = vllm.LLM(model=model.model_path,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=1,
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True)
|
||||
def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, model):
|
||||
if num_gpus_available < 2:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
|
||||
if model.quantization == "gptq":
|
||||
pytest.skip("GPTQ lora outputs are just incredibly unstable")
|
||||
llm_tp1 = vllm.LLM(
|
||||
model=model.model_path,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
gpu_memory_utilization=0.2, # avoid OOM
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp1
|
||||
cleanup()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
llm_tp2 = vllm.LLM(model=model.model_path,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=2,
|
||||
quantization=model.quantization)
|
||||
llm_tp2 = vllm.LLM(
|
||||
model=model.model_path,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.2, # avoid OOM
|
||||
quantization=model.quantization,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp2
|
||||
cleanup()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
assert output_tp1 == output_tp2
|
||||
|
||||
177
tests/lora/test_qwen2vl.py
Normal file
177
tests/lora/test_qwen2vl.py
Normal file
@@ -0,0 +1,177 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import dataclass
|
||||
|
||||
import vllm
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestConfig:
|
||||
model_path: str
|
||||
lora_path: str
|
||||
max_num_seqs: int = 2
|
||||
max_loras: int = 2
|
||||
max_lora_rank: int = 16
|
||||
max_model_len: int = 4096
|
||||
mm_processor_kwargs: dict[str, int] | None = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.mm_processor_kwargs is None:
|
||||
self.mm_processor_kwargs = {
|
||||
"min_pixels": 28 * 28,
|
||||
"max_pixels": 1280 * 28 * 28,
|
||||
}
|
||||
|
||||
|
||||
class Qwen2VLTester:
|
||||
"""Test helper for Qwen2 VL models with LoRA"""
|
||||
|
||||
PROMPT_TEMPLATE = (
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
|
||||
"\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
|
||||
"What is in the image?<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
def __init__(self, config: TestConfig):
|
||||
self.config = config
|
||||
self.llm = self._initialize_llm()
|
||||
|
||||
def _initialize_llm(self) -> vllm.LLM:
|
||||
"""Initialize the LLM with given configuration"""
|
||||
return vllm.LLM(
|
||||
model=self.config.model_path,
|
||||
max_num_seqs=self.config.max_num_seqs,
|
||||
enable_lora=True,
|
||||
max_loras=self.config.max_loras,
|
||||
max_lora_rank=self.config.max_lora_rank,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=self.config.mm_processor_kwargs,
|
||||
max_model_len=self.config.max_model_len,
|
||||
)
|
||||
|
||||
def run_test(
|
||||
self,
|
||||
images: list[ImageAsset],
|
||||
expected_outputs: list[str],
|
||||
lora_id: int | None = None,
|
||||
temperature: float = 0,
|
||||
max_tokens: int = 5,
|
||||
):
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
inputs = [
|
||||
{
|
||||
"prompt": self.PROMPT_TEMPLATE,
|
||||
"multi_modal_data": {"image": asset.pil_image},
|
||||
}
|
||||
for asset in images
|
||||
]
|
||||
|
||||
lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
|
||||
outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
|
||||
generated_texts = [output.outputs[0].text.strip() for output in outputs]
|
||||
|
||||
# Validate outputs
|
||||
for generated, expected in zip(generated_texts, expected_outputs):
|
||||
assert expected.startswith(generated), (
|
||||
f"Generated text {generated} doesn't "
|
||||
)
|
||||
f"match expected pattern {expected}"
|
||||
|
||||
def run_beam_search_test(
|
||||
self,
|
||||
images: list[ImageAsset],
|
||||
expected_outputs: list[list[str]],
|
||||
lora_id: int | None = None,
|
||||
temperature: float = 0,
|
||||
beam_width: int = 2,
|
||||
max_tokens: int = 5,
|
||||
):
|
||||
beam_search_params = BeamSearchParams(
|
||||
beam_width=beam_width, max_tokens=max_tokens, temperature=temperature
|
||||
)
|
||||
|
||||
inputs = [
|
||||
{
|
||||
"prompt": self.PROMPT_TEMPLATE,
|
||||
"multi_modal_data": {"image": asset.pil_image},
|
||||
}
|
||||
for asset in images
|
||||
]
|
||||
|
||||
lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
|
||||
outputs = self.llm.beam_search(
|
||||
inputs, beam_search_params, lora_request=lora_request
|
||||
)
|
||||
|
||||
for output_obj, expected_outs in zip(outputs, expected_outputs):
|
||||
output_texts = [seq.text for seq in output_obj.sequences]
|
||||
assert output_texts == expected_outs, (
|
||||
f"Generated texts {output_texts} do not match expected {expected_outs}"
|
||||
) # noqa: E501
|
||||
|
||||
|
||||
TEST_IMAGES = [
|
||||
ImageAsset("stop_sign"),
|
||||
ImageAsset("cherry_blossom"),
|
||||
]
|
||||
|
||||
EXPECTED_OUTPUTS = [
|
||||
"A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.", # noqa: E501
|
||||
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501
|
||||
]
|
||||
|
||||
# NOTE - beam search .text contains the whole text
|
||||
EXPECTED_BEAM_SEARCH_OUTPUTS = [
|
||||
[
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic skyscraper stands", # noqa: E501
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic tower stands tall", # noqa: E501
|
||||
],
|
||||
]
|
||||
|
||||
QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
|
||||
QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
|
||||
|
||||
def test_qwen2vl_lora(qwen2vl_lora_files):
|
||||
"""Test Qwen 2.0 VL model with LoRA"""
|
||||
config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
|
||||
tester = Qwen2VLTester(config)
|
||||
|
||||
# Test with different LoRA IDs
|
||||
for lora_id in [1, 2]:
|
||||
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
|
||||
|
||||
|
||||
def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
|
||||
"""Test Qwen 2.0 VL model with LoRA through beam search."""
|
||||
config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
|
||||
tester = Qwen2VLTester(config)
|
||||
|
||||
# Test with different LoRA IDs
|
||||
for lora_id in [1, 2]:
|
||||
# NOTE currently, we only test cherry blossom since stop sign
|
||||
# output is slightly different for v1; - the root cause is likely
|
||||
# independent of the intent of this test, which is to ensure beam
|
||||
# search passes through lora through correctly.
|
||||
tester.run_beam_search_test(
|
||||
[ImageAsset("cherry_blossom")],
|
||||
expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
|
||||
lora_id=lora_id,
|
||||
)
|
||||
|
||||
|
||||
def test_qwen25vl_lora(qwen25vl_lora_files):
|
||||
"""Test Qwen 2.5 VL model with LoRA"""
|
||||
config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
|
||||
tester = Qwen2VLTester(config)
|
||||
|
||||
# Test with different LoRA IDs
|
||||
for lora_id in [1, 2]:
|
||||
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
|
||||
115
tests/lora/test_qwen3moe_tp.py
Normal file
115
tests/lora/test_qwen3moe_tp.py
Normal file
@@ -0,0 +1,115 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
# NOTE To avoid overloading the CI pipeline, this test script will not
|
||||
# be triggered on CI and is primarily intended for local testing and verification.
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
MODEL_PATH = "Qwen/Qwen3-30B-A3B"
|
||||
|
||||
PROMPT_TEMPLATE = """<|im_start|>user
|
||||
I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
|
||||
"
|
||||
##Instruction:
|
||||
candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key.
|
||||
Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key.
|
||||
The People_ID of candidate is the foreign key of People_ID of people.
|
||||
|
||||
|
||||
###Input:
|
||||
{context}
|
||||
|
||||
###Response:<|im_end|>
|
||||
<|im_start|>assistant""" # noqa: E501
|
||||
|
||||
EXPECTED_LORA_OUTPUT = [
|
||||
"<think>\n\n</think>\n\nSELECT count(*) FROM candidate",
|
||||
"<think>\n\n</think>\n\nSELECT count(*) FROM candidate",
|
||||
"<think>\n\n</think>\n\nSELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
|
||||
"<think>\n\n</think>\n\nSELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
|
||||
]
|
||||
|
||||
|
||||
def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(context="How many candidates are there?"),
|
||||
PROMPT_TEMPLATE.format(context="Count the number of candidates."),
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="Which poll resource provided the most number of candidate information?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="Return the poll resource associated with the most candidates."
|
||||
),
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
|
||||
|
||||
|
||||
def test_qwen3moe_lora(qwen3moe_lora_files):
|
||||
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
|
||||
# Otherwise, the lora-test will fail due to CUDA OOM.
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
generate_and_test(llm, qwen3moe_lora_files, lora_id=1)
|
||||
generate_and_test(llm, qwen3moe_lora_files, lora_id=2)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_qwen3moe_lora_tp2(qwen3moe_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
tensor_parallel_size=2,
|
||||
)
|
||||
|
||||
generate_and_test(llm, qwen3moe_lora_files, lora_id=1)
|
||||
generate_and_test(llm, qwen3moe_lora_files, lora_id=2)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_qwen3moe_lora_tp4(qwen3moe_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
tensor_parallel_size=4,
|
||||
)
|
||||
|
||||
generate_and_test(llm, qwen3moe_lora_files, lora_id=1)
|
||||
generate_and_test(llm, qwen3moe_lora_files, lora_id=2)
|
||||
75
tests/lora/test_resolver.py
Normal file
75
tests/lora/test_resolver.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
|
||||
|
||||
|
||||
class DummyLoRAResolver(LoRAResolver):
|
||||
"""A dummy LoRA resolver for testing."""
|
||||
|
||||
async def resolve_lora(
|
||||
self, base_model_name: str, lora_name: str
|
||||
) -> LoRARequest | None:
|
||||
if lora_name == "test_lora":
|
||||
return LoRARequest(
|
||||
lora_name=lora_name,
|
||||
lora_path=f"/dummy/path/{base_model_name}/{lora_name}",
|
||||
lora_int_id=abs(hash(lora_name)),
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def test_resolver_registry_registration():
|
||||
"""Test basic resolver registration functionality."""
|
||||
registry = LoRAResolverRegistry
|
||||
resolver = DummyLoRAResolver()
|
||||
|
||||
# Register a new resolver
|
||||
registry.register_resolver("dummy", resolver)
|
||||
assert "dummy" in registry.get_supported_resolvers()
|
||||
|
||||
# Get registered resolver
|
||||
retrieved_resolver = registry.get_resolver("dummy")
|
||||
assert retrieved_resolver is resolver
|
||||
|
||||
|
||||
def test_resolver_registry_duplicate_registration():
|
||||
"""Test registering a resolver with an existing name."""
|
||||
registry = LoRAResolverRegistry
|
||||
resolver1 = DummyLoRAResolver()
|
||||
resolver2 = DummyLoRAResolver()
|
||||
|
||||
registry.register_resolver("dummy", resolver1)
|
||||
registry.register_resolver("dummy", resolver2)
|
||||
|
||||
assert registry.get_resolver("dummy") is resolver2
|
||||
|
||||
|
||||
def test_resolver_registry_unknown_resolver():
|
||||
"""Test getting a non-existent resolver."""
|
||||
registry = LoRAResolverRegistry
|
||||
|
||||
with pytest.raises(KeyError, match="not found"):
|
||||
registry.get_resolver("unknown_resolver")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dummy_resolver_resolve():
|
||||
"""Test the dummy resolver's resolve functionality."""
|
||||
dummy_resolver = DummyLoRAResolver()
|
||||
base_model_name = "base_model_test"
|
||||
lora_name = "test_lora"
|
||||
|
||||
# Test successful resolution
|
||||
result = await dummy_resolver.resolve_lora(base_model_name, lora_name)
|
||||
assert isinstance(result, LoRARequest)
|
||||
assert result.lora_name == lora_name
|
||||
assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}"
|
||||
|
||||
# Test failed resolution
|
||||
result = await dummy_resolver.resolve_lora(base_model_name, "nonexistent_lora")
|
||||
assert result is None
|
||||
@@ -1,55 +0,0 @@
|
||||
import pytest
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer import get_lora_tokenizer
|
||||
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
||||
|
||||
from ..conftest import get_tokenizer_pool_config
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
|
||||
async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
|
||||
reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
|
||||
tokenizer_group = get_tokenizer_group(
|
||||
get_tokenizer_pool_config(tokenizer_group_type),
|
||||
tokenizer_id="gpt2",
|
||||
enable_lora=True,
|
||||
max_num_seqs=1,
|
||||
max_input_length=None,
|
||||
)
|
||||
lora_request = LoRARequest("1", 1, sql_lora_files)
|
||||
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
|
||||
request_id="request_id", prompt="prompt", lora_request=lora_request)
|
||||
assert reference_tokenizer.encode(
|
||||
"prompt") == await tokenizer_group.encode_async(
|
||||
request_id="request_id",
|
||||
prompt="prompt",
|
||||
lora_request=lora_request)
|
||||
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
|
||||
PreTrainedTokenizerBase)
|
||||
assert tokenizer_group.get_lora_tokenizer(
|
||||
None) == await tokenizer_group.get_lora_tokenizer_async(None)
|
||||
|
||||
assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request),
|
||||
PreTrainedTokenizerBase)
|
||||
assert tokenizer_group.get_lora_tokenizer(
|
||||
lora_request) != tokenizer_group.get_lora_tokenizer(None)
|
||||
assert tokenizer_group.get_lora_tokenizer(
|
||||
lora_request) == await tokenizer_group.get_lora_tokenizer_async(
|
||||
lora_request)
|
||||
|
||||
|
||||
def test_get_lora_tokenizer(sql_lora_files, tmpdir):
|
||||
lora_request = None
|
||||
tokenizer = get_lora_tokenizer(lora_request)
|
||||
assert not tokenizer
|
||||
|
||||
lora_request = LoRARequest("1", 1, sql_lora_files)
|
||||
tokenizer = get_lora_tokenizer(lora_request)
|
||||
assert tokenizer.get_added_vocab()
|
||||
|
||||
lora_request = LoRARequest("1", 1, str(tmpdir))
|
||||
tokenizer = get_lora_tokenizer(lora_request)
|
||||
assert not tokenizer
|
||||
116
tests/lora/test_transformers_model.py
Normal file
116
tests/lora/test_transformers_model.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
MODEL_PATH = "hmellor/Ilama-3.2-1B"
|
||||
|
||||
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
|
||||
|
||||
EXPECTED_LORA_OUTPUT = [
|
||||
"SELECT count(*) FROM singer",
|
||||
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501
|
||||
"SELECT DISTINCT Country FROM singer WHERE Age > 20",
|
||||
]
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query="What are all distinct countries where singers above age 20 are from?" # noqa: E501
|
||||
),
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
def test_ilama_lora(ilama_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=16,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
output2 = do_sample(llm, ilama_lora_files, lora_id=2)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
|
||||
)
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4(ilama_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=16,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=False,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
output2 = do_sample(llm, ilama_lora_files, lora_id=2)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
|
||||
)
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=16,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=True,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
output2 = do_sample(llm, ilama_lora_files, lora_id=2)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
@@ -1,58 +1,141 @@
|
||||
from collections import OrderedDict
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections import OrderedDict
|
||||
from typing import NamedTuple
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from huggingface_hub.utils import HfHubHTTPError
|
||||
from torch import nn
|
||||
|
||||
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
|
||||
from vllm.utils import LRUCache
|
||||
from vllm.lora.utils import (
|
||||
get_adapter_absolute_path,
|
||||
parse_fine_tuned_lora_name,
|
||||
replace_submodule,
|
||||
)
|
||||
from vllm.model_executor.models.utils import WeightsMapper
|
||||
|
||||
|
||||
def test_parse_fine_tuned_lora_name():
|
||||
fixture = {
|
||||
("base_model.model.lm_head.lora_A.weight", "lm_head", True),
|
||||
("base_model.model.lm_head.lora_B.weight", "lm_head", False),
|
||||
(
|
||||
class LoRANameParserTestConfig(NamedTuple):
|
||||
name: str
|
||||
module_name: str
|
||||
is_lora_a: bool
|
||||
weights_mapper: WeightsMapper | None = None
|
||||
|
||||
|
||||
def test_parse_fine_tuned_lora_name_valid():
|
||||
fixture = [
|
||||
LoRANameParserTestConfig(
|
||||
"base_model.model.lm_head.lora_A.weight", "lm_head", True, False
|
||||
),
|
||||
LoRANameParserTestConfig(
|
||||
"base_model.model.lm_head.lora_B.weight", "lm_head", False, False
|
||||
),
|
||||
LoRANameParserTestConfig(
|
||||
"base_model.model.model.embed_tokens.lora_embedding_A",
|
||||
"model.embed_tokens",
|
||||
True,
|
||||
),
|
||||
(
|
||||
LoRANameParserTestConfig(
|
||||
"base_model.model.model.embed_tokens.lora_embedding_B",
|
||||
"model.embed_tokens",
|
||||
False,
|
||||
),
|
||||
(
|
||||
LoRANameParserTestConfig(
|
||||
"base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
|
||||
"model.layers.9.mlp.down_proj",
|
||||
True,
|
||||
),
|
||||
(
|
||||
LoRANameParserTestConfig(
|
||||
"base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
|
||||
"model.layers.9.mlp.down_proj",
|
||||
False,
|
||||
),
|
||||
LoRANameParserTestConfig(
|
||||
"language_model.layers.9.mlp.down_proj.lora_A.weight",
|
||||
"language_model.layers.9.mlp.down_proj",
|
||||
True,
|
||||
),
|
||||
LoRANameParserTestConfig(
|
||||
"language_model.layers.9.mlp.down_proj.lora_B.weight",
|
||||
"language_model.layers.9.mlp.down_proj",
|
||||
False,
|
||||
),
|
||||
# Test with WeightsMapper
|
||||
LoRANameParserTestConfig(
|
||||
"base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
|
||||
"language_model.model.layers.9.mlp.down_proj",
|
||||
True,
|
||||
weights_mapper=WeightsMapper(
|
||||
orig_to_new_prefix={"model.": "language_model.model."}
|
||||
),
|
||||
),
|
||||
LoRANameParserTestConfig(
|
||||
"base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
|
||||
"language_model.model.layers.9.mlp.down_proj",
|
||||
False,
|
||||
weights_mapper=WeightsMapper(
|
||||
orig_to_new_prefix={"model.": "language_model.model."}
|
||||
),
|
||||
),
|
||||
LoRANameParserTestConfig(
|
||||
"model.layers.9.mlp.down_proj.lora_A.weight",
|
||||
"language_model.model.layers.9.mlp.down_proj",
|
||||
True,
|
||||
weights_mapper=WeightsMapper(
|
||||
orig_to_new_prefix={"model.": "language_model.model."}
|
||||
),
|
||||
),
|
||||
LoRANameParserTestConfig(
|
||||
"model.layers.9.mlp.down_proj.lora_B.weight",
|
||||
"language_model.model.layers.9.mlp.down_proj",
|
||||
False,
|
||||
weights_mapper=WeightsMapper(
|
||||
orig_to_new_prefix={"model.": "language_model.model."}
|
||||
),
|
||||
),
|
||||
]
|
||||
for name, module_name, is_lora_a, weights_mapper in fixture:
|
||||
assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(
|
||||
name, weights_mapper
|
||||
)
|
||||
|
||||
|
||||
def test_parse_fine_tuned_lora_name_invalid():
|
||||
fixture = {
|
||||
"base_model.weight",
|
||||
"base_model.model.weight",
|
||||
}
|
||||
for name, module_name, is_lora_a in fixture:
|
||||
assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name)
|
||||
for name in fixture:
|
||||
with pytest.raises(ValueError, match="unsupported LoRA weight"):
|
||||
parse_fine_tuned_lora_name(name)
|
||||
|
||||
|
||||
def test_replace_submodule():
|
||||
model = nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", nn.Linear(764, 100)),
|
||||
("act1", nn.ReLU()),
|
||||
("dense2", nn.Linear(100, 50)),
|
||||
(
|
||||
"seq1",
|
||||
nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", nn.Linear(100, 10)),
|
||||
("dense2", nn.Linear(10, 50)),
|
||||
])),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("output", nn.Linear(50, 10)),
|
||||
("outact", nn.Sigmoid()),
|
||||
]))
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", nn.Linear(764, 100)),
|
||||
("act1", nn.ReLU()),
|
||||
("dense2", nn.Linear(100, 50)),
|
||||
(
|
||||
"seq1",
|
||||
nn.Sequential(
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", nn.Linear(100, 10)),
|
||||
("dense2", nn.Linear(10, 50)),
|
||||
]
|
||||
)
|
||||
),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("output", nn.Linear(50, 10)),
|
||||
("outact", nn.Sigmoid()),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
sigmoid = nn.Sigmoid()
|
||||
|
||||
@@ -64,109 +147,52 @@ def test_replace_submodule():
|
||||
assert dict(model.named_modules())["seq1.dense2"] == dense2
|
||||
|
||||
|
||||
class TestLRUCache(LRUCache):
|
||||
|
||||
def _on_remove(self, key, value):
|
||||
if not hasattr(self, "_remove_counter"):
|
||||
self._remove_counter = 0
|
||||
self._remove_counter += 1
|
||||
# Unit tests for get_adapter_absolute_path
|
||||
@patch("os.path.isabs")
|
||||
def test_get_adapter_absolute_path_absolute(mock_isabs):
|
||||
path = "/absolute/path/to/lora"
|
||||
mock_isabs.return_value = True
|
||||
assert get_adapter_absolute_path(path) == path
|
||||
|
||||
|
||||
def test_lru_cache():
|
||||
cache = TestLRUCache(3)
|
||||
@patch("os.path.expanduser")
|
||||
def test_get_adapter_absolute_path_expanduser(mock_expanduser):
|
||||
# Path with ~ that needs to be expanded
|
||||
path = "~/relative/path/to/lora"
|
||||
absolute_path = "/home/user/relative/path/to/lora"
|
||||
mock_expanduser.return_value = absolute_path
|
||||
assert get_adapter_absolute_path(path) == absolute_path
|
||||
|
||||
cache.put(1, 1)
|
||||
assert len(cache) == 1
|
||||
|
||||
cache.put(1, 1)
|
||||
assert len(cache) == 1
|
||||
@patch("os.path.exists")
|
||||
@patch("os.path.abspath")
|
||||
def test_get_adapter_absolute_path_local_existing(mock_abspath, mock_exist):
|
||||
# Relative path that exists locally
|
||||
path = "relative/path/to/lora"
|
||||
absolute_path = "/absolute/path/to/lora"
|
||||
mock_exist.return_value = True
|
||||
mock_abspath.return_value = absolute_path
|
||||
assert get_adapter_absolute_path(path) == absolute_path
|
||||
|
||||
cache.put(2, 2)
|
||||
assert len(cache) == 2
|
||||
|
||||
cache.put(3, 3)
|
||||
assert len(cache) == 3
|
||||
assert set(cache.cache) == {1, 2, 3}
|
||||
@patch("huggingface_hub.snapshot_download")
|
||||
@patch("os.path.exists")
|
||||
def test_get_adapter_absolute_path_huggingface(mock_exist, mock_snapshot_download):
|
||||
# Hugging Face model identifier
|
||||
path = "org/repo"
|
||||
absolute_path = "/mock/snapshot/path"
|
||||
mock_exist.return_value = False
|
||||
mock_snapshot_download.return_value = absolute_path
|
||||
assert get_adapter_absolute_path(path) == absolute_path
|
||||
|
||||
cache.put(4, 4)
|
||||
assert len(cache) == 3
|
||||
assert set(cache.cache) == {2, 3, 4}
|
||||
assert cache._remove_counter == 1
|
||||
assert cache.get(2) == 2
|
||||
|
||||
cache.put(5, 5)
|
||||
assert set(cache.cache) == {2, 4, 5}
|
||||
assert cache._remove_counter == 2
|
||||
|
||||
assert cache.pop(5) == 5
|
||||
assert len(cache) == 2
|
||||
assert set(cache.cache) == {2, 4}
|
||||
assert cache._remove_counter == 3
|
||||
|
||||
cache.pop(10)
|
||||
assert len(cache) == 2
|
||||
assert set(cache.cache) == {2, 4}
|
||||
assert cache._remove_counter == 3
|
||||
|
||||
cache.get(10)
|
||||
assert len(cache) == 2
|
||||
assert set(cache.cache) == {2, 4}
|
||||
assert cache._remove_counter == 3
|
||||
|
||||
cache.put(6, 6)
|
||||
assert len(cache) == 3
|
||||
assert set(cache.cache) == {2, 4, 6}
|
||||
assert 2 in cache
|
||||
assert 4 in cache
|
||||
assert 6 in cache
|
||||
|
||||
cache.remove_oldest()
|
||||
assert len(cache) == 2
|
||||
assert set(cache.cache) == {2, 6}
|
||||
assert cache._remove_counter == 4
|
||||
|
||||
cache.clear()
|
||||
assert len(cache) == 0
|
||||
assert cache._remove_counter == 6
|
||||
|
||||
cache._remove_counter = 0
|
||||
|
||||
cache[1] = 1
|
||||
assert len(cache) == 1
|
||||
|
||||
cache[1] = 1
|
||||
assert len(cache) == 1
|
||||
|
||||
cache[2] = 2
|
||||
assert len(cache) == 2
|
||||
|
||||
cache[3] = 3
|
||||
assert len(cache) == 3
|
||||
assert set(cache.cache) == {1, 2, 3}
|
||||
|
||||
cache[4] = 4
|
||||
assert len(cache) == 3
|
||||
assert set(cache.cache) == {2, 3, 4}
|
||||
assert cache._remove_counter == 1
|
||||
assert cache[2] == 2
|
||||
|
||||
cache[5] = 5
|
||||
assert set(cache.cache) == {2, 4, 5}
|
||||
assert cache._remove_counter == 2
|
||||
|
||||
del cache[5]
|
||||
assert len(cache) == 2
|
||||
assert set(cache.cache) == {2, 4}
|
||||
assert cache._remove_counter == 3
|
||||
|
||||
cache.pop(10)
|
||||
assert len(cache) == 2
|
||||
assert set(cache.cache) == {2, 4}
|
||||
assert cache._remove_counter == 3
|
||||
|
||||
cache[6] = 6
|
||||
assert len(cache) == 3
|
||||
assert set(cache.cache) == {2, 4, 6}
|
||||
assert 2 in cache
|
||||
assert 4 in cache
|
||||
assert 6 in cache
|
||||
@patch("huggingface_hub.snapshot_download")
|
||||
@patch("os.path.exists")
|
||||
def test_get_adapter_absolute_path_huggingface_error(
|
||||
mock_exist, mock_snapshot_download
|
||||
):
|
||||
# Hugging Face model identifier with download error
|
||||
path = "org/repo"
|
||||
mock_exist.return_value = False
|
||||
mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info")
|
||||
assert get_adapter_absolute_path(path) == path
|
||||
|
||||
@@ -1,69 +1,105 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import random
|
||||
import tempfile
|
||||
from unittest.mock import patch
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, ParallelConfig, SchedulerConfig)
|
||||
from vllm.lora.models import LoRAMapping
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
DeviceConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
from vllm.config.load import LoadConfig
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.lora.model_manager import LoRAMapping
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.worker.worker import Worker
|
||||
from vllm.v1.worker.gpu_worker import Worker
|
||||
|
||||
MODEL_PATH = "Qwen/Qwen3-0.6B"
|
||||
NUM_LORAS = 16
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"RANK": "0"})
|
||||
def test_worker_apply_lora(sql_lora_files):
|
||||
worker = Worker(
|
||||
model_config=ModelConfig(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
),
|
||||
def test_worker_apply_lora(qwen3_lora_files):
|
||||
def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
|
||||
lora_mapping = LoRAMapping([], [])
|
||||
|
||||
worker.model_runner.lora_manager.set_active_adapters(
|
||||
lora_requests, lora_mapping
|
||||
)
|
||||
|
||||
model_config = ModelConfig(
|
||||
MODEL_PATH,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_model_len=127,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
load_config=LoadConfig(
|
||||
download_dir=None,
|
||||
load_format="dummy",
|
||||
),
|
||||
parallel_config=ParallelConfig(1, 1, False),
|
||||
scheduler_config=SchedulerConfig(32, 32, 32),
|
||||
parallel_config=ParallelConfig(
|
||||
pipeline_parallel_size=1,
|
||||
tensor_parallel_size=1,
|
||||
data_parallel_size=1,
|
||||
),
|
||||
scheduler_config=SchedulerConfig(
|
||||
max_model_len=model_config.max_model_len,
|
||||
is_encoder_decoder=model_config.is_encoder_decoder,
|
||||
runner_type="generate",
|
||||
max_num_batched_tokens=32,
|
||||
max_num_seqs=32,
|
||||
max_num_partial_prefills=32,
|
||||
),
|
||||
device_config=DeviceConfig("cuda"),
|
||||
cache_config=CacheConfig(block_size=16,
|
||||
gpu_memory_utilization=1.,
|
||||
swap_space=0,
|
||||
cache_dtype="auto"),
|
||||
cache_config=CacheConfig(
|
||||
block_size=16,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
),
|
||||
lora_config=LoRAConfig(
|
||||
max_lora_rank=8, max_cpu_loras=NUM_LORAS, max_loras=NUM_LORAS
|
||||
),
|
||||
)
|
||||
worker = Worker(
|
||||
vllm_config=vllm_config,
|
||||
local_rank=0,
|
||||
rank=0,
|
||||
lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
|
||||
max_loras=32),
|
||||
distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
|
||||
)
|
||||
|
||||
worker.init_device()
|
||||
worker.load_model()
|
||||
|
||||
worker.model_runner.set_active_loras([], LoRAMapping([], []))
|
||||
set_active_loras(worker, [])
|
||||
assert worker.list_loras() == set()
|
||||
|
||||
n_loras = 32
|
||||
lora_requests = [
|
||||
LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
|
||||
LoRARequest(str(i + 1), i + 1, qwen3_lora_files) for i in range(NUM_LORAS)
|
||||
]
|
||||
|
||||
worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], []))
|
||||
set_active_loras(worker, lora_requests)
|
||||
assert worker.list_loras() == {
|
||||
lora_request.lora_int_id
|
||||
for lora_request in lora_requests
|
||||
lora_request.lora_int_id for lora_request in lora_requests
|
||||
}
|
||||
|
||||
for i in range(32):
|
||||
for i in range(NUM_LORAS):
|
||||
random.seed(i)
|
||||
iter_lora_requests = random.choices(lora_requests,
|
||||
k=random.randint(1, n_loras))
|
||||
iter_lora_requests = random.choices(
|
||||
lora_requests, k=random.randint(1, NUM_LORAS)
|
||||
)
|
||||
random.shuffle(iter_lora_requests)
|
||||
iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
|
||||
worker.model_runner.set_active_loras(iter_lora_requests,
|
||||
LoRAMapping([], []))
|
||||
iter_lora_requests = iter_lora_requests[: -random.randint(0, NUM_LORAS)]
|
||||
set_active_loras(worker, lora_requests)
|
||||
assert worker.list_loras().issuperset(
|
||||
{lora_request.lora_int_id
|
||||
for lora_request in iter_lora_requests})
|
||||
{lora_request.lora_int_id for lora_request in iter_lora_requests}
|
||||
)
|
||||
|
||||
@@ -1,60 +1,64 @@
|
||||
from typing import List, Optional
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
from safetensors.torch import save_file
|
||||
|
||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
|
||||
|
||||
class DummyLoRAManager:
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, device: torch.device = "cuda:0"):
|
||||
super().__init__()
|
||||
self._loras = {}
|
||||
self._loras: dict[str, LoRALayerWeights] = {}
|
||||
self._device = device
|
||||
|
||||
def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
|
||||
self._loras[module_name] = lora
|
||||
|
||||
def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
|
||||
return self._loras.get(module_name, None)
|
||||
def get_module_lora(self, module_name: str) -> LoRALayerWeights:
|
||||
return self._loras[module_name]
|
||||
|
||||
def init_random_lora(self,
|
||||
module_name: str,
|
||||
weight: torch.Tensor,
|
||||
rank: int = 8,
|
||||
generate_embeddings_tensor: int = 0):
|
||||
def init_random_lora(
|
||||
self,
|
||||
module_name: str,
|
||||
weight: torch.Tensor,
|
||||
rank: int = 8,
|
||||
):
|
||||
lora = LoRALayerWeights(
|
||||
module_name,
|
||||
rank=rank,
|
||||
lora_alpha=1,
|
||||
lora_a=torch.rand([weight.shape[1], rank],
|
||||
dtype=weight.dtype,
|
||||
device="cuda"),
|
||||
lora_b=torch.rand([rank, weight.shape[0]],
|
||||
dtype=weight.dtype,
|
||||
device="cuda"),
|
||||
lora_a=torch.rand(
|
||||
[rank, weight.shape[1]], dtype=weight.dtype, device=self._device
|
||||
),
|
||||
lora_b=torch.rand(
|
||||
[weight.shape[0], rank], dtype=weight.dtype, device=self._device
|
||||
),
|
||||
)
|
||||
if generate_embeddings_tensor:
|
||||
lora.embeddings_tensor = torch.rand(5,
|
||||
generate_embeddings_tensor,
|
||||
dtype=weight.dtype,
|
||||
device="cuda")
|
||||
self.set_module_lora(module_name, lora)
|
||||
|
||||
return lora
|
||||
|
||||
def init_lora(self,
|
||||
module_name: str,
|
||||
input_dim: int,
|
||||
output_dim: int,
|
||||
rank=8,
|
||||
noop=False,
|
||||
embeddings_tensor=None):
|
||||
def init_lora(
|
||||
self,
|
||||
module_name: str,
|
||||
input_dim: int,
|
||||
output_dim: int,
|
||||
rank=8,
|
||||
noop=False,
|
||||
embeddings_tensor=None,
|
||||
):
|
||||
lora = LoRALayerWeights(
|
||||
module_name,
|
||||
rank=rank,
|
||||
lora_alpha=1,
|
||||
lora_a=torch.rand([input_dim, rank], device="cuda"),
|
||||
lora_b=torch.rand([rank, output_dim], device="cuda"),
|
||||
lora_a=torch.rand([rank, input_dim], device="cuda"),
|
||||
lora_b=torch.rand([output_dim, input_dim], device="cuda"),
|
||||
embeddings_tensor=embeddings_tensor,
|
||||
)
|
||||
self.set_module_lora(module_name, lora)
|
||||
@@ -67,12 +71,12 @@ class DummyLoRAManager:
|
||||
self,
|
||||
module_name: str,
|
||||
input_dim: int,
|
||||
output_dims: List[int],
|
||||
noop_lora_index: List[int] = None,
|
||||
rank=8,
|
||||
output_dims: list[int],
|
||||
noop_lora_index: list[int] | None = None,
|
||||
rank: int = 8,
|
||||
):
|
||||
base_loras = []
|
||||
noop_lora_index = set(noop_lora_index or [])
|
||||
base_loras: list[LoRALayerWeights] = []
|
||||
noop_lora_index_set = set(noop_lora_index or [])
|
||||
|
||||
for i, out_dim in enumerate(output_dims):
|
||||
base_lora = self.init_lora(
|
||||
@@ -80,9 +84,324 @@ class DummyLoRAManager:
|
||||
input_dim,
|
||||
out_dim,
|
||||
rank=rank,
|
||||
noop=i in noop_lora_index,
|
||||
noop=i in noop_lora_index_set,
|
||||
)
|
||||
base_loras.append(base_lora)
|
||||
packed_lora = PackedLoRALayerWeights.pack(base_loras)
|
||||
self.set_module_lora(module_name, packed_lora)
|
||||
return packed_lora
|
||||
|
||||
|
||||
def assert_close(a, b):
|
||||
rtol, atol = {
|
||||
torch.float16: (6e-2, 6e-2),
|
||||
torch.bfloat16: (6e-2, 6e-2),
|
||||
torch.float32: (1e-2, 1e-2),
|
||||
}[a.dtype]
|
||||
torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PunicaTensors:
|
||||
inputs_tensor: torch.Tensor
|
||||
lora_weights: torch.Tensor | list[torch.Tensor]
|
||||
our_out_tensor: torch.Tensor
|
||||
ref_out_tensor: torch.Tensor
|
||||
b_seq_start_loc: torch.Tensor
|
||||
prompt_lora_mapping: torch.Tensor
|
||||
seq_len_tensor: torch.Tensor
|
||||
token_lora_mapping: torch.Tensor
|
||||
|
||||
def meta(self) -> tuple[int, int]:
|
||||
"""
|
||||
Infer max_seq_length and token_nums from the tensors
|
||||
and return them.
|
||||
"""
|
||||
max_seq_length = self.seq_len_tensor.max()
|
||||
token_nums = self.seq_len_tensor.sum().item()
|
||||
if isinstance(max_seq_length, tuple):
|
||||
max_seq_length = max_seq_length[0].item()
|
||||
else:
|
||||
max_seq_length = max_seq_length.item()
|
||||
return max_seq_length, token_nums
|
||||
|
||||
|
||||
def generate_data(
|
||||
batches,
|
||||
hidden_size,
|
||||
lora_nums,
|
||||
max_rank,
|
||||
seq_length,
|
||||
dtype,
|
||||
op_type,
|
||||
device,
|
||||
) -> PunicaTensors:
|
||||
seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
|
||||
b_seq_start_loc = torch.cumsum(
|
||||
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
|
||||
dim=0,
|
||||
).to(device)
|
||||
total_tokens = seq_len_tensor.sum()
|
||||
if op_type == "shrink":
|
||||
inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
|
||||
lora_weights = torch.rand(
|
||||
(lora_nums, max_rank, hidden_size), # col-major
|
||||
dtype=dtype,
|
||||
).to(device)
|
||||
# shrink op need atomic_add, so output is initinized by 0
|
||||
ref_out_tensor = torch.zeros(
|
||||
(total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device
|
||||
)
|
||||
# NOTE shrink kernel using torch.float32 as output type
|
||||
our_out_tensor = torch.zeros((total_tokens, max_rank), dtype=torch.float32).to(
|
||||
device
|
||||
)
|
||||
else:
|
||||
inputs_tensor = torch.rand(
|
||||
(total_tokens, max_rank),
|
||||
dtype=dtype,
|
||||
).to(device)
|
||||
lora_weights = torch.rand(
|
||||
(lora_nums, hidden_size, max_rank), # col-major
|
||||
dtype=dtype,
|
||||
).to(device)
|
||||
# expand op needs to complete y+=a@lora_b, so output is
|
||||
# initinized randomly
|
||||
ref_out_tensor = torch.rand(
|
||||
(total_tokens, hidden_size),
|
||||
dtype=dtype,
|
||||
).to(device)
|
||||
# Ensure the same input.
|
||||
our_out_tensor = ref_out_tensor.clone()
|
||||
lora_indices_tensor = torch.randint(
|
||||
0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
|
||||
).to(device)
|
||||
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
|
||||
current_offset = 0
|
||||
for b_id in range(batches):
|
||||
lora_index = lora_indices_tensor[b_id]
|
||||
indices[current_offset : current_offset + seq_len_tensor[b_id]].copy_(
|
||||
lora_index
|
||||
)
|
||||
current_offset += seq_len_tensor[b_id].item()
|
||||
|
||||
return PunicaTensors(
|
||||
inputs_tensor,
|
||||
lora_weights,
|
||||
our_out_tensor,
|
||||
ref_out_tensor,
|
||||
b_seq_start_loc,
|
||||
lora_indices_tensor,
|
||||
seq_len_tensor,
|
||||
indices,
|
||||
)
|
||||
|
||||
|
||||
def generate_data_for_expand_nslices(
|
||||
batches,
|
||||
hidden_size,
|
||||
lora_nums,
|
||||
max_rank,
|
||||
seq_length,
|
||||
dtype,
|
||||
nslices,
|
||||
device,
|
||||
) -> PunicaTensors:
|
||||
seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
|
||||
b_seq_start_loc = torch.cumsum(
|
||||
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
|
||||
dim=0,
|
||||
).to(device)
|
||||
total_tokens = seq_len_tensor.sum()
|
||||
inputs_tensor = torch.rand(
|
||||
(total_tokens, max_rank),
|
||||
dtype=dtype,
|
||||
).to(device)
|
||||
lora_weights_lst = []
|
||||
for _ in range(nslices):
|
||||
lora_weights_lst.append(
|
||||
torch.rand(
|
||||
(lora_nums, hidden_size, max_rank), # col-major
|
||||
dtype=dtype,
|
||||
).to(device)
|
||||
)
|
||||
# expand op needs to complete y+=a@lora_b, so output is
|
||||
# initinized randomly
|
||||
ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices), dtype=dtype).to(
|
||||
device
|
||||
)
|
||||
# Ensure the same input.
|
||||
our_out_tensor = ref_out_tensor.clone()
|
||||
lora_indices_tensor = torch.randint(
|
||||
0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
|
||||
)
|
||||
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
|
||||
current_offset = 0
|
||||
for b_id in range(batches):
|
||||
lora_index = lora_indices_tensor[b_id]
|
||||
indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
|
||||
lora_index.item()
|
||||
)
|
||||
current_offset += seq_len_tensor[b_id].item()
|
||||
|
||||
lora_indices_tensor = lora_indices_tensor.to(device)
|
||||
return PunicaTensors(
|
||||
inputs_tensor,
|
||||
lora_weights_lst,
|
||||
our_out_tensor,
|
||||
ref_out_tensor,
|
||||
b_seq_start_loc,
|
||||
lora_indices_tensor,
|
||||
seq_len_tensor,
|
||||
indices,
|
||||
)
|
||||
|
||||
|
||||
def generate_data_for_nslices(
|
||||
batches,
|
||||
hidden_size,
|
||||
lora_nums,
|
||||
max_rank,
|
||||
seq_length,
|
||||
nslices,
|
||||
dtype,
|
||||
op_type,
|
||||
device,
|
||||
) -> PunicaTensors:
|
||||
seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
|
||||
b_seq_start_loc = torch.cumsum(
|
||||
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
|
||||
dim=0,
|
||||
).to(device)
|
||||
total_tokens = seq_len_tensor.sum()
|
||||
|
||||
lora_weights_lst = []
|
||||
if op_type == "shrink":
|
||||
inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
|
||||
|
||||
for _ in range(nslices):
|
||||
if op_type == "shrink":
|
||||
lora_weights_lst.append(
|
||||
torch.rand(
|
||||
(lora_nums, max_rank, hidden_size), # col-major
|
||||
dtype=dtype,
|
||||
).to(device)
|
||||
)
|
||||
# NOTE shrink kernel using torch.float32 as output type
|
||||
# shrink op need atomic_add, so output is initinized by 0
|
||||
our_out_tensor = torch.zeros(
|
||||
(nslices, total_tokens, max_rank),
|
||||
dtype=torch.float32,
|
||||
).to(device)
|
||||
else:
|
||||
inputs_tensor = torch.rand(
|
||||
(nslices, total_tokens, max_rank),
|
||||
dtype=dtype,
|
||||
).to(device)
|
||||
for _ in range(nslices):
|
||||
lora_weights_lst.append(
|
||||
torch.rand(
|
||||
(lora_nums, hidden_size, max_rank), # col-major
|
||||
dtype=dtype,
|
||||
).to(device)
|
||||
)
|
||||
# expand op needs to complete y+=a@lora_b, so output is
|
||||
# initinized randomly
|
||||
our_out_tensor = torch.rand(
|
||||
(total_tokens, hidden_size * nslices), dtype=dtype
|
||||
).to(device)
|
||||
|
||||
# Ensure the same input.
|
||||
ref_out_tensor = our_out_tensor.clone()
|
||||
lora_indices_tensor = torch.randint(
|
||||
0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
|
||||
)
|
||||
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
|
||||
current_offset = 0
|
||||
for b_id in range(batches):
|
||||
lora_index = lora_indices_tensor[b_id]
|
||||
indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
|
||||
lora_index.item()
|
||||
)
|
||||
current_offset += seq_len_tensor[b_id].item()
|
||||
|
||||
lora_indices_tensor = lora_indices_tensor.to(device)
|
||||
return PunicaTensors(
|
||||
inputs_tensor,
|
||||
lora_weights_lst,
|
||||
our_out_tensor,
|
||||
ref_out_tensor,
|
||||
b_seq_start_loc,
|
||||
lora_indices_tensor,
|
||||
seq_len_tensor,
|
||||
indices,
|
||||
)
|
||||
|
||||
|
||||
def create_peft_lora(
|
||||
model: torch.nn.Module,
|
||||
save_dir: str,
|
||||
target_modules: list[str],
|
||||
rank: int = 8,
|
||||
alpha: int = 16,
|
||||
dropout: float = 0.1,
|
||||
lora_dtype: torch.dtype = torch.float16,
|
||||
) -> dict[str, torch.Tensor]:
|
||||
lora_weights = {}
|
||||
adapter_config = {
|
||||
"peft_type": "LORA",
|
||||
"auto_mapping": None,
|
||||
"base_model_name_or_path": "dummy_model",
|
||||
"revision": None,
|
||||
"task_type": "CAUSAL_LM",
|
||||
"inference_mode": False,
|
||||
"r": rank,
|
||||
"lora_alpha": alpha,
|
||||
"lora_dropout": dropout,
|
||||
"fan_in_fan_out": False,
|
||||
"bias": "none",
|
||||
"modules_to_save": None,
|
||||
"init_lora_weights": True,
|
||||
"layers_to_transform": None,
|
||||
"layers_pattern": None,
|
||||
"target_modules": target_modules,
|
||||
"exclude_modules": None,
|
||||
"use_rslora": False,
|
||||
"use_dora": False,
|
||||
"loftq_config": None,
|
||||
}
|
||||
|
||||
for module_name in target_modules:
|
||||
module = model
|
||||
for attr in module_name.split("."):
|
||||
module = getattr(module, attr)
|
||||
|
||||
if hasattr(module, "input_size") and hasattr(module, "output_size"):
|
||||
in_features = module.input_size
|
||||
out_features = module.output_size
|
||||
|
||||
elif hasattr(module, "embedding_dim") and hasattr(module, "num_embeddings"):
|
||||
# ParallelLMHead
|
||||
in_features = module.embedding_dim
|
||||
out_features = module.num_embeddings
|
||||
else:
|
||||
raise ValueError(f"Unable to determine dimensions for module {module_name}")
|
||||
|
||||
lora_A = torch.randn(rank, in_features, dtype=lora_dtype)
|
||||
|
||||
torch.nn.init.kaiming_uniform_(lora_A, a=5**0.5)
|
||||
|
||||
lora_B = torch.zeros(out_features, rank, dtype=lora_dtype)
|
||||
|
||||
# PEFT style
|
||||
lora_weights[f"base_model.model.{module_name}.lora_A.weight"] = lora_A
|
||||
lora_weights[f"base_model.model.{module_name}.lora_B.weight"] = lora_B
|
||||
|
||||
config_path = os.path.join(save_dir, "adapter_config.json")
|
||||
with open(config_path, "w", encoding="utf-8") as f:
|
||||
json.dump(adapter_config, f, indent=2, ensure_ascii=False)
|
||||
|
||||
weights_path = os.path.join(save_dir, "adapter_model.safetensors")
|
||||
save_file(lora_weights, weights_path)
|
||||
|
||||
return lora_weights
|
||||
|
||||
Reference in New Issue
Block a user