init
This commit is contained in:
0
tests/lora/__init__.py
Normal file
0
tests/lora/__init__.py
Normal file
179
tests/lora/conftest.py
Normal file
179
tests/lora/conftest.py
Normal file
@@ -0,0 +1,179 @@
|
||||
import contextlib
|
||||
import gc
|
||||
import tempfile
|
||||
from collections import OrderedDict
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
import vllm
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.distributed import destroy_model_parallel, initialize_model_parallel
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
|
||||
|
||||
def cleanup():
|
||||
destroy_model_parallel()
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def cleanup_fixture():
|
||||
yield
|
||||
cleanup()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dist_init():
|
||||
if not torch.distributed.is_initialized():
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl",
|
||||
world_size=1,
|
||||
rank=0,
|
||||
init_method=f"file://{temp_file}",
|
||||
)
|
||||
torch.distributed.all_reduce(torch.zeros(1).cuda())
|
||||
initialize_model_parallel(1, 1)
|
||||
yield
|
||||
cleanup()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dist_init_torch_only():
|
||||
if torch.distributed.is_initialized():
|
||||
return
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl",
|
||||
world_size=1,
|
||||
rank=0,
|
||||
init_method=f"file://{temp_file}",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_model() -> nn.Module:
|
||||
model = nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
])),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("output", ColumnParallelLinear(50, 10)),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
("sampler", Sampler())
|
||||
]))
|
||||
model.config = MagicMock()
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_model_gate_up() -> nn.Module:
|
||||
model = nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
])),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
("sampler", Sampler())
|
||||
]))
|
||||
model.config = MagicMock()
|
||||
return model
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sql_lora_files():
|
||||
return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def mixtral_lora_files():
|
||||
return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def gemma_lora_files():
|
||||
return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def chatglm3_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def baichuan_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/baichuan7b-text2sql-spider")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def baichuan_zero_lora_files():
|
||||
# all the lora_B weights are initialized to zero.
|
||||
return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tinyllama_lora_files():
|
||||
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def llama_2_7b_engine_extra_embeddings() -> nn.Module:
|
||||
cleanup()
|
||||
get_model_old = get_model
|
||||
|
||||
def get_model_patched(*, model_config, device_config, **kwargs):
|
||||
kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8)
|
||||
return get_model_old(model_config=model_config,
|
||||
device_config=device_config,
|
||||
**kwargs)
|
||||
|
||||
with patch("vllm.worker.model_runner.get_model", get_model_patched):
|
||||
engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
|
||||
yield engine.llm_engine
|
||||
del engine
|
||||
cleanup()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def llama_2_7b_model_extra_embeddings(
|
||||
llama_2_7b_engine_extra_embeddings) -> nn.Module:
|
||||
yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
108
tests/lora/test_baichuan.py
Normal file
108
tests/lora/test_baichuan.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from .conftest import cleanup
|
||||
|
||||
MODEL_PATH = "baichuan-inc/Baichuan-7B"
|
||||
|
||||
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
|
||||
),
|
||||
]
|
||||
print(prompts)
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
def test_baichuan_lora(baichuan_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
trust_remote_code=True)
|
||||
|
||||
expected_lora_output = [
|
||||
"SELECT count(*) FROM singer",
|
||||
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE Country = 'France'", # noqa: E501
|
||||
"SELECT name , country , age FROM singer ORDER BY age ASC",
|
||||
]
|
||||
|
||||
output1 = do_sample(llm, baichuan_lora_files, lora_id=1)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output1[i] == expected_lora_output[i]
|
||||
output2 = do_sample(llm, baichuan_lora_files, lora_id=2)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output2[i] == expected_lora_output[i]
|
||||
|
||||
|
||||
@pytest.mark.skip("Requires multiple GPUs")
|
||||
def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
|
||||
# Cannot use as it will initialize torch.cuda too early...
|
||||
# if torch.cuda.device_count() < 4:
|
||||
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
|
||||
|
||||
llm_tp1 = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=1,
|
||||
trust_remote_code=True)
|
||||
output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp1
|
||||
cleanup()
|
||||
|
||||
llm_tp2 = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=2,
|
||||
trust_remote_code=True)
|
||||
output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
|
||||
|
||||
del llm_tp2
|
||||
cleanup()
|
||||
|
||||
assert output_tp1 == output_tp2
|
||||
|
||||
llm_tp4 = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True)
|
||||
output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
|
||||
|
||||
del llm_tp4
|
||||
cleanup()
|
||||
|
||||
assert output_tp1 == output_tp4
|
||||
57
tests/lora/test_chatglm3.py
Normal file
57
tests/lora/test_chatglm3.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_PATH = "THUDM/chatglm3-6b"
|
||||
|
||||
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
|
||||
),
|
||||
]
|
||||
print(prompts)
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
def test_chatglm3_lora(chatglm3_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
trust_remote_code=True)
|
||||
|
||||
expected_lora_output = [
|
||||
"SELECT count(*) FROM singer",
|
||||
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501
|
||||
"SELECT name , country , age FROM singer ORDER BY age",
|
||||
]
|
||||
|
||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output1[i] == expected_lora_output[i]
|
||||
output2 = do_sample(llm, chatglm3_lora_files, lora_id=2)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output2[i] == expected_lora_output[i]
|
||||
46
tests/lora/test_gemma.py
Normal file
46
tests/lora/test_gemma.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_PATH = "google/gemma-7b"
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int) -> str:
|
||||
prompts = [
|
||||
"Quote: Imagination is",
|
||||
"Quote: Be yourself;",
|
||||
"Quote: So many books,",
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
def test_gemma_lora(gemma_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4)
|
||||
|
||||
expected_lora_output = [
|
||||
"more important than knowledge.\nAuthor: Albert Einstein\n",
|
||||
"everyone else is already taken.\nAuthor: Oscar Wilde\n",
|
||||
"so little time\nAuthor: Frank Zappa\n",
|
||||
]
|
||||
|
||||
output1 = do_sample(llm, gemma_lora_files, lora_id=1)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output1[i].startswith(expected_lora_output[i])
|
||||
output2 = do_sample(llm, gemma_lora_files, lora_id=2)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output2[i].startswith(expected_lora_output[i])
|
||||
106
tests/lora/test_layer_variation.py
Normal file
106
tests/lora/test_layer_variation.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import tempfile
|
||||
from random import sample
|
||||
from typing import List, Optional
|
||||
|
||||
import peft
|
||||
import pytest
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from .conftest import cleanup
|
||||
|
||||
MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
|
||||
PROMPTS = [
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501
|
||||
]
|
||||
|
||||
|
||||
def get_lora_model(model_id: str, target_modules: List[str], rank: int):
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id)
|
||||
lora_config = peft.tuners.lora.LoraConfig(target_modules, rank)
|
||||
lora_model = peft.PeftModel(model, lora_config)
|
||||
return lora_model
|
||||
|
||||
|
||||
def do_sample(llm,
|
||||
lora_path: Optional[str] = None,
|
||||
lora_id: Optional[int] = None,
|
||||
logprobs: int = 0,
|
||||
n_tokens: int = 256):
|
||||
prompts = PROMPTS
|
||||
sampling_params = vllm.SamplingParams(temperature=0,
|
||||
max_tokens=n_tokens,
|
||||
logprobs=logprobs,
|
||||
stop=["[/assistant]"])
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
generated_logprobs = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
generated_logprobs.append([
|
||||
list(logprob.keys()) for out in output.outputs
|
||||
for logprob in out.logprobs
|
||||
])
|
||||
return generated_logprobs if logprobs else generated_texts
|
||||
|
||||
|
||||
SUPPORTED_MODULES = [
|
||||
"qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
|
||||
"lm_head"
|
||||
]
|
||||
TARGET_MODULES_LIST = []
|
||||
for length in range(2, 6):
|
||||
TARGET_MODULES_LIST.extend(
|
||||
[sample(SUPPORTED_MODULES, length) for _ in range(3)])
|
||||
|
||||
|
||||
# Test the correctness when layer and rank are varied
|
||||
# step 1: init a base model and serve with LoRA to get the reference results
|
||||
# step 2: merge the same LoRA to the base model, serve the merged model
|
||||
# step 3: compare the results from step 1 and step 2
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
@pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST)
|
||||
@pytest.mark.parametrize("rank", [8, 16, 32, 64])
|
||||
def test_layer_variation_correctness(tp_size, target_modules, rank):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=tp_size,
|
||||
worker_use_ray=True)
|
||||
model = get_lora_model(MODEL_PATH, target_modules, rank)
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
model.save_pretrained(tmpdir)
|
||||
merged_probs = do_sample(llm, tmpdir, 1, logprobs=5, n_tokens=32)
|
||||
del llm
|
||||
cleanup()
|
||||
reference_id_sets = [set(prob[0]) for prob in merged_probs]
|
||||
|
||||
model = get_lora_model(MODEL_PATH, target_modules, rank)
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
merged_model = model.merge_and_unload()
|
||||
merged_model.save_pretrained(tmpdir)
|
||||
llm = vllm.LLM(tmpdir,
|
||||
tokenizer=MODEL_PATH,
|
||||
enable_lora=False,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=tp_size,
|
||||
worker_use_ray=True)
|
||||
probs = do_sample(llm, logprobs=5, n_tokens=32)
|
||||
del llm
|
||||
cleanup()
|
||||
# verify the top-5 tokens are identical for each token
|
||||
id_sets = [set(prob[0]) for prob in probs]
|
||||
assert id_sets == reference_id_sets
|
||||
773
tests/lora/test_layers.py
Normal file
773
tests/lora/test_layers.py
Normal file
@@ -0,0 +1,773 @@
|
||||
import random
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.lora.fully_sharded_layers import (
|
||||
ColumnParallelLinearWithShardedLoRA,
|
||||
MergedColumnParallelLinearWithShardedLoRA,
|
||||
MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA)
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
||||
LogitsProcessorWithLoRA, LoRAMapping,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
MergedQKVParallelLinearWithLora,
|
||||
QKVParallelLinearWithLora,
|
||||
RowParallelLinearWithLoRA,
|
||||
VocabParallelEmbeddingWithLoRA)
|
||||
# yapf: enable
|
||||
from vllm.lora.models import (LoRALayerWeights, PackedLoRALayerWeights,
|
||||
convert_mapping)
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead, VocabParallelEmbedding)
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
||||
from .utils import DummyLoRAManager
|
||||
|
||||
TOLERANCES = {
|
||||
torch.float16: (5e-3, 5e-3),
|
||||
torch.float32: (5e-3, 5e-3),
|
||||
torch.bfloat16: (3e-2, 2e-2),
|
||||
}
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
|
||||
|
||||
def get_random_id_to_index(num_loras: int,
|
||||
num_slots: int,
|
||||
log: bool = True) -> List[Optional[int]]:
|
||||
"""Creates a random lora_id_to_index mapping.
|
||||
|
||||
Args:
|
||||
num_loras: The number of active loras in the mapping.
|
||||
num_slots: The number of slots in the mapping. Must be larger
|
||||
than num_loras.
|
||||
log: Whether to log the output.
|
||||
"""
|
||||
|
||||
if num_loras > num_slots:
|
||||
raise ValueError(
|
||||
f"num_loras is higher than num_slots: {num_loras} > {num_slots}. "
|
||||
"num_loras must be less than or equal to num_slots.")
|
||||
|
||||
slots: List[Optional[int]] = [None] * num_slots
|
||||
random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist()
|
||||
for lora_id, slot_idx in enumerate(random_slot_selections, start=1):
|
||||
slots[slot_idx] = lora_id
|
||||
|
||||
if log:
|
||||
print(f"Created lora_id_to_index mapping: {slots}.")
|
||||
|
||||
return slots
|
||||
|
||||
|
||||
def populate_loras(
|
||||
id_to_index: List[Optional[int]],
|
||||
layer: BaseLayerWithLoRA,
|
||||
layer_weights: torch.Tensor,
|
||||
generate_embeddings_tensor: int = 0,
|
||||
repeats: int = 1,
|
||||
) -> Tuple[Dict[int, LoRALayerWeights], Dict[int, List[LoRALayerWeights]]]:
|
||||
"""This method populates the lora layers with lora weights.
|
||||
|
||||
Args:
|
||||
id_to_index: a list of lora ids. The index of the lora id
|
||||
represents which memory slot the lora matrices are
|
||||
stored in. A None value indicates a free slot.
|
||||
layer: the LoRAlayer to populate.
|
||||
layer_weights: the PyTorch tensor containing the layer's
|
||||
weights.
|
||||
generate_embeddings_tensor: whether to generate an
|
||||
embeddings tensor for each LoRA.
|
||||
repeats: must only be set for column parallel packed
|
||||
layers. Indicates the number of loras to compose
|
||||
together to create a single lora layer.
|
||||
"""
|
||||
|
||||
# Dictionary that maps the lora ID to the
|
||||
# corresponding lora weights.
|
||||
lora_dict: Dict[int, LoRALayerWeights] = dict()
|
||||
|
||||
# Dictionary that maps the lora ID to the
|
||||
# corresponding subloras.
|
||||
sublora_dict: Dict[int, List[LoRALayerWeights]] = dict()
|
||||
|
||||
for slot_idx, lora_id in enumerate(id_to_index):
|
||||
if lora_id is not None:
|
||||
subloras = []
|
||||
sublora_len = layer_weights.shape[0] // repeats
|
||||
for i in range(repeats):
|
||||
sublora = DummyLoRAManager().init_random_lora(
|
||||
module_name=f"fake_{i}",
|
||||
weight=layer_weights,
|
||||
generate_embeddings_tensor=generate_embeddings_tensor,
|
||||
)
|
||||
sublora.lora_b = sublora.lora_b[:, (sublora_len *
|
||||
i):(sublora_len * (i + 1))]
|
||||
sublora.optimize()
|
||||
subloras.append(sublora)
|
||||
|
||||
lora = PackedLoRALayerWeights.pack(
|
||||
subloras) if repeats > 1 else subloras[0]
|
||||
|
||||
layer.set_lora(
|
||||
slot_idx,
|
||||
lora_a=lora.lora_a,
|
||||
lora_b=lora.lora_b,
|
||||
embeddings_tensor=lora.embeddings_tensor,
|
||||
)
|
||||
|
||||
lora_dict[lora_id] = lora
|
||||
sublora_dict[lora_id] = subloras
|
||||
|
||||
return lora_dict, sublora_dict
|
||||
|
||||
|
||||
def create_random_inputs(
|
||||
active_lora_ids: List[int],
|
||||
num_inputs: int,
|
||||
input_size: Tuple[int, ...],
|
||||
input_range: Tuple[float, float],
|
||||
input_type: torch.dtype = torch.int,
|
||||
) -> Tuple[List[torch.Tensor], List[int], List[int]]:
|
||||
"""Creates random inputs.
|
||||
|
||||
Args:
|
||||
active_lora_ids: lora IDs of active lora weights.
|
||||
num_inputs: the number of inputs to create.
|
||||
input_size: the size of each individual input.
|
||||
input_range: the range of values to include in the input.
|
||||
input_range[0] <= possible input values < input_range[1]
|
||||
input_type: the type of values in the input.
|
||||
"""
|
||||
|
||||
low, high = input_range
|
||||
|
||||
inputs, index_mapping, prompt_mapping = [], [], []
|
||||
for _ in range(num_inputs):
|
||||
if input_type == torch.int:
|
||||
inputs.append(
|
||||
torch.randint(low=int(low), high=int(high), size=input_size))
|
||||
else:
|
||||
inputs.append(
|
||||
torch.rand(size=input_size, dtype=input_type) * high + low)
|
||||
|
||||
lora_id = random.choice(active_lora_ids)
|
||||
index_mapping += [lora_id] * input_size[0]
|
||||
prompt_mapping += [lora_id]
|
||||
|
||||
return inputs, index_mapping, prompt_mapping
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
|
||||
def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
|
||||
|
||||
torch.set_default_device(device)
|
||||
max_loras = 8
|
||||
lora_config = LoRAConfig(max_loras=max_loras,
|
||||
max_lora_rank=8,
|
||||
lora_dtype=torch.float16)
|
||||
|
||||
def create_random_embedding_layer():
|
||||
embedding = VocabParallelEmbedding(vocab_size, 256)
|
||||
embedding.weight.data = torch.rand_like(embedding.weight.data)
|
||||
embedding.weight.data[vocab_size:, :] = 0
|
||||
lora_embedding = VocabParallelEmbeddingWithLoRA(embedding)
|
||||
lora_embedding.create_lora_weights(max_loras, lora_config)
|
||||
|
||||
return embedding, lora_embedding
|
||||
|
||||
for i in range(10):
|
||||
set_random_seed(i)
|
||||
|
||||
id_to_index = get_random_id_to_index(num_loras, max_loras)
|
||||
embedding, lora_embedding = create_random_embedding_layer()
|
||||
|
||||
lora_dict, _ = populate_loras(
|
||||
id_to_index,
|
||||
layer=lora_embedding,
|
||||
layer_weights=embedding.weight.T,
|
||||
)
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=list(lora_dict.keys()),
|
||||
num_inputs=num_loras * 3,
|
||||
input_size=(200, ),
|
||||
input_range=(1, vocab_size),
|
||||
)
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||
|
||||
mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
|
||||
vocab_size,
|
||||
lora_config.lora_extra_vocab_size)
|
||||
lora_embedding.set_mapping(*mapping_info)
|
||||
|
||||
lora_result = lora_embedding(torch.cat(inputs))
|
||||
|
||||
expected_results = []
|
||||
for input_, lora_id in zip(inputs, prompt_mapping):
|
||||
lora = lora_dict[lora_id]
|
||||
result = embedding(input_)
|
||||
after_a = F.embedding(
|
||||
input_,
|
||||
lora.lora_a,
|
||||
)
|
||||
result += (after_a @ lora.lora_b)
|
||||
expected_results.append(result)
|
||||
expected_result = torch.cat(expected_results)
|
||||
|
||||
rtol, atol = TOLERANCES[lora_result.dtype]
|
||||
assert torch.allclose(lora_result,
|
||||
expected_result,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
|
||||
# Check that resetting the lora weights succeeds
|
||||
|
||||
for slot_idx in range(max_loras):
|
||||
lora_embedding.reset_lora(slot_idx)
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=[0],
|
||||
num_inputs=num_loras * 3,
|
||||
input_size=(200, ),
|
||||
input_range=(1, vocab_size),
|
||||
)
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||
|
||||
mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
|
||||
vocab_size,
|
||||
lora_config.lora_extra_vocab_size)
|
||||
lora_embedding.set_mapping(*mapping_info, )
|
||||
|
||||
lora_result = lora_embedding(torch.cat(inputs))
|
||||
expected_result = embedding(torch.cat(inputs))
|
||||
|
||||
rtol, atol = TOLERANCES[lora_result.dtype]
|
||||
assert torch.allclose(lora_result,
|
||||
expected_result,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
# @pytest.mark.skip(
|
||||
# reason="Fails when loras are in any slot other than the first.")
|
||||
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
|
||||
def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
|
||||
vocab_size) -> None:
|
||||
|
||||
torch.set_default_device(device)
|
||||
max_loras = 8
|
||||
lora_config = LoRAConfig(max_loras=max_loras,
|
||||
max_lora_rank=8,
|
||||
lora_dtype=torch.float16)
|
||||
|
||||
def create_random_embedding_layer():
|
||||
embedding = VocabParallelEmbedding(vocab_size, 256)
|
||||
embedding_data = torch.rand_like(embedding.weight.data)
|
||||
embedding.weight.data = embedding_data
|
||||
embedding.weight.data[vocab_size:, :] = 0
|
||||
expanded_embedding = VocabParallelEmbedding(
|
||||
vocab_size + lora_config.lora_extra_vocab_size * max_loras,
|
||||
256,
|
||||
org_num_embeddings=vocab_size)
|
||||
expanded_embedding.weight.data[:vocab_size, :] = embedding_data
|
||||
# We need to deepcopy the embedding as it will be modified
|
||||
# in place
|
||||
lora_embedding = VocabParallelEmbeddingWithLoRA(
|
||||
deepcopy(expanded_embedding))
|
||||
lora_embedding.create_lora_weights(max_loras, lora_config)
|
||||
|
||||
return expanded_embedding, lora_embedding
|
||||
|
||||
for i in range(10):
|
||||
set_random_seed(i)
|
||||
|
||||
id_to_index = get_random_id_to_index(num_loras, max_loras)
|
||||
expanded_embedding, lora_embedding = create_random_embedding_layer()
|
||||
lora_dict, _ = populate_loras(
|
||||
id_to_index,
|
||||
layer=lora_embedding,
|
||||
layer_weights=torch.zeros(
|
||||
(256, vocab_size + lora_config.lora_extra_vocab_size)),
|
||||
generate_embeddings_tensor=256,
|
||||
)
|
||||
|
||||
# All embeddings tensors have the same shape.
|
||||
embeddings_tensors = [
|
||||
lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys())
|
||||
]
|
||||
embeddings_tensor_len = embeddings_tensors[0].shape[0]
|
||||
|
||||
# Add empty embeddings_tensors for unoccupied lora slots.
|
||||
for _ in range(max_loras - len(embeddings_tensors)):
|
||||
embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape))
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=list(lora_dict.keys()),
|
||||
num_inputs=num_loras * 3,
|
||||
input_size=(200, ),
|
||||
input_range=(1, vocab_size),
|
||||
)
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||
|
||||
original_inputs = deepcopy(inputs)
|
||||
|
||||
# Force some of the inputs to be in the extended embeddings range
|
||||
# to guarantee that their behavior is tested.
|
||||
for input_, original_input_, lora_id in zip(inputs, original_inputs,
|
||||
prompt_mapping):
|
||||
embedding_id = lora_id - 1
|
||||
input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len)
|
||||
original_input_[-1] = vocab_size
|
||||
input_[-2] = vocab_size + (
|
||||
(embedding_id + 1) * embeddings_tensor_len - 1)
|
||||
original_input_[-2] = vocab_size + embeddings_tensor_len - 1
|
||||
|
||||
mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
|
||||
vocab_size,
|
||||
lora_config.lora_extra_vocab_size)
|
||||
lora_embedding.set_mapping(*mapping_info, )
|
||||
|
||||
expanded_embedding.weight[vocab_size:vocab_size +
|
||||
(embeddings_tensor_len *
|
||||
max_loras)] = torch.cat(embeddings_tensors)
|
||||
|
||||
lora_result = lora_embedding(torch.cat(original_inputs))
|
||||
|
||||
expected_results = []
|
||||
for input_, original_input_, lora_id in zip(inputs, original_inputs,
|
||||
prompt_mapping):
|
||||
lora = lora_dict[lora_id]
|
||||
result = expanded_embedding(input_)
|
||||
after_a = F.embedding(
|
||||
original_input_,
|
||||
lora.lora_a,
|
||||
)
|
||||
result += (after_a @ lora.lora_b)
|
||||
expected_results.append(result)
|
||||
expected_result = torch.cat(expected_results)
|
||||
|
||||
rtol, atol = TOLERANCES[lora_result.dtype]
|
||||
assert torch.allclose(lora_result,
|
||||
expected_result,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
|
||||
# Check that resetting the lora weights succeeds
|
||||
|
||||
for slot_idx in range(max_loras):
|
||||
lora_embedding.reset_lora(slot_idx)
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=[0],
|
||||
num_inputs=num_loras * 3,
|
||||
input_size=(200, ),
|
||||
input_range=(1, vocab_size),
|
||||
)
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||
|
||||
original_inputs = deepcopy(inputs)
|
||||
|
||||
mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
|
||||
vocab_size,
|
||||
lora_config.lora_extra_vocab_size)
|
||||
lora_embedding.set_mapping(*mapping_info, )
|
||||
|
||||
lora_result = lora_embedding(torch.cat(original_inputs))
|
||||
expected_result = expanded_embedding(torch.cat(inputs))
|
||||
|
||||
rtol, atol = TOLERANCES[lora_result.dtype]
|
||||
assert torch.allclose(lora_result,
|
||||
expected_result,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
|
||||
def test_lm_head_logits_processor(dist_init, num_loras, device,
|
||||
vocab_size) -> None:
|
||||
|
||||
torch.set_default_device(device)
|
||||
max_loras = 8
|
||||
lora_config = LoRAConfig(max_loras=max_loras,
|
||||
max_lora_rank=8,
|
||||
lora_dtype=torch.float16)
|
||||
|
||||
def _pretest():
|
||||
linear = ParallelLMHead(vocab_size + lora_config.lora_extra_vocab_size,
|
||||
1024,
|
||||
vocab_size,
|
||||
params_dtype=torch.float16)
|
||||
linear.weight.data = torch.rand_like(linear.weight.data)
|
||||
linear.weight.data[:, vocab_size:] = 0
|
||||
logits_processor = LogitsProcessor(
|
||||
vocab_size + lora_config.lora_extra_vocab_size, vocab_size)
|
||||
lora_logits_processor = LogitsProcessorWithLoRA(
|
||||
logits_processor, 1024, linear.weight.dtype, linear.weight.device)
|
||||
lora_logits_processor.create_lora_weights(max_loras, lora_config)
|
||||
|
||||
return linear, logits_processor, lora_logits_processor
|
||||
|
||||
for i in range(10):
|
||||
set_random_seed(i)
|
||||
|
||||
id_to_index = get_random_id_to_index(num_loras, max_loras)
|
||||
linear, logits_processor, lora_logits_processor = _pretest()
|
||||
|
||||
# NOTE: all the generated loras share the same embeddings tensor.
|
||||
lora_dict, _ = populate_loras(
|
||||
id_to_index,
|
||||
layer=lora_logits_processor,
|
||||
layer_weights=linear.weight,
|
||||
generate_embeddings_tensor=1024,
|
||||
)
|
||||
embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor
|
||||
embeddings_tensor_len = embeddings_tensor.shape[0]
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=list(lora_dict.keys()),
|
||||
num_inputs=8 * num_loras, # * 3,
|
||||
input_size=(1, 1024),
|
||||
input_range=(0, 1),
|
||||
input_type=torch.float16,
|
||||
)
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||
|
||||
input_ = torch.rand(20, 1024)
|
||||
mapping_info = convert_mapping(
|
||||
lora_mapping,
|
||||
id_to_index,
|
||||
max_loras,
|
||||
vocab_size,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
lora_logits_processor.set_mapping(*mapping_info, )
|
||||
|
||||
lora_result = lora_logits_processor._get_logits(
|
||||
hidden_states=torch.cat(inputs),
|
||||
embedding=linear.weight,
|
||||
embedding_bias=None)
|
||||
|
||||
original_weight = linear.weight.clone()
|
||||
|
||||
linear.weight[logits_processor.
|
||||
org_vocab_size:logits_processor.org_vocab_size +
|
||||
embeddings_tensor_len] = embeddings_tensor
|
||||
|
||||
logits_processor.org_vocab_size = (vocab_size +
|
||||
lora_config.lora_extra_vocab_size)
|
||||
expected_results = []
|
||||
for input_, lora_id in zip(inputs, prompt_mapping):
|
||||
lora = lora_dict[lora_id]
|
||||
result = logits_processor._get_logits(hidden_states=input_,
|
||||
embedding=linear.weight,
|
||||
embedding_bias=None)
|
||||
result[:, vocab_size + embeddings_tensor_len:] = float("-inf")
|
||||
result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
|
||||
expected_results.append(result)
|
||||
expected_result = torch.cat(expected_results)
|
||||
logits_processor.org_vocab_size = vocab_size
|
||||
|
||||
# Check that resetting the lora weights succeeds
|
||||
|
||||
for slot_idx in range(max_loras):
|
||||
lora_logits_processor.reset_lora(slot_idx)
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=[0],
|
||||
num_inputs=8 * num_loras * 3,
|
||||
input_size=(1, 1024),
|
||||
input_range=(0, 1),
|
||||
input_type=torch.float16,
|
||||
)
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||
|
||||
mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
|
||||
vocab_size,
|
||||
lora_config.lora_extra_vocab_size)
|
||||
lora_logits_processor.set_mapping(*mapping_info, )
|
||||
|
||||
lora_result = lora_logits_processor._get_logits(
|
||||
hidden_states=torch.cat(inputs),
|
||||
embedding=original_weight,
|
||||
embedding_bias=None)[:, :vocab_size]
|
||||
expected_result = logits_processor._get_logits(
|
||||
hidden_states=torch.cat(inputs),
|
||||
embedding=original_weight,
|
||||
embedding_bias=None)
|
||||
|
||||
rtol, atol = TOLERANCES[lora_result.dtype]
|
||||
assert torch.allclose(lora_result,
|
||||
expected_result,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
||||
@pytest.mark.parametrize("orientation", ["row", "column"])
|
||||
@pytest.mark.parametrize("fully_shard", [True, False])
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
|
||||
device) -> None:
|
||||
|
||||
torch.set_default_device(device)
|
||||
max_loras = 8
|
||||
lora_config = LoRAConfig(max_loras=max_loras,
|
||||
max_lora_rank=8,
|
||||
fully_sharded_loras=fully_shard,
|
||||
lora_dtype=torch.float16)
|
||||
|
||||
def create_random_linear_parallel_layer():
|
||||
if orientation == "row":
|
||||
linear = RowParallelLinear(4096,
|
||||
4096,
|
||||
bias=False,
|
||||
params_dtype=torch.float16)
|
||||
linear.weight.data = torch.rand_like(linear.weight.data)
|
||||
lora_linear = (RowParallelLinearWithLoRA(linear) if not fully_shard
|
||||
else RowParallelLinearWithShardedLoRA(linear))
|
||||
else:
|
||||
linear = ColumnParallelLinear(4096,
|
||||
4096,
|
||||
bias=False,
|
||||
params_dtype=torch.float16)
|
||||
linear.weight.data = torch.rand_like(linear.weight.data)
|
||||
lora_linear = (ColumnParallelLinearWithLoRA(linear)
|
||||
if not fully_shard else
|
||||
ColumnParallelLinearWithShardedLoRA(linear))
|
||||
lora_linear.create_lora_weights(max_loras, lora_config)
|
||||
|
||||
return linear, lora_linear
|
||||
|
||||
for i in range(10):
|
||||
set_random_seed(i)
|
||||
|
||||
id_to_index = get_random_id_to_index(num_loras, max_loras)
|
||||
linear, lora_linear = create_random_linear_parallel_layer()
|
||||
|
||||
lora_dict, _ = populate_loras(
|
||||
id_to_index,
|
||||
layer=lora_linear,
|
||||
layer_weights=linear.weight,
|
||||
)
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=list(lora_dict.keys()),
|
||||
num_inputs=32 * num_loras,
|
||||
input_size=(1, 4096),
|
||||
input_range=(0, 1),
|
||||
input_type=torch.float16,
|
||||
)
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||
|
||||
mapping_info = convert_mapping(
|
||||
lora_mapping,
|
||||
id_to_index,
|
||||
max_loras,
|
||||
512,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
lora_linear.set_mapping(*mapping_info, )
|
||||
|
||||
lora_result = lora_linear(torch.cat(inputs))[0]
|
||||
|
||||
expected_results = []
|
||||
for input_, lora_id in zip(inputs, prompt_mapping):
|
||||
lora = lora_dict[lora_id]
|
||||
result = linear(input_)[0]
|
||||
result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
|
||||
expected_results.append(result)
|
||||
expected_result = torch.cat(expected_results)
|
||||
|
||||
rtol, atol = TOLERANCES[lora_result.dtype]
|
||||
assert torch.allclose(lora_result,
|
||||
expected_result,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
|
||||
# Check that resetting the lora weights succeeds
|
||||
|
||||
for slot_idx in range(max_loras):
|
||||
lora_linear.reset_lora(slot_idx)
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=[0],
|
||||
num_inputs=32 * num_loras,
|
||||
input_size=(1, 4096),
|
||||
input_range=(0, 1),
|
||||
input_type=torch.float16,
|
||||
)
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||
|
||||
mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
|
||||
512, lora_config.lora_extra_vocab_size)
|
||||
lora_linear.set_mapping(*mapping_info, )
|
||||
|
||||
lora_result = lora_linear(torch.cat(inputs))[0]
|
||||
expected_result = linear(torch.cat(inputs))[0]
|
||||
|
||||
rtol, atol = TOLERANCES[lora_result.dtype]
|
||||
assert torch.allclose(lora_result,
|
||||
expected_result,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
|
||||
@pytest.mark.parametrize("repeats", [1, 2, 3])
|
||||
@pytest.mark.parametrize("fully_shard", [True, False])
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
|
||||
device) -> None:
|
||||
|
||||
torch.set_default_device(device)
|
||||
max_loras = 8
|
||||
lora_config = LoRAConfig(max_loras=max_loras,
|
||||
max_lora_rank=8,
|
||||
fully_sharded_loras=fully_shard,
|
||||
lora_dtype=torch.float16)
|
||||
|
||||
def create_column_parallel_packed_layer():
|
||||
if repeats == 2:
|
||||
linear = MergedColumnParallelLinear(4096, [4096] * repeats,
|
||||
bias=False,
|
||||
params_dtype=torch.float16)
|
||||
linear.weight.data = torch.rand_like(linear.weight.data)
|
||||
lora_linear = (MergedColumnParallelLinearWithLoRA(linear)
|
||||
if not fully_shard else
|
||||
MergedColumnParallelLinearWithShardedLoRA(linear))
|
||||
elif repeats == 3:
|
||||
linear = QKVParallelLinear(4096,
|
||||
64,
|
||||
32,
|
||||
bias=False,
|
||||
params_dtype=torch.float16)
|
||||
linear.weight.data = torch.rand_like(linear.weight.data)
|
||||
lora_linear = (MergedQKVParallelLinearWithLora(linear)
|
||||
if not fully_shard else
|
||||
MergedQKVParallelLinearWithShardedLora(linear))
|
||||
else:
|
||||
linear = QKVParallelLinear(4096,
|
||||
64,
|
||||
32,
|
||||
bias=False,
|
||||
params_dtype=torch.float16)
|
||||
linear.weight.data = torch.rand_like(linear.weight.data)
|
||||
lora_linear = QKVParallelLinearWithLora(linear)
|
||||
|
||||
@dataclass
|
||||
class FakeConfig:
|
||||
hidden_size = 4096
|
||||
num_key_value_heads = 32
|
||||
num_attention_heads = 32
|
||||
|
||||
lora_linear.create_lora_weights(max_loras,
|
||||
lora_config,
|
||||
model_config=FakeConfig())
|
||||
|
||||
return linear, lora_linear
|
||||
|
||||
for i in range(10):
|
||||
set_random_seed(i)
|
||||
|
||||
id_to_index = get_random_id_to_index(num_loras, max_loras)
|
||||
|
||||
linear, lora_linear = create_column_parallel_packed_layer()
|
||||
|
||||
lora_dict, sublora_dict = populate_loras(
|
||||
id_to_index,
|
||||
layer=lora_linear,
|
||||
layer_weights=linear.weight,
|
||||
repeats=repeats,
|
||||
)
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=list(lora_dict.keys()),
|
||||
num_inputs=32 * num_loras,
|
||||
input_size=(1, 4096),
|
||||
input_range=(0, 1),
|
||||
input_type=torch.float16,
|
||||
)
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||
|
||||
mapping_info = convert_mapping(
|
||||
lora_mapping,
|
||||
id_to_index,
|
||||
max_loras,
|
||||
512,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
lora_linear.set_mapping(*mapping_info)
|
||||
|
||||
lora_result = lora_linear(torch.cat(inputs))[0]
|
||||
|
||||
expected_results = []
|
||||
for input_, lora_id in zip(inputs, prompt_mapping):
|
||||
result = linear(input_)[0]
|
||||
subloras = sublora_dict[lora_id]
|
||||
for i, sublora in enumerate(subloras):
|
||||
result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] *
|
||||
(i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b *
|
||||
sublora.scaling)
|
||||
expected_results.append(result)
|
||||
expected_result = torch.cat(expected_results)
|
||||
|
||||
rtol, atol = TOLERANCES[lora_result.dtype]
|
||||
assert torch.allclose(lora_result,
|
||||
expected_result,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
|
||||
for slot_idx in range(max_loras):
|
||||
lora_linear.reset_lora(slot_idx)
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=[0],
|
||||
num_inputs=32 * num_loras,
|
||||
input_size=(1, 4096),
|
||||
input_range=(0, 1),
|
||||
input_type=torch.float16,
|
||||
)
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||
|
||||
mapping_info = convert_mapping(
|
||||
lora_mapping,
|
||||
id_to_index,
|
||||
max_loras,
|
||||
512,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
lora_linear.set_mapping(*mapping_info)
|
||||
|
||||
lora_result = lora_linear(torch.cat(inputs))[0]
|
||||
expected_result = linear(torch.cat(inputs))[0]
|
||||
|
||||
rtol, atol = TOLERANCES[lora_result.dtype]
|
||||
assert torch.allclose(lora_result,
|
||||
expected_result,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
148
tests/lora/test_llama.py
Normal file
148
tests/lora/test_llama.py
Normal file
@@ -0,0 +1,148 @@
|
||||
import pytest
|
||||
import ray
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from .conftest import cleanup
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int):
|
||||
prompts = [
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0,
|
||||
max_tokens=256,
|
||||
stop=["[/assistant]"])
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
def test_llama_lora(sql_lora_files, tp_size):
|
||||
# Cannot use as it will initialize torch.cuda too early...
|
||||
# if torch.cuda.device_count() < tp_size:
|
||||
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=tp_size)
|
||||
|
||||
expected_no_lora_output = [
|
||||
"\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501
|
||||
" Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501
|
||||
"\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501
|
||||
" Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501
|
||||
" Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501
|
||||
"\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501
|
||||
]
|
||||
expected_lora_output = [
|
||||
" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501
|
||||
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501
|
||||
" SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501
|
||||
" SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501
|
||||
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501
|
||||
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501
|
||||
]
|
||||
|
||||
print("lora adapter created")
|
||||
assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
|
||||
|
||||
print("lora 1")
|
||||
assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output
|
||||
|
||||
print("no lora")
|
||||
assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
|
||||
|
||||
print("lora 2")
|
||||
assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output
|
||||
|
||||
print("removing lora")
|
||||
|
||||
|
||||
@pytest.mark.skip("Requires multiple GPUs")
|
||||
def test_llama_tensor_parallel_equality(sql_lora_files):
|
||||
# Cannot use as it will initialize torch.cuda too early...
|
||||
# if torch.cuda.device_count() < 4:
|
||||
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
|
||||
|
||||
llm_tp1 = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=1)
|
||||
output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp1
|
||||
cleanup()
|
||||
|
||||
llm_tp2 = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=2)
|
||||
output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp2
|
||||
cleanup()
|
||||
|
||||
assert output_tp1 == output_tp2
|
||||
|
||||
llm_tp4 = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=4)
|
||||
output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp4
|
||||
cleanup()
|
||||
|
||||
assert output_tp1 == output_tp4
|
||||
|
||||
|
||||
def test_llama_lora_warmup(sql_lora_files):
|
||||
"""Test that the LLM initialization works with a warmup LORA path and
|
||||
is more conservative"""
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
def get_num_gpu_blocks_lora():
|
||||
llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
|
||||
num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
|
||||
return num_gpu_blocks_lora_warmup
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
def get_num_gpu_blocks_no_lora():
|
||||
llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
|
||||
num_gpu_blocks_no_lora_warmup = (
|
||||
llm.llm_engine.cache_config.num_gpu_blocks)
|
||||
return num_gpu_blocks_no_lora_warmup
|
||||
|
||||
num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
|
||||
num_gpu_blocks_no_lora_warmup = ray.get(
|
||||
get_num_gpu_blocks_no_lora.remote())
|
||||
assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
|
||||
"The warmup with lora should be more "
|
||||
"conservative than without lora, therefore the number of "
|
||||
"memory blocks for the KV cache should be "
|
||||
"less when using lora than when not using lora")
|
||||
224
tests/lora/test_lora.py
Normal file
224
tests/lora/test_lora.py
Normal file
@@ -0,0 +1,224 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
|
||||
|
||||
from .utils import DummyLoRAManager
|
||||
|
||||
TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4]
|
||||
QKV_TENSOR_SIZES = [
|
||||
(8192, 1024, 1024),
|
||||
(8192 // 8, 1024 // 8, 1024 // 8),
|
||||
(4096, 4096, 4096),
|
||||
(4096 // 2, 4096 // 2, 4096 // 2),
|
||||
]
|
||||
BATCH_SIZES = [8, 32, 256]
|
||||
RANKS = [8]
|
||||
DTYPES = [torch.float16]
|
||||
TOLERANCES = {
|
||||
torch.float16: (5e-3, 5e-3),
|
||||
torch.bfloat16: (3e-2, 2e-2),
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", TENSOR_SIZES)
|
||||
@pytest.mark.parametrize("n", TENSOR_SIZES)
|
||||
@pytest.mark.parametrize("k", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("rank", RANKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
def test_apply_lora(m, n, k, rank, dtype) -> None:
|
||||
manager = DummyLoRAManager()
|
||||
|
||||
module_name = "module"
|
||||
weight = torch.rand([m, n], device="cuda", dtype=dtype)
|
||||
|
||||
manager.init_random_lora(module_name, weight, rank=rank)
|
||||
lora = manager.get_module_lora(module_name)
|
||||
|
||||
input = torch.rand(k, n, device="cuda", dtype=dtype)
|
||||
expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
|
||||
|
||||
lora_a_stack = torch.zeros(8,
|
||||
1,
|
||||
lora.lora_a.shape[1],
|
||||
lora.lora_a.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype)
|
||||
lora_b_stack = torch.zeros(8,
|
||||
1,
|
||||
lora.lora_b.shape[1],
|
||||
lora.lora_b.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype)
|
||||
for i in range(lora_a_stack.shape[0]):
|
||||
lora_a_stack[i][0] = lora.lora_a.T
|
||||
lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
|
||||
|
||||
output = torch.zeros(k, m, device="cuda", dtype=dtype)
|
||||
_apply_lora(
|
||||
input, lora_a_stack, lora_b_stack,
|
||||
torch.randint(0, lora_a_stack.shape[0], (len(input), ), device="cuda"),
|
||||
output)
|
||||
|
||||
rtol, atol = TOLERANCES[dtype]
|
||||
assert torch.allclose(expected, output, rtol=rtol, atol=atol)
|
||||
|
||||
output[:] = 0
|
||||
_apply_lora(input, lora_a_stack, lora_b_stack,
|
||||
torch.full((len(input), ), -1, device="cuda"), output)
|
||||
assert torch.allclose(torch.zeros_like(output), output)
|
||||
|
||||
manager.reset_lora()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", TENSOR_SIZES)
|
||||
@pytest.mark.parametrize("n", TENSOR_SIZES)
|
||||
@pytest.mark.parametrize("k", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("rank", RANKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
|
||||
if m % 2 != 0:
|
||||
pytest.skip("m must be divisible by 2")
|
||||
if m // 2 not in TENSOR_SIZES:
|
||||
pytest.skip("m//2 must be in TENSOR_SIZES")
|
||||
|
||||
manager = DummyLoRAManager()
|
||||
|
||||
module_name = "module"
|
||||
weight = torch.rand([m // 2, n], device="cuda", dtype=dtype)
|
||||
|
||||
manager.init_random_lora(module_name + "1", weight, rank=rank)
|
||||
lora_1 = manager.get_module_lora(module_name + "1")
|
||||
manager.init_random_lora(module_name + "2", weight, rank=rank)
|
||||
lora_2 = manager.get_module_lora(module_name + "2")
|
||||
|
||||
input = torch.rand(k, n, device="cuda", dtype=dtype)
|
||||
expected = torch.cat([
|
||||
input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
|
||||
input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling
|
||||
],
|
||||
dim=1)
|
||||
|
||||
lora_a_stacks = [
|
||||
torch.zeros(8,
|
||||
1,
|
||||
lora_1.lora_a.shape[1],
|
||||
lora_1.lora_a.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype) for i in range(2)
|
||||
]
|
||||
lora_b_stacks = [
|
||||
torch.zeros(8,
|
||||
1,
|
||||
lora_1.lora_b.shape[1],
|
||||
lora_1.lora_b.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype) for i in range(2)
|
||||
]
|
||||
for i in range(lora_a_stacks[0].shape[0]):
|
||||
lora_a_stacks[0][i][0] = lora_1.lora_a.T
|
||||
lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
|
||||
lora_a_stacks[1][i][0] = lora_2.lora_a.T
|
||||
lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
|
||||
|
||||
output = torch.zeros(k, m, device="cuda", dtype=dtype)
|
||||
_apply_lora_packed_nslice(
|
||||
input, lora_a_stacks, lora_b_stacks,
|
||||
torch.randint(0,
|
||||
lora_a_stacks[0].shape[0], (len(input), ),
|
||||
device="cuda"), output, (m // 2, m // 2))
|
||||
|
||||
rtol, atol = TOLERANCES[dtype]
|
||||
assert torch.allclose(expected, output, rtol=rtol, atol=atol)
|
||||
|
||||
output[:] = 0
|
||||
_apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
|
||||
torch.full((len(input), ), -1, device="cuda"),
|
||||
output, (m // 2, m // 2))
|
||||
assert torch.allclose(torch.zeros_like(output), output)
|
||||
|
||||
manager.reset_lora()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
|
||||
@pytest.mark.parametrize("n", TENSOR_SIZES)
|
||||
@pytest.mark.parametrize("k", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("rank", RANKS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
|
||||
manager = DummyLoRAManager()
|
||||
|
||||
module_name = "module"
|
||||
weight_q = torch.empty(qkv[0], n, device="cuda", dtype=dtype)
|
||||
weight_kv = torch.empty(qkv[1], n, device="cuda", dtype=dtype)
|
||||
|
||||
manager.init_random_lora(module_name + "q", weight_q, rank=rank)
|
||||
lora_q = manager.get_module_lora(module_name + "q")
|
||||
manager.init_random_lora(module_name + "k", weight_kv, rank=rank)
|
||||
lora_k = manager.get_module_lora(module_name + "k")
|
||||
manager.init_random_lora(module_name + "v", weight_kv, rank=rank)
|
||||
lora_v = manager.get_module_lora(module_name + "v")
|
||||
|
||||
input = torch.rand(k, n, device="cuda", dtype=dtype)
|
||||
expected = torch.cat([
|
||||
input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling,
|
||||
input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling,
|
||||
input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling
|
||||
],
|
||||
dim=1)
|
||||
|
||||
lora_a_stacks = [
|
||||
torch.zeros(8,
|
||||
1,
|
||||
lora_q.lora_a.shape[1],
|
||||
lora_q.lora_a.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype)
|
||||
] + [
|
||||
torch.zeros(8,
|
||||
1,
|
||||
lora_k.lora_a.shape[1],
|
||||
lora_k.lora_a.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype) for i in range(2)
|
||||
]
|
||||
lora_b_stacks = [
|
||||
torch.zeros(8,
|
||||
1,
|
||||
lora_q.lora_b.shape[1],
|
||||
lora_q.lora_b.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype)
|
||||
] + [
|
||||
torch.zeros(8,
|
||||
1,
|
||||
lora_k.lora_b.shape[1],
|
||||
lora_k.lora_b.shape[0],
|
||||
device="cuda",
|
||||
dtype=dtype) for i in range(2)
|
||||
]
|
||||
for i in range(lora_a_stacks[0].shape[0]):
|
||||
lora_a_stacks[0][i][0] = lora_q.lora_a.T
|
||||
lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
|
||||
lora_a_stacks[1][i][0] = lora_k.lora_a.T
|
||||
lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T
|
||||
lora_a_stacks[2][i][0] = lora_v.lora_a.T
|
||||
lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
|
||||
|
||||
output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype)
|
||||
_apply_lora_packed_nslice(
|
||||
input, lora_a_stacks, lora_b_stacks,
|
||||
torch.randint(0,
|
||||
lora_a_stacks[0].shape[0], (len(input), ),
|
||||
device="cuda"), output, (qkv[0], qkv[1], qkv[2]))
|
||||
|
||||
rtol, atol = TOLERANCES[dtype]
|
||||
assert torch.allclose(expected, output, rtol=rtol, atol=atol)
|
||||
|
||||
output[:] = 0
|
||||
_apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
|
||||
torch.full((len(input), ), -1, device="cuda"),
|
||||
output, (qkv[0], qkv[1], qkv[2]))
|
||||
assert torch.allclose(torch.zeros_like(output), output)
|
||||
|
||||
manager.reset_lora()
|
||||
58
tests/lora/test_lora_checkpoints.py
Normal file
58
tests/lora/test_lora_checkpoints.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import pytest
|
||||
|
||||
from vllm.lora.models import LoRAModel
|
||||
from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
|
||||
|
||||
lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lora_name", lora_lst)
|
||||
def test_load_checkpoints(
|
||||
lora_name,
|
||||
baichuan_lora_files,
|
||||
baichuan_zero_lora_files,
|
||||
chatglm3_lora_files,
|
||||
):
|
||||
supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
|
||||
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
|
||||
embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
|
||||
embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
|
||||
expected_lora_modules = []
|
||||
for module in supported_lora_modules:
|
||||
if module in packed_modules_mapping:
|
||||
expected_lora_modules.extend(packed_modules_mapping[module])
|
||||
else:
|
||||
expected_lora_modules.append(module)
|
||||
if lora_name == "baichuan7B":
|
||||
# For the baichuan7B model, load it's LoRA,
|
||||
# and the test should pass.
|
||||
LoRAModel.from_local_checkpoint(
|
||||
baichuan_lora_files,
|
||||
expected_lora_modules,
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embed_padding_modules)
|
||||
elif lora_name == "baichuan7B-zero":
|
||||
#Test that the target_modules contain prefix
|
||||
# such as "model.layers.0.self_atten.W_pack", and
|
||||
# the test should pass.
|
||||
LoRAModel.from_local_checkpoint(
|
||||
baichuan_zero_lora_files,
|
||||
expected_lora_modules,
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embed_padding_modules)
|
||||
else:
|
||||
# For the baichuan7B model, load chatglm3-6b's LoRA,
|
||||
# and the test should raise the following error.
|
||||
expected_error = "Please verify that the loaded LoRA module is correct" # noqa: E501
|
||||
with pytest.raises(ValueError, match=expected_error):
|
||||
LoRAModel.from_local_checkpoint(
|
||||
chatglm3_lora_files,
|
||||
expected_lora_modules,
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embed_padding_modules)
|
||||
487
tests/lora/test_lora_manager.py
Normal file
487
tests/lora/test_lora_manager.py
Normal file
@@ -0,0 +1,487 @@
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from safetensors.torch import load_file
|
||||
from torch import nn
|
||||
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA)
|
||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
|
||||
LRUCacheLoRAModelManager)
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
|
||||
WorkerLoRAManager)
|
||||
from vllm.model_executor.layers.linear import RowParallelLinear
|
||||
|
||||
EMBEDDING_MODULES = {
|
||||
"embed_tokens": "input_embeddings",
|
||||
"lm_head": "output_embeddings",
|
||||
}
|
||||
|
||||
EMBEDDING_PADDING_MODULES = ["lm_head"]
|
||||
|
||||
|
||||
def test_from_lora_tensors(sql_lora_files):
|
||||
tensors = load_file(
|
||||
os.path.join(sql_lora_files, "adapter_model.safetensors"))
|
||||
new_embeddings = load_file(
|
||||
os.path.join(sql_lora_files, "new_embeddings.safetensors"))
|
||||
lora_model = LoRAModel.from_lora_tensors(
|
||||
1,
|
||||
8,
|
||||
16,
|
||||
tensors,
|
||||
"cuda",
|
||||
embeddings=new_embeddings,
|
||||
embedding_modules=EMBEDDING_MODULES,
|
||||
embedding_padding_modules=EMBEDDING_PADDING_MODULES)
|
||||
for module_name, lora in lora_model.loras.items():
|
||||
assert lora.module_name == module_name
|
||||
assert lora.rank == 8
|
||||
assert lora.lora_alpha == 16
|
||||
assert lora.lora_a is not None
|
||||
assert lora.lora_b is not None
|
||||
assert (lora.lora_a.shape[1] == lora.lora_b.shape[0]
|
||||
), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
|
||||
assert lora.lora_a.shape[1] == 8
|
||||
embeddings_module = next(
|
||||
(k for k in EMBEDDING_MODULES if k in module_name), None)
|
||||
if embeddings_module:
|
||||
assert torch.equal(
|
||||
lora.embeddings_tensor,
|
||||
new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
|
||||
device=lora.embeddings_tensor.device))
|
||||
else:
|
||||
assert lora.embeddings_tensor is None
|
||||
|
||||
|
||||
def create_lora(lora_id: int, model: nn.Module,
|
||||
sub_modules: List[str]) -> LoRAModel:
|
||||
loras = {}
|
||||
for name in sub_modules:
|
||||
w = model.get_submodule(name).weight
|
||||
loras[name] = LoRALayerWeights(
|
||||
name,
|
||||
8,
|
||||
16,
|
||||
torch.rand([w.shape[1], 8], device="cuda"),
|
||||
torch.rand([8, w.shape[0]], device="cuda"),
|
||||
)
|
||||
return LoRAModel(lora_id, 8, loras)
|
||||
|
||||
|
||||
def create_packed_lora(
|
||||
lora_id: int,
|
||||
model: nn.Module,
|
||||
module_name,
|
||||
replaced_module_names,
|
||||
empty_replaced_module_name=None,
|
||||
) -> LoRAModel:
|
||||
w = model.get_submodule(module_name).weight
|
||||
loras = {}
|
||||
for replaced_module_name in replaced_module_names:
|
||||
if replaced_module_name == empty_replaced_module_name:
|
||||
continue
|
||||
loras[replaced_module_name] = LoRALayerWeights(
|
||||
replaced_module_name,
|
||||
8,
|
||||
16,
|
||||
torch.rand([w.shape[1], 8], device="cuda"),
|
||||
torch.rand([8, w.shape[0] // len(replaced_module_names)],
|
||||
device="cuda"),
|
||||
)
|
||||
return LoRAModel(lora_id, 8, loras)
|
||||
|
||||
|
||||
def test_replace_submodules(dist_init, dummy_model):
|
||||
model = dummy_model
|
||||
model.supported_lora_modules = ["dense1", "layer1.dense2"]
|
||||
model.packed_modules_mapping = {}
|
||||
manager = LoRAModelManager(
|
||||
model, 1, 1, 1,
|
||||
LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8))
|
||||
model = manager.model
|
||||
|
||||
assert isinstance(model.get_submodule("dense1"),
|
||||
ColumnParallelLinearWithLoRA)
|
||||
assert isinstance(model.get_submodule("layer1.dense1"),
|
||||
ColumnParallelLinearWithLoRA)
|
||||
assert isinstance(model.get_submodule("dense2"), RowParallelLinear)
|
||||
assert isinstance(model.get_submodule("layer1.dense2"),
|
||||
RowParallelLinearWithLoRA)
|
||||
|
||||
|
||||
def test_lora_model_manager(dist_init, dummy_model):
|
||||
model = dummy_model
|
||||
model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
|
||||
model.packed_modules_mapping = {}
|
||||
model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
|
||||
model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
|
||||
model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
|
||||
manager = LoRAModelManager(
|
||||
model, 2, 2, 2,
|
||||
LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
|
||||
assert all(x is None for x in manager.lora_index_to_id)
|
||||
assert manager.add_lora(model_lora1)
|
||||
assert manager.activate_lora(1)
|
||||
assert manager.lora_index_to_id[0] == 1
|
||||
assert not manager.add_lora(model_lora1)
|
||||
assert not manager.activate_lora(1)
|
||||
assert manager.add_lora(model_lora2)
|
||||
assert manager.activate_lora(2)
|
||||
assert manager.lora_index_to_id[0] == 1
|
||||
assert manager.lora_index_to_id[1] == 2
|
||||
assert not manager.add_lora(model_lora2)
|
||||
assert not manager.activate_lora(2)
|
||||
assert manager.add_lora(model_lora3)
|
||||
assert manager.lora_index_to_id[0] == 1
|
||||
assert manager.lora_index_to_id[1] == 2
|
||||
with pytest.raises(ValueError):
|
||||
assert manager.activate_lora(3)
|
||||
assert manager.lora_index_to_id[0] == 1
|
||||
assert manager.lora_index_to_id[1] == 2
|
||||
assert manager.remove_lora(model_lora2.id)
|
||||
assert manager.lora_index_to_id[1] is None
|
||||
assert not manager.remove_lora(model_lora2.id)
|
||||
assert manager.remove_lora(model_lora1.id)
|
||||
assert not manager.remove_lora(model_lora1.id)
|
||||
assert manager.add_lora(model_lora1)
|
||||
assert manager.lora_index_to_id[0] is None
|
||||
assert manager.lora_index_to_id[1] is None
|
||||
assert manager.add_lora(model_lora2)
|
||||
assert manager.activate_lora(3)
|
||||
assert manager.lora_index_to_id[0] == 3
|
||||
assert manager.lora_index_to_id[1] is None
|
||||
assert manager.activate_lora(2)
|
||||
assert manager.lora_index_to_id[0] == 3
|
||||
assert manager.lora_index_to_id[1] == 2
|
||||
|
||||
|
||||
def test_lora_lru_cache_model_manager(dist_init, dummy_model):
|
||||
model = dummy_model
|
||||
model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
|
||||
model.packed_modules_mapping = {}
|
||||
model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
|
||||
model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
|
||||
model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
|
||||
manager = LRUCacheLoRAModelManager(
|
||||
model, 2, 2, 2,
|
||||
LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
|
||||
assert all(x is None for x in manager.lora_index_to_id)
|
||||
assert manager.add_lora(model_lora1)
|
||||
assert manager.activate_lora(1)
|
||||
assert manager.lora_index_to_id[0] == 1
|
||||
assert not manager.add_lora(model_lora1)
|
||||
assert not manager.activate_lora(1)
|
||||
assert manager.add_lora(model_lora2)
|
||||
assert manager.activate_lora(2)
|
||||
assert manager.lora_index_to_id[0] == 1
|
||||
assert manager.lora_index_to_id[1] == 2
|
||||
assert not manager.add_lora(model_lora2)
|
||||
assert not manager.activate_lora(2)
|
||||
assert manager.add_lora(model_lora3)
|
||||
assert manager.lora_index_to_id[0] == 1
|
||||
assert manager.lora_index_to_id[1] == 2
|
||||
assert manager.activate_lora(3)
|
||||
assert manager.lora_index_to_id[0] == 3
|
||||
assert manager.lora_index_to_id[1] == 2
|
||||
assert manager.remove_lora(model_lora2.id)
|
||||
assert manager.lora_index_to_id[1] is None
|
||||
assert not manager.remove_lora(model_lora2.id)
|
||||
assert manager.remove_lora(model_lora1.id)
|
||||
assert not manager.remove_lora(model_lora1.id)
|
||||
assert manager.add_lora(model_lora1)
|
||||
assert manager.activate_lora(1)
|
||||
assert manager.lora_index_to_id[0] == 3
|
||||
assert manager.lora_index_to_id[1] == 1
|
||||
assert manager.add_lora(model_lora2)
|
||||
assert manager.deactivate_lora(3)
|
||||
assert manager.lora_index_to_id[0] is None
|
||||
assert manager.lora_index_to_id[1] == 1
|
||||
assert manager.activate_lora(2)
|
||||
assert manager.lora_index_to_id[0] == 2
|
||||
assert manager.lora_index_to_id[1] == 1
|
||||
assert manager.activate_lora(3)
|
||||
assert manager.lora_index_to_id[0] == 2
|
||||
assert manager.lora_index_to_id[1] == 3
|
||||
|
||||
|
||||
def test_lru_lora_model_manager(dist_init, dummy_model):
|
||||
# This tests just the LRU cache functionality, everything else is
|
||||
# tested in test_lora_model_manager
|
||||
model = dummy_model
|
||||
model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
|
||||
model.packed_modules_mapping = {}
|
||||
model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
|
||||
model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
|
||||
model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
|
||||
model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"])
|
||||
manager = LRUCacheLoRAModelManager(
|
||||
model, 2, 2, 2,
|
||||
LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
|
||||
|
||||
assert all(x is None for x in manager.lora_index_to_id)
|
||||
|
||||
# Add up to capacity
|
||||
assert manager.add_lora(model_lora1)
|
||||
assert manager.add_lora(model_lora2)
|
||||
assert manager.activate_lora(1)
|
||||
assert manager.activate_lora(2)
|
||||
|
||||
assert set(manager.list_loras()) == {1, 2}
|
||||
assert manager.lora_index_to_id[0] == 1
|
||||
assert manager.lora_index_to_id[1] == 2
|
||||
|
||||
# Add over capacity
|
||||
assert manager.add_lora(model_lora3)
|
||||
assert manager.add_lora(model_lora4)
|
||||
assert manager.activate_lora(3)
|
||||
assert manager.activate_lora(4)
|
||||
|
||||
assert set(manager.list_loras()) == {3, 4}
|
||||
assert manager.lora_index_to_id[0] == 3
|
||||
assert manager.lora_index_to_id[1] == 4
|
||||
|
||||
# Add 3 again to move it to the top and then add 2
|
||||
# should return false since it's in already
|
||||
assert not manager.add_lora(model_lora3)
|
||||
assert not manager.activate_lora(3)
|
||||
assert manager.add_lora(model_lora2)
|
||||
assert manager.activate_lora(2)
|
||||
|
||||
assert set(manager.list_loras()) == {3, 2}
|
||||
assert manager.lora_index_to_id[0] == 3
|
||||
assert manager.lora_index_to_id[1] == 2
|
||||
|
||||
# Remove manually
|
||||
assert manager.remove_lora(3)
|
||||
assert not manager.remove_lora(3)
|
||||
|
||||
assert set(manager.list_loras()) == {2}
|
||||
assert manager.lora_index_to_id[0] is None
|
||||
assert manager.lora_index_to_id[1] == 2
|
||||
|
||||
assert manager.add_lora(model_lora3)
|
||||
assert manager.activate_lora(3)
|
||||
assert manager.add_lora(model_lora4)
|
||||
assert manager.activate_lora(4)
|
||||
|
||||
assert set(manager.list_loras()) == {3, 4}
|
||||
assert manager.lora_index_to_id[0] == 3
|
||||
assert manager.lora_index_to_id[1] == 4
|
||||
|
||||
assert manager.remove_oldest_lora()
|
||||
assert set(manager.list_loras()) == {4}
|
||||
assert manager.lora_index_to_id[0] is None
|
||||
assert manager.lora_index_to_id[1] == 4
|
||||
|
||||
assert manager.remove_oldest_lora()
|
||||
assert set(manager.list_loras()) == set()
|
||||
assert all(x is None for x in manager.lora_index_to_id)
|
||||
|
||||
assert not manager.remove_oldest_lora()
|
||||
assert set(manager.list_loras()) == set()
|
||||
assert all(x is None for x in manager.lora_index_to_id)
|
||||
|
||||
|
||||
def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
|
||||
sql_lora_files):
|
||||
lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
|
||||
worker_lora_manager = LRUCacheWorkerLoRAManager(
|
||||
4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
|
||||
lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
|
||||
EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
|
||||
worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
|
||||
|
||||
mapping = LoRAMapping([], [])
|
||||
worker_lora_manager.set_active_loras([
|
||||
LoRARequest("1", 1, sql_lora_files),
|
||||
LoRARequest("2", 2, sql_lora_files)
|
||||
], mapping)
|
||||
assert worker_lora_manager.list_loras() == {1, 2}
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
|
||||
|
||||
worker_lora_manager.set_active_loras([
|
||||
LoRARequest("1", 1, sql_lora_files),
|
||||
LoRARequest("3", 3, sql_lora_files),
|
||||
LoRARequest("4", 4, sql_lora_files)
|
||||
], mapping)
|
||||
assert worker_lora_manager.list_loras() == {1, 2, 3, 4}
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 3
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
|
||||
|
||||
worker_lora_manager.set_active_loras([
|
||||
LoRARequest("1", 1, sql_lora_files),
|
||||
LoRARequest("2", 2, sql_lora_files),
|
||||
LoRARequest("5", 5, sql_lora_files)
|
||||
], mapping)
|
||||
assert worker_lora_manager.list_loras() == {1, 2, 4, 5}
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
|
||||
|
||||
worker_lora_manager.set_active_loras([
|
||||
LoRARequest("1", 1, sql_lora_files),
|
||||
LoRARequest("1", 1, sql_lora_files),
|
||||
LoRARequest("1", 1, sql_lora_files)
|
||||
], mapping)
|
||||
assert worker_lora_manager.list_loras() == {1, 2, 4, 5}
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
|
||||
|
||||
worker_lora_manager.set_active_loras([
|
||||
LoRARequest("6", 6, sql_lora_files),
|
||||
LoRARequest("7", 7, sql_lora_files),
|
||||
LoRARequest("8", 8, sql_lora_files)
|
||||
], mapping)
|
||||
assert worker_lora_manager.list_loras() == {1, 6, 7, 8}
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 7
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 8
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 6
|
||||
|
||||
# Over capacity
|
||||
with pytest.raises(RuntimeError):
|
||||
worker_lora_manager.set_active_loras([
|
||||
LoRARequest("10", 10, sql_lora_files),
|
||||
LoRARequest("11", 11, sql_lora_files),
|
||||
LoRARequest("12", 12, sql_lora_files),
|
||||
LoRARequest("13", 13, sql_lora_files),
|
||||
LoRARequest("14", 14, sql_lora_files)
|
||||
], mapping)
|
||||
|
||||
|
||||
def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
|
||||
sql_lora_files):
|
||||
# Should remove every LoRA not specified in the request.
|
||||
lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
|
||||
worker_lora_manager = WorkerLoRAManager(
|
||||
4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
|
||||
lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
|
||||
EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
|
||||
worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
|
||||
|
||||
mapping = LoRAMapping([], [])
|
||||
worker_lora_manager.set_active_loras([
|
||||
LoRARequest("1", 1, sql_lora_files),
|
||||
LoRARequest("2", 2, sql_lora_files)
|
||||
], mapping)
|
||||
assert worker_lora_manager.list_loras() == {1, 2}
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
|
||||
|
||||
worker_lora_manager.set_active_loras([
|
||||
LoRARequest("1", 1, sql_lora_files),
|
||||
LoRARequest("3", 3, sql_lora_files),
|
||||
LoRARequest("4", 4, sql_lora_files)
|
||||
], mapping)
|
||||
assert worker_lora_manager.list_loras() == {1, 3, 4}
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 3
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 4
|
||||
|
||||
worker_lora_manager.set_active_loras([
|
||||
LoRARequest("1", 1, sql_lora_files),
|
||||
LoRARequest("2", 2, sql_lora_files),
|
||||
LoRARequest("5", 5, sql_lora_files)
|
||||
], mapping)
|
||||
assert worker_lora_manager.list_loras() == {1, 2, 5}
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
|
||||
|
||||
worker_lora_manager.set_active_loras([
|
||||
LoRARequest("1", 1, sql_lora_files),
|
||||
LoRARequest("1", 1, sql_lora_files),
|
||||
LoRARequest("1", 1, sql_lora_files)
|
||||
], mapping)
|
||||
assert worker_lora_manager.list_loras() == {1}
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[1] is None
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[2] is None
|
||||
|
||||
worker_lora_manager.set_active_loras([
|
||||
LoRARequest("6", 6, sql_lora_files),
|
||||
LoRARequest("7", 7, sql_lora_files),
|
||||
LoRARequest("8", 8, sql_lora_files)
|
||||
], mapping)
|
||||
assert worker_lora_manager.list_loras() == {6, 7, 8}
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 8
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 6
|
||||
assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 7
|
||||
|
||||
# Over capacity
|
||||
with pytest.raises(RuntimeError):
|
||||
worker_lora_manager.set_active_loras([
|
||||
LoRARequest("10", 10, sql_lora_files),
|
||||
LoRARequest("11", 11, sql_lora_files),
|
||||
LoRARequest("12", 12, sql_lora_files),
|
||||
LoRARequest("13", 13, sql_lora_files),
|
||||
LoRARequest("14", 14, sql_lora_files)
|
||||
], mapping)
|
||||
|
||||
|
||||
def test_packed_loras(dist_init, dummy_model_gate_up):
|
||||
model = dummy_model_gate_up
|
||||
model.supported_lora_modules = ["gate_up_proj"]
|
||||
model.packed_modules_mapping = {
|
||||
"gate_up_proj": [
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
],
|
||||
}
|
||||
model_lora = create_packed_lora(
|
||||
1,
|
||||
model,
|
||||
module_name="gate_up_proj",
|
||||
replaced_module_names=["gate_proj", "up_proj"])
|
||||
model_lora1 = create_packed_lora(
|
||||
2,
|
||||
model,
|
||||
module_name="gate_up_proj",
|
||||
replaced_module_names=["gate_proj", "up_proj"],
|
||||
empty_replaced_module_name="gate_proj",
|
||||
)
|
||||
|
||||
manager = LoRAModelManager(
|
||||
model, 2, 2, 2,
|
||||
LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
|
||||
model = manager.model
|
||||
|
||||
assert isinstance(model.get_submodule("gate_up_proj"),
|
||||
MergedColumnParallelLinearWithLoRA)
|
||||
assert manager.add_lora(model_lora)
|
||||
assert manager.add_lora(model_lora1)
|
||||
|
||||
packed_lora = model_lora.get_lora("gate_up_proj")
|
||||
assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
|
||||
|
||||
assert torch.allclose(packed_lora.lora_a[0],
|
||||
model_lora.get_lora("gate_proj").lora_a)
|
||||
assert torch.allclose(packed_lora.lora_b[0],
|
||||
model_lora.get_lora("gate_proj").lora_b)
|
||||
assert torch.allclose(packed_lora.lora_a[1],
|
||||
model_lora.get_lora("up_proj").lora_a)
|
||||
assert torch.allclose(packed_lora.lora_b[1],
|
||||
model_lora.get_lora("up_proj").lora_b)
|
||||
|
||||
packed_lora1 = model_lora1.get_lora("gate_up_proj")
|
||||
assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
|
||||
|
||||
assert packed_lora1.lora_a[0] is None
|
||||
assert packed_lora1.lora_b[0] is None
|
||||
assert torch.allclose(packed_lora1.lora_a[1],
|
||||
model_lora1.get_lora("up_proj").lora_a)
|
||||
assert torch.allclose(packed_lora1.lora_b[1],
|
||||
model_lora1.get_lora("up_proj").lora_b)
|
||||
53
tests/lora/test_mixtral.py
Normal file
53
tests/lora/test_mixtral.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int):
|
||||
prompts = [
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501
|
||||
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [4])
|
||||
def test_mixtral_lora(mixtral_lora_files, tp_size):
|
||||
if torch.cuda.device_count() < tp_size:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=tp_size,
|
||||
worker_use_ray=True)
|
||||
|
||||
expected_lora_output = [
|
||||
"give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501
|
||||
"give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501
|
||||
"inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", # noqa: E501
|
||||
]
|
||||
|
||||
assert do_sample(llm, mixtral_lora_files,
|
||||
lora_id=1) == expected_lora_output
|
||||
assert do_sample(llm, mixtral_lora_files,
|
||||
lora_id=2) == expected_lora_output
|
||||
231
tests/lora/test_punica.py
Normal file
231
tests/lora/test_punica.py
Normal file
@@ -0,0 +1,231 @@
|
||||
# Based on code from https://github.com/punica-ai/punica
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm.lora.punica as punica
|
||||
|
||||
|
||||
def assert_close(a, b):
|
||||
rtol, atol = {
|
||||
torch.float16: (5e-3, 5e-3),
|
||||
torch.bfloat16: (3e-2, 2e-2),
|
||||
torch.float32: (None, None),
|
||||
}[a.dtype]
|
||||
torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
def _lora_ref_impl(
|
||||
y_final: torch.Tensor,
|
||||
x: torch.Tensor,
|
||||
wa_T_all: torch.Tensor,
|
||||
wb_T_all: torch.Tensor,
|
||||
indicies: torch.LongTensor,
|
||||
layer_idx: int,
|
||||
scale: float,
|
||||
):
|
||||
y_stage_1 = torch.empty(
|
||||
(x.size(0), wa_T_all.size(-2)),
|
||||
dtype=torch.float32,
|
||||
device=x.device,
|
||||
)
|
||||
bs = x.shape[0]
|
||||
s = torch.tensor(scale, dtype=torch.float32, device=x.device)
|
||||
for i, lora_idx in zip(range(bs), indicies.cpu().tolist()):
|
||||
xi = x[i].unsqueeze(0).to(torch.float32)
|
||||
wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
|
||||
if wb_T_all is not None:
|
||||
wb = wb_T_all[lora_idx, layer_idx].transpose(-1,
|
||||
-2).to(torch.float32)
|
||||
|
||||
tmp = xi @ wa
|
||||
y_stage_1[i] = tmp.squeeze(0)
|
||||
y_final[i] += ((tmp @ wb).squeeze(0) *
|
||||
s if wb_T_all is not None else y_stage_1[i])
|
||||
return y_final, y_stage_1
|
||||
|
||||
|
||||
H1 = H2 = [
|
||||
128,
|
||||
256,
|
||||
512,
|
||||
1024,
|
||||
1152,
|
||||
1280,
|
||||
1536,
|
||||
2048,
|
||||
2304,
|
||||
2560,
|
||||
2752,
|
||||
3072,
|
||||
3456,
|
||||
3584,
|
||||
4096,
|
||||
4608,
|
||||
5120,
|
||||
5504,
|
||||
5632,
|
||||
6144,
|
||||
6848,
|
||||
6912,
|
||||
7168,
|
||||
8192,
|
||||
9216,
|
||||
10240,
|
||||
11008,
|
||||
13824,
|
||||
14336,
|
||||
15360,
|
||||
22016,
|
||||
24576,
|
||||
27392,
|
||||
32000,
|
||||
32256,
|
||||
32512,
|
||||
32768,
|
||||
33024,
|
||||
36864,
|
||||
43264,
|
||||
49152,
|
||||
64000,
|
||||
64256,
|
||||
102400,
|
||||
102656,
|
||||
128000,
|
||||
128256,
|
||||
]
|
||||
H2 = [64] + H2
|
||||
R = [1, 2, 4]
|
||||
SEED = [0xabcdabcd987]
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
|
||||
@pytest.mark.parametrize("h1", H1)
|
||||
@pytest.mark.parametrize("r", R)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@torch.inference_mode()
|
||||
def test_lora_a_extra_shapes(dtype_str, h1, r, seed):
|
||||
torch.manual_seed(seed)
|
||||
num_loras = 4
|
||||
num_layers = 1
|
||||
bs = 32
|
||||
dtype = getattr(torch, dtype_str)
|
||||
device = torch.device("cuda")
|
||||
|
||||
wa_T_all = torch.randn(num_loras,
|
||||
num_layers,
|
||||
r,
|
||||
h1,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device)
|
||||
|
||||
for layer_idx in range(num_layers):
|
||||
x = torch.randn(bs, h1, dtype=dtype, device=device)
|
||||
y = torch.randn(bs, r, dtype=dtype, device=device)
|
||||
|
||||
y_ref = y.clone()
|
||||
_lora_ref_impl(
|
||||
y_ref,
|
||||
x,
|
||||
wa_T_all,
|
||||
None,
|
||||
indices,
|
||||
layer_idx,
|
||||
1.0,
|
||||
)
|
||||
|
||||
y_our = y.clone()
|
||||
punica.bgmv(y_our, x, wa_T_all, indices, layer_idx, 1.0)
|
||||
|
||||
assert_close(y_ref, y_our)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
|
||||
@pytest.mark.parametrize("h1", H1)
|
||||
@pytest.mark.parametrize("h2", H2)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_lora_correctness(dtype_str, h1, h2, seed, device):
|
||||
torch.manual_seed(seed)
|
||||
num_loras = 4
|
||||
num_layers = 1
|
||||
r = 8
|
||||
bs = 32
|
||||
scale = 0.123
|
||||
dtype = getattr(torch, dtype_str)
|
||||
torch.set_default_device(device)
|
||||
|
||||
wa_T_all = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
|
||||
wb_T_all = torch.randn(num_loras, num_layers, h2, r, dtype=dtype)
|
||||
indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
|
||||
|
||||
for layer_idx in range(num_layers):
|
||||
x = torch.randn(bs, h1, dtype=dtype)
|
||||
y = torch.randn(bs, h2, dtype=dtype)
|
||||
|
||||
y_ref = y.clone()
|
||||
_lora_ref_impl(y_ref, x, wa_T_all, wb_T_all, indices, layer_idx, scale)
|
||||
|
||||
y_our = y.clone()
|
||||
punica.add_lora(y_our, x, wa_T_all, wb_T_all, indices, layer_idx,
|
||||
scale)
|
||||
|
||||
assert_close(y_ref, y_our)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
|
||||
@pytest.mark.parametrize("h1", H1)
|
||||
@pytest.mark.parametrize("h2", H2)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_lora_correctness_slice(dtype_str, h1, h2, seed, device):
|
||||
if h2 % 3 != 0 or h2 // 3 not in H1:
|
||||
pytest.skip("h2 must be divisible by 3 and in supported shapes")
|
||||
torch.manual_seed(seed)
|
||||
num_loras = 4
|
||||
num_layers = 1
|
||||
r = 8
|
||||
bs = 32
|
||||
scale = 0.123
|
||||
dtype = getattr(torch, dtype_str)
|
||||
torch.set_default_device(device)
|
||||
|
||||
wa_T_all_0 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
|
||||
wa_T_all_1 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
|
||||
wa_T_all_2 = torch.randn(num_loras, num_layers, r, h1, dtype=dtype)
|
||||
wb_T_all_0 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
|
||||
wb_T_all_1 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
|
||||
wb_T_all_2 = torch.randn(num_loras, num_layers, h2 // 3, r, dtype=dtype)
|
||||
|
||||
indices = torch.randint(num_loras, (bs, ), dtype=torch.long)
|
||||
|
||||
for layer_idx in range(num_layers):
|
||||
x = torch.randn(bs, h1, dtype=dtype)
|
||||
y = torch.randn(bs, h2, dtype=dtype)
|
||||
s = h2 // 3
|
||||
|
||||
y_ref = y.clone()
|
||||
_lora_ref_impl(y_ref[:, :s], x, wa_T_all_0, wb_T_all_0, indices,
|
||||
layer_idx, scale)
|
||||
_lora_ref_impl(y_ref[:, s:s * 2], x, wa_T_all_1, wb_T_all_1, indices,
|
||||
layer_idx, scale)
|
||||
_lora_ref_impl(y_ref[:, s * 2:], x, wa_T_all_2, wb_T_all_2, indices,
|
||||
layer_idx, scale)
|
||||
|
||||
y_our = y.clone()
|
||||
punica.add_lora_slice(y_our, x, wa_T_all_0, wb_T_all_0, indices,
|
||||
layer_idx, scale, 0, s)
|
||||
punica.add_lora_slice(y_our, x, wa_T_all_1, wb_T_all_1, indices,
|
||||
layer_idx, scale, s, s)
|
||||
punica.add_lora_slice(y_our, x, wa_T_all_2, wb_T_all_2, indices,
|
||||
layer_idx, scale, s * 2, s)
|
||||
|
||||
assert_close(y_ref[:, :s], y_our[:, :s])
|
||||
assert_close(y_ref[:, s:s * 2], y_our[:, s:s * 2])
|
||||
assert_close(y_ref[:, s * 2:], y_our[:, s * 2:])
|
||||
179
tests/lora/test_quant_model.py
Normal file
179
tests/lora/test_quant_model.py
Normal file
@@ -0,0 +1,179 @@
|
||||
# Adapted from
|
||||
# https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from .conftest import cleanup
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelWithQuantization:
|
||||
model_path: str
|
||||
quantization: str
|
||||
|
||||
|
||||
MODELS: List[ModelWithQuantization] = [
|
||||
ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
|
||||
quantization="AWQ"),
|
||||
ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
|
||||
quantization="GPTQ"),
|
||||
]
|
||||
|
||||
|
||||
def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
|
||||
raw_prompts = [
|
||||
"Give me an orange-ish brown color",
|
||||
"Give me a neon pink color",
|
||||
]
|
||||
|
||||
def format_prompt_tuples(prompt):
|
||||
return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
prompts = [format_prompt_tuples(p) for p in raw_prompts]
|
||||
|
||||
sampling_params = vllm.SamplingParams(temperature=0,
|
||||
max_tokens=max_tokens,
|
||||
stop=["<|im_end|>"])
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
|
||||
# Cannot use as it will initialize torch.cuda too early...
|
||||
# if torch.cuda.device_count() < tp_size:
|
||||
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
||||
llm = vllm.LLM(model=model.model_path,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_model_len=400,
|
||||
tensor_parallel_size=tp_size,
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True)
|
||||
|
||||
if model.quantization is None:
|
||||
expected_no_lora_output = [
|
||||
"Here are some examples of orange-brown colors",
|
||||
"I'm sorry, I don't have"
|
||||
]
|
||||
expected_lora_output = [
|
||||
"#ff8050",
|
||||
"#ff8080",
|
||||
]
|
||||
elif model.quantization == "AWQ":
|
||||
expected_no_lora_output = [
|
||||
"I'm sorry, I don't understand",
|
||||
"I'm sorry, I don't understand",
|
||||
]
|
||||
expected_lora_output = [
|
||||
"#f07700: A v",
|
||||
"#f00000: A v",
|
||||
]
|
||||
elif model.quantization == "GPTQ":
|
||||
expected_no_lora_output = [
|
||||
"I'm sorry, I don't have",
|
||||
"I'm sorry, I don't have",
|
||||
]
|
||||
expected_lora_output = [
|
||||
"#f08800: This is",
|
||||
"#f07788 \n#",
|
||||
]
|
||||
|
||||
def expect_match(output, expected_output):
|
||||
# HACK: GPTQ lora outputs are just incredibly unstable.
|
||||
# Assert that the outputs changed.
|
||||
if (model.quantization == "GPTQ"
|
||||
and expected_output is expected_lora_output):
|
||||
assert output != expected_no_lora_output
|
||||
for i, o in enumerate(output):
|
||||
assert o.startswith(
|
||||
'#'), f"Expected example {i} to start with # but got {o}"
|
||||
return
|
||||
assert output == expected_output
|
||||
|
||||
max_tokens = 10
|
||||
|
||||
print("lora adapter created")
|
||||
output = do_sample(llm,
|
||||
tinyllama_lora_files,
|
||||
lora_id=0,
|
||||
max_tokens=max_tokens)
|
||||
expect_match(output, expected_no_lora_output)
|
||||
|
||||
print("lora 1")
|
||||
output = do_sample(llm,
|
||||
tinyllama_lora_files,
|
||||
lora_id=1,
|
||||
max_tokens=max_tokens)
|
||||
expect_match(output, expected_lora_output)
|
||||
|
||||
print("no lora")
|
||||
output = do_sample(llm,
|
||||
tinyllama_lora_files,
|
||||
lora_id=0,
|
||||
max_tokens=max_tokens)
|
||||
expect_match(output, expected_no_lora_output)
|
||||
|
||||
print("lora 2")
|
||||
output = do_sample(llm,
|
||||
tinyllama_lora_files,
|
||||
lora_id=2,
|
||||
max_tokens=max_tokens)
|
||||
expect_match(output, expected_lora_output)
|
||||
|
||||
print("removing lora")
|
||||
|
||||
del llm
|
||||
cleanup()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.skip("Requires multiple GPUs")
|
||||
def test_quant_model_tp_equality(tinyllama_lora_files, model):
|
||||
# Cannot use as it will initialize torch.cuda too early...
|
||||
# if torch.cuda.device_count() < 2:
|
||||
# pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
|
||||
|
||||
llm_tp1 = vllm.LLM(model=model.model_path,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=1,
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True)
|
||||
output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp1
|
||||
cleanup()
|
||||
|
||||
llm_tp2 = vllm.LLM(model=model.model_path,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=2,
|
||||
quantization=model.quantization)
|
||||
output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp2
|
||||
cleanup()
|
||||
|
||||
assert output_tp1 == output_tp2
|
||||
55
tests/lora/test_tokenizer_group.py
Normal file
55
tests/lora/test_tokenizer_group.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import pytest
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.tokenizer import get_lora_tokenizer
|
||||
from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
|
||||
|
||||
from ..conftest import get_tokenizer_pool_config
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
|
||||
async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
|
||||
reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
|
||||
tokenizer_group = get_tokenizer_group(
|
||||
get_tokenizer_pool_config(tokenizer_group_type),
|
||||
tokenizer_id="gpt2",
|
||||
enable_lora=True,
|
||||
max_num_seqs=1,
|
||||
max_input_length=None,
|
||||
)
|
||||
lora_request = LoRARequest("1", 1, sql_lora_files)
|
||||
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
|
||||
request_id="request_id", prompt="prompt", lora_request=lora_request)
|
||||
assert reference_tokenizer.encode(
|
||||
"prompt") == await tokenizer_group.encode_async(
|
||||
request_id="request_id",
|
||||
prompt="prompt",
|
||||
lora_request=lora_request)
|
||||
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
|
||||
PreTrainedTokenizerBase)
|
||||
assert tokenizer_group.get_lora_tokenizer(
|
||||
None) == await tokenizer_group.get_lora_tokenizer_async(None)
|
||||
|
||||
assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request),
|
||||
PreTrainedTokenizerBase)
|
||||
assert tokenizer_group.get_lora_tokenizer(
|
||||
lora_request) != tokenizer_group.get_lora_tokenizer(None)
|
||||
assert tokenizer_group.get_lora_tokenizer(
|
||||
lora_request) == await tokenizer_group.get_lora_tokenizer_async(
|
||||
lora_request)
|
||||
|
||||
|
||||
def test_get_lora_tokenizer(sql_lora_files, tmpdir):
|
||||
lora_request = None
|
||||
tokenizer = get_lora_tokenizer(lora_request)
|
||||
assert not tokenizer
|
||||
|
||||
lora_request = LoRARequest("1", 1, sql_lora_files)
|
||||
tokenizer = get_lora_tokenizer(lora_request)
|
||||
assert tokenizer.get_added_vocab()
|
||||
|
||||
lora_request = LoRARequest("1", 1, str(tmpdir))
|
||||
tokenizer = get_lora_tokenizer(lora_request)
|
||||
assert not tokenizer
|
||||
172
tests/lora/test_utils.py
Normal file
172
tests/lora/test_utils.py
Normal file
@@ -0,0 +1,172 @@
|
||||
from collections import OrderedDict
|
||||
|
||||
from torch import nn
|
||||
|
||||
from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
|
||||
from vllm.utils import LRUCache
|
||||
|
||||
|
||||
def test_parse_fine_tuned_lora_name():
|
||||
fixture = {
|
||||
("base_model.model.lm_head.lora_A.weight", "lm_head", True),
|
||||
("base_model.model.lm_head.lora_B.weight", "lm_head", False),
|
||||
(
|
||||
"base_model.model.model.embed_tokens.lora_embedding_A",
|
||||
"model.embed_tokens",
|
||||
True,
|
||||
),
|
||||
(
|
||||
"base_model.model.model.embed_tokens.lora_embedding_B",
|
||||
"model.embed_tokens",
|
||||
False,
|
||||
),
|
||||
(
|
||||
"base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
|
||||
"model.layers.9.mlp.down_proj",
|
||||
True,
|
||||
),
|
||||
(
|
||||
"base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
|
||||
"model.layers.9.mlp.down_proj",
|
||||
False,
|
||||
),
|
||||
}
|
||||
for name, module_name, is_lora_a in fixture:
|
||||
assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name)
|
||||
|
||||
|
||||
def test_replace_submodule():
|
||||
model = nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", nn.Linear(764, 100)),
|
||||
("act1", nn.ReLU()),
|
||||
("dense2", nn.Linear(100, 50)),
|
||||
(
|
||||
"seq1",
|
||||
nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", nn.Linear(100, 10)),
|
||||
("dense2", nn.Linear(10, 50)),
|
||||
])),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("output", nn.Linear(50, 10)),
|
||||
("outact", nn.Sigmoid()),
|
||||
]))
|
||||
|
||||
sigmoid = nn.Sigmoid()
|
||||
|
||||
replace_submodule(model, "act1", sigmoid)
|
||||
assert dict(model.named_modules())["act1"] == sigmoid
|
||||
|
||||
dense2 = nn.Linear(1, 5)
|
||||
replace_submodule(model, "seq1.dense2", dense2)
|
||||
assert dict(model.named_modules())["seq1.dense2"] == dense2
|
||||
|
||||
|
||||
class TestLRUCache(LRUCache):
|
||||
|
||||
def _on_remove(self, key, value):
|
||||
if not hasattr(self, "_remove_counter"):
|
||||
self._remove_counter = 0
|
||||
self._remove_counter += 1
|
||||
|
||||
|
||||
def test_lru_cache():
|
||||
cache = TestLRUCache(3)
|
||||
|
||||
cache.put(1, 1)
|
||||
assert len(cache) == 1
|
||||
|
||||
cache.put(1, 1)
|
||||
assert len(cache) == 1
|
||||
|
||||
cache.put(2, 2)
|
||||
assert len(cache) == 2
|
||||
|
||||
cache.put(3, 3)
|
||||
assert len(cache) == 3
|
||||
assert set(cache.cache) == {1, 2, 3}
|
||||
|
||||
cache.put(4, 4)
|
||||
assert len(cache) == 3
|
||||
assert set(cache.cache) == {2, 3, 4}
|
||||
assert cache._remove_counter == 1
|
||||
assert cache.get(2) == 2
|
||||
|
||||
cache.put(5, 5)
|
||||
assert set(cache.cache) == {2, 4, 5}
|
||||
assert cache._remove_counter == 2
|
||||
|
||||
assert cache.pop(5) == 5
|
||||
assert len(cache) == 2
|
||||
assert set(cache.cache) == {2, 4}
|
||||
assert cache._remove_counter == 3
|
||||
|
||||
cache.pop(10)
|
||||
assert len(cache) == 2
|
||||
assert set(cache.cache) == {2, 4}
|
||||
assert cache._remove_counter == 3
|
||||
|
||||
cache.get(10)
|
||||
assert len(cache) == 2
|
||||
assert set(cache.cache) == {2, 4}
|
||||
assert cache._remove_counter == 3
|
||||
|
||||
cache.put(6, 6)
|
||||
assert len(cache) == 3
|
||||
assert set(cache.cache) == {2, 4, 6}
|
||||
assert 2 in cache
|
||||
assert 4 in cache
|
||||
assert 6 in cache
|
||||
|
||||
cache.remove_oldest()
|
||||
assert len(cache) == 2
|
||||
assert set(cache.cache) == {2, 6}
|
||||
assert cache._remove_counter == 4
|
||||
|
||||
cache.clear()
|
||||
assert len(cache) == 0
|
||||
assert cache._remove_counter == 6
|
||||
|
||||
cache._remove_counter = 0
|
||||
|
||||
cache[1] = 1
|
||||
assert len(cache) == 1
|
||||
|
||||
cache[1] = 1
|
||||
assert len(cache) == 1
|
||||
|
||||
cache[2] = 2
|
||||
assert len(cache) == 2
|
||||
|
||||
cache[3] = 3
|
||||
assert len(cache) == 3
|
||||
assert set(cache.cache) == {1, 2, 3}
|
||||
|
||||
cache[4] = 4
|
||||
assert len(cache) == 3
|
||||
assert set(cache.cache) == {2, 3, 4}
|
||||
assert cache._remove_counter == 1
|
||||
assert cache[2] == 2
|
||||
|
||||
cache[5] = 5
|
||||
assert set(cache.cache) == {2, 4, 5}
|
||||
assert cache._remove_counter == 2
|
||||
|
||||
del cache[5]
|
||||
assert len(cache) == 2
|
||||
assert set(cache.cache) == {2, 4}
|
||||
assert cache._remove_counter == 3
|
||||
|
||||
cache.pop(10)
|
||||
assert len(cache) == 2
|
||||
assert set(cache.cache) == {2, 4}
|
||||
assert cache._remove_counter == 3
|
||||
|
||||
cache[6] = 6
|
||||
assert len(cache) == 3
|
||||
assert set(cache.cache) == {2, 4, 6}
|
||||
assert 2 in cache
|
||||
assert 4 in cache
|
||||
assert 6 in cache
|
||||
69
tests/lora/test_worker.py
Normal file
69
tests/lora/test_worker.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import os
|
||||
import random
|
||||
import tempfile
|
||||
from unittest.mock import patch
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, ParallelConfig, SchedulerConfig)
|
||||
from vllm.lora.models import LoRAMapping
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.worker.worker import Worker
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"RANK": "0"})
|
||||
def test_worker_apply_lora(sql_lora_files):
|
||||
worker = Worker(
|
||||
model_config=ModelConfig(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
),
|
||||
load_config=LoadConfig(
|
||||
download_dir=None,
|
||||
load_format="dummy",
|
||||
),
|
||||
parallel_config=ParallelConfig(1, 1, False),
|
||||
scheduler_config=SchedulerConfig(32, 32, 32),
|
||||
device_config=DeviceConfig("cuda"),
|
||||
cache_config=CacheConfig(block_size=16,
|
||||
gpu_memory_utilization=1.,
|
||||
swap_space=0,
|
||||
cache_dtype="auto"),
|
||||
local_rank=0,
|
||||
rank=0,
|
||||
lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
|
||||
max_loras=32),
|
||||
distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
|
||||
)
|
||||
worker.init_device()
|
||||
worker.load_model()
|
||||
|
||||
worker.model_runner.set_active_loras([], LoRAMapping([], []))
|
||||
assert worker.list_loras() == set()
|
||||
|
||||
n_loras = 32
|
||||
lora_requests = [
|
||||
LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
|
||||
]
|
||||
|
||||
worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], []))
|
||||
assert worker.list_loras() == {
|
||||
lora_request.lora_int_id
|
||||
for lora_request in lora_requests
|
||||
}
|
||||
|
||||
for i in range(32):
|
||||
random.seed(i)
|
||||
iter_lora_requests = random.choices(lora_requests,
|
||||
k=random.randint(1, n_loras))
|
||||
random.shuffle(iter_lora_requests)
|
||||
iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
|
||||
worker.model_runner.set_active_loras(iter_lora_requests,
|
||||
LoRAMapping([], []))
|
||||
assert worker.list_loras().issuperset(
|
||||
{lora_request.lora_int_id
|
||||
for lora_request in iter_lora_requests})
|
||||
88
tests/lora/utils.py
Normal file
88
tests/lora/utils.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from typing import List, Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||
|
||||
|
||||
class DummyLoRAManager:
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._loras = {}
|
||||
|
||||
def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
|
||||
self._loras[module_name] = lora
|
||||
|
||||
def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
|
||||
return self._loras.get(module_name, None)
|
||||
|
||||
def init_random_lora(self,
|
||||
module_name: str,
|
||||
weight: torch.Tensor,
|
||||
rank: int = 8,
|
||||
generate_embeddings_tensor: int = 0):
|
||||
lora = LoRALayerWeights(
|
||||
module_name,
|
||||
rank=rank,
|
||||
lora_alpha=1,
|
||||
lora_a=torch.rand([weight.shape[1], rank],
|
||||
dtype=weight.dtype,
|
||||
device="cuda"),
|
||||
lora_b=torch.rand([rank, weight.shape[0]],
|
||||
dtype=weight.dtype,
|
||||
device="cuda"),
|
||||
)
|
||||
if generate_embeddings_tensor:
|
||||
lora.embeddings_tensor = torch.rand(5,
|
||||
generate_embeddings_tensor,
|
||||
dtype=weight.dtype,
|
||||
device="cuda")
|
||||
self.set_module_lora(module_name, lora)
|
||||
|
||||
return lora
|
||||
|
||||
def init_lora(self,
|
||||
module_name: str,
|
||||
input_dim: int,
|
||||
output_dim: int,
|
||||
rank=8,
|
||||
noop=False,
|
||||
embeddings_tensor=None):
|
||||
lora = LoRALayerWeights(
|
||||
module_name,
|
||||
rank=rank,
|
||||
lora_alpha=1,
|
||||
lora_a=torch.rand([input_dim, rank], device="cuda"),
|
||||
lora_b=torch.rand([rank, output_dim], device="cuda"),
|
||||
embeddings_tensor=embeddings_tensor,
|
||||
)
|
||||
self.set_module_lora(module_name, lora)
|
||||
return lora
|
||||
|
||||
def reset_lora(self):
|
||||
self._loras = {}
|
||||
|
||||
def init_packed_lora(
|
||||
self,
|
||||
module_name: str,
|
||||
input_dim: int,
|
||||
output_dims: List[int],
|
||||
noop_lora_index: List[int] = None,
|
||||
rank=8,
|
||||
):
|
||||
base_loras = []
|
||||
noop_lora_index = set(noop_lora_index or [])
|
||||
|
||||
for i, out_dim in enumerate(output_dims):
|
||||
base_lora = self.init_lora(
|
||||
module_name + "_000_" + str(i),
|
||||
input_dim,
|
||||
out_dim,
|
||||
rank=rank,
|
||||
noop=i in noop_lora_index,
|
||||
)
|
||||
base_loras.append(base_lora)
|
||||
packed_lora = PackedLoRALayerWeights.pack(base_loras)
|
||||
self.set_module_lora(module_name, packed_lora)
|
||||
return packed_lora
|
||||
Reference in New Issue
Block a user